{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.979706088173548, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages": -4.896095973094816e-08, "completion_length": 200.90625, "delta_ref_entropy_loss": 0.0, "delta_ref_ppl": 0.0, "entropy_loss": -0.83984375, "epoch": 0.0013995801259622112, "grad_norm": 4.0278045522033405, "k1_kl": 0.0, "k3_kl": 0.0, "kimi_kl": 0.0, "learning_rate": 9.985994397759104e-07, "loss": 0.0, "ppl": 0.62109375, "reward": 0.15700868889689445, "reward_std": 0.16773776710033417, "rewards/single_object_detection_bbox_reward": 0.15700869634747505, "step": 1, "temperature": 0.9 }, { "advantages": -1.4874552345389702e-07, "completion_length": 40.5625, "delta_ref_entropy_loss": -0.000278472900390625, "delta_ref_ppl": 0.0006961822509765625, "entropy_loss": -0.759765625, "epoch": 0.0027991602519244225, "grad_norm": 7.120188718326727, "k1_kl": 0.00033473968505859375, "k3_kl": 0.0003490447998046875, "kimi_kl": 0.0007534027099609375, "learning_rate": 9.971988795518206e-07, "loss": 0.0, "ppl": 0.583984375, "reward": 0.26757121458649635, "reward_std": 0.21258887648582458, "rewards/single_object_detection_bbox_reward": 0.26757125928997993, "step": 2, "temperature": 0.9 }, { "advantages": 1.5486564919342527e-07, "completion_length": 39.4375, "delta_ref_entropy_loss": 0.00139617919921875, "delta_ref_ppl": -0.002651214599609375, "entropy_loss": -0.865234375, "epoch": 0.004198740377886634, "grad_norm": 9.41337139148115, "k1_kl": 0.0026092529296875, "k3_kl": 0.001140594482421875, "kimi_kl": 0.001682281494140625, "learning_rate": 9.95798319327731e-07, "loss": 0.0, "ppl": 0.662109375, "reward": 0.2049536295235157, "reward_std": 0.162916399538517, "rewards/single_object_detection_bbox_reward": 0.204953633248806, "step": 3, "temperature": 0.9 }, { "advantages": 3.9913827798443435e-09, "completion_length": 27.6875, "delta_ref_entropy_loss": 0.008941650390625, "delta_ref_ppl": -0.0044708251953125, "entropy_loss": -0.751953125, "epoch": 0.005598320503848845, "grad_norm": 2.8534816927916586, "k1_kl": 0.0039825439453125, "k3_kl": 0.00133514404296875, "kimi_kl": 0.00176239013671875, "learning_rate": 9.943977591036415e-07, "loss": 0.0001, "ppl": 0.59765625, "reward": 0.2264171913266182, "reward_std": 0.19678117334842682, "rewards/single_object_detection_bbox_reward": 0.2264171987771988, "step": 4, "temperature": 0.9 }, { "advantages": -4.523567098679848e-08, "completion_length": 72.875, "delta_ref_entropy_loss": 0.015899658203125, "delta_ref_ppl": -0.01116943359375, "entropy_loss": -0.791015625, "epoch": 0.006997900629811057, "grad_norm": 5.264279860804034, "k1_kl": 0.010955810546875, "k3_kl": 0.00506591796875, "kimi_kl": 0.006622314453125, "learning_rate": 9.929971988795519e-07, "loss": 0.0002, "ppl": 0.634765625, "reward": 0.41646139323711395, "reward_std": 0.308459147810936, "rewards/single_object_detection_bbox_reward": 0.41646140813827515, "step": 5, "temperature": 0.9 }, { "advantages": 2.6875309089291477e-08, "completion_length": 92.75, "delta_ref_entropy_loss": 0.0301513671875, "delta_ref_ppl": -0.02166748046875, "entropy_loss": -0.775390625, "epoch": 0.008397480755773267, "grad_norm": 7.145391470130097, "k1_kl": 0.021240234375, "k3_kl": 0.01483154296875, "kimi_kl": 0.019500732421875, "learning_rate": 9.91596638655462e-07, "loss": 0.0006, "ppl": 0.65234375, "reward": 0.5305007845163345, "reward_std": 0.2634095028042793, "rewards/single_object_detection_bbox_reward": 0.5305007696151733, "step": 6, "temperature": 0.9 }, { "advantages": 8.783702867276588e-07, "completion_length": 61.25, "delta_ref_entropy_loss": 0.03125, "delta_ref_ppl": -0.02490234375, "entropy_loss": -0.796875, "epoch": 0.00979706088173548, "grad_norm": 11.34359942315115, "k1_kl": 0.02484130859375, "k3_kl": 0.014129638671875, "kimi_kl": 0.0194091796875, "learning_rate": 9.901960784313725e-07, "loss": 0.0006, "ppl": 0.677734375, "reward": 0.4637238532304764, "reward_std": 0.18800602853298187, "rewards/single_object_detection_bbox_reward": 0.46372388303279877, "step": 7, "temperature": 0.9 }, { "advantages": -2.4147864792212204e-06, "completion_length": 41.6875, "delta_ref_entropy_loss": 0.037353515625, "delta_ref_ppl": -0.02984619140625, "entropy_loss": -0.74609375, "epoch": 0.01119664100769769, "grad_norm": 4.139262879578042, "k1_kl": 0.0284423828125, "k3_kl": 0.013580322265625, "kimi_kl": 0.02032470703125, "learning_rate": 9.88795518207283e-07, "loss": 0.0005, "ppl": 0.6328125, "reward": 0.629896491765976, "reward_std": 0.1617755964398384, "rewards/single_object_detection_bbox_reward": 0.6298965811729431, "step": 8, "temperature": 0.9 }, { "advantages": 2.6537373489787797e-06, "completion_length": 94.1875, "delta_ref_entropy_loss": 0.03741455078125, "delta_ref_ppl": -0.02923583984375, "entropy_loss": -0.73828125, "epoch": 0.012596221133659902, "grad_norm": 3.5436552133317774, "k1_kl": 0.0294189453125, "k3_kl": 0.0181884765625, "kimi_kl": 0.02703857421875, "learning_rate": 9.873949579831934e-07, "loss": 0.0007, "ppl": 0.61328125, "reward": 0.6739962697029114, "reward_std": 0.27868078649044037, "rewards/single_object_detection_bbox_reward": 0.6739963591098785, "step": 9, "temperature": 0.9 }, { "advantages": 2.087759156665925e-06, "completion_length": 69.15625, "delta_ref_entropy_loss": 0.0604248046875, "delta_ref_ppl": -0.072265625, "entropy_loss": -0.6875, "epoch": 0.013995801259622114, "grad_norm": 3.0354791587250545, "k1_kl": 0.072265625, "k3_kl": 0.044189453125, "kimi_kl": 0.084716796875, "learning_rate": 9.859943977591036e-07, "loss": 0.0018, "ppl": 0.587890625, "reward": 0.9059661328792572, "reward_std": 0.07600058522075415, "rewards/single_object_detection_bbox_reward": 0.9059662222862244, "step": 10, "temperature": 0.9 }, { "advantages": 7.200454277267454e-07, "completion_length": 69.25, "delta_ref_entropy_loss": 0.07177734375, "delta_ref_ppl": -0.0751953125, "entropy_loss": -0.67578125, "epoch": 0.015395381385584325, "grad_norm": 3.028128667141219, "k1_kl": 0.0751953125, "k3_kl": 0.044677734375, "kimi_kl": 0.080078125, "learning_rate": 9.84593837535014e-07, "loss": 0.0018, "ppl": 0.58203125, "reward": 0.8288761675357819, "reward_std": 0.12490098550915718, "rewards/single_object_detection_bbox_reward": 0.8288761675357819, "step": 11, "temperature": 0.9 }, { "advantages": 3.21386130508472e-06, "completion_length": 41.90625, "delta_ref_entropy_loss": 0.076904296875, "delta_ref_ppl": -0.079833984375, "entropy_loss": -0.69140625, "epoch": 0.016794961511546535, "grad_norm": 3.73542693456845, "k1_kl": 0.08056640625, "k3_kl": 0.043212890625, "kimi_kl": 0.076171875, "learning_rate": 9.831932773109242e-07, "loss": 0.0017, "ppl": 0.59375, "reward": 0.8460124433040619, "reward_std": 0.07444469630718231, "rewards/single_object_detection_bbox_reward": 0.8460125029087067, "step": 12, "temperature": 0.9 }, { "advantages": 3.238341825806401e-07, "completion_length": 79.15625, "delta_ref_entropy_loss": 0.0672607421875, "delta_ref_ppl": -0.0718994140625, "entropy_loss": -0.685546875, "epoch": 0.01819454163750875, "grad_norm": 4.035374210736438, "k1_kl": 0.07275390625, "k3_kl": 0.0458984375, "kimi_kl": 0.0780029296875, "learning_rate": 9.817927170868347e-07, "loss": 0.0018, "ppl": 0.58984375, "reward": 0.7728854417800903, "reward_std": 0.15086935833096504, "rewards/single_object_detection_bbox_reward": 0.7728854417800903, "step": 13, "temperature": 0.9 }, { "advantages": -7.659463188147697e-07, "completion_length": 105.8125, "delta_ref_entropy_loss": 0.0552978515625, "delta_ref_ppl": -0.082275390625, "entropy_loss": -0.70703125, "epoch": 0.01959412176347096, "grad_norm": 3.7132623901928365, "k1_kl": 0.08154296875, "k3_kl": 0.0494384765625, "kimi_kl": 0.1064453125, "learning_rate": 9.80392156862745e-07, "loss": 0.002, "ppl": 0.60546875, "reward": 0.8635495901107788, "reward_std": 0.13047760352492332, "rewards/single_object_detection_bbox_reward": 0.863549679517746, "step": 14, "temperature": 0.9 }, { "advantages": 1.1239733765933124e-06, "completion_length": 54.9375, "delta_ref_entropy_loss": 0.0648193359375, "delta_ref_ppl": -0.0859375, "entropy_loss": -0.6484375, "epoch": 0.02099370188943317, "grad_norm": 3.6833997443280233, "k1_kl": 0.0859375, "k3_kl": 0.05615234375, "kimi_kl": 0.11865234375, "learning_rate": 9.789915966386553e-07, "loss": 0.0022, "ppl": 0.556640625, "reward": 0.8876737058162689, "reward_std": 0.13931188732385635, "rewards/single_object_detection_bbox_reward": 0.8876737058162689, "step": 15, "temperature": 0.9 }, { "advantages": -1.1747969494990684e-06, "completion_length": 31.09375, "delta_ref_entropy_loss": 0.07666015625, "delta_ref_ppl": -0.111328125, "entropy_loss": -0.69921875, "epoch": 0.02239328201539538, "grad_norm": 2.1525360756214535, "k1_kl": 0.111083984375, "k3_kl": 0.064697265625, "kimi_kl": 0.1484375, "learning_rate": 9.775910364145657e-07, "loss": 0.0026, "ppl": 0.609375, "reward": 0.8448176085948944, "reward_std": 0.07923123985528946, "rewards/single_object_detection_bbox_reward": 0.8448176681995392, "step": 16, "temperature": 0.9 }, { "advantages": -6.326341690510162e-07, "completion_length": 65.625, "delta_ref_entropy_loss": 0.077392578125, "delta_ref_ppl": -0.110595703125, "entropy_loss": -0.708984375, "epoch": 0.023792862141357594, "grad_norm": 3.0710736404617913, "k1_kl": 0.111083984375, "k3_kl": 0.0732421875, "kimi_kl": 0.17578125, "learning_rate": 9.761904761904762e-07, "loss": 0.0029, "ppl": 0.615234375, "reward": 0.7932409942150116, "reward_std": 0.15396752953529358, "rewards/single_object_detection_bbox_reward": 0.7932410538196564, "step": 17, "temperature": 0.9 }, { "advantages": -2.1266087628646346e-06, "completion_length": 59.3125, "delta_ref_entropy_loss": 0.0859375, "delta_ref_ppl": -0.089111328125, "entropy_loss": -0.689453125, "epoch": 0.025192442267319804, "grad_norm": 2.071766427103985, "k1_kl": 0.08837890625, "k3_kl": 0.04345703125, "kimi_kl": 0.08837890625, "learning_rate": 9.747899159663866e-07, "loss": 0.0017, "ppl": 0.59765625, "reward": 0.8474541902542114, "reward_std": 0.06185236759483814, "rewards/single_object_detection_bbox_reward": 0.8474542498588562, "step": 18, "temperature": 0.9 }, { "advantages": 1.411619038549361e-06, "completion_length": 43.71875, "delta_ref_entropy_loss": 0.095458984375, "delta_ref_ppl": -0.099853515625, "entropy_loss": -0.69140625, "epoch": 0.026592022393282014, "grad_norm": 4.912516716283916, "k1_kl": 0.099853515625, "k3_kl": 0.0537109375, "kimi_kl": 0.111083984375, "learning_rate": 9.733893557422968e-07, "loss": 0.0021, "ppl": 0.595703125, "reward": 0.8445721864700317, "reward_std": 0.10113547742366791, "rewards/single_object_detection_bbox_reward": 0.8445722758769989, "step": 19, "temperature": 0.9 }, { "advantages": 2.2258611522829597e-06, "completion_length": 80.625, "delta_ref_entropy_loss": 0.09423828125, "delta_ref_ppl": -0.1025390625, "entropy_loss": -0.63671875, "epoch": 0.02799160251924423, "grad_norm": 2.6597199246159624, "k1_kl": 0.102294921875, "k3_kl": 0.0552978515625, "kimi_kl": 0.111328125, "learning_rate": 9.719887955182072e-07, "loss": 0.0022, "ppl": 0.544921875, "reward": 0.9065327346324921, "reward_std": 0.07566798850893974, "rewards/single_object_detection_bbox_reward": 0.9065327644348145, "step": 20, "temperature": 0.9 }, { "advantages": -4.5589574710902525e-06, "completion_length": 32.53125, "delta_ref_entropy_loss": 0.09521484375, "delta_ref_ppl": -0.097412109375, "entropy_loss": -0.625, "epoch": 0.02939118264520644, "grad_norm": 3.8808555545905095, "k1_kl": 0.097900390625, "k3_kl": 0.048828125, "kimi_kl": 0.099609375, "learning_rate": 9.705882352941176e-07, "loss": 0.002, "ppl": 0.541015625, "reward": 0.9619888067245483, "reward_std": 0.0374802858568728, "rewards/single_object_detection_bbox_reward": 0.9619888365268707, "step": 21, "temperature": 0.9 }, { "advantages": 1.544665110486676e-06, "completion_length": 66.75, "delta_ref_entropy_loss": 0.084228515625, "delta_ref_ppl": -0.103759765625, "entropy_loss": -0.701171875, "epoch": 0.03079076277116865, "grad_norm": 2.2610053986718874, "k1_kl": 0.1044921875, "k3_kl": 0.0604248046875, "kimi_kl": 0.127197265625, "learning_rate": 9.691876750700279e-07, "loss": 0.0024, "ppl": 0.615234375, "reward": 0.9300380647182465, "reward_std": 0.10000704601407051, "rewards/single_object_detection_bbox_reward": 0.9300382435321808, "step": 22, "temperature": 0.9 }, { "advantages": -1.5523818319707061e-06, "completion_length": 113.375, "delta_ref_entropy_loss": 0.105224609375, "delta_ref_ppl": -0.1279296875, "entropy_loss": -0.623046875, "epoch": 0.03219034289713086, "grad_norm": 4.040511840819169, "k1_kl": 0.127685546875, "k3_kl": 0.0711669921875, "kimi_kl": 0.174072265625, "learning_rate": 9.677871148459383e-07, "loss": 0.0028, "ppl": 0.546875, "reward": 0.9139650464057922, "reward_std": 0.06844944134354591, "rewards/single_object_detection_bbox_reward": 0.9139651358127594, "step": 23, "temperature": 0.9 }, { "advantages": 8.198300065487274e-07, "completion_length": 70.6875, "delta_ref_entropy_loss": 0.10400390625, "delta_ref_ppl": -0.119384765625, "entropy_loss": -0.626953125, "epoch": 0.03358992302309307, "grad_norm": 2.006395755515778, "k1_kl": 0.11962890625, "k3_kl": 0.0654296875, "kimi_kl": 0.15673828125, "learning_rate": 9.663865546218487e-07, "loss": 0.0026, "ppl": 0.54296875, "reward": 0.958545982837677, "reward_std": 0.04924641363322735, "rewards/single_object_detection_bbox_reward": 0.9585460424423218, "step": 24, "temperature": 0.9 }, { "advantages": -3.940160041793206e-06, "completion_length": 76.4375, "delta_ref_entropy_loss": 0.099609375, "delta_ref_ppl": -0.138671875, "entropy_loss": -0.65625, "epoch": 0.03498950314905528, "grad_norm": 2.5269558898804156, "k1_kl": 0.138671875, "k3_kl": 0.08056640625, "kimi_kl": 0.1865234375, "learning_rate": 9.649859943977591e-07, "loss": 0.0032, "ppl": 0.572265625, "reward": 0.910751610994339, "reward_std": 0.05238910764455795, "rewards/single_object_detection_bbox_reward": 0.9107517004013062, "step": 25, "temperature": 0.9 }, { "advantages": -2.8759241104125977e-06, "completion_length": 80.21875, "delta_ref_entropy_loss": 0.11767578125, "delta_ref_ppl": -0.1357421875, "entropy_loss": -0.611328125, "epoch": 0.0363890832750175, "grad_norm": 2.0222302132442604, "k1_kl": 0.135498046875, "k3_kl": 0.0758056640625, "kimi_kl": 0.18994140625, "learning_rate": 9.635854341736694e-07, "loss": 0.003, "ppl": 0.533203125, "reward": 0.9260647296905518, "reward_std": 0.04111409652978182, "rewards/single_object_detection_bbox_reward": 0.9260647594928741, "step": 26, "temperature": 0.9 }, { "advantages": -4.352203518465103e-06, "completion_length": 84.21875, "delta_ref_entropy_loss": 0.10400390625, "delta_ref_ppl": -0.1181640625, "entropy_loss": -0.611328125, "epoch": 0.03778866340097971, "grad_norm": 3.5011065356668567, "k1_kl": 0.1181640625, "k3_kl": 0.06640625, "kimi_kl": 0.14599609375, "learning_rate": 9.621848739495798e-07, "loss": 0.0027, "ppl": 0.533203125, "reward": 0.9547756016254425, "reward_std": 0.033334456384181976, "rewards/single_object_detection_bbox_reward": 0.9547756910324097, "step": 27, "temperature": 0.9 }, { "advantages": 8.840380360197742e-06, "completion_length": 100.28125, "delta_ref_entropy_loss": 0.09814453125, "delta_ref_ppl": -0.105712890625, "entropy_loss": -0.662109375, "epoch": 0.03918824352694192, "grad_norm": 2.4032982478914366, "k1_kl": 0.105712890625, "k3_kl": 0.0555419921875, "kimi_kl": 0.114013671875, "learning_rate": 9.607843137254902e-07, "loss": 0.0022, "ppl": 0.578125, "reward": 0.9107533693313599, "reward_std": 0.08306529000401497, "rewards/single_object_detection_bbox_reward": 0.9107534289360046, "step": 28, "temperature": 0.9 }, { "advantages": -8.118472578644287e-07, "completion_length": 88.375, "delta_ref_entropy_loss": 0.100830078125, "delta_ref_ppl": -0.108642578125, "entropy_loss": -0.65234375, "epoch": 0.04058782365290413, "grad_norm": 3.73540873265276, "k1_kl": 0.108642578125, "k3_kl": 0.0576171875, "kimi_kl": 0.117919921875, "learning_rate": 9.593837535014006e-07, "loss": 0.0023, "ppl": 0.568359375, "reward": 0.8670237362384796, "reward_std": 0.09506590850651264, "rewards/single_object_detection_bbox_reward": 0.8670237958431244, "step": 29, "temperature": 0.9 }, { "advantages": 8.373388482141308e-06, "completion_length": 91.40625, "delta_ref_entropy_loss": 0.095458984375, "delta_ref_ppl": -0.0849609375, "entropy_loss": -0.654296875, "epoch": 0.04198740377886634, "grad_norm": 1.9101920130461814, "k1_kl": 0.085205078125, "k3_kl": 0.0406494140625, "kimi_kl": 0.0791015625, "learning_rate": 9.579831932773109e-07, "loss": 0.0016, "ppl": 0.572265625, "reward": 0.9136916100978851, "reward_std": 0.06986285746097565, "rewards/single_object_detection_bbox_reward": 0.9136916995048523, "step": 30, "temperature": 0.9 }, { "advantages": -4.148244101997989e-06, "completion_length": 67.5, "delta_ref_entropy_loss": 0.10888671875, "delta_ref_ppl": -0.1083984375, "entropy_loss": -0.640625, "epoch": 0.04338698390482855, "grad_norm": 2.672414494081636, "k1_kl": 0.108154296875, "k3_kl": 0.0538330078125, "kimi_kl": 0.098876953125, "learning_rate": 9.565826330532213e-07, "loss": 0.0022, "ppl": 0.55859375, "reward": 0.8932546377182007, "reward_std": 0.06755099818110466, "rewards/single_object_detection_bbox_reward": 0.8932547867298126, "step": 31, "temperature": 0.9 }, { "advantages": -3.272933213338547e-08, "completion_length": 87.375, "delta_ref_entropy_loss": 0.14306640625, "delta_ref_ppl": -0.16748046875, "entropy_loss": -0.65234375, "epoch": 0.04478656403079076, "grad_norm": 2.076770596567632, "k1_kl": 0.16650390625, "k3_kl": 0.09326171875, "kimi_kl": 0.205078125, "learning_rate": 9.551820728291317e-07, "loss": 0.0037, "ppl": 0.56640625, "reward": 0.84548419713974, "reward_std": 0.050869280472397804, "rewards/single_object_detection_bbox_reward": 0.8454842865467072, "step": 32, "temperature": 0.9 }, { "advantages": -7.484640946131549e-06, "completion_length": 21.0625, "delta_ref_entropy_loss": 0.14404296875, "delta_ref_ppl": -0.14208984375, "entropy_loss": -0.62109375, "epoch": 0.04618614415675298, "grad_norm": 4.340496957601052, "k1_kl": 0.142578125, "k3_kl": 0.073974609375, "kimi_kl": 0.15576171875, "learning_rate": 9.53781512605042e-07, "loss": 0.003, "ppl": 0.541015625, "reward": 0.9453826546669006, "reward_std": 0.05231573339551687, "rewards/single_object_detection_bbox_reward": 0.9453827440738678, "step": 33, "temperature": 0.9 }, { "advantages": 6.188505267346045e-06, "completion_length": 32.0, "delta_ref_entropy_loss": 0.13427734375, "delta_ref_ppl": -0.12744140625, "entropy_loss": -0.6171875, "epoch": 0.04758572428271519, "grad_norm": 2.145111229686203, "k1_kl": 0.127685546875, "k3_kl": 0.0650634765625, "kimi_kl": 0.125732421875, "learning_rate": 9.523809523809522e-07, "loss": 0.0026, "ppl": 0.541015625, "reward": 0.9421988427639008, "reward_std": 0.04547931253910065, "rewards/single_object_detection_bbox_reward": 0.9421988725662231, "step": 34, "temperature": 0.9 }, { "advantages": 3.2520455306439544e-06, "completion_length": 71.375, "delta_ref_entropy_loss": 0.11572265625, "delta_ref_ppl": -0.14453125, "entropy_loss": -0.625, "epoch": 0.0489853044086774, "grad_norm": 2.3692764986822006, "k1_kl": 0.144287109375, "k3_kl": 0.0799560546875, "kimi_kl": 0.212646484375, "learning_rate": 9.509803921568627e-07, "loss": 0.0032, "ppl": 0.541015625, "reward": 0.8915755450725555, "reward_std": 0.06342761404812336, "rewards/single_object_detection_bbox_reward": 0.8915755748748779, "step": 35, "temperature": 0.9 }, { "advantages": -6.533627242788498e-06, "completion_length": 41.5, "delta_ref_entropy_loss": 0.15478515625, "delta_ref_ppl": -0.1845703125, "entropy_loss": -0.5703125, "epoch": 0.05038488453463961, "grad_norm": 2.535406685606816, "k1_kl": 0.1845703125, "k3_kl": 0.10498046875, "kimi_kl": 0.23779296875, "learning_rate": 9.49579831932773e-07, "loss": 0.0042, "ppl": 0.4970703125, "reward": 0.9095452725887299, "reward_std": 0.045821938663721085, "rewards/single_object_detection_bbox_reward": 0.9095453321933746, "step": 36, "temperature": 0.9 }, { "advantages": -4.674308115681924e-06, "completion_length": 75.53125, "delta_ref_entropy_loss": 0.118896484375, "delta_ref_ppl": -0.123779296875, "entropy_loss": -0.63671875, "epoch": 0.05178446466060182, "grad_norm": 2.184884953343415, "k1_kl": 0.123046875, "k3_kl": 0.0653076171875, "kimi_kl": 0.1240234375, "learning_rate": 9.481792717086834e-07, "loss": 0.0026, "ppl": 0.564453125, "reward": 0.927015870809555, "reward_std": 0.055495524778962135, "rewards/single_object_detection_bbox_reward": 0.9270159304141998, "step": 37, "temperature": 0.9 }, { "advantages": -1.7009942325785232e-06, "completion_length": 95.21875, "delta_ref_entropy_loss": 0.132080078125, "delta_ref_ppl": -0.135009765625, "entropy_loss": -0.619140625, "epoch": 0.05318404478656403, "grad_norm": 2.533380233794715, "k1_kl": 0.135986328125, "k3_kl": 0.075439453125, "kimi_kl": 0.16845703125, "learning_rate": 9.467787114845937e-07, "loss": 0.003, "ppl": 0.5390625, "reward": 0.928099513053894, "reward_std": 0.06711981631815434, "rewards/single_object_detection_bbox_reward": 0.9280995726585388, "step": 38, "temperature": 0.9 }, { "advantages": -5.822894991069916e-06, "completion_length": 80.53125, "delta_ref_entropy_loss": 0.123291015625, "delta_ref_ppl": -0.113037109375, "entropy_loss": -0.630859375, "epoch": 0.05458362491252624, "grad_norm": 6.642736574823134, "k1_kl": 0.1123046875, "k3_kl": 0.0543212890625, "kimi_kl": 0.115478515625, "learning_rate": 9.453781512605042e-07, "loss": 0.0022, "ppl": 0.548828125, "reward": 0.9468332827091217, "reward_std": 0.03535185754299164, "rewards/single_object_detection_bbox_reward": 0.9468333423137665, "step": 39, "temperature": 0.9 }, { "advantages": 3.127647545397849e-06, "completion_length": 116.96875, "delta_ref_entropy_loss": 0.12890625, "delta_ref_ppl": -0.12939453125, "entropy_loss": -0.599609375, "epoch": 0.05598320503848846, "grad_norm": 1.9779853855955272, "k1_kl": 0.129638671875, "k3_kl": 0.0697021484375, "kimi_kl": 0.193359375, "learning_rate": 9.439775910364145e-07, "loss": 0.0028, "ppl": 0.53125, "reward": 0.9483802914619446, "reward_std": 0.04442845098674297, "rewards/single_object_detection_bbox_reward": 0.948380321264267, "step": 40, "temperature": 0.9 }, { "advantages": -8.175815082722693e-06, "completion_length": 107.84375, "delta_ref_entropy_loss": 0.140625, "delta_ref_ppl": -0.1171875, "entropy_loss": -0.634765625, "epoch": 0.05738278516445067, "grad_norm": 6.96508451207209, "k1_kl": 0.11669921875, "k3_kl": 0.0550537109375, "kimi_kl": 0.108154296875, "learning_rate": 9.425770308123249e-07, "loss": 0.0022, "ppl": 0.5546875, "reward": 0.8918345272541046, "reward_std": 0.0568095063790679, "rewards/single_object_detection_bbox_reward": 0.8918345868587494, "step": 41, "temperature": 0.9 }, { "advantages": 3.1088879666185676e-06, "completion_length": 53.78125, "delta_ref_entropy_loss": 0.1435546875, "delta_ref_ppl": -0.13623046875, "entropy_loss": -0.625, "epoch": 0.05878236529041288, "grad_norm": 4.3355575497324, "k1_kl": 0.135986328125, "k3_kl": 0.06884765625, "kimi_kl": 0.14013671875, "learning_rate": 9.411764705882352e-07, "loss": 0.0028, "ppl": 0.541015625, "reward": 0.9487521350383759, "reward_std": 0.056458424776792526, "rewards/single_object_detection_bbox_reward": 0.9487521946430206, "step": 42, "temperature": 0.9 }, { "advantages": 2.2901222109794617e-06, "completion_length": 40.0, "delta_ref_entropy_loss": 0.13916015625, "delta_ref_ppl": -0.13037109375, "entropy_loss": -0.583984375, "epoch": 0.06018194541637509, "grad_norm": 4.189790034191799, "k1_kl": 0.130615234375, "k3_kl": 0.06884765625, "kimi_kl": 0.1298828125, "learning_rate": 9.397759103641457e-07, "loss": 0.0028, "ppl": 0.509765625, "reward": 0.9541997611522675, "reward_std": 0.04190229997038841, "rewards/single_object_detection_bbox_reward": 0.9541997611522675, "step": 43, "temperature": 0.9 }, { "advantages": 4.7257967707992066e-06, "completion_length": 78.625, "delta_ref_entropy_loss": 0.129638671875, "delta_ref_ppl": -0.13623046875, "entropy_loss": -0.615234375, "epoch": 0.0615815255423373, "grad_norm": 2.058847742864203, "k1_kl": 0.13623046875, "k3_kl": 0.073974609375, "kimi_kl": 0.16943359375, "learning_rate": 9.38375350140056e-07, "loss": 0.003, "ppl": 0.544921875, "reward": 0.9534803032875061, "reward_std": 0.05468878149986267, "rewards/single_object_detection_bbox_reward": 0.9534803032875061, "step": 44, "temperature": 0.9 }, { "advantages": 1.4873222369260475e-06, "completion_length": 107.125, "delta_ref_entropy_loss": 0.1044921875, "delta_ref_ppl": -0.105224609375, "entropy_loss": -0.666015625, "epoch": 0.06298110566829951, "grad_norm": 1.9287463624239525, "k1_kl": 0.105712890625, "k3_kl": 0.05908203125, "kimi_kl": 0.10595703125, "learning_rate": 9.369747899159663e-07, "loss": 0.0024, "ppl": 0.587890625, "reward": 0.9044331312179565, "reward_std": 0.09606552869081497, "rewards/single_object_detection_bbox_reward": 0.9044331908226013, "step": 45, "temperature": 0.9 }, { "advantages": -1.889786631181778e-06, "completion_length": 108.40625, "delta_ref_entropy_loss": 0.15185546875, "delta_ref_ppl": -0.16259765625, "entropy_loss": -0.599609375, "epoch": 0.06438068579426172, "grad_norm": 2.696092032730364, "k1_kl": 0.16259765625, "k3_kl": 0.0888671875, "kimi_kl": 0.19970703125, "learning_rate": 9.355742296918767e-07, "loss": 0.0036, "ppl": 0.5234375, "reward": 0.905423492193222, "reward_std": 0.05821524187922478, "rewards/single_object_detection_bbox_reward": 0.9054235816001892, "step": 46, "temperature": 0.9 }, { "advantages": 9.397045005243854e-06, "completion_length": 91.6875, "delta_ref_entropy_loss": 0.118408203125, "delta_ref_ppl": -0.097900390625, "entropy_loss": -0.646484375, "epoch": 0.06578026592022393, "grad_norm": 1.7809694837690733, "k1_kl": 0.09814453125, "k3_kl": 0.0478515625, "kimi_kl": 0.08984375, "learning_rate": 9.34173669467787e-07, "loss": 0.0019, "ppl": 0.56640625, "reward": 0.9249379634857178, "reward_std": 0.031931765377521515, "rewards/single_object_detection_bbox_reward": 0.9249379932880402, "step": 47, "temperature": 0.9 }, { "advantages": 7.2920565798995085e-06, "completion_length": 157.5625, "delta_ref_entropy_loss": 0.1474609375, "delta_ref_ppl": -0.1484375, "entropy_loss": -0.646484375, "epoch": 0.06717984604618614, "grad_norm": 2.1005689590371492, "k1_kl": 0.1484375, "k3_kl": 0.078125, "kimi_kl": 0.19140625, "learning_rate": 9.327731092436975e-07, "loss": 0.0031, "ppl": 0.56640625, "reward": 0.9466767311096191, "reward_std": 0.047439140267670155, "rewards/single_object_detection_bbox_reward": 0.9466768801212311, "step": 48, "temperature": 0.9 }, { "advantages": 6.134222871878592e-06, "completion_length": 44.3125, "delta_ref_entropy_loss": 0.141357421875, "delta_ref_ppl": -0.117431640625, "entropy_loss": -0.6171875, "epoch": 0.06857942617214835, "grad_norm": 1.8106850634724307, "k1_kl": 0.11767578125, "k3_kl": 0.0562744140625, "kimi_kl": 0.1162109375, "learning_rate": 9.313725490196078e-07, "loss": 0.0022, "ppl": 0.537109375, "reward": 0.9466363489627838, "reward_std": 0.037708748830482364, "rewards/single_object_detection_bbox_reward": 0.9466363489627838, "step": 49, "temperature": 0.9 }, { "advantages": -2.2641782564392088e-06, "completion_length": 90.5625, "delta_ref_entropy_loss": 0.14453125, "delta_ref_ppl": -0.126220703125, "entropy_loss": -0.609375, "epoch": 0.06997900629811056, "grad_norm": 3.0153399966814547, "k1_kl": 0.126220703125, "k3_kl": 0.061767578125, "kimi_kl": 0.11474609375, "learning_rate": 9.299719887955182e-07, "loss": 0.0025, "ppl": 0.53515625, "reward": 0.9452129900455475, "reward_std": 0.05837794579565525, "rewards/single_object_detection_bbox_reward": 0.9452130794525146, "step": 50, "temperature": 0.9 }, { "advantages": -7.094549573594122e-06, "completion_length": 212.0, "delta_ref_entropy_loss": 0.125244140625, "delta_ref_ppl": -0.14453125, "entropy_loss": -0.552734375, "epoch": 0.07137858642407278, "grad_norm": 2.0996154370118143, "k1_kl": 0.14501953125, "k3_kl": 0.0810546875, "kimi_kl": 0.17626953125, "learning_rate": 9.285714285714285e-07, "loss": 0.0032, "ppl": 0.490234375, "reward": 0.9279657900333405, "reward_std": 0.037254695780575275, "rewards/single_object_detection_bbox_reward": 0.9279658794403076, "step": 51, "temperature": 0.9 }, { "advantages": -3.854345138165627e-06, "completion_length": 66.84375, "delta_ref_entropy_loss": 0.12890625, "delta_ref_ppl": -0.13525390625, "entropy_loss": -0.609375, "epoch": 0.072778166550035, "grad_norm": 2.35072204579431, "k1_kl": 0.1357421875, "k3_kl": 0.07275390625, "kimi_kl": 0.15380859375, "learning_rate": 9.27170868347339e-07, "loss": 0.0029, "ppl": 0.529296875, "reward": 0.9125744998455048, "reward_std": 0.036206657998263836, "rewards/single_object_detection_bbox_reward": 0.9125744700431824, "step": 52, "temperature": 0.9 }, { "advantages": -3.2346165426133666e-06, "completion_length": 94.90625, "delta_ref_entropy_loss": 0.14697265625, "delta_ref_ppl": -0.1474609375, "entropy_loss": -0.60546875, "epoch": 0.0741777466759972, "grad_norm": 10.437209275570634, "k1_kl": 0.14697265625, "k3_kl": 0.07421875, "kimi_kl": 0.140625, "learning_rate": 9.257703081232493e-07, "loss": 0.003, "ppl": 0.5234375, "reward": 0.8632356524467468, "reward_std": 0.06414942070841789, "rewards/single_object_detection_bbox_reward": 0.863235741853714, "step": 53, "temperature": 0.9 }, { "advantages": 8.621387905805022e-07, "completion_length": 67.0, "delta_ref_entropy_loss": 0.171875, "delta_ref_ppl": -0.130126953125, "entropy_loss": -0.611328125, "epoch": 0.07557732680195942, "grad_norm": 2.848382660717718, "k1_kl": 0.1298828125, "k3_kl": 0.0556640625, "kimi_kl": 0.107666015625, "learning_rate": 9.243697478991597e-07, "loss": 0.0022, "ppl": 0.529296875, "reward": 0.917338639497757, "reward_std": 0.02012170571833849, "rewards/single_object_detection_bbox_reward": 0.9173387587070465, "step": 54, "temperature": 0.9 }, { "advantages": 6.115064479672583e-06, "completion_length": 109.0, "delta_ref_entropy_loss": 0.14404296875, "delta_ref_ppl": -0.099365234375, "entropy_loss": -0.58203125, "epoch": 0.07697690692792163, "grad_norm": 4.059117244349785, "k1_kl": 0.098876953125, "k3_kl": 0.0458984375, "kimi_kl": 0.07861328125, "learning_rate": 9.2296918767507e-07, "loss": 0.0018, "ppl": 0.51171875, "reward": 0.9407348334789276, "reward_std": 0.04800436459481716, "rewards/single_object_detection_bbox_reward": 0.9407349526882172, "step": 55, "temperature": 0.9 }, { "advantages": 1.8961729892907897e-06, "completion_length": 55.5, "delta_ref_entropy_loss": 0.13916015625, "delta_ref_ppl": -0.136474609375, "entropy_loss": -0.595703125, "epoch": 0.07837648705388384, "grad_norm": 3.896106456112329, "k1_kl": 0.13671875, "k3_kl": 0.0709228515625, "kimi_kl": 0.14453125, "learning_rate": 9.215686274509803e-07, "loss": 0.0028, "ppl": 0.521484375, "reward": 0.945097416639328, "reward_std": 0.037606099620461464, "rewards/single_object_detection_bbox_reward": 0.9450975060462952, "step": 56, "temperature": 0.9 }, { "advantages": 1.2687274590916786e-06, "completion_length": 68.5625, "delta_ref_entropy_loss": 0.19384765625, "delta_ref_ppl": -0.1572265625, "entropy_loss": -0.587890625, "epoch": 0.07977606717984605, "grad_norm": 3.417838956013578, "k1_kl": 0.15673828125, "k3_kl": 0.07177734375, "kimi_kl": 0.13623046875, "learning_rate": 9.201680672268907e-07, "loss": 0.0029, "ppl": 0.5126953125, "reward": 0.890575498342514, "reward_std": 0.05038749938830733, "rewards/single_object_detection_bbox_reward": 0.890575498342514, "step": 57, "temperature": 0.9 }, { "advantages": 1.9847815337925567e-06, "completion_length": 135.75, "delta_ref_entropy_loss": 0.143310546875, "delta_ref_ppl": -0.164794921875, "entropy_loss": -0.61328125, "epoch": 0.08117564730580826, "grad_norm": 2.1734040109091737, "k1_kl": 0.16552734375, "k3_kl": 0.0946044921875, "kimi_kl": 0.22265625, "learning_rate": 9.18767507002801e-07, "loss": 0.0038, "ppl": 0.529296875, "reward": 0.8967481851577759, "reward_std": 0.03652935288846493, "rewards/single_object_detection_bbox_reward": 0.8967481553554535, "step": 58, "temperature": 0.9 }, { "advantages": -9.538073754811194e-06, "completion_length": 131.96875, "delta_ref_entropy_loss": 0.14794921875, "delta_ref_ppl": -0.129638671875, "entropy_loss": -0.61328125, "epoch": 0.08257522743177047, "grad_norm": 1.7197537255264934, "k1_kl": 0.130126953125, "k3_kl": 0.060546875, "kimi_kl": 0.11279296875, "learning_rate": 9.173669467787114e-07, "loss": 0.0024, "ppl": 0.5390625, "reward": 0.9593584537506104, "reward_std": 0.0345267690718174, "rewards/single_object_detection_bbox_reward": 0.9593584537506104, "step": 59, "temperature": 0.9 }, { "advantages": -9.290209163737018e-06, "completion_length": 49.9375, "delta_ref_entropy_loss": 0.162109375, "delta_ref_ppl": -0.157958984375, "entropy_loss": -0.599609375, "epoch": 0.08397480755773268, "grad_norm": 1.7904027063279646, "k1_kl": 0.157958984375, "k3_kl": 0.08251953125, "kimi_kl": 0.208251953125, "learning_rate": 9.159663865546218e-07, "loss": 0.0033, "ppl": 0.53125, "reward": 0.933628648519516, "reward_std": 0.017884302418679, "rewards/single_object_detection_bbox_reward": 0.9336287379264832, "step": 60, "temperature": 0.9 }, { "advantages": -7.256332992255921e-07, "completion_length": 73.71875, "delta_ref_entropy_loss": 0.13818359375, "delta_ref_ppl": -0.14013671875, "entropy_loss": -0.603515625, "epoch": 0.08537438768369489, "grad_norm": 2.4012182898772987, "k1_kl": 0.14013671875, "k3_kl": 0.07568359375, "kimi_kl": 0.197265625, "learning_rate": 9.145658263305322e-07, "loss": 0.003, "ppl": 0.53125, "reward": 0.9596521556377411, "reward_std": 0.04218217730522156, "rewards/single_object_detection_bbox_reward": 0.9596522152423859, "step": 61, "temperature": 0.9 }, { "advantages": -1.889254463094403e-06, "completion_length": 31.9375, "delta_ref_entropy_loss": 0.16748046875, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.58203125, "epoch": 0.0867739678096571, "grad_norm": 2.3678204696111935, "k1_kl": 0.1748046875, "k3_kl": 0.092529296875, "kimi_kl": 0.2158203125, "learning_rate": 9.131652661064425e-07, "loss": 0.0037, "ppl": 0.5068359375, "reward": 0.9587394595146179, "reward_std": 0.0324792442843318, "rewards/single_object_detection_bbox_reward": 0.9587395191192627, "step": 62, "temperature": 0.9 }, { "advantages": -7.208436727523804e-07, "completion_length": 31.9375, "delta_ref_entropy_loss": 0.189453125, "delta_ref_ppl": -0.17822265625, "entropy_loss": -0.58203125, "epoch": 0.08817354793561931, "grad_norm": 2.075230868048321, "k1_kl": 0.17822265625, "k3_kl": 0.0927734375, "kimi_kl": 0.1943359375, "learning_rate": 9.117647058823529e-07, "loss": 0.0037, "ppl": 0.505859375, "reward": 0.9546421468257904, "reward_std": 0.028678488917648792, "rewards/single_object_detection_bbox_reward": 0.9546422064304352, "step": 63, "temperature": 0.9 }, { "advantages": 3.942022658520727e-06, "completion_length": 69.15625, "delta_ref_entropy_loss": 0.15576171875, "delta_ref_ppl": -0.14208984375, "entropy_loss": -0.591796875, "epoch": 0.08957312806158152, "grad_norm": 1.5670889904660803, "k1_kl": 0.142578125, "k3_kl": 0.0736083984375, "kimi_kl": 0.1611328125, "learning_rate": 9.103641456582632e-07, "loss": 0.0029, "ppl": 0.5244140625, "reward": 0.9815987944602966, "reward_std": 0.009665529243648052, "rewards/single_object_detection_bbox_reward": 0.9815988838672638, "step": 64, "temperature": 0.9 }, { "advantages": 2.214951280166133e-06, "completion_length": 70.875, "delta_ref_entropy_loss": 0.16455078125, "delta_ref_ppl": -0.16162109375, "entropy_loss": -0.59765625, "epoch": 0.09097270818754374, "grad_norm": 3.3137060651902845, "k1_kl": 0.16162109375, "k3_kl": 0.08544921875, "kimi_kl": 0.18359375, "learning_rate": 9.089635854341736e-07, "loss": 0.0034, "ppl": 0.521484375, "reward": 0.9589613974094391, "reward_std": 0.02127282926812768, "rewards/single_object_detection_bbox_reward": 0.9589614868164062, "step": 65, "temperature": 0.9 }, { "advantages": 6.393929197656689e-06, "completion_length": 145.6875, "delta_ref_entropy_loss": 0.15283203125, "delta_ref_ppl": -0.14990234375, "entropy_loss": -0.609375, "epoch": 0.09237228831350595, "grad_norm": 1.9432982640984968, "k1_kl": 0.150390625, "k3_kl": 0.081298828125, "kimi_kl": 0.182373046875, "learning_rate": 9.07563025210084e-07, "loss": 0.0032, "ppl": 0.541015625, "reward": 0.8990254998207092, "reward_std": 0.03795382287353277, "rewards/single_object_detection_bbox_reward": 0.899025559425354, "step": 66, "temperature": 0.9 }, { "advantages": -5.707411219191272e-06, "completion_length": 66.40625, "delta_ref_entropy_loss": 0.16845703125, "delta_ref_ppl": -0.177734375, "entropy_loss": -0.576171875, "epoch": 0.09377186843946816, "grad_norm": 3.0881082799553834, "k1_kl": 0.17724609375, "k3_kl": 0.100830078125, "kimi_kl": 0.2568359375, "learning_rate": 9.061624649859943e-07, "loss": 0.004, "ppl": 0.5087890625, "reward": 0.9548870921134949, "reward_std": 0.04616040363907814, "rewards/single_object_detection_bbox_reward": 0.9548871517181396, "step": 67, "temperature": 0.9 }, { "advantages": 4.170462716501788e-06, "completion_length": 28.59375, "delta_ref_entropy_loss": 0.158203125, "delta_ref_ppl": -0.162109375, "entropy_loss": -0.59375, "epoch": 0.09517144856543037, "grad_norm": 3.0388276146371576, "k1_kl": 0.16259765625, "k3_kl": 0.0849609375, "kimi_kl": 0.18310546875, "learning_rate": 9.047619047619047e-07, "loss": 0.0034, "ppl": 0.5185546875, "reward": 0.924724817276001, "reward_std": 0.047751178964972496, "rewards/single_object_detection_bbox_reward": 0.9247248768806458, "step": 68, "temperature": 0.9 }, { "advantages": 3.619385665842856e-06, "completion_length": 65.4375, "delta_ref_entropy_loss": 0.1552734375, "delta_ref_ppl": -0.139892578125, "entropy_loss": -0.5546875, "epoch": 0.09657102869139259, "grad_norm": 4.399417463632052, "k1_kl": 0.139892578125, "k3_kl": 0.07421875, "kimi_kl": 0.152587890625, "learning_rate": 9.033613445378151e-07, "loss": 0.003, "ppl": 0.478515625, "reward": 0.9385094344615936, "reward_std": 0.03819224191829562, "rewards/single_object_detection_bbox_reward": 0.9385094344615936, "step": 69, "temperature": 0.9 }, { "advantages": -7.929014486762753e-06, "completion_length": 68.6875, "delta_ref_entropy_loss": 0.1767578125, "delta_ref_ppl": -0.164306640625, "entropy_loss": -0.595703125, "epoch": 0.0979706088173548, "grad_norm": 2.668804287031023, "k1_kl": 0.1640625, "k3_kl": 0.08447265625, "kimi_kl": 0.252197265625, "learning_rate": 9.019607843137255e-07, "loss": 0.0034, "ppl": 0.525390625, "reward": 0.9418520033359528, "reward_std": 0.032641260884702206, "rewards/single_object_detection_bbox_reward": 0.9418520927429199, "step": 70, "temperature": 0.9 }, { "advantages": -3.138024950999352e-06, "completion_length": 66.40625, "delta_ref_entropy_loss": 0.1376953125, "delta_ref_ppl": -0.14208984375, "entropy_loss": -0.564453125, "epoch": 0.099370188943317, "grad_norm": 1.99657484984093, "k1_kl": 0.14208984375, "k3_kl": 0.075927734375, "kimi_kl": 0.15087890625, "learning_rate": 9.005602240896358e-07, "loss": 0.003, "ppl": 0.4970703125, "reward": 0.9403776228427887, "reward_std": 0.045329438522458076, "rewards/single_object_detection_bbox_reward": 0.9403776824474335, "step": 71, "temperature": 0.9 }, { "advantages": 5.879572483991069e-06, "completion_length": 51.03125, "delta_ref_entropy_loss": 0.185546875, "delta_ref_ppl": -0.18310546875, "entropy_loss": -0.576171875, "epoch": 0.10076976906927922, "grad_norm": 3.2356183307297117, "k1_kl": 0.18359375, "k3_kl": 0.1015625, "kimi_kl": 0.232421875, "learning_rate": 8.991596638655462e-07, "loss": 0.0041, "ppl": 0.509765625, "reward": 0.9392561912536621, "reward_std": 0.04589370358735323, "rewards/single_object_detection_bbox_reward": 0.9392561912536621, "step": 72, "temperature": 0.9 }, { "advantages": 1.0794295121741015e-05, "completion_length": 102.09375, "delta_ref_entropy_loss": 0.1337890625, "delta_ref_ppl": -0.122802734375, "entropy_loss": -0.564453125, "epoch": 0.10216934919524143, "grad_norm": 1.7181101208424208, "k1_kl": 0.12255859375, "k3_kl": 0.062255859375, "kimi_kl": 0.1591796875, "learning_rate": 8.977591036414566e-07, "loss": 0.0025, "ppl": 0.4892578125, "reward": 0.9501882791519165, "reward_std": 0.019998079631477594, "rewards/single_object_detection_bbox_reward": 0.9501883089542389, "step": 73, "temperature": 0.9 }, { "advantages": -1.3483555676430115e-05, "completion_length": 34.0, "delta_ref_entropy_loss": 0.15234375, "delta_ref_ppl": -0.126220703125, "entropy_loss": -0.5625, "epoch": 0.10356892932120364, "grad_norm": 2.90334370713229, "k1_kl": 0.1259765625, "k3_kl": 0.0567626953125, "kimi_kl": 0.10400390625, "learning_rate": 8.96358543417367e-07, "loss": 0.0023, "ppl": 0.4951171875, "reward": 0.9666232764720917, "reward_std": 0.018561851233243942, "rewards/single_object_detection_bbox_reward": 0.9666233062744141, "step": 74, "temperature": 0.9 }, { "advantages": -4.828242595067422e-06, "completion_length": 164.96875, "delta_ref_entropy_loss": 0.138671875, "delta_ref_ppl": -0.1357421875, "entropy_loss": -0.6328125, "epoch": 0.10496850944716585, "grad_norm": 11.310816311563181, "k1_kl": 0.134521484375, "k3_kl": 0.067138671875, "kimi_kl": 0.141845703125, "learning_rate": 8.949579831932773e-07, "loss": 0.0027, "ppl": 0.552734375, "reward": 0.9068467319011688, "reward_std": 0.06598371267318726, "rewards/single_object_detection_bbox_reward": 0.906846821308136, "step": 75, "temperature": 0.9 }, { "advantages": 6.4974386759786285e-06, "completion_length": 89.03125, "delta_ref_entropy_loss": 0.138671875, "delta_ref_ppl": -0.142578125, "entropy_loss": -0.611328125, "epoch": 0.10636808957312806, "grad_norm": 1.9389484464348896, "k1_kl": 0.14208984375, "k3_kl": 0.0753173828125, "kimi_kl": 0.193115234375, "learning_rate": 8.935574229691877e-07, "loss": 0.003, "ppl": 0.5390625, "reward": 0.8889987170696259, "reward_std": 0.032990531995892525, "rewards/single_object_detection_bbox_reward": 0.8889987468719482, "step": 76, "temperature": 0.9 }, { "advantages": -6.669334197795251e-06, "completion_length": 69.40625, "delta_ref_entropy_loss": 0.1640625, "delta_ref_ppl": -0.18505859375, "entropy_loss": -0.560546875, "epoch": 0.10776766969909027, "grad_norm": 2.3434540458007853, "k1_kl": 0.1865234375, "k3_kl": 0.099853515625, "kimi_kl": 0.21875, "learning_rate": 8.92156862745098e-07, "loss": 0.004, "ppl": 0.4892578125, "reward": 0.9652335047721863, "reward_std": 0.03539320174604654, "rewards/single_object_detection_bbox_reward": 0.9652335345745087, "step": 77, "temperature": 0.9 }, { "advantages": 3.4262029657838866e-06, "completion_length": 127.71875, "delta_ref_entropy_loss": 0.14404296875, "delta_ref_ppl": -0.12451171875, "entropy_loss": -0.5625, "epoch": 0.10916724982505248, "grad_norm": 2.2162562886258548, "k1_kl": 0.124267578125, "k3_kl": 0.0595703125, "kimi_kl": 0.13330078125, "learning_rate": 8.907563025210084e-07, "loss": 0.0024, "ppl": 0.4892578125, "reward": 0.9675173461437225, "reward_std": 0.0315967109054327, "rewards/single_object_detection_bbox_reward": 0.9675173461437225, "step": 78, "temperature": 0.9 }, { "advantages": -5.778191663807775e-06, "completion_length": 98.9375, "delta_ref_entropy_loss": 0.1572265625, "delta_ref_ppl": -0.15869140625, "entropy_loss": -0.5859375, "epoch": 0.11056682995101469, "grad_norm": 4.3840146057780265, "k1_kl": 0.15966796875, "k3_kl": 0.081298828125, "kimi_kl": 0.19384765625, "learning_rate": 8.893557422969187e-07, "loss": 0.0033, "ppl": 0.51171875, "reward": 0.9227761030197144, "reward_std": 0.03840775415301323, "rewards/single_object_detection_bbox_reward": 0.9227762222290039, "step": 79, "temperature": 0.9 }, { "advantages": 9.712402970762923e-08, "completion_length": 86.71875, "delta_ref_entropy_loss": 0.16943359375, "delta_ref_ppl": -0.1474609375, "entropy_loss": -0.59765625, "epoch": 0.11196641007697691, "grad_norm": 10.494786880035727, "k1_kl": 0.1474609375, "k3_kl": 0.0653076171875, "kimi_kl": 0.12158203125, "learning_rate": 8.87955182072829e-07, "loss": 0.0026, "ppl": 0.521484375, "reward": 0.9370253086090088, "reward_std": 0.01974444603547454, "rewards/single_object_detection_bbox_reward": 0.9370253384113312, "step": 80, "temperature": 0.9 }, { "advantages": -6.9340960067165724e-06, "completion_length": 58.09375, "delta_ref_entropy_loss": 0.2021484375, "delta_ref_ppl": -0.1767578125, "entropy_loss": -0.5546875, "epoch": 0.11336599020293912, "grad_norm": 3.2137719189676686, "k1_kl": 0.17626953125, "k3_kl": 0.0849609375, "kimi_kl": 0.17626953125, "learning_rate": 8.865546218487394e-07, "loss": 0.0034, "ppl": 0.4873046875, "reward": 0.9563844501972198, "reward_std": 0.020615200512111187, "rewards/single_object_detection_bbox_reward": 0.956384539604187, "step": 81, "temperature": 0.9 }, { "advantages": 4.241775513946777e-06, "completion_length": 61.375, "delta_ref_entropy_loss": 0.1806640625, "delta_ref_ppl": -0.15380859375, "entropy_loss": -0.544921875, "epoch": 0.11476557032890133, "grad_norm": 4.359733571165506, "k1_kl": 0.15380859375, "k3_kl": 0.07568359375, "kimi_kl": 0.12890625, "learning_rate": 8.851540616246498e-07, "loss": 0.003, "ppl": 0.4794921875, "reward": 0.9321529269218445, "reward_std": 0.04578855633735657, "rewards/single_object_detection_bbox_reward": 0.9321529567241669, "step": 82, "temperature": 0.9 }, { "advantages": -5.777393425887567e-06, "completion_length": 89.75, "delta_ref_entropy_loss": 0.14697265625, "delta_ref_ppl": -0.128662109375, "entropy_loss": -0.587890625, "epoch": 0.11616515045486354, "grad_norm": 2.973949426303086, "k1_kl": 0.12841796875, "k3_kl": 0.0634765625, "kimi_kl": 0.121826171875, "learning_rate": 8.837535014005602e-07, "loss": 0.0025, "ppl": 0.5107421875, "reward": 0.7996354699134827, "reward_std": 0.050381191074848175, "rewards/single_object_detection_bbox_reward": 0.7996355891227722, "step": 83, "temperature": 0.9 }, { "advantages": 1.0131459532658482e-05, "completion_length": 88.125, "delta_ref_entropy_loss": 0.18798828125, "delta_ref_ppl": -0.17041015625, "entropy_loss": -0.56640625, "epoch": 0.11756473058082575, "grad_norm": 2.2393942732631498, "k1_kl": 0.17041015625, "k3_kl": 0.0814208984375, "kimi_kl": 0.16015625, "learning_rate": 8.823529411764705e-07, "loss": 0.0032, "ppl": 0.5029296875, "reward": 0.948837548494339, "reward_std": 0.017462439835071564, "rewards/single_object_detection_bbox_reward": 0.9488376080989838, "step": 84, "temperature": 0.9 }, { "advantages": 1.7295991305843472e-06, "completion_length": 74.6875, "delta_ref_entropy_loss": 0.18994140625, "delta_ref_ppl": -0.17919921875, "entropy_loss": -0.56640625, "epoch": 0.11896431070678797, "grad_norm": 2.614056967256866, "k1_kl": 0.17919921875, "k3_kl": 0.09326171875, "kimi_kl": 0.189453125, "learning_rate": 8.809523809523809e-07, "loss": 0.0037, "ppl": 0.49609375, "reward": 0.9426228106021881, "reward_std": 0.03586062602698803, "rewards/single_object_detection_bbox_reward": 0.9426228404045105, "step": 85, "temperature": 0.9 }, { "advantages": 1.0551885679888073e-05, "completion_length": 113.5625, "delta_ref_entropy_loss": 0.129150390625, "delta_ref_ppl": -0.10107421875, "entropy_loss": -0.568359375, "epoch": 0.12036389083275018, "grad_norm": 3.1958308019994486, "k1_kl": 0.100830078125, "k3_kl": 0.044677734375, "kimi_kl": 0.0721435546875, "learning_rate": 8.795518207282913e-07, "loss": 0.0018, "ppl": 0.4931640625, "reward": 0.9312670826911926, "reward_std": 0.051133742555975914, "rewards/single_object_detection_bbox_reward": 0.9312671720981598, "step": 86, "temperature": 0.9 }, { "advantages": -1.6961513665592065e-05, "completion_length": 140.75, "delta_ref_entropy_loss": 0.21044921875, "delta_ref_ppl": -0.19140625, "entropy_loss": -0.576171875, "epoch": 0.12176347095871239, "grad_norm": 12.448714691689702, "k1_kl": 0.19189453125, "k3_kl": 0.094970703125, "kimi_kl": 0.18701171875, "learning_rate": 8.781512605042016e-07, "loss": 0.0038, "ppl": 0.4990234375, "reward": 0.9310014545917511, "reward_std": 0.07924620807170868, "rewards/single_object_detection_bbox_reward": 0.9310014843940735, "step": 87, "temperature": 0.9 }, { "advantages": -1.8392291281088546e-06, "completion_length": 79.6875, "delta_ref_entropy_loss": 0.150390625, "delta_ref_ppl": -0.1376953125, "entropy_loss": -0.6015625, "epoch": 0.1231630510846746, "grad_norm": 2.844436049928766, "k1_kl": 0.13818359375, "k3_kl": 0.071533203125, "kimi_kl": 0.136962890625, "learning_rate": 8.76750700280112e-07, "loss": 0.0029, "ppl": 0.529296875, "reward": 0.9165406823158264, "reward_std": 0.0423041176982224, "rewards/single_object_detection_bbox_reward": 0.9165406823158264, "step": 88, "temperature": 0.9 }, { "advantages": 7.776543952786596e-06, "completion_length": 86.71875, "delta_ref_entropy_loss": 0.150390625, "delta_ref_ppl": -0.126220703125, "entropy_loss": -0.62890625, "epoch": 0.1245626312106368, "grad_norm": 2.37818204723995, "k1_kl": 0.126708984375, "k3_kl": 0.0621337890625, "kimi_kl": 0.106201171875, "learning_rate": 8.753501400560223e-07, "loss": 0.0025, "ppl": 0.560546875, "reward": 0.88956019282341, "reward_std": 0.043342080898582935, "rewards/single_object_detection_bbox_reward": 0.8895602226257324, "step": 89, "temperature": 0.9 }, { "advantages": 7.395766090212419e-06, "completion_length": 37.53125, "delta_ref_entropy_loss": 0.19189453125, "delta_ref_ppl": -0.17041015625, "entropy_loss": -0.53125, "epoch": 0.12596221133659902, "grad_norm": 3.0058547780720053, "k1_kl": 0.1708984375, "k3_kl": 0.079345703125, "kimi_kl": 0.15283203125, "learning_rate": 8.739495798319328e-07, "loss": 0.0032, "ppl": 0.46875, "reward": 0.9535475373268127, "reward_std": 0.01775090442970395, "rewards/single_object_detection_bbox_reward": 0.9535476267337799, "step": 90, "temperature": 0.9 }, { "advantages": -1.7205121366714593e-05, "completion_length": 89.5, "delta_ref_entropy_loss": 0.18212890625, "delta_ref_ppl": -0.138671875, "entropy_loss": -0.560546875, "epoch": 0.12736179146256124, "grad_norm": 1.8756456064450235, "k1_kl": 0.13916015625, "k3_kl": 0.0594482421875, "kimi_kl": 0.092529296875, "learning_rate": 8.725490196078431e-07, "loss": 0.0024, "ppl": 0.4912109375, "reward": 0.9601150453090668, "reward_std": 0.026506650261580944, "rewards/single_object_detection_bbox_reward": 0.9601151347160339, "step": 91, "temperature": 0.9 }, { "advantages": 6.356409926411288e-06, "completion_length": 67.4375, "delta_ref_entropy_loss": 0.22705078125, "delta_ref_ppl": -0.2177734375, "entropy_loss": -0.564453125, "epoch": 0.12876137158852344, "grad_norm": 14.917979526548594, "k1_kl": 0.2177734375, "k3_kl": 0.11279296875, "kimi_kl": 0.240234375, "learning_rate": 8.711484593837535e-07, "loss": 0.0045, "ppl": 0.50390625, "reward": 0.9565789699554443, "reward_std": 0.029134005308151245, "rewards/single_object_detection_bbox_reward": 0.9565789699554443, "step": 92, "temperature": 0.9 }, { "advantages": 7.723059297859436e-06, "completion_length": 21.0, "delta_ref_entropy_loss": 0.2060546875, "delta_ref_ppl": -0.16259765625, "entropy_loss": -0.53515625, "epoch": 0.13016095171448566, "grad_norm": 7.28320603108388, "k1_kl": 0.162109375, "k3_kl": 0.068115234375, "kimi_kl": 0.109130859375, "learning_rate": 8.697478991596638e-07, "loss": 0.0027, "ppl": 0.46875, "reward": 0.9774503707885742, "reward_std": 0.019763534888625145, "rewards/single_object_detection_bbox_reward": 0.977450430393219, "step": 93, "temperature": 0.9 }, { "advantages": 1.7181571685398467e-06, "completion_length": 130.0, "delta_ref_entropy_loss": 0.1787109375, "delta_ref_ppl": -0.1513671875, "entropy_loss": -0.578125, "epoch": 0.13156053184044786, "grad_norm": 2.492086862640161, "k1_kl": 0.150390625, "k3_kl": 0.0645751953125, "kimi_kl": 0.1123046875, "learning_rate": 8.683473389355742e-07, "loss": 0.0026, "ppl": 0.513671875, "reward": 0.9279113113880157, "reward_std": 0.026246548746712506, "rewards/single_object_detection_bbox_reward": 0.9279113709926605, "step": 94, "temperature": 0.9 }, { "advantages": 4.375885851004568e-06, "completion_length": 108.40625, "delta_ref_entropy_loss": 0.16943359375, "delta_ref_ppl": -0.16650390625, "entropy_loss": -0.634765625, "epoch": 0.13296011196641008, "grad_norm": 2.8999291074468654, "k1_kl": 0.16748046875, "k3_kl": 0.08154296875, "kimi_kl": 0.150390625, "learning_rate": 8.669467787114846e-07, "loss": 0.0033, "ppl": 0.5625, "reward": 0.929175615310669, "reward_std": 0.03620412480086088, "rewards/single_object_detection_bbox_reward": 0.9291756451129913, "step": 95, "temperature": 0.9 }, { "advantages": 8.690569757163757e-07, "completion_length": 79.0, "delta_ref_entropy_loss": 0.1474609375, "delta_ref_ppl": -0.120361328125, "entropy_loss": -0.5859375, "epoch": 0.13435969209237228, "grad_norm": 2.1412049994843465, "k1_kl": 0.120849609375, "k3_kl": 0.0537109375, "kimi_kl": 0.091552734375, "learning_rate": 8.65546218487395e-07, "loss": 0.0021, "ppl": 0.513671875, "reward": 0.9514386653900146, "reward_std": 0.046078477054834366, "rewards/single_object_detection_bbox_reward": 0.9514386653900146, "step": 96, "temperature": 0.9 }, { "advantages": -1.4075743820285425e-05, "completion_length": 68.0, "delta_ref_entropy_loss": 0.21142578125, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.5546875, "epoch": 0.1357592722183345, "grad_norm": 2.480417632736826, "k1_kl": 0.1748046875, "k3_kl": 0.0762939453125, "kimi_kl": 0.137939453125, "learning_rate": 8.641456582633053e-07, "loss": 0.0031, "ppl": 0.48828125, "reward": 0.971817135810852, "reward_std": 0.01462101167999208, "rewards/single_object_detection_bbox_reward": 0.971817135810852, "step": 97, "temperature": 0.9 }, { "advantages": -1.1359472864569398e-06, "completion_length": 98.0, "delta_ref_entropy_loss": 0.1708984375, "delta_ref_ppl": -0.14599609375, "entropy_loss": -0.55859375, "epoch": 0.1371588523442967, "grad_norm": 1.845358666004261, "k1_kl": 0.14599609375, "k3_kl": 0.066162109375, "kimi_kl": 0.126953125, "learning_rate": 8.627450980392156e-07, "loss": 0.0026, "ppl": 0.4912109375, "reward": 0.9823255836963654, "reward_std": 0.015480469446629286, "rewards/single_object_detection_bbox_reward": 0.9823256731033325, "step": 98, "temperature": 0.9 }, { "advantages": 1.8084955115682533e-06, "completion_length": 62.3125, "delta_ref_entropy_loss": 0.19677734375, "delta_ref_ppl": -0.19287109375, "entropy_loss": -0.560546875, "epoch": 0.13855843247025892, "grad_norm": 2.983554409571776, "k1_kl": 0.19287109375, "k3_kl": 0.103271484375, "kimi_kl": 0.27783203125, "learning_rate": 8.613445378151261e-07, "loss": 0.0041, "ppl": 0.494140625, "reward": 0.9469702839851379, "reward_std": 0.03160835988819599, "rewards/single_object_detection_bbox_reward": 0.9469703137874603, "step": 99, "temperature": 0.9 }, { "advantages": -1.096007054002257e-05, "completion_length": 98.09375, "delta_ref_entropy_loss": 0.1767578125, "delta_ref_ppl": -0.16357421875, "entropy_loss": -0.572265625, "epoch": 0.13995801259622112, "grad_norm": 2.0571911924841526, "k1_kl": 0.16357421875, "k3_kl": 0.0771484375, "kimi_kl": 0.1513671875, "learning_rate": 8.599439775910364e-07, "loss": 0.0031, "ppl": 0.5009765625, "reward": 0.9339576065540314, "reward_std": 0.028953278437256813, "rewards/single_object_detection_bbox_reward": 0.9339576363563538, "step": 100, "temperature": 0.9 }, { "advantages": -1.6826073078846093e-05, "completion_length": 66.875, "delta_ref_entropy_loss": 0.19384765625, "delta_ref_ppl": -0.18994140625, "entropy_loss": -0.541015625, "epoch": 0.14135759272218335, "grad_norm": 2.7010655607728324, "k1_kl": 0.1904296875, "k3_kl": 0.09619140625, "kimi_kl": 0.20068359375, "learning_rate": 8.585434173669467e-07, "loss": 0.0039, "ppl": 0.4765625, "reward": 0.8856777846813202, "reward_std": 0.04447423852980137, "rewards/single_object_detection_bbox_reward": 0.885677844285965, "step": 101, "temperature": 0.9 }, { "advantages": -6.641660377226799e-07, "completion_length": 31.03125, "delta_ref_entropy_loss": 0.20947265625, "delta_ref_ppl": -0.173828125, "entropy_loss": -0.564453125, "epoch": 0.14275717284814557, "grad_norm": 3.6426232038939967, "k1_kl": 0.17431640625, "k3_kl": 0.076171875, "kimi_kl": 0.128662109375, "learning_rate": 8.57142857142857e-07, "loss": 0.003, "ppl": 0.4931640625, "reward": 0.9495174586772919, "reward_std": 0.03382150735706091, "rewards/single_object_detection_bbox_reward": 0.9495175182819366, "step": 102, "temperature": 0.9 }, { "advantages": -3.9022418150125304e-06, "completion_length": 77.90625, "delta_ref_entropy_loss": 0.16064453125, "delta_ref_ppl": -0.12548828125, "entropy_loss": -0.564453125, "epoch": 0.14415675297410777, "grad_norm": 7.109074685758896, "k1_kl": 0.125, "k3_kl": 0.0535888671875, "kimi_kl": 0.089599609375, "learning_rate": 8.557422969187675e-07, "loss": 0.0021, "ppl": 0.494140625, "reward": 0.9700618386268616, "reward_std": 0.03855510242283344, "rewards/single_object_detection_bbox_reward": 0.9700619280338287, "step": 103, "temperature": 0.9 }, { "advantages": 6.5446699863969116e-06, "completion_length": 50.5, "delta_ref_entropy_loss": 0.2177734375, "delta_ref_ppl": -0.1923828125, "entropy_loss": -0.541015625, "epoch": 0.14555633310007, "grad_norm": 2.5998673170025968, "k1_kl": 0.19189453125, "k3_kl": 0.088623046875, "kimi_kl": 0.17138671875, "learning_rate": 8.543417366946778e-07, "loss": 0.0035, "ppl": 0.4677734375, "reward": 0.9859899282455444, "reward_std": 0.012297755281906575, "rewards/single_object_detection_bbox_reward": 0.9859899878501892, "step": 104, "temperature": 0.9 }, { "advantages": -1.78872478500125e-05, "completion_length": 70.34375, "delta_ref_entropy_loss": 0.197265625, "delta_ref_ppl": -0.157470703125, "entropy_loss": -0.533203125, "epoch": 0.1469559132260322, "grad_norm": 2.665440582961025, "k1_kl": 0.15771484375, "k3_kl": 0.0709228515625, "kimi_kl": 0.11865234375, "learning_rate": 8.529411764705882e-07, "loss": 0.0029, "ppl": 0.46484375, "reward": 0.9253762662410736, "reward_std": 0.024457083083689213, "rewards/single_object_detection_bbox_reward": 0.9253763854503632, "step": 105, "temperature": 0.9 }, { "advantages": -8.408512712776428e-06, "completion_length": 41.75, "delta_ref_entropy_loss": 0.20703125, "delta_ref_ppl": -0.22314453125, "entropy_loss": -0.5390625, "epoch": 0.1483554933519944, "grad_norm": 2.6257864119860317, "k1_kl": 0.2236328125, "k3_kl": 0.119384765625, "kimi_kl": 0.259765625, "learning_rate": 8.515406162464985e-07, "loss": 0.0048, "ppl": 0.46875, "reward": 0.9298695027828217, "reward_std": 0.046540794894099236, "rewards/single_object_detection_bbox_reward": 0.9298695921897888, "step": 106, "temperature": 0.9 }, { "advantages": -1.9076146600127686e-06, "completion_length": 62.65625, "delta_ref_entropy_loss": 0.1923828125, "delta_ref_ppl": -0.17529296875, "entropy_loss": -0.5703125, "epoch": 0.1497550734779566, "grad_norm": 2.4223017472975363, "k1_kl": 0.17578125, "k3_kl": 0.082763671875, "kimi_kl": 0.161376953125, "learning_rate": 8.50140056022409e-07, "loss": 0.0033, "ppl": 0.50390625, "reward": 0.9655231833457947, "reward_std": 0.03624452743679285, "rewards/single_object_detection_bbox_reward": 0.9655232429504395, "step": 107, "temperature": 0.9 }, { "advantages": 2.3301690816879272e-06, "completion_length": 80.5, "delta_ref_entropy_loss": 0.17041015625, "delta_ref_ppl": -0.1650390625, "entropy_loss": -0.595703125, "epoch": 0.15115465360391883, "grad_norm": 2.35843697230024, "k1_kl": 0.1640625, "k3_kl": 0.08203125, "kimi_kl": 0.16650390625, "learning_rate": 8.487394957983193e-07, "loss": 0.0033, "ppl": 0.5234375, "reward": 0.9459192156791687, "reward_std": 0.029893978498876095, "rewards/single_object_detection_bbox_reward": 0.9459192454814911, "step": 108, "temperature": 0.9 }, { "advantages": -5.681600669049658e-06, "completion_length": 47.375, "delta_ref_entropy_loss": 0.2080078125, "delta_ref_ppl": -0.1923828125, "entropy_loss": -0.580078125, "epoch": 0.15255423372988103, "grad_norm": 4.5861494552329924, "k1_kl": 0.19189453125, "k3_kl": 0.09326171875, "kimi_kl": 0.1904296875, "learning_rate": 8.473389355742296e-07, "loss": 0.0037, "ppl": 0.5048828125, "reward": 0.9537060856819153, "reward_std": 0.023371422663331032, "rewards/single_object_detection_bbox_reward": 0.9537061154842377, "step": 109, "temperature": 0.9 }, { "advantages": -1.1295612978301506e-06, "completion_length": 79.5, "delta_ref_entropy_loss": 0.20263671875, "delta_ref_ppl": -0.181640625, "entropy_loss": -0.533203125, "epoch": 0.15395381385584325, "grad_norm": 3.8017755659467065, "k1_kl": 0.18212890625, "k3_kl": 0.0908203125, "kimi_kl": 0.18994140625, "learning_rate": 8.4593837535014e-07, "loss": 0.0036, "ppl": 0.470703125, "reward": 0.9483364224433899, "reward_std": 0.008753636211622506, "rewards/single_object_detection_bbox_reward": 0.9483364224433899, "step": 110, "temperature": 0.9 }, { "advantages": 7.131802703952417e-06, "completion_length": 79.09375, "delta_ref_entropy_loss": 0.19482421875, "delta_ref_ppl": -0.169921875, "entropy_loss": -0.55859375, "epoch": 0.15535339398180545, "grad_norm": 3.4644841896085667, "k1_kl": 0.17041015625, "k3_kl": 0.078857421875, "kimi_kl": 0.14990234375, "learning_rate": 8.445378151260503e-07, "loss": 0.0031, "ppl": 0.490234375, "reward": 0.9528076648712158, "reward_std": 0.03025669325143099, "rewards/single_object_detection_bbox_reward": 0.952807754278183, "step": 111, "temperature": 0.9 }, { "advantages": 1.2083511364835431e-05, "completion_length": 86.375, "delta_ref_entropy_loss": 0.158203125, "delta_ref_ppl": -0.18115234375, "entropy_loss": -0.603515625, "epoch": 0.15675297410776767, "grad_norm": 3.136292685822501, "k1_kl": 0.18115234375, "k3_kl": 0.10107421875, "kimi_kl": 0.2685546875, "learning_rate": 8.431372549019608e-07, "loss": 0.004, "ppl": 0.52734375, "reward": 0.8961102068424225, "reward_std": 0.04285774379968643, "rewards/single_object_detection_bbox_reward": 0.8961102068424225, "step": 112, "temperature": 0.9 }, { "advantages": -1.5328505469369702e-05, "completion_length": 70.46875, "delta_ref_entropy_loss": 0.16357421875, "delta_ref_ppl": -0.172607421875, "entropy_loss": -0.580078125, "epoch": 0.15815255423372987, "grad_norm": 2.7412182616875427, "k1_kl": 0.172119140625, "k3_kl": 0.0936279296875, "kimi_kl": 0.224365234375, "learning_rate": 8.417366946778711e-07, "loss": 0.0038, "ppl": 0.51171875, "reward": 0.9635868668556213, "reward_std": 0.026052240282297134, "rewards/single_object_detection_bbox_reward": 0.9635869562625885, "step": 113, "temperature": 0.9 }, { "advantages": 8.69243331180769e-06, "completion_length": 142.625, "delta_ref_entropy_loss": 0.177734375, "delta_ref_ppl": -0.12255859375, "entropy_loss": -0.533203125, "epoch": 0.1595521343596921, "grad_norm": 8.488422130196815, "k1_kl": 0.12255859375, "k3_kl": 0.0587158203125, "kimi_kl": 0.088134765625, "learning_rate": 8.403361344537815e-07, "loss": 0.0023, "ppl": 0.4765625, "reward": 0.9534128606319427, "reward_std": 0.031845537945628166, "rewards/single_object_detection_bbox_reward": 0.9534128904342651, "step": 114, "temperature": 0.9 }, { "advantages": 1.1588317647692747e-06, "completion_length": 90.875, "delta_ref_entropy_loss": 0.18701171875, "delta_ref_ppl": -0.15234375, "entropy_loss": -0.580078125, "epoch": 0.1609517144856543, "grad_norm": 3.430878669107607, "k1_kl": 0.15185546875, "k3_kl": 0.0689697265625, "kimi_kl": 0.117919921875, "learning_rate": 8.389355742296918e-07, "loss": 0.0028, "ppl": 0.5078125, "reward": 0.9659456014633179, "reward_std": 0.024519706144928932, "rewards/single_object_detection_bbox_reward": 0.9659456312656403, "step": 115, "temperature": 0.9 }, { "advantages": -1.3310994745552307e-05, "completion_length": 78.25, "delta_ref_entropy_loss": 0.17333984375, "delta_ref_ppl": -0.1630859375, "entropy_loss": -0.56640625, "epoch": 0.16235129461161651, "grad_norm": 2.9394590629853647, "k1_kl": 0.1630859375, "k3_kl": 0.081298828125, "kimi_kl": 0.158203125, "learning_rate": 8.375350140056023e-07, "loss": 0.0033, "ppl": 0.4990234375, "reward": 0.9276168048381805, "reward_std": 0.03461664728820324, "rewards/single_object_detection_bbox_reward": 0.9276168942451477, "step": 116, "temperature": 0.9 }, { "advantages": 1.2971997875865782e-07, "completion_length": 195.5, "delta_ref_entropy_loss": 0.17626953125, "delta_ref_ppl": -0.18310546875, "entropy_loss": -0.6171875, "epoch": 0.16375087473757874, "grad_norm": 2.038294363964098, "k1_kl": 0.18359375, "k3_kl": 0.09521484375, "kimi_kl": 0.19580078125, "learning_rate": 8.361344537815126e-07, "loss": 0.0038, "ppl": 0.544921875, "reward": 0.9014261364936829, "reward_std": 0.04672816023230553, "rewards/single_object_detection_bbox_reward": 0.90142622590065, "step": 117, "temperature": 0.9 }, { "advantages": -8.233424267700684e-06, "completion_length": 49.0, "delta_ref_entropy_loss": 0.19384765625, "delta_ref_ppl": -0.1640625, "entropy_loss": -0.51953125, "epoch": 0.16515045486354094, "grad_norm": 1.5851692532540451, "k1_kl": 0.1640625, "k3_kl": 0.075927734375, "kimi_kl": 0.1455078125, "learning_rate": 8.34733893557423e-07, "loss": 0.003, "ppl": 0.453125, "reward": 0.9663197994232178, "reward_std": 0.031653992831707, "rewards/single_object_detection_bbox_reward": 0.9663199186325073, "step": 118, "temperature": 0.9 }, { "advantages": -1.59894796070148e-06, "completion_length": 65.03125, "delta_ref_entropy_loss": 0.18798828125, "delta_ref_ppl": -0.1552734375, "entropy_loss": -0.599609375, "epoch": 0.16655003498950316, "grad_norm": 2.8554706012229625, "k1_kl": 0.155517578125, "k3_kl": 0.06787109375, "kimi_kl": 0.1162109375, "learning_rate": 8.333333333333333e-07, "loss": 0.0027, "ppl": 0.521484375, "reward": 0.9587060809135437, "reward_std": 0.030262704007327557, "rewards/single_object_detection_bbox_reward": 0.9587061703205109, "step": 119, "temperature": 0.9 }, { "advantages": -5.562656767210683e-06, "completion_length": 88.5, "delta_ref_entropy_loss": 0.1943359375, "delta_ref_ppl": -0.17236328125, "entropy_loss": -0.552734375, "epoch": 0.16794961511546536, "grad_norm": 5.427248720263248, "k1_kl": 0.1728515625, "k3_kl": 0.077880859375, "kimi_kl": 0.14892578125, "learning_rate": 8.319327731092437e-07, "loss": 0.0031, "ppl": 0.48046875, "reward": 0.9797366857528687, "reward_std": 0.006814709049649537, "rewards/single_object_detection_bbox_reward": 0.9797366857528687, "step": 120, "temperature": 0.9 }, { "advantages": 4.078128483797627e-06, "completion_length": 40.28125, "delta_ref_entropy_loss": 0.236328125, "delta_ref_ppl": -0.1845703125, "entropy_loss": -0.54296875, "epoch": 0.16934919524142758, "grad_norm": 2.808556062589046, "k1_kl": 0.1845703125, "k3_kl": 0.083740234375, "kimi_kl": 0.15185546875, "learning_rate": 8.305322128851541e-07, "loss": 0.0034, "ppl": 0.4736328125, "reward": 0.9725812673568726, "reward_std": 0.023832130944356322, "rewards/single_object_detection_bbox_reward": 0.9725813269615173, "step": 121, "temperature": 0.9 }, { "advantages": 7.016584390839853e-06, "completion_length": 41.40625, "delta_ref_entropy_loss": 0.2001953125, "delta_ref_ppl": -0.16015625, "entropy_loss": -0.609375, "epoch": 0.17074877536738978, "grad_norm": 2.683354714904703, "k1_kl": 0.16064453125, "k3_kl": 0.0723876953125, "kimi_kl": 0.114990234375, "learning_rate": 8.291316526610644e-07, "loss": 0.0029, "ppl": 0.537109375, "reward": 0.9546248614788055, "reward_std": 0.043212974444031715, "rewards/single_object_detection_bbox_reward": 0.9546249210834503, "step": 122, "temperature": 0.9 }, { "advantages": -2.969988145196112e-06, "completion_length": 80.0, "delta_ref_entropy_loss": 0.23828125, "delta_ref_ppl": -0.1787109375, "entropy_loss": -0.529296875, "epoch": 0.172148355493352, "grad_norm": 2.637010693852366, "k1_kl": 0.17919921875, "k3_kl": 0.07666015625, "kimi_kl": 0.1240234375, "learning_rate": 8.277310924369747e-07, "loss": 0.0031, "ppl": 0.46875, "reward": 0.9667004346847534, "reward_std": 0.02709413506090641, "rewards/single_object_detection_bbox_reward": 0.9667005240917206, "step": 123, "temperature": 0.9 }, { "advantages": -8.213999535655603e-06, "completion_length": 90.53125, "delta_ref_entropy_loss": 0.1767578125, "delta_ref_ppl": -0.15576171875, "entropy_loss": -0.5625, "epoch": 0.1735479356193142, "grad_norm": 1.8943588860598357, "k1_kl": 0.15625, "k3_kl": 0.0755615234375, "kimi_kl": 0.13427734375, "learning_rate": 8.26330532212885e-07, "loss": 0.003, "ppl": 0.494140625, "reward": 0.9223918318748474, "reward_std": 0.03227350953966379, "rewards/single_object_detection_bbox_reward": 0.9223918914794922, "step": 124, "temperature": 0.9 }, { "advantages": 1.2328848413289961e-05, "completion_length": 91.9375, "delta_ref_entropy_loss": 0.1826171875, "delta_ref_ppl": -0.1728515625, "entropy_loss": -0.59765625, "epoch": 0.17494751574527642, "grad_norm": 1.70978312930698, "k1_kl": 0.1728515625, "k3_kl": 0.08447265625, "kimi_kl": 0.18115234375, "learning_rate": 8.249299719887955e-07, "loss": 0.0034, "ppl": 0.529296875, "reward": 0.9320000112056732, "reward_std": 0.031562854535877705, "rewards/single_object_detection_bbox_reward": 0.9320001006126404, "step": 125, "temperature": 0.9 }, { "advantages": 4.7867322336969664e-06, "completion_length": 78.3125, "delta_ref_entropy_loss": 0.173828125, "delta_ref_ppl": -0.159423828125, "entropy_loss": -0.61328125, "epoch": 0.17634709587123862, "grad_norm": 2.7235591013646614, "k1_kl": 0.15966796875, "k3_kl": 0.07568359375, "kimi_kl": 0.161865234375, "learning_rate": 8.235294117647058e-07, "loss": 0.003, "ppl": 0.537109375, "reward": 0.9541514217853546, "reward_std": 0.028819449245929718, "rewards/single_object_detection_bbox_reward": 0.9541515111923218, "step": 126, "temperature": 0.9 }, { "advantages": -1.537453954369994e-05, "completion_length": 51.5, "delta_ref_entropy_loss": 0.18798828125, "delta_ref_ppl": -0.16796875, "entropy_loss": -0.54296875, "epoch": 0.17774667599720084, "grad_norm": 2.1729919699579208, "k1_kl": 0.1689453125, "k3_kl": 0.079345703125, "kimi_kl": 0.14013671875, "learning_rate": 8.221288515406162e-07, "loss": 0.0032, "ppl": 0.4765625, "reward": 0.9296923577785492, "reward_std": 0.03996267169713974, "rewards/single_object_detection_bbox_reward": 0.9296923875808716, "step": 127, "temperature": 0.9 }, { "advantages": -3.333470033339836e-07, "completion_length": 65.25, "delta_ref_entropy_loss": 0.21240234375, "delta_ref_ppl": -0.1787109375, "entropy_loss": -0.578125, "epoch": 0.17914625612316304, "grad_norm": 3.2672116379718594, "k1_kl": 0.1787109375, "k3_kl": 0.079833984375, "kimi_kl": 0.137451171875, "learning_rate": 8.207282913165265e-07, "loss": 0.0032, "ppl": 0.51171875, "reward": 0.9426396191120148, "reward_std": 0.03895847126841545, "rewards/single_object_detection_bbox_reward": 0.9426397085189819, "step": 128, "temperature": 0.9 }, { "advantages": -1.8698296798902447e-05, "completion_length": 98.59375, "delta_ref_entropy_loss": 0.18115234375, "delta_ref_ppl": -0.16552734375, "entropy_loss": -0.53515625, "epoch": 0.18054583624912526, "grad_norm": 2.126047009444628, "k1_kl": 0.1650390625, "k3_kl": 0.082763671875, "kimi_kl": 0.173095703125, "learning_rate": 8.19327731092437e-07, "loss": 0.0033, "ppl": 0.4736328125, "reward": 0.9774347245693207, "reward_std": 0.018101769499480724, "rewards/single_object_detection_bbox_reward": 0.9774347841739655, "step": 129, "temperature": 0.9 }, { "advantages": 7.731308187430841e-06, "completion_length": 67.84375, "delta_ref_entropy_loss": 0.2099609375, "delta_ref_ppl": -0.20166015625, "entropy_loss": -0.5703125, "epoch": 0.1819454163750875, "grad_norm": 3.2521711304344953, "k1_kl": 0.20166015625, "k3_kl": 0.098876953125, "kimi_kl": 0.21533203125, "learning_rate": 8.179271708683473e-07, "loss": 0.0039, "ppl": 0.4970703125, "reward": 0.9646835923194885, "reward_std": 0.048521244898438454, "rewards/single_object_detection_bbox_reward": 0.9646836519241333, "step": 130, "temperature": 0.9 }, { "advantages": 2.1196903503550857e-06, "completion_length": 99.0625, "delta_ref_entropy_loss": 0.1630859375, "delta_ref_ppl": -0.12158203125, "entropy_loss": -0.56640625, "epoch": 0.18334499650104968, "grad_norm": 2.4103598552492134, "k1_kl": 0.12158203125, "k3_kl": 0.052978515625, "kimi_kl": 0.0777587890625, "learning_rate": 8.165266106442576e-07, "loss": 0.0021, "ppl": 0.5, "reward": 0.9544437527656555, "reward_std": 0.051724448800086975, "rewards/single_object_detection_bbox_reward": 0.9544437527656555, "step": 131, "temperature": 0.9 }, { "advantages": -5.0663950332818786e-06, "completion_length": 41.5, "delta_ref_entropy_loss": 0.20556640625, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.552734375, "epoch": 0.1847445766270119, "grad_norm": 2.442703377487576, "k1_kl": 0.18701171875, "k3_kl": 0.0889892578125, "kimi_kl": 0.179443359375, "learning_rate": 8.15126050420168e-07, "loss": 0.0036, "ppl": 0.482421875, "reward": 0.9460774064064026, "reward_std": 0.017303330067079514, "rewards/single_object_detection_bbox_reward": 0.9460774660110474, "step": 132, "temperature": 0.9 }, { "advantages": -3.5217299227952026e-06, "completion_length": 39.5, "delta_ref_entropy_loss": 0.1943359375, "delta_ref_ppl": -0.142578125, "entropy_loss": -0.525390625, "epoch": 0.1861441567529741, "grad_norm": 2.6437962574727454, "k1_kl": 0.1416015625, "k3_kl": 0.063232421875, "kimi_kl": 0.102294921875, "learning_rate": 8.137254901960784e-07, "loss": 0.0025, "ppl": 0.466796875, "reward": 0.9825178682804108, "reward_std": 0.026022614911198616, "rewards/single_object_detection_bbox_reward": 0.9825178980827332, "step": 133, "temperature": 0.9 }, { "advantages": -1.5944242932164343e-05, "completion_length": 68.71875, "delta_ref_entropy_loss": 0.177734375, "delta_ref_ppl": -0.1953125, "entropy_loss": -0.6015625, "epoch": 0.18754373687893633, "grad_norm": 2.4265878716047977, "k1_kl": 0.19482421875, "k3_kl": 0.1025390625, "kimi_kl": 0.23486328125, "learning_rate": 8.123249299719888e-07, "loss": 0.0041, "ppl": 0.53125, "reward": 0.9556589126586914, "reward_std": 0.02876918762922287, "rewards/single_object_detection_bbox_reward": 0.9556589722633362, "step": 134, "temperature": 0.9 }, { "advantages": 1.0712869880080689e-06, "completion_length": 55.53125, "delta_ref_entropy_loss": 0.2080078125, "delta_ref_ppl": -0.19921875, "entropy_loss": -0.544921875, "epoch": 0.18894331700489853, "grad_norm": 2.589300473704273, "k1_kl": 0.19921875, "k3_kl": 0.09765625, "kimi_kl": 0.205078125, "learning_rate": 8.109243697478991e-07, "loss": 0.0039, "ppl": 0.482421875, "reward": 0.9672265350818634, "reward_std": 0.02249018754810095, "rewards/single_object_detection_bbox_reward": 0.967226654291153, "step": 135, "temperature": 0.9 }, { "advantages": 2.6207418386547943e-06, "completion_length": 83.96875, "delta_ref_entropy_loss": 0.169921875, "delta_ref_ppl": -0.14208984375, "entropy_loss": -0.599609375, "epoch": 0.19034289713086075, "grad_norm": 3.0916687188414764, "k1_kl": 0.1416015625, "k3_kl": 0.0592041015625, "kimi_kl": 0.0908203125, "learning_rate": 8.095238095238095e-07, "loss": 0.0024, "ppl": 0.52734375, "reward": 0.9134499728679657, "reward_std": 0.026618992909789085, "rewards/single_object_detection_bbox_reward": 0.9134500026702881, "step": 136, "temperature": 0.9 }, { "advantages": 1.6875562778295716e-06, "completion_length": 23.125, "delta_ref_entropy_loss": 0.20166015625, "delta_ref_ppl": -0.18017578125, "entropy_loss": -0.5390625, "epoch": 0.19174247725682295, "grad_norm": 2.3371396233471824, "k1_kl": 0.1796875, "k3_kl": 0.086181640625, "kimi_kl": 0.1689453125, "learning_rate": 8.081232492997198e-07, "loss": 0.0034, "ppl": 0.4765625, "reward": 0.9635791778564453, "reward_std": 0.02127858530730009, "rewards/single_object_detection_bbox_reward": 0.9635792374610901, "step": 137, "temperature": 0.9 }, { "advantages": 4.564811433738214e-06, "completion_length": 80.1875, "delta_ref_entropy_loss": 0.20849609375, "delta_ref_ppl": -0.216796875, "entropy_loss": -0.552734375, "epoch": 0.19314205738278517, "grad_norm": 2.487312102695842, "k1_kl": 0.2158203125, "k3_kl": 0.110107421875, "kimi_kl": 0.2412109375, "learning_rate": 8.067226890756303e-07, "loss": 0.0044, "ppl": 0.4853515625, "reward": 0.9108118116855621, "reward_std": 0.04736736789345741, "rewards/single_object_detection_bbox_reward": 0.9108118712902069, "step": 138, "temperature": 0.9 }, { "advantages": -2.461884832882788e-06, "completion_length": 65.375, "delta_ref_entropy_loss": 0.16552734375, "delta_ref_ppl": -0.138427734375, "entropy_loss": -0.576171875, "epoch": 0.19454163750874737, "grad_norm": 2.379764545384359, "k1_kl": 0.138916015625, "k3_kl": 0.0606689453125, "kimi_kl": 0.103515625, "learning_rate": 8.053221288515406e-07, "loss": 0.0024, "ppl": 0.509765625, "reward": 0.9515462219715118, "reward_std": 0.022300844080746174, "rewards/single_object_detection_bbox_reward": 0.951546311378479, "step": 139, "temperature": 0.9 }, { "advantages": 1.555654648655036e-05, "completion_length": 98.40625, "delta_ref_entropy_loss": 0.22265625, "delta_ref_ppl": -0.19677734375, "entropy_loss": -0.552734375, "epoch": 0.1959412176347096, "grad_norm": 2.829520087886082, "k1_kl": 0.197265625, "k3_kl": 0.0966796875, "kimi_kl": 0.2109375, "learning_rate": 8.03921568627451e-07, "loss": 0.0039, "ppl": 0.4853515625, "reward": 0.9145411252975464, "reward_std": 0.02674452494829893, "rewards/single_object_detection_bbox_reward": 0.914541095495224, "step": 140, "temperature": 0.9 }, { "advantages": -1.4629747511207825e-06, "completion_length": 71.0, "delta_ref_entropy_loss": 0.2119140625, "delta_ref_ppl": -0.22216796875, "entropy_loss": -0.5625, "epoch": 0.1973407977606718, "grad_norm": 3.239480982619146, "k1_kl": 0.22216796875, "k3_kl": 0.11572265625, "kimi_kl": 0.24169921875, "learning_rate": 8.025210084033613e-07, "loss": 0.0046, "ppl": 0.494140625, "reward": 0.9858669340610504, "reward_std": 0.013250474818050861, "rewards/single_object_detection_bbox_reward": 0.9858669936656952, "step": 141, "temperature": 0.9 }, { "advantages": 8.495125712215668e-06, "completion_length": 67.53125, "delta_ref_entropy_loss": 0.18115234375, "delta_ref_ppl": -0.17138671875, "entropy_loss": -0.607421875, "epoch": 0.198740377886634, "grad_norm": 2.444332778202203, "k1_kl": 0.1708984375, "k3_kl": 0.0830078125, "kimi_kl": 0.1513671875, "learning_rate": 8.011204481792717e-07, "loss": 0.0033, "ppl": 0.5400390625, "reward": 0.9181573390960693, "reward_std": 0.03757694619707763, "rewards/single_object_detection_bbox_reward": 0.9181573987007141, "step": 142, "temperature": 0.9 }, { "advantages": 5.228179134064703e-06, "completion_length": 80.46875, "delta_ref_entropy_loss": 0.189453125, "delta_ref_ppl": -0.16748046875, "entropy_loss": -0.572265625, "epoch": 0.2001399580125962, "grad_norm": 2.7808020436932943, "k1_kl": 0.16748046875, "k3_kl": 0.0758056640625, "kimi_kl": 0.16455078125, "learning_rate": 7.997198879551821e-07, "loss": 0.003, "ppl": 0.5009765625, "reward": 0.8852032423019409, "reward_std": 0.05188259109854698, "rewards/single_object_detection_bbox_reward": 0.8852032721042633, "step": 143, "temperature": 0.9 }, { "advantages": 8.174085678547272e-06, "completion_length": 88.5, "delta_ref_entropy_loss": 0.19482421875, "delta_ref_ppl": -0.19970703125, "entropy_loss": -0.548828125, "epoch": 0.20153953813855843, "grad_norm": 2.081493432921667, "k1_kl": 0.19873046875, "k3_kl": 0.10888671875, "kimi_kl": 0.2197265625, "learning_rate": 7.983193277310924e-07, "loss": 0.0043, "ppl": 0.484375, "reward": 0.896990180015564, "reward_std": 0.030945589765906334, "rewards/single_object_detection_bbox_reward": 0.8969903290271759, "step": 144, "temperature": 0.9 }, { "advantages": 1.4595421816920862e-05, "completion_length": 177.0625, "delta_ref_entropy_loss": 0.22021484375, "delta_ref_ppl": -0.1826171875, "entropy_loss": -0.52734375, "epoch": 0.20293911826452066, "grad_norm": 6.0915150236266875, "k1_kl": 0.18359375, "k3_kl": 0.0848388671875, "kimi_kl": 0.18212890625, "learning_rate": 7.969187675070028e-07, "loss": 0.0034, "ppl": 0.4775390625, "reward": 0.9524249136447906, "reward_std": 0.02822862658649683, "rewards/single_object_detection_bbox_reward": 0.9524250030517578, "step": 145, "temperature": 0.9 }, { "advantages": 3.108222614400802e-06, "completion_length": 60.5, "delta_ref_entropy_loss": 0.1953125, "delta_ref_ppl": -0.15966796875, "entropy_loss": -0.55078125, "epoch": 0.20433869839048285, "grad_norm": 2.4539126656327275, "k1_kl": 0.15966796875, "k3_kl": 0.082763671875, "kimi_kl": 0.126708984375, "learning_rate": 7.955182072829131e-07, "loss": 0.0033, "ppl": 0.4892578125, "reward": 0.9397244155406952, "reward_std": 0.03853663429617882, "rewards/single_object_detection_bbox_reward": 0.9397244453430176, "step": 146, "temperature": 0.9 }, { "advantages": -2.559008635216742e-06, "completion_length": 77.8125, "delta_ref_entropy_loss": 0.18359375, "delta_ref_ppl": -0.16845703125, "entropy_loss": -0.5625, "epoch": 0.20573827851644508, "grad_norm": 2.428902405741733, "k1_kl": 0.1689453125, "k3_kl": 0.0771484375, "kimi_kl": 0.13720703125, "learning_rate": 7.941176470588235e-07, "loss": 0.0031, "ppl": 0.4970703125, "reward": 0.9626089334487915, "reward_std": 0.02317740209400654, "rewards/single_object_detection_bbox_reward": 0.9626089930534363, "step": 147, "temperature": 0.9 }, { "advantages": 1.4756141808902612e-05, "completion_length": 51.84375, "delta_ref_entropy_loss": 0.21142578125, "delta_ref_ppl": -0.19140625, "entropy_loss": -0.56640625, "epoch": 0.20713785864240727, "grad_norm": 9.938502257016719, "k1_kl": 0.19091796875, "k3_kl": 0.0892333984375, "kimi_kl": 0.175048828125, "learning_rate": 7.927170868347338e-07, "loss": 0.0036, "ppl": 0.4990234375, "reward": 0.9458918869495392, "reward_std": 0.028210192918777466, "rewards/single_object_detection_bbox_reward": 0.9458919763565063, "step": 148, "temperature": 0.9 }, { "advantages": 1.4545662907039514e-05, "completion_length": 67.875, "delta_ref_entropy_loss": 0.20654296875, "delta_ref_ppl": -0.1767578125, "entropy_loss": -0.591796875, "epoch": 0.2085374387683695, "grad_norm": 4.756019682359689, "k1_kl": 0.177734375, "k3_kl": 0.0859375, "kimi_kl": 0.15869140625, "learning_rate": 7.913165266106442e-07, "loss": 0.0034, "ppl": 0.525390625, "reward": 0.936804860830307, "reward_std": 0.02654985710978508, "rewards/single_object_detection_bbox_reward": 0.9368049204349518, "step": 149, "temperature": 0.9 }, { "advantages": 8.19843296540057e-06, "completion_length": 87.90625, "delta_ref_entropy_loss": 0.19189453125, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.544921875, "epoch": 0.2099370188943317, "grad_norm": 2.2210105303973204, "k1_kl": 0.21142578125, "k3_kl": 0.108154296875, "kimi_kl": 0.27880859375, "learning_rate": 7.899159663865545e-07, "loss": 0.0043, "ppl": 0.4814453125, "reward": 0.9400824010372162, "reward_std": 0.021692122507374734, "rewards/single_object_detection_bbox_reward": 0.9400824904441833, "step": 150, "temperature": 0.9 }, { "advantages": 7.051442025840515e-07, "completion_length": 106.1875, "delta_ref_entropy_loss": 0.189453125, "delta_ref_ppl": -0.18310546875, "entropy_loss": -0.583984375, "epoch": 0.21133659902029392, "grad_norm": 2.610390078778413, "k1_kl": 0.18310546875, "k3_kl": 0.085205078125, "kimi_kl": 0.1650390625, "learning_rate": 7.88515406162465e-07, "loss": 0.0034, "ppl": 0.51171875, "reward": 0.9158325791358948, "reward_std": 0.04501317674294114, "rewards/single_object_detection_bbox_reward": 0.9158326089382172, "step": 151, "temperature": 0.9 }, { "advantages": 1.4502029443974607e-07, "completion_length": 219.59375, "delta_ref_entropy_loss": 0.1767578125, "delta_ref_ppl": -0.1845703125, "entropy_loss": -0.6484375, "epoch": 0.21273617914625612, "grad_norm": 2.2268216606578997, "k1_kl": 0.18408203125, "k3_kl": 0.097412109375, "kimi_kl": 0.21923828125, "learning_rate": 7.871148459383753e-07, "loss": 0.0039, "ppl": 0.57421875, "reward": 0.9065601825714111, "reward_std": 0.05999067425727844, "rewards/single_object_detection_bbox_reward": 0.9065602421760559, "step": 152, "temperature": 0.9 }, { "advantages": -5.037124903850554e-07, "completion_length": 75.5, "delta_ref_entropy_loss": 0.18310546875, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.59375, "epoch": 0.21413575927221834, "grad_norm": 2.5137989340879336, "k1_kl": 0.18603515625, "k3_kl": 0.098876953125, "kimi_kl": 0.2578125, "learning_rate": 7.857142857142856e-07, "loss": 0.0039, "ppl": 0.51953125, "reward": 0.9590257704257965, "reward_std": 0.02217921894043684, "rewards/single_object_detection_bbox_reward": 0.9590258598327637, "step": 153, "temperature": 0.9 }, { "advantages": -7.110115632258385e-06, "completion_length": 32.5, "delta_ref_entropy_loss": 0.2099609375, "delta_ref_ppl": -0.1982421875, "entropy_loss": -0.5322265625, "epoch": 0.21553533939818054, "grad_norm": 2.2854528901909057, "k1_kl": 0.1982421875, "k3_kl": 0.094970703125, "kimi_kl": 0.1767578125, "learning_rate": 7.84313725490196e-07, "loss": 0.0038, "ppl": 0.46875, "reward": 0.9468192756175995, "reward_std": 0.029405130073428154, "rewards/single_object_detection_bbox_reward": 0.9468193650245667, "step": 154, "temperature": 0.9 }, { "advantages": -1.086361226043664e-05, "completion_length": 88.5, "delta_ref_entropy_loss": 0.19482421875, "delta_ref_ppl": -0.17041015625, "entropy_loss": -0.587890625, "epoch": 0.21693491952414276, "grad_norm": 3.3236591870063013, "k1_kl": 0.16943359375, "k3_kl": 0.077392578125, "kimi_kl": 0.1416015625, "learning_rate": 7.829131652661064e-07, "loss": 0.0031, "ppl": 0.5234375, "reward": 0.9477995932102203, "reward_std": 0.04087623953819275, "rewards/single_object_detection_bbox_reward": 0.9477995932102203, "step": 155, "temperature": 0.9 }, { "advantages": -1.0572507562756073e-05, "completion_length": 51.625, "delta_ref_entropy_loss": 0.1796875, "delta_ref_ppl": -0.177734375, "entropy_loss": -0.578125, "epoch": 0.21833449965010496, "grad_norm": 2.848716982419972, "k1_kl": 0.177734375, "k3_kl": 0.086181640625, "kimi_kl": 0.17236328125, "learning_rate": 7.815126050420168e-07, "loss": 0.0035, "ppl": 0.505859375, "reward": 0.902997612953186, "reward_std": 0.0668251495808363, "rewards/single_object_detection_bbox_reward": 0.9029976725578308, "step": 156, "temperature": 0.9 }, { "advantages": -5.800942972200573e-06, "completion_length": 78.65625, "delta_ref_entropy_loss": 0.20751953125, "delta_ref_ppl": -0.173828125, "entropy_loss": -0.546875, "epoch": 0.21973407977606718, "grad_norm": 3.3441784685212737, "k1_kl": 0.17431640625, "k3_kl": 0.0810546875, "kimi_kl": 0.133056640625, "learning_rate": 7.801120448179271e-07, "loss": 0.0033, "ppl": 0.4814453125, "reward": 0.9708406627178192, "reward_std": 0.022656922810710967, "rewards/single_object_detection_bbox_reward": 0.970840722322464, "step": 157, "temperature": 0.9 }, { "advantages": -1.1654836953312042e-06, "completion_length": 107.0, "delta_ref_entropy_loss": 0.1884765625, "delta_ref_ppl": -0.16259765625, "entropy_loss": -0.556640625, "epoch": 0.22113365990202938, "grad_norm": 2.0814620746141252, "k1_kl": 0.16162109375, "k3_kl": 0.072021484375, "kimi_kl": 0.131103515625, "learning_rate": 7.787114845938375e-07, "loss": 0.0029, "ppl": 0.4892578125, "reward": 0.9289256930351257, "reward_std": 0.016465121414512396, "rewards/single_object_detection_bbox_reward": 0.9289257824420929, "step": 158, "temperature": 0.9 }, { "advantages": 8.029330729186768e-07, "completion_length": 41.59375, "delta_ref_entropy_loss": 0.19091796875, "delta_ref_ppl": -0.1982421875, "entropy_loss": -0.509765625, "epoch": 0.2225332400279916, "grad_norm": 2.9222535079320617, "k1_kl": 0.19873046875, "k3_kl": 0.103759765625, "kimi_kl": 0.22705078125, "learning_rate": 7.773109243697479e-07, "loss": 0.0042, "ppl": 0.4521484375, "reward": 0.9037781953811646, "reward_std": 0.03812484350055456, "rewards/single_object_detection_bbox_reward": 0.9037782847881317, "step": 159, "temperature": 0.9 }, { "advantages": 6.741977358615259e-06, "completion_length": 50.5, "delta_ref_entropy_loss": 0.1962890625, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.583984375, "epoch": 0.22393282015395383, "grad_norm": 2.6855679528803127, "k1_kl": 0.18359375, "k3_kl": 0.082275390625, "kimi_kl": 0.1611328125, "learning_rate": 7.759103641456583e-07, "loss": 0.0033, "ppl": 0.515625, "reward": 0.9511028826236725, "reward_std": 0.01101148611633107, "rewards/single_object_detection_bbox_reward": 0.9511029422283173, "step": 160, "temperature": 0.9 }, { "advantages": 6.493048203992657e-06, "completion_length": 21.65625, "delta_ref_entropy_loss": 0.18701171875, "delta_ref_ppl": -0.1982421875, "entropy_loss": -0.58984375, "epoch": 0.22533240027991602, "grad_norm": 1.924256216329314, "k1_kl": 0.19873046875, "k3_kl": 0.099365234375, "kimi_kl": 0.25439453125, "learning_rate": 7.745098039215686e-07, "loss": 0.004, "ppl": 0.5234375, "reward": 0.945155680179596, "reward_std": 0.032420193776488304, "rewards/single_object_detection_bbox_reward": 0.9451557397842407, "step": 161, "temperature": 0.9 }, { "advantages": -4.402228796607233e-06, "completion_length": 42.0, "delta_ref_entropy_loss": 0.208984375, "delta_ref_ppl": -0.1943359375, "entropy_loss": -0.5390625, "epoch": 0.22673198040587825, "grad_norm": 2.868349769199714, "k1_kl": 0.1943359375, "k3_kl": 0.0888671875, "kimi_kl": 0.1650390625, "learning_rate": 7.73109243697479e-07, "loss": 0.0036, "ppl": 0.4755859375, "reward": 0.9600699245929718, "reward_std": 0.03601648099720478, "rewards/single_object_detection_bbox_reward": 0.9600699543952942, "step": 162, "temperature": 0.9 }, { "advantages": -1.5351123693108093e-05, "completion_length": 31.53125, "delta_ref_entropy_loss": 0.22802734375, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.5146484375, "epoch": 0.22813156053184044, "grad_norm": 28.692038424941682, "k1_kl": 0.21435546875, "k3_kl": 0.190673828125, "kimi_kl": 0.251953125, "learning_rate": 7.717086834733894e-07, "loss": 0.0077, "ppl": 0.4521484375, "reward": 0.9247898161411285, "reward_std": 0.018508149310946465, "rewards/single_object_detection_bbox_reward": 0.9247898757457733, "step": 163, "temperature": 0.9 }, { "advantages": -8.343586387127289e-06, "completion_length": 49.5, "delta_ref_entropy_loss": 0.20654296875, "delta_ref_ppl": -0.22216796875, "entropy_loss": -0.5703125, "epoch": 0.22953114065780267, "grad_norm": 4.050769005440449, "k1_kl": 0.22216796875, "k3_kl": 0.10986328125, "kimi_kl": 0.2509765625, "learning_rate": 7.703081232492997e-07, "loss": 0.0044, "ppl": 0.5, "reward": 0.9541065692901611, "reward_std": 0.02372945920797065, "rewards/single_object_detection_bbox_reward": 0.9541065990924835, "step": 164, "temperature": 0.9 }, { "advantages": -1.3056344869255554e-05, "completion_length": 133.0625, "delta_ref_entropy_loss": 0.19970703125, "delta_ref_ppl": -0.19091796875, "entropy_loss": -0.533203125, "epoch": 0.23093072078376486, "grad_norm": 2.7461135324964894, "k1_kl": 0.1904296875, "k3_kl": 0.09619140625, "kimi_kl": 0.21240234375, "learning_rate": 7.689075630252101e-07, "loss": 0.0039, "ppl": 0.47265625, "reward": 0.8709410429000854, "reward_std": 0.0272610099054873, "rewards/single_object_detection_bbox_reward": 0.8709411025047302, "step": 165, "temperature": 0.9 }, { "advantages": -3.255105639254907e-06, "completion_length": 106.53125, "delta_ref_entropy_loss": 0.21240234375, "delta_ref_ppl": -0.18994140625, "entropy_loss": -0.529296875, "epoch": 0.2323303009097271, "grad_norm": 2.2901550239126056, "k1_kl": 0.18994140625, "k3_kl": 0.08740234375, "kimi_kl": 0.169189453125, "learning_rate": 7.675070028011204e-07, "loss": 0.0035, "ppl": 0.4619140625, "reward": 0.9782087206840515, "reward_std": 0.011037688032956794, "rewards/single_object_detection_bbox_reward": 0.9782087802886963, "step": 166, "temperature": 0.9 }, { "advantages": -3.3708557793943328e-06, "completion_length": 70.0, "delta_ref_entropy_loss": 0.228515625, "delta_ref_ppl": -0.22265625, "entropy_loss": -0.57421875, "epoch": 0.23372988103568929, "grad_norm": 26.602835989094597, "k1_kl": 0.22265625, "k3_kl": 0.105712890625, "kimi_kl": 0.2451171875, "learning_rate": 7.661064425770309e-07, "loss": 0.0042, "ppl": 0.5078125, "reward": 0.9463842809200287, "reward_std": 0.011603296268731356, "rewards/single_object_detection_bbox_reward": 0.9463843107223511, "step": 167, "temperature": 0.9 }, { "advantages": -5.02488478559826e-06, "completion_length": 40.5, "delta_ref_entropy_loss": 0.263671875, "delta_ref_ppl": -0.259765625, "entropy_loss": -0.53515625, "epoch": 0.2351294611616515, "grad_norm": 3.0913452277704425, "k1_kl": 0.2607421875, "k3_kl": 0.140869140625, "kimi_kl": 0.345703125, "learning_rate": 7.647058823529411e-07, "loss": 0.0056, "ppl": 0.4736328125, "reward": 0.9260867834091187, "reward_std": 0.040385862812399864, "rewards/single_object_detection_bbox_reward": 0.9260869026184082, "step": 168, "temperature": 0.9 }, { "advantages": 6.264342118811328e-06, "completion_length": 47.4375, "delta_ref_entropy_loss": 0.2138671875, "delta_ref_ppl": -0.1689453125, "entropy_loss": -0.5703125, "epoch": 0.2365290412876137, "grad_norm": 2.9160986836994454, "k1_kl": 0.169921875, "k3_kl": 0.074951171875, "kimi_kl": 0.135498046875, "learning_rate": 7.633053221288515e-07, "loss": 0.003, "ppl": 0.505859375, "reward": 0.910872220993042, "reward_std": 0.034245869785081595, "rewards/single_object_detection_bbox_reward": 0.9108722805976868, "step": 169, "temperature": 0.9 }, { "advantages": -3.1420164532391937e-06, "completion_length": 78.53125, "delta_ref_entropy_loss": 0.205078125, "delta_ref_ppl": -0.18896484375, "entropy_loss": -0.5703125, "epoch": 0.23792862141357593, "grad_norm": 2.3698946217329206, "k1_kl": 0.18896484375, "k3_kl": 0.0950927734375, "kimi_kl": 0.20947265625, "learning_rate": 7.619047619047618e-07, "loss": 0.0038, "ppl": 0.505859375, "reward": 0.9604765176773071, "reward_std": 0.017504170071333647, "rewards/single_object_detection_bbox_reward": 0.9604765772819519, "step": 170, "temperature": 0.9 }, { "advantages": -7.306092669523423e-06, "completion_length": 80.6875, "delta_ref_entropy_loss": 0.1904296875, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.552734375, "epoch": 0.23932820153953813, "grad_norm": 2.572083554935536, "k1_kl": 0.185546875, "k3_kl": 0.095703125, "kimi_kl": 0.203125, "learning_rate": 7.605042016806722e-07, "loss": 0.0038, "ppl": 0.486328125, "reward": 0.965759813785553, "reward_std": 0.03524491656571627, "rewards/single_object_detection_bbox_reward": 0.9657598435878754, "step": 171, "temperature": 0.9 }, { "advantages": -2.79928963209386e-06, "completion_length": 69.1875, "delta_ref_entropy_loss": 0.1865234375, "delta_ref_ppl": -0.15283203125, "entropy_loss": -0.53515625, "epoch": 0.24072778166550035, "grad_norm": 2.6958648258856046, "k1_kl": 0.1533203125, "k3_kl": 0.069091796875, "kimi_kl": 0.119873046875, "learning_rate": 7.591036414565826e-07, "loss": 0.0028, "ppl": 0.470703125, "reward": 0.9322323203086853, "reward_std": 0.03738449327647686, "rewards/single_object_detection_bbox_reward": 0.9322323799133301, "step": 172, "temperature": 0.9 }, { "advantages": 3.2040156838775147e-06, "completion_length": 68.0, "delta_ref_entropy_loss": 0.2001953125, "delta_ref_ppl": -0.1708984375, "entropy_loss": -0.55859375, "epoch": 0.24212736179146255, "grad_norm": 1.7132862282682497, "k1_kl": 0.1708984375, "k3_kl": 0.0791015625, "kimi_kl": 0.15234375, "learning_rate": 7.577030812324929e-07, "loss": 0.0032, "ppl": 0.4912109375, "reward": 0.9616701006889343, "reward_std": 0.010663798078894615, "rewards/single_object_detection_bbox_reward": 0.9616701900959015, "step": 173, "temperature": 0.9 }, { "advantages": -1.7849463347374694e-06, "completion_length": 70.0, "delta_ref_entropy_loss": 0.2529296875, "delta_ref_ppl": -0.21533203125, "entropy_loss": -0.515625, "epoch": 0.24352694191742477, "grad_norm": 3.801769519616892, "k1_kl": 0.2158203125, "k3_kl": 0.0986328125, "kimi_kl": 0.179443359375, "learning_rate": 7.563025210084033e-07, "loss": 0.004, "ppl": 0.458984375, "reward": 0.9315775334835052, "reward_std": 0.03504119021818042, "rewards/single_object_detection_bbox_reward": 0.9315776526927948, "step": 174, "temperature": 0.9 }, { "advantages": 7.051974762362079e-06, "completion_length": 51.78125, "delta_ref_entropy_loss": 0.21533203125, "delta_ref_ppl": -0.19091796875, "entropy_loss": -0.6171875, "epoch": 0.244926522043387, "grad_norm": 2.4517497031713154, "k1_kl": 0.1904296875, "k3_kl": 0.094482421875, "kimi_kl": 0.21337890625, "learning_rate": 7.549019607843136e-07, "loss": 0.0038, "ppl": 0.548828125, "reward": 0.9190386831760406, "reward_std": 0.029360241256654263, "rewards/single_object_detection_bbox_reward": 0.919038712978363, "step": 175, "temperature": 0.9 }, { "advantages": -6.426924528568634e-06, "completion_length": 140.6875, "delta_ref_entropy_loss": 0.22607421875, "delta_ref_ppl": -0.18017578125, "entropy_loss": -0.5390625, "epoch": 0.2463261021693492, "grad_norm": 3.0885676621748774, "k1_kl": 0.18115234375, "k3_kl": 0.076904296875, "kimi_kl": 0.12646484375, "learning_rate": 7.535014005602241e-07, "loss": 0.0031, "ppl": 0.4794921875, "reward": 0.9728661477565765, "reward_std": 0.013553782366216183, "rewards/single_object_detection_bbox_reward": 0.9728662073612213, "step": 176, "temperature": 0.9 }, { "advantages": -9.463035951284837e-06, "completion_length": 131.125, "delta_ref_entropy_loss": 0.19482421875, "delta_ref_ppl": -0.1845703125, "entropy_loss": -0.57421875, "epoch": 0.24772568229531142, "grad_norm": 2.2062218963374733, "k1_kl": 0.18505859375, "k3_kl": 0.088623046875, "kimi_kl": 0.185546875, "learning_rate": 7.521008403361344e-07, "loss": 0.0035, "ppl": 0.50390625, "reward": 0.9472881853580475, "reward_std": 0.021736339665949345, "rewards/single_object_detection_bbox_reward": 0.9472882449626923, "step": 177, "temperature": 0.9 }, { "advantages": 7.323920726776123e-06, "completion_length": 79.09375, "delta_ref_entropy_loss": 0.23046875, "delta_ref_ppl": -0.18896484375, "entropy_loss": -0.568359375, "epoch": 0.2491252624212736, "grad_norm": 2.759151068706165, "k1_kl": 0.1884765625, "k3_kl": 0.085205078125, "kimi_kl": 0.15576171875, "learning_rate": 7.507002801120448e-07, "loss": 0.0034, "ppl": 0.498046875, "reward": 0.9555157721042633, "reward_std": 0.0155285126529634, "rewards/single_object_detection_bbox_reward": 0.9555157721042633, "step": 178, "temperature": 0.9 }, { "advantages": 9.053253961610608e-06, "completion_length": 40.1875, "delta_ref_entropy_loss": 0.24072265625, "delta_ref_ppl": -0.19140625, "entropy_loss": -0.5234375, "epoch": 0.25052484254723584, "grad_norm": 2.797813326422486, "k1_kl": 0.19091796875, "k3_kl": 0.0908203125, "kimi_kl": 0.14453125, "learning_rate": 7.492997198879551e-07, "loss": 0.0036, "ppl": 0.46875, "reward": 0.9704452753067017, "reward_std": 0.034053447656333447, "rewards/single_object_detection_bbox_reward": 0.9704453349113464, "step": 179, "temperature": 0.9 }, { "advantages": 1.5821838132978883e-06, "completion_length": 109.0, "delta_ref_entropy_loss": 0.201171875, "delta_ref_ppl": -0.18408203125, "entropy_loss": -0.57421875, "epoch": 0.25192442267319803, "grad_norm": 2.867476442967632, "k1_kl": 0.1845703125, "k3_kl": 0.082763671875, "kimi_kl": 0.14794921875, "learning_rate": 7.478991596638656e-07, "loss": 0.0033, "ppl": 0.5087890625, "reward": 0.895580381155014, "reward_std": 0.05575796030461788, "rewards/single_object_detection_bbox_reward": 0.8955805003643036, "step": 180, "temperature": 0.9 }, { "advantages": -4.6300101530505344e-08, "completion_length": 45.0625, "delta_ref_entropy_loss": 0.2119140625, "delta_ref_ppl": -0.1728515625, "entropy_loss": -0.546875, "epoch": 0.25332400279916023, "grad_norm": 2.816474986012522, "k1_kl": 0.171875, "k3_kl": 0.076904296875, "kimi_kl": 0.14208984375, "learning_rate": 7.464985994397759e-07, "loss": 0.0031, "ppl": 0.4833984375, "reward": 0.9478709995746613, "reward_std": 0.030483032576739788, "rewards/single_object_detection_bbox_reward": 0.947871059179306, "step": 181, "temperature": 0.9 }, { "advantages": 1.1658296443783911e-05, "completion_length": 117.375, "delta_ref_entropy_loss": 0.21728515625, "delta_ref_ppl": -0.1962890625, "entropy_loss": -0.607421875, "epoch": 0.2547235829251225, "grad_norm": 3.11759814114824, "k1_kl": 0.19677734375, "k3_kl": 0.093017578125, "kimi_kl": 0.164306640625, "learning_rate": 7.450980392156863e-07, "loss": 0.0037, "ppl": 0.5390625, "reward": 0.9003268182277679, "reward_std": 0.043751793913543224, "rewards/single_object_detection_bbox_reward": 0.9003269076347351, "step": 182, "temperature": 0.9 }, { "advantages": -6.432245982068707e-06, "completion_length": 63.75, "delta_ref_entropy_loss": 0.2216796875, "delta_ref_ppl": -0.19970703125, "entropy_loss": -0.5048828125, "epoch": 0.2561231630510847, "grad_norm": 2.79420014141938, "k1_kl": 0.2001953125, "k3_kl": 0.099609375, "kimi_kl": 0.19140625, "learning_rate": 7.436974789915966e-07, "loss": 0.004, "ppl": 0.4462890625, "reward": 0.9249684512615204, "reward_std": 0.038427422288805246, "rewards/single_object_detection_bbox_reward": 0.9249685704708099, "step": 183, "temperature": 0.9 }, { "advantages": -1.8187399746238953e-06, "completion_length": 106.71875, "delta_ref_entropy_loss": 0.23974609375, "delta_ref_ppl": -0.20556640625, "entropy_loss": -0.5400390625, "epoch": 0.2575227431770469, "grad_norm": 2.8109957588177985, "k1_kl": 0.20654296875, "k3_kl": 0.095947265625, "kimi_kl": 0.1650390625, "learning_rate": 7.42296918767507e-07, "loss": 0.0038, "ppl": 0.4697265625, "reward": 0.9473485052585602, "reward_std": 0.03562370361760259, "rewards/single_object_detection_bbox_reward": 0.9473486244678497, "step": 184, "temperature": 0.9 }, { "advantages": -2.3297434836422326e-05, "completion_length": 109.75, "delta_ref_entropy_loss": 0.21435546875, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.54296875, "epoch": 0.25892232330300907, "grad_norm": 2.619166588098137, "k1_kl": 0.18408203125, "k3_kl": 0.08837890625, "kimi_kl": 0.16357421875, "learning_rate": 7.408963585434174e-07, "loss": 0.0036, "ppl": 0.4765625, "reward": 0.9215331971645355, "reward_std": 0.037805890664458275, "rewards/single_object_detection_bbox_reward": 0.9215332865715027, "step": 185, "temperature": 0.9 }, { "advantages": -3.0318540211737854e-06, "completion_length": 87.625, "delta_ref_entropy_loss": 0.21337890625, "delta_ref_ppl": -0.18505859375, "entropy_loss": -0.521484375, "epoch": 0.2603219034289713, "grad_norm": 2.8741042835517154, "k1_kl": 0.18408203125, "k3_kl": 0.085205078125, "kimi_kl": 0.1611328125, "learning_rate": 7.394957983193277e-07, "loss": 0.0034, "ppl": 0.4599609375, "reward": 0.9316982626914978, "reward_std": 0.018972303485497832, "rewards/single_object_detection_bbox_reward": 0.931698352098465, "step": 186, "temperature": 0.9 }, { "advantages": 5.372400892156293e-06, "completion_length": 124.875, "delta_ref_entropy_loss": 0.22998046875, "delta_ref_ppl": -0.24267578125, "entropy_loss": -0.544921875, "epoch": 0.2617214835549335, "grad_norm": 5.547180859450594, "k1_kl": 0.2431640625, "k3_kl": 0.123779296875, "kimi_kl": 0.25732421875, "learning_rate": 7.380952380952381e-07, "loss": 0.0049, "ppl": 0.4794921875, "reward": 0.9490662515163422, "reward_std": 0.01883362978696823, "rewards/single_object_detection_bbox_reward": 0.9490663111209869, "step": 187, "temperature": 0.9 }, { "advantages": -1.416674706433696e-06, "completion_length": 51.25, "delta_ref_entropy_loss": 0.21630859375, "delta_ref_ppl": -0.20556640625, "entropy_loss": -0.5390625, "epoch": 0.2631210636808957, "grad_norm": 2.179398016963401, "k1_kl": 0.205078125, "k3_kl": 0.099609375, "kimi_kl": 0.19921875, "learning_rate": 7.366946778711484e-07, "loss": 0.004, "ppl": 0.470703125, "reward": 0.9575863182544708, "reward_std": 0.020031790249049664, "rewards/single_object_detection_bbox_reward": 0.9575863480567932, "step": 188, "temperature": 0.9 }, { "advantages": -1.5538053830255194e-05, "completion_length": 54.125, "delta_ref_entropy_loss": 0.2236328125, "delta_ref_ppl": -0.1923828125, "entropy_loss": -0.517578125, "epoch": 0.26452064380685797, "grad_norm": 1.6580674542485194, "k1_kl": 0.19287109375, "k3_kl": 0.088623046875, "kimi_kl": 0.171630859375, "learning_rate": 7.352941176470589e-07, "loss": 0.0036, "ppl": 0.458984375, "reward": 0.978038102388382, "reward_std": 0.031134886667132378, "rewards/single_object_detection_bbox_reward": 0.9780381917953491, "step": 189, "temperature": 0.9 }, { "advantages": -7.565266741949017e-06, "completion_length": 50.40625, "delta_ref_entropy_loss": 0.20947265625, "delta_ref_ppl": -0.205078125, "entropy_loss": -0.50390625, "epoch": 0.26592022393282017, "grad_norm": 2.8469797995035484, "k1_kl": 0.2060546875, "k3_kl": 0.0970458984375, "kimi_kl": 0.187255859375, "learning_rate": 7.338935574229691e-07, "loss": 0.0039, "ppl": 0.439453125, "reward": 0.977291464805603, "reward_std": 0.032572416588664055, "rewards/single_object_detection_bbox_reward": 0.977291464805603, "step": 190, "temperature": 0.9 }, { "advantages": 1.7495560769020813e-06, "completion_length": 96.28125, "delta_ref_entropy_loss": 0.21728515625, "delta_ref_ppl": -0.171875, "entropy_loss": -0.560546875, "epoch": 0.26731980405878236, "grad_norm": 4.619888087976672, "k1_kl": 0.17138671875, "k3_kl": 0.075927734375, "kimi_kl": 0.120849609375, "learning_rate": 7.324929971988795e-07, "loss": 0.003, "ppl": 0.4951171875, "reward": 0.9223224222660065, "reward_std": 0.03753358591347933, "rewards/single_object_detection_bbox_reward": 0.9223224818706512, "step": 191, "temperature": 0.9 }, { "advantages": 4.395576638671628e-06, "completion_length": 32.71875, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.19775390625, "entropy_loss": -0.521484375, "epoch": 0.26871938418474456, "grad_norm": 3.937376098511523, "k1_kl": 0.19775390625, "k3_kl": 0.091064453125, "kimi_kl": 0.171875, "learning_rate": 7.310924369747898e-07, "loss": 0.0036, "ppl": 0.455078125, "reward": 0.9690508544445038, "reward_std": 0.036797983571887016, "rewards/single_object_detection_bbox_reward": 0.9690509140491486, "step": 192, "temperature": 0.9 }, { "advantages": 8.65444849296182e-06, "completion_length": 89.03125, "delta_ref_entropy_loss": 0.20751953125, "delta_ref_ppl": -0.15283203125, "entropy_loss": -0.541015625, "epoch": 0.2701189643107068, "grad_norm": 3.4055579938143827, "k1_kl": 0.15283203125, "k3_kl": 0.061767578125, "kimi_kl": 0.095458984375, "learning_rate": 7.296918767507003e-07, "loss": 0.0025, "ppl": 0.4716796875, "reward": 0.9404144883155823, "reward_std": 0.021571168210357428, "rewards/single_object_detection_bbox_reward": 0.940414547920227, "step": 193, "temperature": 0.9 }, { "advantages": -1.7774948446458438e-07, "completion_length": 102.3125, "delta_ref_entropy_loss": 0.2060546875, "delta_ref_ppl": -0.17919921875, "entropy_loss": -0.58984375, "epoch": 0.271518544436669, "grad_norm": 2.1172355333038277, "k1_kl": 0.1796875, "k3_kl": 0.07666015625, "kimi_kl": 0.134033203125, "learning_rate": 7.282913165266106e-07, "loss": 0.0031, "ppl": 0.517578125, "reward": 0.9541440606117249, "reward_std": 0.027768636122345924, "rewards/single_object_detection_bbox_reward": 0.9541441202163696, "step": 194, "temperature": 0.9 }, { "advantages": 7.435280622303253e-06, "completion_length": 61.0, "delta_ref_entropy_loss": 0.2216796875, "delta_ref_ppl": -0.18408203125, "entropy_loss": -0.529296875, "epoch": 0.2729181245626312, "grad_norm": 3.3702779140156056, "k1_kl": 0.18408203125, "k3_kl": 0.0791015625, "kimi_kl": 0.124755859375, "learning_rate": 7.268907563025209e-07, "loss": 0.0032, "ppl": 0.4638671875, "reward": 0.9253318011760712, "reward_std": 0.02412689570337534, "rewards/single_object_detection_bbox_reward": 0.9253318011760712, "step": 195, "temperature": 0.9 }, { "advantages": 6.323547609099478e-06, "completion_length": 80.6875, "delta_ref_entropy_loss": 0.21875, "delta_ref_ppl": -0.18408203125, "entropy_loss": -0.5625, "epoch": 0.2743177046885934, "grad_norm": 4.275250769827655, "k1_kl": 0.18408203125, "k3_kl": 0.08642578125, "kimi_kl": 0.15478515625, "learning_rate": 7.254901960784313e-07, "loss": 0.0034, "ppl": 0.4990234375, "reward": 0.9466573596000671, "reward_std": 0.029067616909742355, "rewards/single_object_detection_bbox_reward": 0.9466574192047119, "step": 196, "temperature": 0.9 }, { "advantages": -4.716616899713699e-06, "completion_length": 60.5625, "delta_ref_entropy_loss": 0.22119140625, "delta_ref_ppl": -0.17529296875, "entropy_loss": -0.50390625, "epoch": 0.27571728481455565, "grad_norm": 2.6928776638789325, "k1_kl": 0.17529296875, "k3_kl": 0.077880859375, "kimi_kl": 0.140869140625, "learning_rate": 7.240896358543416e-07, "loss": 0.0031, "ppl": 0.44140625, "reward": 0.9904437363147736, "reward_std": 0.01456612441688776, "rewards/single_object_detection_bbox_reward": 0.9904438257217407, "step": 197, "temperature": 0.9 }, { "advantages": 1.0193725017870747e-05, "completion_length": 60.5, "delta_ref_entropy_loss": 0.22509765625, "delta_ref_ppl": -0.23876953125, "entropy_loss": -0.5390625, "epoch": 0.27711686494051785, "grad_norm": 3.284947098098428, "k1_kl": 0.2392578125, "k3_kl": 0.127197265625, "kimi_kl": 0.24365234375, "learning_rate": 7.226890756302521e-07, "loss": 0.0051, "ppl": 0.4697265625, "reward": 0.9131086468696594, "reward_std": 0.02756919330568053, "rewards/single_object_detection_bbox_reward": 0.9131087064743042, "step": 198, "temperature": 0.9 }, { "advantages": -6.334989166134619e-06, "completion_length": 53.71875, "delta_ref_entropy_loss": 0.25927734375, "delta_ref_ppl": -0.23486328125, "entropy_loss": -0.513671875, "epoch": 0.27851644506648005, "grad_norm": 2.654432455007358, "k1_kl": 0.234375, "k3_kl": 0.11572265625, "kimi_kl": 0.23046875, "learning_rate": 7.212885154061624e-07, "loss": 0.0046, "ppl": 0.4541015625, "reward": 0.9552677273750305, "reward_std": 0.019588098861277103, "rewards/single_object_detection_bbox_reward": 0.9552678465843201, "step": 199, "temperature": 0.9 }, { "advantages": -2.0187349036859814e-05, "completion_length": 75.625, "delta_ref_entropy_loss": 0.20849609375, "delta_ref_ppl": -0.18994140625, "entropy_loss": -0.478515625, "epoch": 0.27991602519244224, "grad_norm": 2.2694707824666343, "k1_kl": 0.189453125, "k3_kl": 0.08984375, "kimi_kl": 0.162841796875, "learning_rate": 7.198879551820728e-07, "loss": 0.0036, "ppl": 0.4228515625, "reward": 0.9488168060779572, "reward_std": 0.032012284733355045, "rewards/single_object_detection_bbox_reward": 0.9488168358802795, "step": 200, "temperature": 0.9 }, { "advantages": -1.671351537879673e-05, "completion_length": 33.0, "delta_ref_entropy_loss": 0.23974609375, "delta_ref_ppl": -0.19970703125, "entropy_loss": -0.552734375, "epoch": 0.2813156053184045, "grad_norm": 2.727875569413167, "k1_kl": 0.19970703125, "k3_kl": 0.089111328125, "kimi_kl": 0.16748046875, "learning_rate": 7.184873949579831e-07, "loss": 0.0036, "ppl": 0.4951171875, "reward": 0.9604962468147278, "reward_std": 0.02085621003061533, "rewards/single_object_detection_bbox_reward": 0.960496336221695, "step": 201, "temperature": 0.9 }, { "advantages": 5.688784426638449e-06, "completion_length": 164.9375, "delta_ref_entropy_loss": 0.2099609375, "delta_ref_ppl": -0.22265625, "entropy_loss": -0.5625, "epoch": 0.2827151854443667, "grad_norm": 5.052843478492764, "k1_kl": 0.22265625, "k3_kl": 0.123779296875, "kimi_kl": 0.29443359375, "learning_rate": 7.170868347338936e-07, "loss": 0.005, "ppl": 0.501953125, "reward": 0.9262741804122925, "reward_std": 0.051669150590896606, "rewards/single_object_detection_bbox_reward": 0.926274299621582, "step": 202, "temperature": 0.9 }, { "advantages": -4.652621555578662e-06, "completion_length": 52.0, "delta_ref_entropy_loss": 0.2626953125, "delta_ref_ppl": -0.228515625, "entropy_loss": -0.5166015625, "epoch": 0.2841147655703289, "grad_norm": 2.49220670262175, "k1_kl": 0.2294921875, "k3_kl": 0.108154296875, "kimi_kl": 0.228515625, "learning_rate": 7.156862745098039e-07, "loss": 0.0043, "ppl": 0.4521484375, "reward": 0.9508625566959381, "reward_std": 0.013735507382079959, "rewards/single_object_detection_bbox_reward": 0.9508626163005829, "step": 203, "temperature": 0.9 }, { "advantages": -9.198274440791465e-06, "completion_length": 137.78125, "delta_ref_entropy_loss": 0.20263671875, "delta_ref_ppl": -0.16796875, "entropy_loss": -0.548828125, "epoch": 0.28551434569629114, "grad_norm": 2.540824450599785, "k1_kl": 0.16845703125, "k3_kl": 0.0802001953125, "kimi_kl": 0.143310546875, "learning_rate": 7.142857142857143e-07, "loss": 0.0032, "ppl": 0.48828125, "reward": 0.9775774776935577, "reward_std": 0.019285129383206367, "rewards/single_object_detection_bbox_reward": 0.9775775372982025, "step": 204, "temperature": 0.9 }, { "advantages": 1.2622880603885278e-05, "completion_length": 57.09375, "delta_ref_entropy_loss": 0.2138671875, "delta_ref_ppl": -0.19775390625, "entropy_loss": -0.533203125, "epoch": 0.28691392582225334, "grad_norm": 2.2776469472364242, "k1_kl": 0.19775390625, "k3_kl": 0.090087890625, "kimi_kl": 0.1845703125, "learning_rate": 7.128851540616246e-07, "loss": 0.0036, "ppl": 0.4677734375, "reward": 0.9839803874492645, "reward_std": 0.01436405023559928, "rewards/single_object_detection_bbox_reward": 0.9839805066585541, "step": 205, "temperature": 0.9 }, { "advantages": -7.338290060943109e-06, "completion_length": 32.46875, "delta_ref_entropy_loss": 0.23095703125, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.546875, "epoch": 0.28831350594821553, "grad_norm": 9.100477933894902, "k1_kl": 0.2080078125, "k3_kl": 0.107666015625, "kimi_kl": 0.20361328125, "learning_rate": 7.11484593837535e-07, "loss": 0.0043, "ppl": 0.4853515625, "reward": 0.9525587558746338, "reward_std": 0.04317737743258476, "rewards/single_object_detection_bbox_reward": 0.9525587856769562, "step": 206, "temperature": 0.9 }, { "advantages": -1.5561070085823303e-06, "completion_length": 103.125, "delta_ref_entropy_loss": 0.19677734375, "delta_ref_ppl": -0.1591796875, "entropy_loss": -0.57421875, "epoch": 0.28971308607417773, "grad_norm": 4.0682114280063235, "k1_kl": 0.15869140625, "k3_kl": 0.068115234375, "kimi_kl": 0.116455078125, "learning_rate": 7.100840336134454e-07, "loss": 0.0027, "ppl": 0.5078125, "reward": 0.9792561233043671, "reward_std": 0.013496311381459236, "rewards/single_object_detection_bbox_reward": 0.9792561531066895, "step": 207, "temperature": 0.9 }, { "advantages": -2.222455103151333e-05, "completion_length": 79.09375, "delta_ref_entropy_loss": 0.18994140625, "delta_ref_ppl": -0.17236328125, "entropy_loss": -0.552734375, "epoch": 0.29111266620014, "grad_norm": 2.2031133218686723, "k1_kl": 0.171875, "k3_kl": 0.085205078125, "kimi_kl": 0.154296875, "learning_rate": 7.086834733893557e-07, "loss": 0.0034, "ppl": 0.494140625, "reward": 0.9178899228572845, "reward_std": 0.021509647369384766, "rewards/single_object_detection_bbox_reward": 0.9178899526596069, "step": 208, "temperature": 0.9 }, { "advantages": 5.263502657726349e-06, "completion_length": 78.5625, "delta_ref_entropy_loss": 0.197265625, "delta_ref_ppl": -0.1591796875, "entropy_loss": -0.52734375, "epoch": 0.2925122463261022, "grad_norm": 3.526403241443841, "k1_kl": 0.15869140625, "k3_kl": 0.0728759765625, "kimi_kl": 0.12060546875, "learning_rate": 7.072829131652661e-07, "loss": 0.0029, "ppl": 0.470703125, "reward": 0.971653938293457, "reward_std": 0.017245569732040167, "rewards/single_object_detection_bbox_reward": 0.9716539978981018, "step": 209, "temperature": 0.9 }, { "advantages": 1.2338696251390502e-06, "completion_length": 50.0, "delta_ref_entropy_loss": 0.21533203125, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.533203125, "epoch": 0.2939118264520644, "grad_norm": 28.40544545134297, "k1_kl": 0.18603515625, "k3_kl": 0.092041015625, "kimi_kl": 0.1787109375, "learning_rate": 7.058823529411765e-07, "loss": 0.0037, "ppl": 0.46875, "reward": 0.9832829535007477, "reward_std": 0.012184732127934694, "rewards/single_object_detection_bbox_reward": 0.9832830429077148, "step": 210, "temperature": 0.9 }, { "advantages": -8.282651265290042e-06, "completion_length": 57.90625, "delta_ref_entropy_loss": 0.2197265625, "delta_ref_ppl": -0.22021484375, "entropy_loss": -0.53515625, "epoch": 0.29531140657802657, "grad_norm": 5.044536583361623, "k1_kl": 0.22021484375, "k3_kl": 0.109619140625, "kimi_kl": 0.2314453125, "learning_rate": 7.044817927170869e-07, "loss": 0.0044, "ppl": 0.4697265625, "reward": 0.952402800321579, "reward_std": 0.03314939793199301, "rewards/single_object_detection_bbox_reward": 0.9524028301239014, "step": 211, "temperature": 0.9 }, { "advantages": -1.8206023923994508e-06, "completion_length": 87.03125, "delta_ref_entropy_loss": 0.24658203125, "delta_ref_ppl": -0.22900390625, "entropy_loss": -0.568359375, "epoch": 0.2967109867039888, "grad_norm": 3.017113458528322, "k1_kl": 0.22998046875, "k3_kl": 0.11181640625, "kimi_kl": 0.22216796875, "learning_rate": 7.030812324929971e-07, "loss": 0.0045, "ppl": 0.5029296875, "reward": 0.9583162665367126, "reward_std": 0.029238158836960793, "rewards/single_object_detection_bbox_reward": 0.9583163261413574, "step": 212, "temperature": 0.9 }, { "advantages": -5.994923881758041e-06, "completion_length": 69.03125, "delta_ref_entropy_loss": 0.2275390625, "delta_ref_ppl": -0.2529296875, "entropy_loss": -0.5166015625, "epoch": 0.298110566829951, "grad_norm": 3.1415418284548617, "k1_kl": 0.25244140625, "k3_kl": 0.13427734375, "kimi_kl": 0.3046875, "learning_rate": 7.016806722689075e-07, "loss": 0.0054, "ppl": 0.451171875, "reward": 0.9768148064613342, "reward_std": 0.017831498757004738, "rewards/single_object_detection_bbox_reward": 0.976814866065979, "step": 213, "temperature": 0.9 }, { "advantages": -2.474124812579248e-06, "completion_length": 109.8125, "delta_ref_entropy_loss": 0.20068359375, "delta_ref_ppl": -0.1591796875, "entropy_loss": -0.521484375, "epoch": 0.2995101469559132, "grad_norm": 5.033721951705269, "k1_kl": 0.15869140625, "k3_kl": 0.0684814453125, "kimi_kl": 0.117919921875, "learning_rate": 7.002801120448178e-07, "loss": 0.0027, "ppl": 0.455078125, "reward": 0.9122512936592102, "reward_std": 0.04681274387985468, "rewards/single_object_detection_bbox_reward": 0.9122513830661774, "step": 214, "temperature": 0.9 }, { "advantages": 1.936419312187354e-05, "completion_length": 69.0, "delta_ref_entropy_loss": 0.22607421875, "delta_ref_ppl": -0.2138671875, "entropy_loss": -0.5029296875, "epoch": 0.3009097270818754, "grad_norm": 2.21908711454936, "k1_kl": 0.2138671875, "k3_kl": 0.1064453125, "kimi_kl": 0.22265625, "learning_rate": 6.988795518207283e-07, "loss": 0.0042, "ppl": 0.443359375, "reward": 0.9404851496219635, "reward_std": 0.013676028233021498, "rewards/single_object_detection_bbox_reward": 0.9404852390289307, "step": 215, "temperature": 0.9 }, { "advantages": -3.833855771517847e-06, "completion_length": 98.25, "delta_ref_entropy_loss": 0.21630859375, "delta_ref_ppl": -0.22802734375, "entropy_loss": -0.533203125, "epoch": 0.30230930720783766, "grad_norm": 5.130392507951878, "k1_kl": 0.22802734375, "k3_kl": 0.115234375, "kimi_kl": 0.23291015625, "learning_rate": 6.974789915966386e-07, "loss": 0.0046, "ppl": 0.4677734375, "reward": 0.9216841161251068, "reward_std": 0.04520721361041069, "rewards/single_object_detection_bbox_reward": 0.921684205532074, "step": 216, "temperature": 0.9 }, { "advantages": 1.0101923294314474e-05, "completion_length": 52.59375, "delta_ref_entropy_loss": 0.24560546875, "delta_ref_ppl": -0.24267578125, "entropy_loss": -0.5205078125, "epoch": 0.30370888733379986, "grad_norm": 2.574237807574603, "k1_kl": 0.24267578125, "k3_kl": 0.116455078125, "kimi_kl": 0.2197265625, "learning_rate": 6.960784313725489e-07, "loss": 0.0047, "ppl": 0.455078125, "reward": 0.9644257426261902, "reward_std": 0.03963810205459595, "rewards/single_object_detection_bbox_reward": 0.9644257724285126, "step": 217, "temperature": 0.9 }, { "advantages": 1.6311446415784303e-07, "completion_length": 21.0, "delta_ref_entropy_loss": 0.23828125, "delta_ref_ppl": -0.189453125, "entropy_loss": -0.55078125, "epoch": 0.30510846745976206, "grad_norm": 2.7809366999101583, "k1_kl": 0.18994140625, "k3_kl": 0.081787109375, "kimi_kl": 0.1298828125, "learning_rate": 6.946778711484593e-07, "loss": 0.0033, "ppl": 0.482421875, "reward": 0.9405972361564636, "reward_std": 0.03303752979263663, "rewards/single_object_detection_bbox_reward": 0.940597265958786, "step": 218, "temperature": 0.9 }, { "advantages": -1.5917101791274035e-05, "completion_length": 39.5, "delta_ref_entropy_loss": 0.1865234375, "delta_ref_ppl": -0.1611328125, "entropy_loss": -0.5546875, "epoch": 0.3065080475857243, "grad_norm": 2.6494537209052544, "k1_kl": 0.1611328125, "k3_kl": 0.067626953125, "kimi_kl": 0.115478515625, "learning_rate": 6.932773109243697e-07, "loss": 0.0027, "ppl": 0.490234375, "reward": 0.9285969138145447, "reward_std": 0.04202980920672417, "rewards/single_object_detection_bbox_reward": 0.9285970032215118, "step": 219, "temperature": 0.9 }, { "advantages": 1.221456255962039e-05, "completion_length": 52.0, "delta_ref_entropy_loss": 0.24853515625, "delta_ref_ppl": -0.22216796875, "entropy_loss": -0.4765625, "epoch": 0.3079076277116865, "grad_norm": 2.869506273353324, "k1_kl": 0.22216796875, "k3_kl": 0.109130859375, "kimi_kl": 0.20751953125, "learning_rate": 6.918767507002801e-07, "loss": 0.0044, "ppl": 0.427734375, "reward": 0.9693328142166138, "reward_std": 0.03380634821951389, "rewards/single_object_detection_bbox_reward": 0.9693328738212585, "step": 220, "temperature": 0.9 }, { "advantages": 1.376042553147272e-05, "completion_length": 68.3125, "delta_ref_entropy_loss": 0.22216796875, "delta_ref_ppl": -0.2197265625, "entropy_loss": -0.5087890625, "epoch": 0.3093072078376487, "grad_norm": 2.54554565296058, "k1_kl": 0.2197265625, "k3_kl": 0.111083984375, "kimi_kl": 0.23974609375, "learning_rate": 6.904761904761904e-07, "loss": 0.0044, "ppl": 0.44921875, "reward": 0.9312529861927032, "reward_std": 0.0035680003638844937, "rewards/single_object_detection_bbox_reward": 0.9312530755996704, "step": 221, "temperature": 0.9 }, { "advantages": -3.916477908205707e-06, "completion_length": 61.5, "delta_ref_entropy_loss": 0.25, "delta_ref_ppl": -0.23486328125, "entropy_loss": -0.4912109375, "epoch": 0.3107067879636109, "grad_norm": 2.2965539154849264, "k1_kl": 0.2353515625, "k3_kl": 0.117919921875, "kimi_kl": 0.2490234375, "learning_rate": 6.890756302521008e-07, "loss": 0.0047, "ppl": 0.43359375, "reward": 0.9617359936237335, "reward_std": 0.027538195718079805, "rewards/single_object_detection_bbox_reward": 0.9617360532283783, "step": 222, "temperature": 0.9 }, { "advantages": -7.3839248670992674e-06, "completion_length": 49.53125, "delta_ref_entropy_loss": 0.2177734375, "delta_ref_ppl": -0.20263671875, "entropy_loss": -0.541015625, "epoch": 0.31210636808957315, "grad_norm": 2.625685477368835, "k1_kl": 0.20263671875, "k3_kl": 0.091796875, "kimi_kl": 0.15673828125, "learning_rate": 6.876750700280112e-07, "loss": 0.0037, "ppl": 0.4765625, "reward": 0.9047788977622986, "reward_std": 0.04448643885552883, "rewards/single_object_detection_bbox_reward": 0.904778927564621, "step": 223, "temperature": 0.9 }, { "advantages": -5.01716806411423e-06, "completion_length": 65.25, "delta_ref_entropy_loss": 0.22216796875, "delta_ref_ppl": -0.19677734375, "entropy_loss": -0.5703125, "epoch": 0.31350594821553535, "grad_norm": 3.1347214963596643, "k1_kl": 0.19775390625, "k3_kl": 0.09423828125, "kimi_kl": 0.1865234375, "learning_rate": 6.862745098039216e-07, "loss": 0.0038, "ppl": 0.5, "reward": 0.9361460208892822, "reward_std": 0.03709167242050171, "rewards/single_object_detection_bbox_reward": 0.9361461400985718, "step": 224, "temperature": 0.9 }, { "advantages": -6.000910616421606e-06, "completion_length": 89.90625, "delta_ref_entropy_loss": 0.251953125, "delta_ref_ppl": -0.201171875, "entropy_loss": -0.4970703125, "epoch": 0.31490552834149754, "grad_norm": 3.935703502328405, "k1_kl": 0.201171875, "k3_kl": 0.087646484375, "kimi_kl": 0.140869140625, "learning_rate": 6.848739495798319e-07, "loss": 0.0035, "ppl": 0.431640625, "reward": 0.9764271080493927, "reward_std": 0.026549814268946648, "rewards/single_object_detection_bbox_reward": 0.9764271080493927, "step": 225, "temperature": 0.9 }, { "advantages": -8.90038427314721e-06, "completion_length": 159.875, "delta_ref_entropy_loss": 0.19580078125, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.583984375, "epoch": 0.31630510846745974, "grad_norm": 1.8327450415611386, "k1_kl": 0.17529296875, "k3_kl": 0.079833984375, "kimi_kl": 0.14501953125, "learning_rate": 6.834733893557423e-07, "loss": 0.0032, "ppl": 0.5146484375, "reward": 0.9410186111927032, "reward_std": 0.030814019963145256, "rewards/single_object_detection_bbox_reward": 0.9410187005996704, "step": 226, "temperature": 0.9 }, { "advantages": -1.970412540686084e-06, "completion_length": 49.0625, "delta_ref_entropy_loss": 0.22509765625, "delta_ref_ppl": -0.20166015625, "entropy_loss": -0.53515625, "epoch": 0.317704688593422, "grad_norm": 4.244764181614656, "k1_kl": 0.2021484375, "k3_kl": 0.102783203125, "kimi_kl": 0.16845703125, "learning_rate": 6.820728291316526e-07, "loss": 0.0041, "ppl": 0.46875, "reward": 0.9215983748435974, "reward_std": 0.026116439141333103, "rewards/single_object_detection_bbox_reward": 0.921598494052887, "step": 227, "temperature": 0.9 }, { "advantages": 6.7181620977407874e-06, "completion_length": 108.0, "delta_ref_entropy_loss": 0.208984375, "delta_ref_ppl": -0.22509765625, "entropy_loss": -0.5322265625, "epoch": 0.3191042687193842, "grad_norm": 2.868993618486571, "k1_kl": 0.22509765625, "k3_kl": 0.1171875, "kimi_kl": 0.265625, "learning_rate": 6.80672268907563e-07, "loss": 0.0047, "ppl": 0.4716796875, "reward": 0.951713889837265, "reward_std": 0.041456589475274086, "rewards/single_object_detection_bbox_reward": 0.9517139196395874, "step": 228, "temperature": 0.9 }, { "advantages": 3.061656570935156e-06, "completion_length": 110.4375, "delta_ref_entropy_loss": 0.20849609375, "delta_ref_ppl": -0.1943359375, "entropy_loss": -0.5322265625, "epoch": 0.3205038488453464, "grad_norm": 8.406508043320535, "k1_kl": 0.1943359375, "k3_kl": 0.100341796875, "kimi_kl": 0.18798828125, "learning_rate": 6.792717086834734e-07, "loss": 0.004, "ppl": 0.474609375, "reward": 0.9637416005134583, "reward_std": 0.024823994375765324, "rewards/single_object_detection_bbox_reward": 0.9637415707111359, "step": 229, "temperature": 0.9 }, { "advantages": 8.000593766155362e-06, "completion_length": 42.5, "delta_ref_entropy_loss": 0.2275390625, "delta_ref_ppl": -0.24365234375, "entropy_loss": -0.4619140625, "epoch": 0.3219034289713086, "grad_norm": 2.709141887057067, "k1_kl": 0.24365234375, "k3_kl": 0.124267578125, "kimi_kl": 0.25634765625, "learning_rate": 6.778711484593837e-07, "loss": 0.005, "ppl": 0.4052734375, "reward": 0.9414671361446381, "reward_std": 0.030582991428673267, "rewards/single_object_detection_bbox_reward": 0.9414672553539276, "step": 230, "temperature": 0.9 }, { "advantages": 1.7937273355528305e-06, "completion_length": 79.125, "delta_ref_entropy_loss": 0.2392578125, "delta_ref_ppl": -0.22265625, "entropy_loss": -0.5390625, "epoch": 0.32330300909727083, "grad_norm": 4.085923289274958, "k1_kl": 0.22314453125, "k3_kl": 0.106689453125, "kimi_kl": 0.20166015625, "learning_rate": 6.764705882352941e-07, "loss": 0.0043, "ppl": 0.4794921875, "reward": 0.9497236609458923, "reward_std": 0.008686837740242481, "rewards/single_object_detection_bbox_reward": 0.9497237503528595, "step": 231, "temperature": 0.9 }, { "advantages": 2.787581706797937e-06, "completion_length": 99.40625, "delta_ref_entropy_loss": 0.1982421875, "delta_ref_ppl": -0.20068359375, "entropy_loss": -0.564453125, "epoch": 0.32470258922323303, "grad_norm": 3.7456177536999884, "k1_kl": 0.2001953125, "k3_kl": 0.0966796875, "kimi_kl": 0.19482421875, "learning_rate": 6.750700280112045e-07, "loss": 0.0039, "ppl": 0.4921875, "reward": 0.9225129783153534, "reward_std": 0.031601326540112495, "rewards/single_object_detection_bbox_reward": 0.9225129783153534, "step": 232, "temperature": 0.9 }, { "advantages": -5.401671387517126e-06, "completion_length": 69.0, "delta_ref_entropy_loss": 0.23046875, "delta_ref_ppl": -0.23388671875, "entropy_loss": -0.55859375, "epoch": 0.3261021693491952, "grad_norm": 2.4462913071130417, "k1_kl": 0.234375, "k3_kl": 0.120849609375, "kimi_kl": 0.22412109375, "learning_rate": 6.736694677871149e-07, "loss": 0.0048, "ppl": 0.4892578125, "reward": 0.901746392250061, "reward_std": 0.03766851965337992, "rewards/single_object_detection_bbox_reward": 0.9017464816570282, "step": 233, "temperature": 0.9 }, { "advantages": -6.18877197666734e-06, "completion_length": 112.84375, "delta_ref_entropy_loss": 0.20703125, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.53515625, "epoch": 0.3275017494751575, "grad_norm": 1.9921861525453985, "k1_kl": 0.20849609375, "k3_kl": 0.1031494140625, "kimi_kl": 0.224609375, "learning_rate": 6.722689075630252e-07, "loss": 0.0041, "ppl": 0.4677734375, "reward": 0.9163267314434052, "reward_std": 0.02091753063723445, "rewards/single_object_detection_bbox_reward": 0.9163267612457275, "step": 234, "temperature": 0.9 }, { "advantages": 1.633326996852702e-05, "completion_length": 79.34375, "delta_ref_entropy_loss": 0.20556640625, "delta_ref_ppl": -0.228515625, "entropy_loss": -0.537109375, "epoch": 0.3289013296011197, "grad_norm": 2.520561360284241, "k1_kl": 0.22900390625, "k3_kl": 0.123291015625, "kimi_kl": 0.3232421875, "learning_rate": 6.708683473389355e-07, "loss": 0.0049, "ppl": 0.4658203125, "reward": 0.893390417098999, "reward_std": 0.02796479407697916, "rewards/single_object_detection_bbox_reward": 0.8933904767036438, "step": 235, "temperature": 0.9 }, { "advantages": 6.612390421878445e-06, "completion_length": 52.34375, "delta_ref_entropy_loss": 0.21044921875, "delta_ref_ppl": -0.20166015625, "entropy_loss": -0.55078125, "epoch": 0.33030090972708187, "grad_norm": 4.405418035583237, "k1_kl": 0.20166015625, "k3_kl": 0.101318359375, "kimi_kl": 0.22216796875, "learning_rate": 6.694677871148459e-07, "loss": 0.004, "ppl": 0.4892578125, "reward": 0.9005250334739685, "reward_std": 0.0181113351136446, "rewards/single_object_detection_bbox_reward": 0.9005251228809357, "step": 236, "temperature": 0.9 }, { "advantages": -1.1378633644198999e-05, "completion_length": 99.40625, "delta_ref_entropy_loss": 0.20361328125, "delta_ref_ppl": -0.18701171875, "entropy_loss": -0.587890625, "epoch": 0.33170048985304407, "grad_norm": 4.247833673466306, "k1_kl": 0.1875, "k3_kl": 0.0849609375, "kimi_kl": 0.143310546875, "learning_rate": 6.680672268907563e-07, "loss": 0.0034, "ppl": 0.517578125, "reward": 0.925936222076416, "reward_std": 0.036400677636265755, "rewards/single_object_detection_bbox_reward": 0.9259363412857056, "step": 237, "temperature": 0.9 }, { "advantages": 1.6710600903024897e-07, "completion_length": 67.96875, "delta_ref_entropy_loss": 0.2314453125, "delta_ref_ppl": -0.1923828125, "entropy_loss": -0.5341796875, "epoch": 0.3331000699790063, "grad_norm": 2.715001618989585, "k1_kl": 0.19189453125, "k3_kl": 0.080078125, "kimi_kl": 0.1328125, "learning_rate": 6.666666666666666e-07, "loss": 0.0032, "ppl": 0.47265625, "reward": 0.8929937481880188, "reward_std": 0.013484003327903338, "rewards/single_object_detection_bbox_reward": 0.8929937779903412, "step": 238, "temperature": 0.9 }, { "advantages": -1.688408065092517e-05, "completion_length": 59.9375, "delta_ref_entropy_loss": 0.21875, "delta_ref_ppl": -0.22900390625, "entropy_loss": -0.5390625, "epoch": 0.3344996501049685, "grad_norm": 3.9344806031196624, "k1_kl": 0.22900390625, "k3_kl": 0.115966796875, "kimi_kl": 0.23876953125, "learning_rate": 6.652661064425769e-07, "loss": 0.0047, "ppl": 0.466796875, "reward": 0.9462940394878387, "reward_std": 0.04158920422196388, "rewards/single_object_detection_bbox_reward": 0.9462940692901611, "step": 239, "temperature": 0.9 }, { "advantages": -1.731248903524829e-05, "completion_length": 59.59375, "delta_ref_entropy_loss": 0.2109375, "delta_ref_ppl": -0.1611328125, "entropy_loss": -0.548828125, "epoch": 0.3358992302309307, "grad_norm": 1.895190331837322, "k1_kl": 0.16162109375, "k3_kl": 0.0657958984375, "kimi_kl": 0.109619140625, "learning_rate": 6.638655462184873e-07, "loss": 0.0026, "ppl": 0.4853515625, "reward": 0.9830936789512634, "reward_std": 0.011895484756678343, "rewards/single_object_detection_bbox_reward": 0.9830937683582306, "step": 240, "temperature": 0.9 }, { "advantages": -1.0175630904996069e-05, "completion_length": 147.25, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.22216796875, "entropy_loss": -0.525390625, "epoch": 0.3372988103568929, "grad_norm": 3.7646735549566994, "k1_kl": 0.22265625, "k3_kl": 0.107666015625, "kimi_kl": 0.234375, "learning_rate": 6.624649859943977e-07, "loss": 0.0043, "ppl": 0.4599609375, "reward": 0.951490044593811, "reward_std": 0.018330988939851522, "rewards/single_object_detection_bbox_reward": 0.9514901340007782, "step": 241, "temperature": 0.9 }, { "advantages": -1.2609043551492505e-05, "completion_length": 119.25, "delta_ref_entropy_loss": 0.20556640625, "delta_ref_ppl": -0.20068359375, "entropy_loss": -0.525390625, "epoch": 0.33869839048285516, "grad_norm": 2.3123401074954675, "k1_kl": 0.20068359375, "k3_kl": 0.093994140625, "kimi_kl": 0.173828125, "learning_rate": 6.610644257703081e-07, "loss": 0.0038, "ppl": 0.4658203125, "reward": 0.9197300672531128, "reward_std": 0.040204837918281555, "rewards/single_object_detection_bbox_reward": 0.9197301864624023, "step": 242, "temperature": 0.9 }, { "advantages": 7.269372417795239e-06, "completion_length": 78.84375, "delta_ref_entropy_loss": 0.20556640625, "delta_ref_ppl": -0.181640625, "entropy_loss": -0.623046875, "epoch": 0.34009797060881736, "grad_norm": 2.4430888598483746, "k1_kl": 0.181640625, "k3_kl": 0.087646484375, "kimi_kl": 0.176025390625, "learning_rate": 6.596638655462184e-07, "loss": 0.0035, "ppl": 0.55078125, "reward": 0.9311226010322571, "reward_std": 0.026651649735867977, "rewards/single_object_detection_bbox_reward": 0.9311227202415466, "step": 243, "temperature": 0.9 }, { "advantages": -1.0470992947375635e-05, "completion_length": 76.1875, "delta_ref_entropy_loss": 0.23291015625, "delta_ref_ppl": -0.23828125, "entropy_loss": -0.556640625, "epoch": 0.34149755073477955, "grad_norm": 8.376885246053318, "k1_kl": 0.23974609375, "k3_kl": 0.118896484375, "kimi_kl": 0.2333984375, "learning_rate": 6.582633053221288e-07, "loss": 0.0048, "ppl": 0.4912109375, "reward": 0.9345467984676361, "reward_std": 0.02757244842359796, "rewards/single_object_detection_bbox_reward": 0.9345468878746033, "step": 244, "temperature": 0.9 }, { "advantages": -5.068523478257703e-06, "completion_length": 117.8125, "delta_ref_entropy_loss": 0.23583984375, "delta_ref_ppl": -0.1845703125, "entropy_loss": -0.509765625, "epoch": 0.34289713086074175, "grad_norm": 2.6072419511063254, "k1_kl": 0.18505859375, "k3_kl": 0.081298828125, "kimi_kl": 0.14208984375, "learning_rate": 6.568627450980392e-07, "loss": 0.0033, "ppl": 0.4482421875, "reward": 0.9712219536304474, "reward_std": 0.014597766334190965, "rewards/single_object_detection_bbox_reward": 0.9712220430374146, "step": 245, "temperature": 0.9 }, { "advantages": 9.027443411468994e-06, "completion_length": 49.5, "delta_ref_entropy_loss": 0.24267578125, "delta_ref_ppl": -0.24755859375, "entropy_loss": -0.53125, "epoch": 0.344296710986704, "grad_norm": 5.277640584463494, "k1_kl": 0.2490234375, "k3_kl": 0.1318359375, "kimi_kl": 0.296875, "learning_rate": 6.554621848739496e-07, "loss": 0.0053, "ppl": 0.4677734375, "reward": 0.9361321032047272, "reward_std": 0.019467670004814863, "rewards/single_object_detection_bbox_reward": 0.936132162809372, "step": 246, "temperature": 0.9 }, { "advantages": -1.779930516931927e-05, "completion_length": 78.5, "delta_ref_entropy_loss": 0.208984375, "delta_ref_ppl": -0.20556640625, "entropy_loss": -0.521484375, "epoch": 0.3456962911126662, "grad_norm": 3.1268953864551463, "k1_kl": 0.20556640625, "k3_kl": 0.099853515625, "kimi_kl": 0.2265625, "learning_rate": 6.540616246498599e-07, "loss": 0.004, "ppl": 0.4609375, "reward": 0.9652242660522461, "reward_std": 0.008787496946752071, "rewards/single_object_detection_bbox_reward": 0.9652242660522461, "step": 247, "temperature": 0.9 }, { "advantages": -7.131004281291098e-06, "completion_length": 42.0, "delta_ref_entropy_loss": 0.22509765625, "delta_ref_ppl": -0.1904296875, "entropy_loss": -0.509765625, "epoch": 0.3470958712386284, "grad_norm": 2.3542243216644656, "k1_kl": 0.19091796875, "k3_kl": 0.089599609375, "kimi_kl": 0.20654296875, "learning_rate": 6.526610644257703e-07, "loss": 0.0036, "ppl": 0.4501953125, "reward": 0.8743930757045746, "reward_std": 0.0013537309132516384, "rewards/single_object_detection_bbox_reward": 0.8743931353092194, "step": 248, "temperature": 0.9 }, { "advantages": -1.463799640077923e-05, "completion_length": 80.03125, "delta_ref_entropy_loss": 0.201171875, "delta_ref_ppl": -0.17578125, "entropy_loss": -0.552734375, "epoch": 0.34849545136459065, "grad_norm": 1.8991182514870477, "k1_kl": 0.17578125, "k3_kl": 0.076416015625, "kimi_kl": 0.141845703125, "learning_rate": 6.512605042016807e-07, "loss": 0.0031, "ppl": 0.4814453125, "reward": 0.9188582599163055, "reward_std": 0.03262439277023077, "rewards/single_object_detection_bbox_reward": 0.9188583493232727, "step": 249, "temperature": 0.9 }, { "advantages": 3.2312904068021453e-06, "completion_length": 58.4375, "delta_ref_entropy_loss": 0.2705078125, "delta_ref_ppl": -0.25537109375, "entropy_loss": -0.49609375, "epoch": 0.34989503149055284, "grad_norm": 3.2607765263354556, "k1_kl": 0.2548828125, "k3_kl": 0.121826171875, "kimi_kl": 0.2587890625, "learning_rate": 6.49859943977591e-07, "loss": 0.0049, "ppl": 0.4384765625, "reward": 0.9573101699352264, "reward_std": 0.009956127731129527, "rewards/single_object_detection_bbox_reward": 0.9573102295398712, "step": 250, "temperature": 0.9 }, { "advantages": 7.036275519567425e-06, "completion_length": 86.5, "delta_ref_entropy_loss": 0.21923828125, "delta_ref_ppl": -0.16943359375, "entropy_loss": -0.53515625, "epoch": 0.35129461161651504, "grad_norm": 4.687122888015562, "k1_kl": 0.169921875, "k3_kl": 0.0732421875, "kimi_kl": 0.12255859375, "learning_rate": 6.484593837535014e-07, "loss": 0.0029, "ppl": 0.46875, "reward": 0.897723913192749, "reward_std": 0.04349006898701191, "rewards/single_object_detection_bbox_reward": 0.8977240025997162, "step": 251, "temperature": 0.9 }, { "advantages": 1.0482967127245502e-05, "completion_length": 113.875, "delta_ref_entropy_loss": 0.22900390625, "delta_ref_ppl": -0.2041015625, "entropy_loss": -0.521484375, "epoch": 0.35269419174247724, "grad_norm": 3.916880560536632, "k1_kl": 0.20458984375, "k3_kl": 0.103271484375, "kimi_kl": 0.197265625, "learning_rate": 6.470588235294117e-07, "loss": 0.0041, "ppl": 0.4619140625, "reward": 0.9860041737556458, "reward_std": 0.023415246047079563, "rewards/single_object_detection_bbox_reward": 0.9860042631626129, "step": 252, "temperature": 0.9 }, { "advantages": -4.012669897690557e-06, "completion_length": 96.28125, "delta_ref_entropy_loss": 0.208984375, "delta_ref_ppl": -0.20556640625, "entropy_loss": -0.53515625, "epoch": 0.3540937718684395, "grad_norm": 2.339368195376113, "k1_kl": 0.2060546875, "k3_kl": 0.1005859375, "kimi_kl": 0.20166015625, "learning_rate": 6.456582633053222e-07, "loss": 0.004, "ppl": 0.4716796875, "reward": 0.9405384957790375, "reward_std": 0.033499219454824924, "rewards/single_object_detection_bbox_reward": 0.940538614988327, "step": 253, "temperature": 0.9 }, { "advantages": -1.6388617609663925e-06, "completion_length": 33.3125, "delta_ref_entropy_loss": 0.22998046875, "delta_ref_ppl": -0.21875, "entropy_loss": -0.53515625, "epoch": 0.3554933519944017, "grad_norm": 3.301940726282743, "k1_kl": 0.21875, "k3_kl": 0.108154296875, "kimi_kl": 0.23388671875, "learning_rate": 6.442577030812325e-07, "loss": 0.0043, "ppl": 0.4677734375, "reward": 0.9453755617141724, "reward_std": 0.04087735339999199, "rewards/single_object_detection_bbox_reward": 0.9453756511211395, "step": 254, "temperature": 0.9 }, { "advantages": 3.1342996749117447e-06, "completion_length": 106.59375, "delta_ref_entropy_loss": 0.2099609375, "delta_ref_ppl": -0.201171875, "entropy_loss": -0.5634765625, "epoch": 0.3568929321203639, "grad_norm": 3.0220180393222025, "k1_kl": 0.201171875, "k3_kl": 0.099609375, "kimi_kl": 0.19287109375, "learning_rate": 6.428571428571429e-07, "loss": 0.004, "ppl": 0.50390625, "reward": 0.9372033774852753, "reward_std": 0.02774013252928853, "rewards/single_object_detection_bbox_reward": 0.9372034668922424, "step": 255, "temperature": 0.9 }, { "advantages": 2.2625813471677247e-06, "completion_length": 140.3125, "delta_ref_entropy_loss": 0.2080078125, "delta_ref_ppl": -0.203125, "entropy_loss": -0.509765625, "epoch": 0.3582925122463261, "grad_norm": 2.535992322458734, "k1_kl": 0.20263671875, "k3_kl": 0.094970703125, "kimi_kl": 0.25146484375, "learning_rate": 6.414565826330532e-07, "loss": 0.0038, "ppl": 0.447265625, "reward": 0.9898190498352051, "reward_std": 0.005494278622791171, "rewards/single_object_detection_bbox_reward": 0.9898191392421722, "step": 256, "temperature": 0.9 }, { "advantages": 7.606244594171585e-06, "completion_length": 103.96875, "delta_ref_entropy_loss": 0.21142578125, "delta_ref_ppl": -0.197265625, "entropy_loss": -0.54296875, "epoch": 0.35969209237228833, "grad_norm": 3.1391085028737544, "k1_kl": 0.1982421875, "k3_kl": 0.0908203125, "kimi_kl": 0.16845703125, "learning_rate": 6.400560224089635e-07, "loss": 0.0036, "ppl": 0.47265625, "reward": 0.9715672135353088, "reward_std": 0.020164311863482, "rewards/single_object_detection_bbox_reward": 0.971567302942276, "step": 257, "temperature": 0.9 }, { "advantages": 1.8815377700320823e-06, "completion_length": 76.75, "delta_ref_entropy_loss": 0.2529296875, "delta_ref_ppl": -0.232421875, "entropy_loss": -0.501953125, "epoch": 0.3610916724982505, "grad_norm": 3.2069320883989776, "k1_kl": 0.23291015625, "k3_kl": 0.112060546875, "kimi_kl": 0.2265625, "learning_rate": 6.386554621848739e-07, "loss": 0.0045, "ppl": 0.4345703125, "reward": 0.9196077287197113, "reward_std": 0.044558814726769924, "rewards/single_object_detection_bbox_reward": 0.9196078181266785, "step": 258, "temperature": 0.9 }, { "advantages": 1.3643876627611462e-05, "completion_length": 31.0625, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.54296875, "epoch": 0.3624912526242127, "grad_norm": 5.402527765589349, "k1_kl": 0.20849609375, "k3_kl": 0.101318359375, "kimi_kl": 0.1865234375, "learning_rate": 6.372549019607843e-07, "loss": 0.004, "ppl": 0.4736328125, "reward": 0.9301213622093201, "reward_std": 0.0224040811881423, "rewards/single_object_detection_bbox_reward": 0.9301214218139648, "step": 259, "temperature": 0.9 }, { "advantages": -4.105802247522661e-06, "completion_length": 110.375, "delta_ref_entropy_loss": 0.21533203125, "delta_ref_ppl": -0.1796875, "entropy_loss": -0.52734375, "epoch": 0.363890832750175, "grad_norm": 3.0830261437579316, "k1_kl": 0.1796875, "k3_kl": 0.081298828125, "kimi_kl": 0.1513671875, "learning_rate": 6.358543417366946e-07, "loss": 0.0033, "ppl": 0.462890625, "reward": 0.9371642172336578, "reward_std": 0.019147627986967564, "rewards/single_object_detection_bbox_reward": 0.9371642470359802, "step": 260, "temperature": 0.9 }, { "advantages": 8.61739522406424e-06, "completion_length": 97.625, "delta_ref_entropy_loss": 0.21435546875, "delta_ref_ppl": -0.19873046875, "entropy_loss": -0.5625, "epoch": 0.36529041287613717, "grad_norm": 2.8951235259405044, "k1_kl": 0.19873046875, "k3_kl": 0.09423828125, "kimi_kl": 0.17626953125, "learning_rate": 6.344537815126049e-07, "loss": 0.0038, "ppl": 0.4931640625, "reward": 0.8827849328517914, "reward_std": 0.044321103021502495, "rewards/single_object_detection_bbox_reward": 0.8827850222587585, "step": 261, "temperature": 0.9 }, { "advantages": 1.6628100638627075e-06, "completion_length": 50.0, "delta_ref_entropy_loss": 0.25244140625, "delta_ref_ppl": -0.22412109375, "entropy_loss": -0.521484375, "epoch": 0.36668999300209937, "grad_norm": 5.707423626543018, "k1_kl": 0.22509765625, "k3_kl": 0.1220703125, "kimi_kl": 0.23046875, "learning_rate": 6.330532212885154e-07, "loss": 0.0049, "ppl": 0.4638671875, "reward": 0.9852575659751892, "reward_std": 0.022137894295156002, "rewards/single_object_detection_bbox_reward": 0.985257625579834, "step": 262, "temperature": 0.9 }, { "advantages": 1.3605294952867553e-06, "completion_length": 60.5, "delta_ref_entropy_loss": 0.2314453125, "delta_ref_ppl": -0.1748046875, "entropy_loss": -0.5087890625, "epoch": 0.36808957312806156, "grad_norm": 2.5899167570424253, "k1_kl": 0.17529296875, "k3_kl": 0.0712890625, "kimi_kl": 0.121826171875, "learning_rate": 6.316526610644257e-07, "loss": 0.0029, "ppl": 0.4443359375, "reward": 0.9562506973743439, "reward_std": 0.015076665789820254, "rewards/single_object_detection_bbox_reward": 0.956250786781311, "step": 263, "temperature": 0.9 }, { "advantages": 9.891710305964807e-06, "completion_length": 40.5, "delta_ref_entropy_loss": 0.220703125, "delta_ref_ppl": -0.21923828125, "entropy_loss": -0.544921875, "epoch": 0.3694891532540238, "grad_norm": 4.897855516285246, "k1_kl": 0.21923828125, "k3_kl": 0.121337890625, "kimi_kl": 0.2548828125, "learning_rate": 6.302521008403361e-07, "loss": 0.0048, "ppl": 0.486328125, "reward": 0.9715021550655365, "reward_std": 0.03283619321882725, "rewards/single_object_detection_bbox_reward": 0.9715022444725037, "step": 264, "temperature": 0.9 }, { "advantages": -1.1079280284320703e-05, "completion_length": 78.90625, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.2001953125, "entropy_loss": -0.486328125, "epoch": 0.370888733379986, "grad_norm": 2.994902314710207, "k1_kl": 0.19921875, "k3_kl": 0.090087890625, "kimi_kl": 0.1611328125, "learning_rate": 6.288515406162464e-07, "loss": 0.0036, "ppl": 0.4306640625, "reward": 0.9236345887184143, "reward_std": 0.020767325651831925, "rewards/single_object_detection_bbox_reward": 0.9236345887184143, "step": 265, "temperature": 0.9 }, { "advantages": -5.772337544840411e-06, "completion_length": 78.5, "delta_ref_entropy_loss": 0.2294921875, "delta_ref_ppl": -0.17822265625, "entropy_loss": -0.544921875, "epoch": 0.3722883135059482, "grad_norm": 3.208512058442257, "k1_kl": 0.1787109375, "k3_kl": 0.0748291015625, "kimi_kl": 0.1171875, "learning_rate": 6.274509803921569e-07, "loss": 0.003, "ppl": 0.478515625, "reward": 0.9288034737110138, "reward_std": 0.01181774353608489, "rewards/single_object_detection_bbox_reward": 0.9288035035133362, "step": 266, "temperature": 0.9 }, { "advantages": 9.104875971388537e-06, "completion_length": 73.09375, "delta_ref_entropy_loss": 0.2294921875, "delta_ref_ppl": -0.20751953125, "entropy_loss": -0.546875, "epoch": 0.3736878936319104, "grad_norm": 3.8953878626693927, "k1_kl": 0.20751953125, "k3_kl": 0.093994140625, "kimi_kl": 0.220458984375, "learning_rate": 6.260504201680672e-07, "loss": 0.0038, "ppl": 0.4775390625, "reward": 0.9541618525981903, "reward_std": 0.027050886303186417, "rewards/single_object_detection_bbox_reward": 0.9541619420051575, "step": 267, "temperature": 0.9 }, { "advantages": -1.5703429028235405e-05, "completion_length": 38.53125, "delta_ref_entropy_loss": 0.25830078125, "delta_ref_ppl": -0.24365234375, "entropy_loss": -0.4970703125, "epoch": 0.37508747375787266, "grad_norm": 3.4294251192877794, "k1_kl": 0.244140625, "k3_kl": 0.1240234375, "kimi_kl": 0.2412109375, "learning_rate": 6.246498599439776e-07, "loss": 0.005, "ppl": 0.4384765625, "reward": 0.9074057638645172, "reward_std": 0.028064946498489007, "rewards/single_object_detection_bbox_reward": 0.9074057638645172, "step": 268, "temperature": 0.9 }, { "advantages": -6.379560090863379e-06, "completion_length": 51.25, "delta_ref_entropy_loss": 0.21728515625, "delta_ref_ppl": -0.22314453125, "entropy_loss": -0.5341796875, "epoch": 0.37648705388383485, "grad_norm": 2.91111073618343, "k1_kl": 0.22412109375, "k3_kl": 0.116943359375, "kimi_kl": 0.2578125, "learning_rate": 6.232492997198879e-07, "loss": 0.0047, "ppl": 0.4775390625, "reward": 0.9146747589111328, "reward_std": 0.04659312218427658, "rewards/single_object_detection_bbox_reward": 0.9146748483181, "step": 269, "temperature": 0.9 }, { "advantages": 8.408512996993522e-08, "completion_length": 112.9375, "delta_ref_entropy_loss": 0.2197265625, "delta_ref_ppl": -0.21337890625, "entropy_loss": -0.611328125, "epoch": 0.37788663400979705, "grad_norm": 3.0063737009591494, "k1_kl": 0.2138671875, "k3_kl": 0.102783203125, "kimi_kl": 0.1884765625, "learning_rate": 6.218487394957984e-07, "loss": 0.0041, "ppl": 0.5439453125, "reward": 0.9427981078624725, "reward_std": 0.04551912844181061, "rewards/single_object_detection_bbox_reward": 0.9427982270717621, "step": 270, "temperature": 0.9 }, { "advantages": -8.33813146527973e-06, "completion_length": 45.1875, "delta_ref_entropy_loss": 0.23291015625, "delta_ref_ppl": -0.2587890625, "entropy_loss": -0.51171875, "epoch": 0.37928621413575925, "grad_norm": 4.105899858084896, "k1_kl": 0.25927734375, "k3_kl": 0.14404296875, "kimi_kl": 0.4150390625, "learning_rate": 6.204481792717087e-07, "loss": 0.0058, "ppl": 0.44921875, "reward": 0.888546347618103, "reward_std": 0.04352429648861289, "rewards/single_object_detection_bbox_reward": 0.888546347618103, "step": 271, "temperature": 0.9 }, { "advantages": -6.773376526325592e-06, "completion_length": 127.5, "delta_ref_entropy_loss": 0.2138671875, "delta_ref_ppl": -0.1884765625, "entropy_loss": -0.546875, "epoch": 0.3806857942617215, "grad_norm": 2.2014579272734007, "k1_kl": 0.189453125, "k3_kl": 0.088134765625, "kimi_kl": 0.1787109375, "learning_rate": 6.19047619047619e-07, "loss": 0.0035, "ppl": 0.4853515625, "reward": 0.9423486888408661, "reward_std": 0.016046041157096624, "rewards/single_object_detection_bbox_reward": 0.9423487186431885, "step": 272, "temperature": 0.9 }, { "advantages": -8.531181265425403e-06, "completion_length": 41.5, "delta_ref_entropy_loss": 0.2724609375, "delta_ref_ppl": -0.24560546875, "entropy_loss": -0.5068359375, "epoch": 0.3820853743876837, "grad_norm": 3.3521144703127703, "k1_kl": 0.24560546875, "k3_kl": 0.1123046875, "kimi_kl": 0.20263671875, "learning_rate": 6.176470588235294e-07, "loss": 0.0045, "ppl": 0.4443359375, "reward": 0.9186044037342072, "reward_std": 0.021679881028831005, "rewards/single_object_detection_bbox_reward": 0.9186044037342072, "step": 273, "temperature": 0.9 }, { "advantages": -7.861959488764114e-06, "completion_length": 59.75, "delta_ref_entropy_loss": 0.22216796875, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.521484375, "epoch": 0.3834849545136459, "grad_norm": 3.1561407801190495, "k1_kl": 0.20947265625, "k3_kl": 0.103759765625, "kimi_kl": 0.2314453125, "learning_rate": 6.162464985994397e-07, "loss": 0.0042, "ppl": 0.4599609375, "reward": 0.9521549046039581, "reward_std": 0.03050260804593563, "rewards/single_object_detection_bbox_reward": 0.9521550238132477, "step": 274, "temperature": 0.9 }, { "advantages": -8.843041541695129e-06, "completion_length": 88.5, "delta_ref_entropy_loss": 0.23291015625, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.5078125, "epoch": 0.38488453463960814, "grad_norm": 3.7186419569288205, "k1_kl": 0.2119140625, "k3_kl": 0.1025390625, "kimi_kl": 0.198974609375, "learning_rate": 6.148459383753502e-07, "loss": 0.0041, "ppl": 0.4462890625, "reward": 0.9472053349018097, "reward_std": 0.007422563387081027, "rewards/single_object_detection_bbox_reward": 0.9472053945064545, "step": 275, "temperature": 0.9 }, { "advantages": 5.489482646225952e-07, "completion_length": 99.46875, "delta_ref_entropy_loss": 0.20654296875, "delta_ref_ppl": -0.18408203125, "entropy_loss": -0.5703125, "epoch": 0.38628411476557034, "grad_norm": 2.385675420814399, "k1_kl": 0.1845703125, "k3_kl": 0.089599609375, "kimi_kl": 0.16455078125, "learning_rate": 6.134453781512605e-07, "loss": 0.0036, "ppl": 0.5087890625, "reward": 0.9157361388206482, "reward_std": 0.019428445026278496, "rewards/single_object_detection_bbox_reward": 0.9157362580299377, "step": 276, "temperature": 0.9 }, { "advantages": 2.7301075533614494e-07, "completion_length": 92.78125, "delta_ref_entropy_loss": 0.19580078125, "delta_ref_ppl": -0.17236328125, "entropy_loss": -0.564453125, "epoch": 0.38768369489153254, "grad_norm": 2.3133127608712702, "k1_kl": 0.17333984375, "k3_kl": 0.08154296875, "kimi_kl": 0.16162109375, "learning_rate": 6.120448179271709e-07, "loss": 0.0033, "ppl": 0.5029296875, "reward": 0.9256820678710938, "reward_std": 0.035167557187378407, "rewards/single_object_detection_bbox_reward": 0.9256820976734161, "step": 277, "temperature": 0.9 }, { "advantages": -2.8634178192987747e-06, "completion_length": 38.5, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.19873046875, "entropy_loss": -0.501953125, "epoch": 0.38908327501749473, "grad_norm": 2.053384596989561, "k1_kl": 0.19873046875, "k3_kl": 0.08837890625, "kimi_kl": 0.17529296875, "learning_rate": 6.106442577030812e-07, "loss": 0.0035, "ppl": 0.443359375, "reward": 0.9497108459472656, "reward_std": 0.014795406721532345, "rewards/single_object_detection_bbox_reward": 0.949710875749588, "step": 278, "temperature": 0.9 }, { "advantages": 2.2973998511588434e-05, "completion_length": 98.46875, "delta_ref_entropy_loss": 0.20751953125, "delta_ref_ppl": -0.1806640625, "entropy_loss": -0.517578125, "epoch": 0.390482855143457, "grad_norm": 1.7498207714003868, "k1_kl": 0.18017578125, "k3_kl": 0.089111328125, "kimi_kl": 0.18115234375, "learning_rate": 6.092436974789916e-07, "loss": 0.0035, "ppl": 0.45703125, "reward": 0.975766509771347, "reward_std": 0.011310051428154111, "rewards/single_object_detection_bbox_reward": 0.9757665693759918, "step": 279, "temperature": 0.9 }, { "advantages": 7.586421816085931e-06, "completion_length": 108.5, "delta_ref_entropy_loss": 0.24755859375, "delta_ref_ppl": -0.21826171875, "entropy_loss": -0.521484375, "epoch": 0.3918824352694192, "grad_norm": 2.821372133111805, "k1_kl": 0.2177734375, "k3_kl": 0.09619140625, "kimi_kl": 0.177734375, "learning_rate": 6.078431372549019e-07, "loss": 0.0038, "ppl": 0.4560546875, "reward": 0.9578020572662354, "reward_std": 0.011875429656356573, "rewards/single_object_detection_bbox_reward": 0.9578020870685577, "step": 280, "temperature": 0.9 }, { "advantages": 7.679153895878699e-06, "completion_length": 98.5, "delta_ref_entropy_loss": 0.23779296875, "delta_ref_ppl": -0.224609375, "entropy_loss": -0.513671875, "epoch": 0.3932820153953814, "grad_norm": 3.9548035313546706, "k1_kl": 0.22509765625, "k3_kl": 0.11376953125, "kimi_kl": 0.25390625, "learning_rate": 6.064425770308122e-07, "loss": 0.0045, "ppl": 0.4560546875, "reward": 0.973365306854248, "reward_std": 0.025510717649012804, "rewards/single_object_detection_bbox_reward": 0.9733653366565704, "step": 281, "temperature": 0.9 }, { "advantages": -9.554239113640506e-06, "completion_length": 33.6875, "delta_ref_entropy_loss": 0.28564453125, "delta_ref_ppl": -0.26220703125, "entropy_loss": -0.4833984375, "epoch": 0.3946815955213436, "grad_norm": 2.913781612778585, "k1_kl": 0.2626953125, "k3_kl": 0.1259765625, "kimi_kl": 0.2265625, "learning_rate": 6.050420168067226e-07, "loss": 0.0051, "ppl": 0.423828125, "reward": 0.9695053994655609, "reward_std": 0.02117696776986122, "rewards/single_object_detection_bbox_reward": 0.9695054590702057, "step": 282, "temperature": 0.9 }, { "advantages": 1.344177928785939e-05, "completion_length": 115.75, "delta_ref_entropy_loss": 0.2021484375, "delta_ref_ppl": -0.18310546875, "entropy_loss": -0.603515625, "epoch": 0.3960811756473058, "grad_norm": 4.421507442851522, "k1_kl": 0.18310546875, "k3_kl": 0.085205078125, "kimi_kl": 0.16650390625, "learning_rate": 6.03641456582633e-07, "loss": 0.0034, "ppl": 0.5234375, "reward": 0.8756513297557831, "reward_std": 0.042633384466171265, "rewards/single_object_detection_bbox_reward": 0.8756514191627502, "step": 283, "temperature": 0.9 }, { "advantages": -2.0436411432456225e-05, "completion_length": 88.3125, "delta_ref_entropy_loss": 0.2177734375, "delta_ref_ppl": -0.197265625, "entropy_loss": -0.55859375, "epoch": 0.397480755773268, "grad_norm": 2.612317947822165, "k1_kl": 0.197265625, "k3_kl": 0.086181640625, "kimi_kl": 0.1689453125, "learning_rate": 6.022408963585434e-07, "loss": 0.0035, "ppl": 0.4814453125, "reward": 0.9793879389762878, "reward_std": 0.018848412670195103, "rewards/single_object_detection_bbox_reward": 0.9793879985809326, "step": 284, "temperature": 0.9 }, { "advantages": -2.9326018875508453e-06, "completion_length": 96.21875, "delta_ref_entropy_loss": 0.2060546875, "delta_ref_ppl": -0.1669921875, "entropy_loss": -0.5400390625, "epoch": 0.3988803358992302, "grad_norm": 2.4189899115076843, "k1_kl": 0.16650390625, "k3_kl": 0.07763671875, "kimi_kl": 0.16015625, "learning_rate": 6.008403361344537e-07, "loss": 0.0031, "ppl": 0.4736328125, "reward": 0.9680605530738831, "reward_std": 0.021286440081894398, "rewards/single_object_detection_bbox_reward": 0.9680605828762054, "step": 285, "temperature": 0.9 }, { "advantages": 1.453102748882884e-05, "completion_length": 67.09375, "delta_ref_entropy_loss": 0.23779296875, "delta_ref_ppl": -0.18212890625, "entropy_loss": -0.525390625, "epoch": 0.4002799160251924, "grad_norm": 2.505410757443771, "k1_kl": 0.1826171875, "k3_kl": 0.081298828125, "kimi_kl": 0.141845703125, "learning_rate": 5.994397759103641e-07, "loss": 0.0032, "ppl": 0.4619140625, "reward": 0.9630335569381714, "reward_std": 0.01959157269448042, "rewards/single_object_detection_bbox_reward": 0.9630336165428162, "step": 286, "temperature": 0.9 }, { "advantages": -1.049866659741383e-05, "completion_length": 98.3125, "delta_ref_entropy_loss": 0.185546875, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.611328125, "epoch": 0.40167949615115467, "grad_norm": 9.04136000592905, "k1_kl": 0.17529296875, "k3_kl": 0.0885009765625, "kimi_kl": 0.2255859375, "learning_rate": 5.980392156862744e-07, "loss": 0.0035, "ppl": 0.546875, "reward": 0.9604036211967468, "reward_std": 0.022346808575093746, "rewards/single_object_detection_bbox_reward": 0.960403710603714, "step": 287, "temperature": 0.9 }, { "advantages": 3.84875716008537e-06, "completion_length": 30.28125, "delta_ref_entropy_loss": 0.25634765625, "delta_ref_ppl": -0.240234375, "entropy_loss": -0.509765625, "epoch": 0.40307907627711687, "grad_norm": 4.791730362372276, "k1_kl": 0.24072265625, "k3_kl": 0.121826171875, "kimi_kl": 0.2451171875, "learning_rate": 5.966386554621849e-07, "loss": 0.0049, "ppl": 0.451171875, "reward": 0.954839825630188, "reward_std": 0.01959882862865925, "rewards/single_object_detection_bbox_reward": 0.9548398554325104, "step": 288, "temperature": 0.9 }, { "advantages": -9.216368326292468e-06, "completion_length": 98.5, "delta_ref_entropy_loss": 0.2294921875, "delta_ref_ppl": -0.171875, "entropy_loss": -0.5556640625, "epoch": 0.40447865640307906, "grad_norm": 2.7129501119471007, "k1_kl": 0.17236328125, "k3_kl": 0.08203125, "kimi_kl": 0.116943359375, "learning_rate": 5.952380952380952e-07, "loss": 0.0033, "ppl": 0.50390625, "reward": 0.9731113612651825, "reward_std": 0.026238794205710292, "rewards/single_object_detection_bbox_reward": 0.9731114506721497, "step": 289, "temperature": 0.9 }, { "advantages": 2.3522534320363775e-07, "completion_length": 111.21875, "delta_ref_entropy_loss": 0.20947265625, "delta_ref_ppl": -0.20458984375, "entropy_loss": -0.5625, "epoch": 0.4058782365290413, "grad_norm": 2.843267846112108, "k1_kl": 0.2041015625, "k3_kl": 0.10693359375, "kimi_kl": 0.20947265625, "learning_rate": 5.938375350140056e-07, "loss": 0.0043, "ppl": 0.49609375, "reward": 0.935889482498169, "reward_std": 0.018346023745834827, "rewards/single_object_detection_bbox_reward": 0.9358895123004913, "step": 290, "temperature": 0.9 }, { "advantages": -2.3937654987093993e-06, "completion_length": 32.5, "delta_ref_entropy_loss": 0.24072265625, "delta_ref_ppl": -0.20458984375, "entropy_loss": -0.4833984375, "epoch": 0.4072778166550035, "grad_norm": 5.327292556103761, "k1_kl": 0.205078125, "k3_kl": 0.09521484375, "kimi_kl": 0.177001953125, "learning_rate": 5.924369747899159e-07, "loss": 0.0038, "ppl": 0.419921875, "reward": 0.9875791668891907, "reward_std": 0.00946454907534644, "rewards/single_object_detection_bbox_reward": 0.9875792264938354, "step": 291, "temperature": 0.9 }, { "advantages": -7.164664793890552e-06, "completion_length": 141.4375, "delta_ref_entropy_loss": 0.2275390625, "delta_ref_ppl": -0.1796875, "entropy_loss": -0.5224609375, "epoch": 0.4086773967809657, "grad_norm": 2.1894450905143152, "k1_kl": 0.1796875, "k3_kl": 0.075927734375, "kimi_kl": 0.125732421875, "learning_rate": 5.910364145658264e-07, "loss": 0.003, "ppl": 0.4541015625, "reward": 0.9592124223709106, "reward_std": 0.03403776837512851, "rewards/single_object_detection_bbox_reward": 0.9592124819755554, "step": 292, "temperature": 0.9 }, { "advantages": -3.6828486145168426e-06, "completion_length": 59.0, "delta_ref_entropy_loss": 0.21875, "delta_ref_ppl": -0.2001953125, "entropy_loss": -0.5, "epoch": 0.4100769769069279, "grad_norm": 2.9962362387518806, "k1_kl": 0.19970703125, "k3_kl": 0.09033203125, "kimi_kl": 0.16455078125, "learning_rate": 5.896358543417367e-07, "loss": 0.0036, "ppl": 0.4404296875, "reward": 0.9818171560764313, "reward_std": 0.019433083944022655, "rewards/single_object_detection_bbox_reward": 0.9818172454833984, "step": 293, "temperature": 0.9 }, { "advantages": 7.3955000061687315e-06, "completion_length": 61.0, "delta_ref_entropy_loss": 0.2392578125, "delta_ref_ppl": -0.25634765625, "entropy_loss": -0.560546875, "epoch": 0.41147655703289016, "grad_norm": 3.740274549388804, "k1_kl": 0.25732421875, "k3_kl": 0.1298828125, "kimi_kl": 0.298828125, "learning_rate": 5.88235294117647e-07, "loss": 0.0052, "ppl": 0.4833984375, "reward": 0.9092407822608948, "reward_std": 0.03649666719138622, "rewards/single_object_detection_bbox_reward": 0.9092408418655396, "step": 294, "temperature": 0.9 }, { "advantages": 1.046168017637683e-05, "completion_length": 24.6875, "delta_ref_entropy_loss": 0.234375, "delta_ref_ppl": -0.1923828125, "entropy_loss": -0.572265625, "epoch": 0.41287613715885235, "grad_norm": 7.167143827342189, "k1_kl": 0.1923828125, "k3_kl": 0.08251953125, "kimi_kl": 0.15478515625, "learning_rate": 5.868347338935574e-07, "loss": 0.0033, "ppl": 0.5009765625, "reward": 0.9310817122459412, "reward_std": 0.03462263010442257, "rewards/single_object_detection_bbox_reward": 0.9310817420482635, "step": 295, "temperature": 0.9 }, { "advantages": -3.2330194699170534e-07, "completion_length": 62.6875, "delta_ref_entropy_loss": 0.2236328125, "delta_ref_ppl": -0.23388671875, "entropy_loss": -0.626953125, "epoch": 0.41427571728481455, "grad_norm": 4.190797981606727, "k1_kl": 0.23388671875, "k3_kl": 0.118896484375, "kimi_kl": 0.22021484375, "learning_rate": 5.854341736694678e-07, "loss": 0.0048, "ppl": 0.564453125, "reward": 0.8423499464988708, "reward_std": 0.051704888232052326, "rewards/single_object_detection_bbox_reward": 0.8423499763011932, "step": 296, "temperature": 0.9 }, { "advantages": 2.61331792899e-05, "completion_length": 49.03125, "delta_ref_entropy_loss": 0.2578125, "delta_ref_ppl": -0.23193359375, "entropy_loss": -0.4921875, "epoch": 0.41567529741077675, "grad_norm": 2.1941987811770853, "k1_kl": 0.23193359375, "k3_kl": 0.106689453125, "kimi_kl": 0.20556640625, "learning_rate": 5.840336134453782e-07, "loss": 0.0042, "ppl": 0.4296875, "reward": 0.9757990539073944, "reward_std": 0.009776606049854308, "rewards/single_object_detection_bbox_reward": 0.9757991433143616, "step": 297, "temperature": 0.9 }, { "advantages": -6.278578155161085e-06, "completion_length": 103.78125, "delta_ref_entropy_loss": 0.21337890625, "delta_ref_ppl": -0.22802734375, "entropy_loss": -0.5546875, "epoch": 0.417074877536739, "grad_norm": 4.213448942661426, "k1_kl": 0.228515625, "k3_kl": 0.118896484375, "kimi_kl": 0.27783203125, "learning_rate": 5.826330532212885e-07, "loss": 0.0047, "ppl": 0.4912109375, "reward": 0.8917255997657776, "reward_std": 0.046947063878178596, "rewards/single_object_detection_bbox_reward": 0.8917255997657776, "step": 298, "temperature": 0.9 }, { "advantages": 6.161098667689657e-06, "completion_length": 96.5625, "delta_ref_entropy_loss": 0.2646484375, "delta_ref_ppl": -0.2041015625, "entropy_loss": -0.470703125, "epoch": 0.4184744576627012, "grad_norm": 3.2580486298232403, "k1_kl": 0.20458984375, "k3_kl": 0.08203125, "kimi_kl": 0.126708984375, "learning_rate": 5.812324929971989e-07, "loss": 0.0033, "ppl": 0.4150390625, "reward": 0.9935216009616852, "reward_std": 0.006115466938354075, "rewards/single_object_detection_bbox_reward": 0.9935217201709747, "step": 299, "temperature": 0.9 }, { "advantages": -1.370081986351579e-05, "completion_length": 101.40625, "delta_ref_entropy_loss": 0.23291015625, "delta_ref_ppl": -0.22509765625, "entropy_loss": -0.4716796875, "epoch": 0.4198740377886634, "grad_norm": 2.9724260200484163, "k1_kl": 0.22509765625, "k3_kl": 0.114501953125, "kimi_kl": 0.21044921875, "learning_rate": 5.798319327731093e-07, "loss": 0.0046, "ppl": 0.41796875, "reward": 0.8909415602684021, "reward_std": 0.04394917003810406, "rewards/single_object_detection_bbox_reward": 0.8909416198730469, "step": 300, "temperature": 0.9 }, { "advantages": 1.956575715666986e-06, "completion_length": 40.96875, "delta_ref_entropy_loss": 0.2734375, "delta_ref_ppl": -0.24169921875, "entropy_loss": -0.4951171875, "epoch": 0.4212736179146256, "grad_norm": 1.8770058255809323, "k1_kl": 0.2421875, "k3_kl": 0.111572265625, "kimi_kl": 0.20458984375, "learning_rate": 5.784313725490197e-07, "loss": 0.0045, "ppl": 0.439453125, "reward": 0.9806126952171326, "reward_std": 0.01614095619879663, "rewards/single_object_detection_bbox_reward": 0.9806127548217773, "step": 301, "temperature": 0.9 }, { "advantages": -1.045715646341705e-05, "completion_length": 135.03125, "delta_ref_entropy_loss": 0.25634765625, "delta_ref_ppl": -0.189453125, "entropy_loss": -0.51953125, "epoch": 0.42267319804058784, "grad_norm": 3.6381558934592313, "k1_kl": 0.189453125, "k3_kl": 0.079833984375, "kimi_kl": 0.124267578125, "learning_rate": 5.770308123249299e-07, "loss": 0.0032, "ppl": 0.4638671875, "reward": 0.8788131475448608, "reward_std": 0.014921332709491253, "rewards/single_object_detection_bbox_reward": 0.8788132071495056, "step": 302, "temperature": 0.9 }, { "advantages": -1.7302909895988705e-05, "completion_length": 89.0, "delta_ref_entropy_loss": 0.2197265625, "delta_ref_ppl": -0.18798828125, "entropy_loss": -0.521484375, "epoch": 0.42407277816655004, "grad_norm": 2.4189814166025334, "k1_kl": 0.1884765625, "k3_kl": 0.084716796875, "kimi_kl": 0.15380859375, "learning_rate": 5.756302521008402e-07, "loss": 0.0034, "ppl": 0.4619140625, "reward": 0.9571860730648041, "reward_std": 0.014205945655703545, "rewards/single_object_detection_bbox_reward": 0.9571860730648041, "step": 303, "temperature": 0.9 }, { "advantages": -2.517232019272342e-06, "completion_length": 107.0, "delta_ref_entropy_loss": 0.2099609375, "delta_ref_ppl": -0.18115234375, "entropy_loss": -0.521484375, "epoch": 0.42547235829251223, "grad_norm": 3.052762950989389, "k1_kl": 0.1806640625, "k3_kl": 0.084228515625, "kimi_kl": 0.13916015625, "learning_rate": 5.742296918767506e-07, "loss": 0.0034, "ppl": 0.4560546875, "reward": 0.9256999790668488, "reward_std": 0.031873770989477634, "rewards/single_object_detection_bbox_reward": 0.9257000386714935, "step": 304, "temperature": 0.9 }, { "advantages": 1.6407243492722046e-06, "completion_length": 52.875, "delta_ref_entropy_loss": 0.236328125, "delta_ref_ppl": -0.2177734375, "entropy_loss": -0.537109375, "epoch": 0.4268719384184745, "grad_norm": 5.285369791227334, "k1_kl": 0.21923828125, "k3_kl": 0.101318359375, "kimi_kl": 0.176025390625, "learning_rate": 5.72829131652661e-07, "loss": 0.0041, "ppl": 0.4755859375, "reward": 0.9396786391735077, "reward_std": 0.034118957817554474, "rewards/single_object_detection_bbox_reward": 0.9396786987781525, "step": 305, "temperature": 0.9 }, { "advantages": 1.6357883851014776e-05, "completion_length": 70.8125, "delta_ref_entropy_loss": 0.216796875, "delta_ref_ppl": -0.18115234375, "entropy_loss": -0.57421875, "epoch": 0.4282715185444367, "grad_norm": 98.94763259213256, "k1_kl": 0.181640625, "k3_kl": 0.0830078125, "kimi_kl": 0.151123046875, "learning_rate": 5.714285714285714e-07, "loss": 0.0033, "ppl": 0.51171875, "reward": 0.8662272989749908, "reward_std": 0.030772211961448193, "rewards/single_object_detection_bbox_reward": 0.8662273585796356, "step": 306, "temperature": 0.9 }, { "advantages": 3.966236818087054e-06, "completion_length": 74.40625, "delta_ref_entropy_loss": 0.2265625, "delta_ref_ppl": -0.21142578125, "entropy_loss": -0.5078125, "epoch": 0.4296710986703989, "grad_norm": 4.609280718235254, "k1_kl": 0.2109375, "k3_kl": 0.096435546875, "kimi_kl": 0.1884765625, "learning_rate": 5.700280112044817e-07, "loss": 0.0039, "ppl": 0.4462890625, "reward": 0.9002973139286041, "reward_std": 0.053025033324956894, "rewards/single_object_detection_bbox_reward": 0.9002973437309265, "step": 307, "temperature": 0.9 }, { "advantages": 6.176664555823663e-06, "completion_length": 131.84375, "delta_ref_entropy_loss": 0.24365234375, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.5380859375, "epoch": 0.4310706787963611, "grad_norm": 2.25296937507277, "k1_kl": 0.208984375, "k3_kl": 0.08984375, "kimi_kl": 0.1845703125, "learning_rate": 5.686274509803921e-07, "loss": 0.0036, "ppl": 0.46875, "reward": 0.9779910445213318, "reward_std": 0.015721157396910712, "rewards/single_object_detection_bbox_reward": 0.977991133928299, "step": 308, "temperature": 0.9 }, { "advantages": -1.082409744412871e-05, "completion_length": 78.9375, "delta_ref_entropy_loss": 0.2685546875, "delta_ref_ppl": -0.20947265625, "entropy_loss": -0.5009765625, "epoch": 0.4324702589223233, "grad_norm": 3.069023489185527, "k1_kl": 0.2099609375, "k3_kl": 0.08984375, "kimi_kl": 0.1513671875, "learning_rate": 5.672268907563025e-07, "loss": 0.0036, "ppl": 0.4462890625, "reward": 0.9729272127151489, "reward_std": 0.00982456048950553, "rewards/single_object_detection_bbox_reward": 0.9729272723197937, "step": 309, "temperature": 0.9 }, { "advantages": -9.734716286402545e-06, "completion_length": 88.15625, "delta_ref_entropy_loss": 0.2197265625, "delta_ref_ppl": -0.19287109375, "entropy_loss": -0.513671875, "epoch": 0.4338698390482855, "grad_norm": 6.183430213388067, "k1_kl": 0.19287109375, "k3_kl": 0.09326171875, "kimi_kl": 0.1533203125, "learning_rate": 5.658263305322129e-07, "loss": 0.0037, "ppl": 0.4560546875, "reward": 0.9278159439563751, "reward_std": 0.037833163514733315, "rewards/single_object_detection_bbox_reward": 0.9278160333633423, "step": 310, "temperature": 0.9 }, { "advantages": 8.229166610362881e-06, "completion_length": 31.90625, "delta_ref_entropy_loss": 0.2578125, "delta_ref_ppl": -0.21630859375, "entropy_loss": -0.490234375, "epoch": 0.4352694191742477, "grad_norm": 6.278705445132566, "k1_kl": 0.216796875, "k3_kl": 0.107666015625, "kimi_kl": 0.194580078125, "learning_rate": 5.644257703081232e-07, "loss": 0.0043, "ppl": 0.4345703125, "reward": 0.9414488971233368, "reward_std": 0.028156538493931293, "rewards/single_object_detection_bbox_reward": 0.9414489567279816, "step": 311, "temperature": 0.9 }, { "advantages": -2.180625301662076e-06, "completion_length": 33.71875, "delta_ref_entropy_loss": 0.24560546875, "delta_ref_ppl": -0.24169921875, "entropy_loss": -0.49609375, "epoch": 0.4366689993002099, "grad_norm": 3.3035922471076318, "k1_kl": 0.2412109375, "k3_kl": 0.123291015625, "kimi_kl": 0.29296875, "learning_rate": 5.630252100840336e-07, "loss": 0.0049, "ppl": 0.4345703125, "reward": 0.9259284138679504, "reward_std": 0.019886877853423357, "rewards/single_object_detection_bbox_reward": 0.9259284734725952, "step": 312, "temperature": 0.9 }, { "advantages": -4.661934895011655e-06, "completion_length": 102.0625, "delta_ref_entropy_loss": 0.244140625, "delta_ref_ppl": -0.2607421875, "entropy_loss": -0.529296875, "epoch": 0.43806857942617217, "grad_norm": 10.50355578940028, "k1_kl": 0.2607421875, "k3_kl": 0.13818359375, "kimi_kl": 0.3046875, "learning_rate": 5.61624649859944e-07, "loss": 0.0055, "ppl": 0.462890625, "reward": 0.9062386453151703, "reward_std": 0.07126427628099918, "rewards/single_object_detection_bbox_reward": 0.9062386751174927, "step": 313, "temperature": 0.9 }, { "advantages": -1.7958029275177978e-05, "completion_length": 37.9375, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.501953125, "epoch": 0.43946815955213436, "grad_norm": 2.14556809915832, "k1_kl": 0.18408203125, "k3_kl": 0.083251953125, "kimi_kl": 0.1494140625, "learning_rate": 5.602240896358543e-07, "loss": 0.0033, "ppl": 0.447265625, "reward": 0.9552726447582245, "reward_std": 0.031941117718815804, "rewards/single_object_detection_bbox_reward": 0.9552727341651917, "step": 314, "temperature": 0.9 }, { "advantages": -8.028533329706988e-06, "completion_length": 30.75, "delta_ref_entropy_loss": 0.24365234375, "delta_ref_ppl": -0.2412109375, "entropy_loss": -0.55078125, "epoch": 0.44086773967809656, "grad_norm": 3.0152715334178204, "k1_kl": 0.23974609375, "k3_kl": 0.12158203125, "kimi_kl": 0.2421875, "learning_rate": 5.588235294117647e-07, "loss": 0.0049, "ppl": 0.4912109375, "reward": 0.966973066329956, "reward_std": 0.015436515212059021, "rewards/single_object_detection_bbox_reward": 0.9669731557369232, "step": 315, "temperature": 0.9 }, { "advantages": -2.6090337996720336e-06, "completion_length": 92.1875, "delta_ref_entropy_loss": 0.2255859375, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.5703125, "epoch": 0.44226731980405876, "grad_norm": 3.4355260879976073, "k1_kl": 0.18603515625, "k3_kl": 0.0789794921875, "kimi_kl": 0.12646484375, "learning_rate": 5.57422969187675e-07, "loss": 0.0032, "ppl": 0.5068359375, "reward": 0.9159234762191772, "reward_std": 0.047792719677090645, "rewards/single_object_detection_bbox_reward": 0.915923535823822, "step": 316, "temperature": 0.9 }, { "advantages": 2.090712860081112e-05, "completion_length": 60.53125, "delta_ref_entropy_loss": 0.24560546875, "delta_ref_ppl": -0.23193359375, "entropy_loss": -0.466796875, "epoch": 0.443666899930021, "grad_norm": 3.2766107109390457, "k1_kl": 0.23193359375, "k3_kl": 0.11376953125, "kimi_kl": 0.2421875, "learning_rate": 5.560224089635854e-07, "loss": 0.0045, "ppl": 0.41015625, "reward": 0.9607036113739014, "reward_std": 0.0020465206762310117, "rewards/single_object_detection_bbox_reward": 0.9607036411762238, "step": 317, "temperature": 0.9 }, { "advantages": 3.1324370866059326e-06, "completion_length": 68.75, "delta_ref_entropy_loss": 0.208984375, "delta_ref_ppl": -0.201171875, "entropy_loss": -0.5, "epoch": 0.4450664800559832, "grad_norm": 3.8464472760105073, "k1_kl": 0.201171875, "k3_kl": 0.099365234375, "kimi_kl": 0.21044921875, "learning_rate": 5.546218487394958e-07, "loss": 0.004, "ppl": 0.435546875, "reward": 0.9755175113677979, "reward_std": 0.024029545485973358, "rewards/single_object_detection_bbox_reward": 0.975517600774765, "step": 318, "temperature": 0.9 }, { "advantages": 6.326341690510162e-06, "completion_length": 84.125, "delta_ref_entropy_loss": 0.26513671875, "delta_ref_ppl": -0.2451171875, "entropy_loss": -0.525390625, "epoch": 0.4464660601819454, "grad_norm": 5.243755935075001, "k1_kl": 0.2451171875, "k3_kl": 0.117919921875, "kimi_kl": 0.224609375, "learning_rate": 5.532212885154062e-07, "loss": 0.0047, "ppl": 0.4599609375, "reward": 0.9724006950855255, "reward_std": 0.028944194316864014, "rewards/single_object_detection_bbox_reward": 0.9724007546901703, "step": 319, "temperature": 0.9 }, { "advantages": -4.934945479817543e-06, "completion_length": 100.09375, "delta_ref_entropy_loss": 0.2275390625, "delta_ref_ppl": -0.20458984375, "entropy_loss": -0.5263671875, "epoch": 0.44786564030790765, "grad_norm": 2.7015096716402085, "k1_kl": 0.20361328125, "k3_kl": 0.1064453125, "kimi_kl": 0.193359375, "learning_rate": 5.518207282913165e-07, "loss": 0.0043, "ppl": 0.4619140625, "reward": 0.9037177264690399, "reward_std": 0.029502329067327082, "rewards/single_object_detection_bbox_reward": 0.9037178158760071, "step": 320, "temperature": 0.9 }, { "advantages": 1.5910449292277917e-05, "completion_length": 78.65625, "delta_ref_entropy_loss": 0.22607421875, "delta_ref_ppl": -0.18994140625, "entropy_loss": -0.533203125, "epoch": 0.44926522043386985, "grad_norm": 4.49421597230567, "k1_kl": 0.1904296875, "k3_kl": 0.093505859375, "kimi_kl": 0.162109375, "learning_rate": 5.504201680672269e-07, "loss": 0.0037, "ppl": 0.47265625, "reward": 0.9406421780586243, "reward_std": 0.027724554762244225, "rewards/single_object_detection_bbox_reward": 0.9406421780586243, "step": 321, "temperature": 0.9 }, { "advantages": 4.306834739509213e-06, "completion_length": 107.3125, "delta_ref_entropy_loss": 0.21435546875, "delta_ref_ppl": -0.16796875, "entropy_loss": -0.5029296875, "epoch": 0.45066480055983205, "grad_norm": 5.87689924625296, "k1_kl": 0.16748046875, "k3_kl": 0.072509765625, "kimi_kl": 0.1337890625, "learning_rate": 5.490196078431373e-07, "loss": 0.0029, "ppl": 0.4365234375, "reward": 0.9618799984455109, "reward_std": 0.043286049738526344, "rewards/single_object_detection_bbox_reward": 0.9618800580501556, "step": 322, "temperature": 0.9 }, { "advantages": -8.075897312664893e-06, "completion_length": 68.84375, "delta_ref_entropy_loss": 0.2041015625, "delta_ref_ppl": -0.1708984375, "entropy_loss": -0.533203125, "epoch": 0.45206438068579424, "grad_norm": 1.9209433616430913, "k1_kl": 0.171875, "k3_kl": 0.0751953125, "kimi_kl": 0.126708984375, "learning_rate": 5.476190476190477e-07, "loss": 0.003, "ppl": 0.474609375, "reward": 0.9709484875202179, "reward_std": 0.015531521406956017, "rewards/single_object_detection_bbox_reward": 0.9709485471248627, "step": 323, "temperature": 0.9 }, { "advantages": 1.2543318689495209e-05, "completion_length": 60.5, "delta_ref_entropy_loss": 0.2666015625, "delta_ref_ppl": -0.2333984375, "entropy_loss": -0.4775390625, "epoch": 0.4534639608117565, "grad_norm": 4.344017797422033, "k1_kl": 0.23291015625, "k3_kl": 0.115478515625, "kimi_kl": 0.2314453125, "learning_rate": 5.462184873949579e-07, "loss": 0.0046, "ppl": 0.4228515625, "reward": 0.9840381145477295, "reward_std": 0.011431663297116756, "rewards/single_object_detection_bbox_reward": 0.9840381443500519, "step": 324, "temperature": 0.9 }, { "advantages": -1.4760134945390746e-06, "completion_length": 58.0, "delta_ref_entropy_loss": 0.234375, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.50390625, "epoch": 0.4548635409377187, "grad_norm": 2.344538144779874, "k1_kl": 0.21240234375, "k3_kl": 0.09716796875, "kimi_kl": 0.1982421875, "learning_rate": 5.448179271708682e-07, "loss": 0.0039, "ppl": 0.435546875, "reward": 0.9797380566596985, "reward_std": 0.011795940226875246, "rewards/single_object_detection_bbox_reward": 0.979738175868988, "step": 325, "temperature": 0.9 }, { "advantages": 2.1625312456308166e-06, "completion_length": 81.96875, "delta_ref_entropy_loss": 0.23779296875, "delta_ref_ppl": -0.17626953125, "entropy_loss": -0.51953125, "epoch": 0.4562631210636809, "grad_norm": 9.877077884483358, "k1_kl": 0.1767578125, "k3_kl": 0.0712890625, "kimi_kl": 0.116455078125, "learning_rate": 5.434173669467787e-07, "loss": 0.0028, "ppl": 0.4541015625, "reward": 0.973078191280365, "reward_std": 0.018040773458778858, "rewards/single_object_detection_bbox_reward": 0.9730782508850098, "step": 326, "temperature": 0.9 }, { "advantages": 2.0416321149241412e-05, "completion_length": 50.5, "delta_ref_entropy_loss": 0.24072265625, "delta_ref_ppl": -0.20361328125, "entropy_loss": -0.50390625, "epoch": 0.4576627011896431, "grad_norm": 2.7911228665353476, "k1_kl": 0.203125, "k3_kl": 0.093994140625, "kimi_kl": 0.16015625, "learning_rate": 5.42016806722689e-07, "loss": 0.0037, "ppl": 0.44921875, "reward": 0.968260794878006, "reward_std": 0.005829062778502703, "rewards/single_object_detection_bbox_reward": 0.9682608544826508, "step": 327, "temperature": 0.9 }, { "advantages": 8.545816207572443e-06, "completion_length": 34.0, "delta_ref_entropy_loss": 0.271484375, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.5078125, "epoch": 0.45906228131560534, "grad_norm": 4.323352543615033, "k1_kl": 0.21533203125, "k3_kl": 0.0947265625, "kimi_kl": 0.18115234375, "learning_rate": 5.406162464985994e-07, "loss": 0.0038, "ppl": 0.4443359375, "reward": 0.9225798845291138, "reward_std": 0.017170635052025318, "rewards/single_object_detection_bbox_reward": 0.9225799441337585, "step": 328, "temperature": 0.9 }, { "advantages": -8.084412684183917e-06, "completion_length": 41.3125, "delta_ref_entropy_loss": 0.244140625, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.5107421875, "epoch": 0.46046186144156753, "grad_norm": 2.9264965256697155, "k1_kl": 0.21240234375, "k3_kl": 0.101318359375, "kimi_kl": 0.18359375, "learning_rate": 5.392156862745097e-07, "loss": 0.0041, "ppl": 0.45703125, "reward": 0.9046731889247894, "reward_std": 0.023774588480591774, "rewards/single_object_detection_bbox_reward": 0.9046732187271118, "step": 329, "temperature": 0.9 }, { "advantages": 3.2809180083859246e-07, "completion_length": 50.96875, "delta_ref_entropy_loss": 0.2333984375, "delta_ref_ppl": -0.21923828125, "entropy_loss": -0.560546875, "epoch": 0.46186144156752973, "grad_norm": 3.1517903976822863, "k1_kl": 0.22021484375, "k3_kl": 0.101806640625, "kimi_kl": 0.18408203125, "learning_rate": 5.378151260504201e-07, "loss": 0.0041, "ppl": 0.49609375, "reward": 0.9211283326148987, "reward_std": 0.03778224904090166, "rewards/single_object_detection_bbox_reward": 0.9211283922195435, "step": 330, "temperature": 0.9 }, { "advantages": -6.84788233229483e-07, "completion_length": 74.1875, "delta_ref_entropy_loss": 0.25048828125, "delta_ref_ppl": -0.232421875, "entropy_loss": -0.5283203125, "epoch": 0.4632610216934919, "grad_norm": 2.2565570963759183, "k1_kl": 0.232421875, "k3_kl": 0.108154296875, "kimi_kl": 0.217041015625, "learning_rate": 5.364145658263305e-07, "loss": 0.0043, "ppl": 0.4697265625, "reward": 0.9755015969276428, "reward_std": 0.017212437000125647, "rewards/single_object_detection_bbox_reward": 0.97550168633461, "step": 331, "temperature": 0.9 }, { "advantages": -2.3221865319555945e-06, "completion_length": 105.65625, "delta_ref_entropy_loss": 0.2685546875, "delta_ref_ppl": -0.265625, "entropy_loss": -0.513671875, "epoch": 0.4646606018194542, "grad_norm": 3.646739128847412, "k1_kl": 0.265625, "k3_kl": 0.130615234375, "kimi_kl": 0.294921875, "learning_rate": 5.350140056022409e-07, "loss": 0.0052, "ppl": 0.4521484375, "reward": 0.9770630598068237, "reward_std": 0.020807857625186443, "rewards/single_object_detection_bbox_reward": 0.9770631194114685, "step": 332, "temperature": 0.9 }, { "advantages": 6.523250704049133e-07, "completion_length": 186.21875, "delta_ref_entropy_loss": 0.212890625, "delta_ref_ppl": -0.17333984375, "entropy_loss": -0.54296875, "epoch": 0.4660601819454164, "grad_norm": 3.426883398760363, "k1_kl": 0.173828125, "k3_kl": 0.0755615234375, "kimi_kl": 0.121826171875, "learning_rate": 5.336134453781512e-07, "loss": 0.003, "ppl": 0.4833984375, "reward": 0.9436999559402466, "reward_std": 0.025191535241901875, "rewards/single_object_detection_bbox_reward": 0.9437000155448914, "step": 333, "temperature": 0.9 }, { "advantages": 3.929915465050726e-06, "completion_length": 60.75, "delta_ref_entropy_loss": 0.228515625, "delta_ref_ppl": -0.1923828125, "entropy_loss": -0.5234375, "epoch": 0.46745976207137857, "grad_norm": 2.336374361638177, "k1_kl": 0.1923828125, "k3_kl": 0.084716796875, "kimi_kl": 0.1611328125, "learning_rate": 5.322128851540616e-07, "loss": 0.0034, "ppl": 0.462890625, "reward": 0.9627728462219238, "reward_std": 0.02263063471764326, "rewards/single_object_detection_bbox_reward": 0.962772935628891, "step": 334, "temperature": 0.9 }, { "advantages": -2.5930682454600174e-06, "completion_length": 32.34375, "delta_ref_entropy_loss": 0.25244140625, "delta_ref_ppl": -0.19921875, "entropy_loss": -0.505859375, "epoch": 0.4688593421973408, "grad_norm": 3.5225902803253377, "k1_kl": 0.19970703125, "k3_kl": 0.086669921875, "kimi_kl": 0.14306640625, "learning_rate": 5.30812324929972e-07, "loss": 0.0035, "ppl": 0.4453125, "reward": 0.9288876056671143, "reward_std": 0.024847839202266186, "rewards/single_object_detection_bbox_reward": 0.928887665271759, "step": 335, "temperature": 0.9 }, { "advantages": 9.036091057623707e-06, "completion_length": 107.375, "delta_ref_entropy_loss": 0.2763671875, "delta_ref_ppl": -0.22021484375, "entropy_loss": -0.4794921875, "epoch": 0.470258922323303, "grad_norm": 3.091715815068366, "k1_kl": 0.22021484375, "k3_kl": 0.09619140625, "kimi_kl": 0.16552734375, "learning_rate": 5.294117647058823e-07, "loss": 0.0038, "ppl": 0.421875, "reward": 0.9368462562561035, "reward_std": 0.0068434185814112425, "rewards/single_object_detection_bbox_reward": 0.9368463158607483, "step": 336, "temperature": 0.9 }, { "advantages": 1.2516975402832031e-06, "completion_length": 61.0, "delta_ref_entropy_loss": 0.26025390625, "delta_ref_ppl": -0.205078125, "entropy_loss": -0.4755859375, "epoch": 0.4716585024492652, "grad_norm": 4.177106500384255, "k1_kl": 0.20556640625, "k3_kl": 0.091064453125, "kimi_kl": 0.15869140625, "learning_rate": 5.280112044817927e-07, "loss": 0.0036, "ppl": 0.41796875, "reward": 0.9915417730808258, "reward_std": 0.011192043893970549, "rewards/single_object_detection_bbox_reward": 0.9915418326854706, "step": 337, "temperature": 0.9 }, { "advantages": 1.778467003532569e-05, "completion_length": 101.1875, "delta_ref_entropy_loss": 0.234375, "delta_ref_ppl": -0.251953125, "entropy_loss": -0.544921875, "epoch": 0.4730580825752274, "grad_norm": 3.564612038072974, "k1_kl": 0.25244140625, "k3_kl": 0.1318359375, "kimi_kl": 0.349609375, "learning_rate": 5.26610644257703e-07, "loss": 0.0053, "ppl": 0.48046875, "reward": 0.9257388412952423, "reward_std": 0.041208211332559586, "rewards/single_object_detection_bbox_reward": 0.9257389307022095, "step": 338, "temperature": 0.9 }, { "advantages": 1.3704279353987658e-05, "completion_length": 37.34375, "delta_ref_entropy_loss": 0.23583984375, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.52734375, "epoch": 0.47445766270118966, "grad_norm": 11.572313556272892, "k1_kl": 0.18603515625, "k3_kl": 0.079833984375, "kimi_kl": 0.14306640625, "learning_rate": 5.252100840336135e-07, "loss": 0.0032, "ppl": 0.46484375, "reward": 0.9186000525951385, "reward_std": 0.03974991664290428, "rewards/single_object_detection_bbox_reward": 0.9186000823974609, "step": 339, "temperature": 0.9 }, { "advantages": 5.74133798636467e-06, "completion_length": 90.0, "delta_ref_entropy_loss": 0.25048828125, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.529296875, "epoch": 0.47585724282715186, "grad_norm": 12.518152681061396, "k1_kl": 0.20849609375, "k3_kl": 0.09326171875, "kimi_kl": 0.1669921875, "learning_rate": 5.238095238095238e-07, "loss": 0.0037, "ppl": 0.470703125, "reward": 0.9595731198787689, "reward_std": 0.0037227653665468097, "rewards/single_object_detection_bbox_reward": 0.9595730900764465, "step": 340, "temperature": 0.9 }, { "advantages": 7.397375156870112e-08, "completion_length": 89.21875, "delta_ref_entropy_loss": 0.2119140625, "delta_ref_ppl": -0.1865234375, "entropy_loss": -0.556640625, "epoch": 0.47725682295311406, "grad_norm": 5.44674251479826, "k1_kl": 0.18701171875, "k3_kl": 0.086181640625, "kimi_kl": 0.15576171875, "learning_rate": 5.224089635854342e-07, "loss": 0.0034, "ppl": 0.490234375, "reward": 0.9535040259361267, "reward_std": 0.027135197073221207, "rewards/single_object_detection_bbox_reward": 0.9535041153430939, "step": 341, "temperature": 0.9 }, { "advantages": 8.375517154490808e-06, "completion_length": 49.0, "delta_ref_entropy_loss": 0.21240234375, "delta_ref_ppl": -0.20263671875, "entropy_loss": -0.556640625, "epoch": 0.47865640307907625, "grad_norm": 2.9136228362969443, "k1_kl": 0.20263671875, "k3_kl": 0.095947265625, "kimi_kl": 0.19775390625, "learning_rate": 5.210084033613445e-07, "loss": 0.0038, "ppl": 0.48828125, "reward": 0.9595920741558075, "reward_std": 0.031308059580624104, "rewards/single_object_detection_bbox_reward": 0.9595920741558075, "step": 342, "temperature": 0.9 }, { "advantages": 4.532879756879993e-06, "completion_length": 113.09375, "delta_ref_entropy_loss": 0.21337890625, "delta_ref_ppl": -0.17236328125, "entropy_loss": -0.5166015625, "epoch": 0.4800559832050385, "grad_norm": 1.9928369312730896, "k1_kl": 0.171875, "k3_kl": 0.072509765625, "kimi_kl": 0.12060546875, "learning_rate": 5.19607843137255e-07, "loss": 0.0029, "ppl": 0.45703125, "reward": 0.9664286077022552, "reward_std": 0.029988369904458523, "rewards/single_object_detection_bbox_reward": 0.9664286375045776, "step": 343, "temperature": 0.9 }, { "advantages": 5.119879745052458e-06, "completion_length": 61.0, "delta_ref_entropy_loss": 0.25341796875, "delta_ref_ppl": -0.2353515625, "entropy_loss": -0.478515625, "epoch": 0.4814555633310007, "grad_norm": 3.004268155270481, "k1_kl": 0.236328125, "k3_kl": 0.115966796875, "kimi_kl": 0.23046875, "learning_rate": 5.182072829131653e-07, "loss": 0.0046, "ppl": 0.421875, "reward": 0.9091679155826569, "reward_std": 0.02766579296439886, "rewards/single_object_detection_bbox_reward": 0.9091679751873016, "step": 344, "temperature": 0.9 }, { "advantages": -1.1212858225917444e-05, "completion_length": 84.8125, "delta_ref_entropy_loss": 0.2763671875, "delta_ref_ppl": -0.23828125, "entropy_loss": -0.4970703125, "epoch": 0.4828551434569629, "grad_norm": 3.1340456570025568, "k1_kl": 0.2373046875, "k3_kl": 0.110595703125, "kimi_kl": 0.19580078125, "learning_rate": 5.168067226890757e-07, "loss": 0.0044, "ppl": 0.44140625, "reward": 0.9434846937656403, "reward_std": 0.013218006119132042, "rewards/single_object_detection_bbox_reward": 0.9434846937656403, "step": 345, "temperature": 0.9 }, { "advantages": -5.101253009343054e-06, "completion_length": 119.1875, "delta_ref_entropy_loss": 0.2255859375, "delta_ref_ppl": -0.19775390625, "entropy_loss": -0.486328125, "epoch": 0.4842547235829251, "grad_norm": 2.556834419015536, "k1_kl": 0.19775390625, "k3_kl": 0.091796875, "kimi_kl": 0.1689453125, "learning_rate": 5.154061624649859e-07, "loss": 0.0037, "ppl": 0.4267578125, "reward": 0.8800197541713715, "reward_std": 0.056839196011424065, "rewards/single_object_detection_bbox_reward": 0.8800196945667267, "step": 346, "temperature": 0.9 }, { "advantages": 1.3259370916784974e-06, "completion_length": 69.53125, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.1953125, "entropy_loss": -0.5205078125, "epoch": 0.48565430370888735, "grad_norm": 2.4938616922825725, "k1_kl": 0.19482421875, "k3_kl": 0.08154296875, "kimi_kl": 0.12939453125, "learning_rate": 5.140056022408962e-07, "loss": 0.0033, "ppl": 0.4599609375, "reward": 0.9785158336162567, "reward_std": 0.00604225869756192, "rewards/single_object_detection_bbox_reward": 0.9785158932209015, "step": 347, "temperature": 0.9 }, { "advantages": 9.340766609966522e-06, "completion_length": 72.5625, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.21240234375, "entropy_loss": -0.517578125, "epoch": 0.48705388383484954, "grad_norm": 8.634628800593006, "k1_kl": 0.2119140625, "k3_kl": 0.134765625, "kimi_kl": 0.18408203125, "learning_rate": 5.126050420168067e-07, "loss": 0.0054, "ppl": 0.4619140625, "reward": 0.9323742389678955, "reward_std": 0.028458310291171074, "rewards/single_object_detection_bbox_reward": 0.9323743581771851, "step": 348, "temperature": 0.9 }, { "advantages": 1.4693077901029028e-05, "completion_length": 136.125, "delta_ref_entropy_loss": 0.26611328125, "delta_ref_ppl": -0.2099609375, "entropy_loss": -0.490234375, "epoch": 0.48845346396081174, "grad_norm": 4.2518872225838935, "k1_kl": 0.2099609375, "k3_kl": 0.092529296875, "kimi_kl": 0.162109375, "learning_rate": 5.11204481792717e-07, "loss": 0.0037, "ppl": 0.4306640625, "reward": 0.9446817338466644, "reward_std": 0.02779541350901127, "rewards/single_object_detection_bbox_reward": 0.9446818232536316, "step": 349, "temperature": 0.9 }, { "advantages": -7.332170071094879e-06, "completion_length": 113.84375, "delta_ref_entropy_loss": 0.2255859375, "delta_ref_ppl": -0.17724609375, "entropy_loss": -0.55078125, "epoch": 0.489853044086774, "grad_norm": 2.591095492319605, "k1_kl": 0.17724609375, "k3_kl": 0.07421875, "kimi_kl": 0.118896484375, "learning_rate": 5.098039215686274e-07, "loss": 0.003, "ppl": 0.4892578125, "reward": 0.9599020183086395, "reward_std": 0.021290190517902374, "rewards/single_object_detection_bbox_reward": 0.9599021077156067, "step": 350, "temperature": 0.9 }, { "advantages": -7.000885489105713e-07, "completion_length": 51.5, "delta_ref_entropy_loss": 0.2490234375, "delta_ref_ppl": -0.20263671875, "entropy_loss": -0.5029296875, "epoch": 0.4912526242127362, "grad_norm": 48.24960673512405, "k1_kl": 0.20263671875, "k3_kl": 0.0914306640625, "kimi_kl": 0.180908203125, "learning_rate": 5.084033613445377e-07, "loss": 0.0037, "ppl": 0.443359375, "reward": 0.960275411605835, "reward_std": 0.031381309032440186, "rewards/single_object_detection_bbox_reward": 0.9602755010128021, "step": 351, "temperature": 0.9 }, { "advantages": 2.5637973521952517e-07, "completion_length": 87.375, "delta_ref_entropy_loss": 0.27294921875, "delta_ref_ppl": -0.2109375, "entropy_loss": -0.5078125, "epoch": 0.4926522043386984, "grad_norm": 2.695569774195829, "k1_kl": 0.2119140625, "k3_kl": 0.08984375, "kimi_kl": 0.169677734375, "learning_rate": 5.070028011204482e-07, "loss": 0.0036, "ppl": 0.44921875, "reward": 0.9543321132659912, "reward_std": 0.022325478494167328, "rewards/single_object_detection_bbox_reward": 0.9543321132659912, "step": 352, "temperature": 0.9 }, { "advantages": -5.39861093784566e-06, "completion_length": 59.5, "delta_ref_entropy_loss": 0.2109375, "delta_ref_ppl": -0.171875, "entropy_loss": -0.513671875, "epoch": 0.4940517844646606, "grad_norm": 3.2319220847119583, "k1_kl": 0.171875, "k3_kl": 0.0733642578125, "kimi_kl": 0.1171875, "learning_rate": 5.056022408963585e-07, "loss": 0.0029, "ppl": 0.4541015625, "reward": 0.9819219708442688, "reward_std": 0.01785277295857668, "rewards/single_object_detection_bbox_reward": 0.9819220304489136, "step": 353, "temperature": 0.9 }, { "advantages": 6.381555749612744e-06, "completion_length": 77.5, "delta_ref_entropy_loss": 0.2333984375, "delta_ref_ppl": -0.21630859375, "entropy_loss": -0.4697265625, "epoch": 0.49545136459062283, "grad_norm": 3.9150607954693117, "k1_kl": 0.21630859375, "k3_kl": 0.099609375, "kimi_kl": 0.18896484375, "learning_rate": 5.042016806722689e-07, "loss": 0.004, "ppl": 0.412109375, "reward": 0.9630095064640045, "reward_std": 0.013039001496508718, "rewards/single_object_detection_bbox_reward": 0.9630096256732941, "step": 354, "temperature": 0.9 }, { "advantages": 1.915996654133778e-05, "completion_length": 78.375, "delta_ref_entropy_loss": 0.21533203125, "delta_ref_ppl": -0.1748046875, "entropy_loss": -0.5546875, "epoch": 0.49685094471658503, "grad_norm": 3.101422761415341, "k1_kl": 0.17431640625, "k3_kl": 0.080078125, "kimi_kl": 0.12841796875, "learning_rate": 5.028011204481792e-07, "loss": 0.0032, "ppl": 0.4951171875, "reward": 0.9482844471931458, "reward_std": 0.020748308568727225, "rewards/single_object_detection_bbox_reward": 0.9482845664024353, "step": 355, "temperature": 0.9 }, { "advantages": -2.978901818551094e-06, "completion_length": 116.84375, "delta_ref_entropy_loss": 0.2265625, "delta_ref_ppl": -0.1962890625, "entropy_loss": -0.53515625, "epoch": 0.4982505248425472, "grad_norm": 3.7363777885625837, "k1_kl": 0.19677734375, "k3_kl": 0.086669921875, "kimi_kl": 0.156494140625, "learning_rate": 5.014005602240897e-07, "loss": 0.0035, "ppl": 0.4697265625, "reward": 0.9773686826229095, "reward_std": 0.022551425732672215, "rewards/single_object_detection_bbox_reward": 0.9773687422275543, "step": 356, "temperature": 0.9 }, { "advantages": 4.20265982370438e-06, "completion_length": 60.5, "delta_ref_entropy_loss": 0.2841796875, "delta_ref_ppl": -0.20654296875, "entropy_loss": -0.4775390625, "epoch": 0.4996501049685094, "grad_norm": 3.7722632770028928, "k1_kl": 0.20654296875, "k3_kl": 0.082275390625, "kimi_kl": 0.122314453125, "learning_rate": 5e-07, "loss": 0.0033, "ppl": 0.419921875, "reward": 0.9489797055721283, "reward_std": 0.016071557765826583, "rewards/single_object_detection_bbox_reward": 0.9489797353744507, "step": 357, "temperature": 0.9 }, { "advantages": -9.481396716637391e-06, "completion_length": 124.875, "delta_ref_entropy_loss": 0.23974609375, "delta_ref_ppl": -0.20703125, "entropy_loss": -0.525390625, "epoch": 0.5010496850944717, "grad_norm": 2.4135400995470904, "k1_kl": 0.20703125, "k3_kl": 0.08984375, "kimi_kl": 0.151611328125, "learning_rate": 4.985994397759103e-07, "loss": 0.0036, "ppl": 0.4619140625, "reward": 0.9778860211372375, "reward_std": 0.010445498250192031, "rewards/single_object_detection_bbox_reward": 0.9778860509395599, "step": 358, "temperature": 0.9 }, { "advantages": -6.5932318875638884e-06, "completion_length": 108.53125, "delta_ref_entropy_loss": 0.22900390625, "delta_ref_ppl": -0.1875, "entropy_loss": -0.5166015625, "epoch": 0.5024492652204339, "grad_norm": 1.8712302697363854, "k1_kl": 0.18798828125, "k3_kl": 0.076904296875, "kimi_kl": 0.123291015625, "learning_rate": 4.971988795518207e-07, "loss": 0.0031, "ppl": 0.4521484375, "reward": 0.974878191947937, "reward_std": 0.013164728705305606, "rewards/single_object_detection_bbox_reward": 0.9748782515525818, "step": 359, "temperature": 0.9 }, { "advantages": -9.775827720659436e-06, "completion_length": 77.375, "delta_ref_entropy_loss": 0.22021484375, "delta_ref_ppl": -0.1689453125, "entropy_loss": -0.509765625, "epoch": 0.5038488453463961, "grad_norm": 2.96706204961209, "k1_kl": 0.17041015625, "k3_kl": 0.068359375, "kimi_kl": 0.116455078125, "learning_rate": 4.95798319327731e-07, "loss": 0.0027, "ppl": 0.447265625, "reward": 0.9523955285549164, "reward_std": 0.004917952581308782, "rewards/single_object_detection_bbox_reward": 0.9523955285549164, "step": 360, "temperature": 0.9 }, { "advantages": 1.7563014807819854e-05, "completion_length": 23.03125, "delta_ref_entropy_loss": 0.26953125, "delta_ref_ppl": -0.22412109375, "entropy_loss": -0.4453125, "epoch": 0.5052484254723583, "grad_norm": 3.6657063917982504, "k1_kl": 0.224609375, "k3_kl": 0.099365234375, "kimi_kl": 0.2099609375, "learning_rate": 4.943977591036415e-07, "loss": 0.004, "ppl": 0.39453125, "reward": 0.9790475368499756, "reward_std": 0.01465154392644763, "rewards/single_object_detection_bbox_reward": 0.9790475070476532, "step": 361, "temperature": 0.9 }, { "advantages": 4.910730979190703e-06, "completion_length": 116.15625, "delta_ref_entropy_loss": 0.22216796875, "delta_ref_ppl": -0.1787109375, "entropy_loss": -0.4716796875, "epoch": 0.5066480055983205, "grad_norm": 3.9757996892606116, "k1_kl": 0.17822265625, "k3_kl": 0.08056640625, "kimi_kl": 0.1474609375, "learning_rate": 4.929971988795518e-07, "loss": 0.0032, "ppl": 0.4189453125, "reward": 0.971221536397934, "reward_std": 0.025658381171524525, "rewards/single_object_detection_bbox_reward": 0.9712216854095459, "step": 362, "temperature": 0.9 }, { "advantages": 2.855169213944464e-06, "completion_length": 68.96875, "delta_ref_entropy_loss": 0.2607421875, "delta_ref_ppl": -0.232421875, "entropy_loss": -0.490234375, "epoch": 0.5080475857242828, "grad_norm": 2.5091569127141153, "k1_kl": 0.232421875, "k3_kl": 0.108154296875, "kimi_kl": 0.1953125, "learning_rate": 4.915966386554621e-07, "loss": 0.0043, "ppl": 0.431640625, "reward": 0.988466203212738, "reward_std": 0.017904572188854218, "rewards/single_object_detection_bbox_reward": 0.9884662330150604, "step": 363, "temperature": 0.9 }, { "advantages": -8.27839357953053e-06, "completion_length": 97.125, "delta_ref_entropy_loss": 0.24755859375, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.552734375, "epoch": 0.509447165850245, "grad_norm": 2.4321192755152268, "k1_kl": 0.21240234375, "k3_kl": 0.10400390625, "kimi_kl": 0.21044921875, "learning_rate": 4.901960784313725e-07, "loss": 0.0042, "ppl": 0.498046875, "reward": 0.9063223600387573, "reward_std": 0.032984958961606026, "rewards/single_object_detection_bbox_reward": 0.9063223898410797, "step": 364, "temperature": 0.9 }, { "advantages": -1.58096001996455e-05, "completion_length": 97.5, "delta_ref_entropy_loss": 0.22314453125, "delta_ref_ppl": -0.197265625, "entropy_loss": -0.59765625, "epoch": 0.5108467459762072, "grad_norm": 2.46778085583447, "k1_kl": 0.197265625, "k3_kl": 0.091796875, "kimi_kl": 0.181640625, "learning_rate": 4.887955182072829e-07, "loss": 0.0037, "ppl": 0.53125, "reward": 0.9113706648349762, "reward_std": 0.042930612340569496, "rewards/single_object_detection_bbox_reward": 0.911370724439621, "step": 365, "temperature": 0.9 }, { "advantages": -1.4378024843608728e-05, "completion_length": 79.0, "delta_ref_entropy_loss": 0.22900390625, "delta_ref_ppl": -0.18798828125, "entropy_loss": -0.521484375, "epoch": 0.5122463261021694, "grad_norm": 2.3017959282270954, "k1_kl": 0.1875, "k3_kl": 0.082275390625, "kimi_kl": 0.13818359375, "learning_rate": 4.873949579831933e-07, "loss": 0.0033, "ppl": 0.4619140625, "reward": 0.964756429195404, "reward_std": 0.014536892529577017, "rewards/single_object_detection_bbox_reward": 0.9647564589977264, "step": 366, "temperature": 0.9 }, { "advantages": -5.885692985430069e-06, "completion_length": 140.53125, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.2216796875, "entropy_loss": -0.501953125, "epoch": 0.5136459062281316, "grad_norm": 2.4753437019920863, "k1_kl": 0.2216796875, "k3_kl": 0.1123046875, "kimi_kl": 0.25439453125, "learning_rate": 4.859943977591036e-07, "loss": 0.0045, "ppl": 0.4453125, "reward": 0.9094062149524689, "reward_std": 0.02764401864260435, "rewards/single_object_detection_bbox_reward": 0.9094062149524689, "step": 367, "temperature": 0.9 }, { "advantages": 2.9292759791132994e-06, "completion_length": 127.4375, "delta_ref_entropy_loss": 0.240234375, "delta_ref_ppl": -0.24072265625, "entropy_loss": -0.490234375, "epoch": 0.5150454863540938, "grad_norm": 4.152678967050506, "k1_kl": 0.24072265625, "k3_kl": 0.126953125, "kimi_kl": 0.26708984375, "learning_rate": 4.845938375350139e-07, "loss": 0.0051, "ppl": 0.4384765625, "reward": 0.9744552671909332, "reward_std": 0.02817428018897772, "rewards/single_object_detection_bbox_reward": 0.974455326795578, "step": 368, "temperature": 0.9 }, { "advantages": -1.3501783016067748e-05, "completion_length": 61.5, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.1943359375, "entropy_loss": -0.525390625, "epoch": 0.516445066480056, "grad_norm": 2.0757698378506473, "k1_kl": 0.19384765625, "k3_kl": 0.0810546875, "kimi_kl": 0.131591796875, "learning_rate": 4.831932773109244e-07, "loss": 0.0033, "ppl": 0.4599609375, "reward": 0.8988297581672668, "reward_std": 0.020148981362581253, "rewards/single_object_detection_bbox_reward": 0.8988298773765564, "step": 369, "temperature": 0.9 }, { "advantages": 4.6672573716932675e-07, "completion_length": 114.28125, "delta_ref_entropy_loss": 0.2236328125, "delta_ref_ppl": -0.1845703125, "entropy_loss": -0.5107421875, "epoch": 0.5178446466060181, "grad_norm": 2.0752063170899695, "k1_kl": 0.18408203125, "k3_kl": 0.078125, "kimi_kl": 0.118896484375, "learning_rate": 4.817927170868347e-07, "loss": 0.0031, "ppl": 0.451171875, "reward": 0.9752184152603149, "reward_std": 0.018497971817851067, "rewards/single_object_detection_bbox_reward": 0.9752184748649597, "step": 370, "temperature": 0.9 }, { "advantages": -7.128343298745676e-06, "completion_length": 70.0, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.474609375, "epoch": 0.5192442267319805, "grad_norm": 2.491389667422787, "k1_kl": 0.18505859375, "k3_kl": 0.084228515625, "kimi_kl": 0.15771484375, "learning_rate": 4.803921568627451e-07, "loss": 0.0034, "ppl": 0.419921875, "reward": 0.931580513715744, "reward_std": 0.009801158681511879, "rewards/single_object_detection_bbox_reward": 0.9315806031227112, "step": 371, "temperature": 0.9 }, { "advantages": 5.034729838371277e-06, "completion_length": 42.5625, "delta_ref_entropy_loss": 0.23974609375, "delta_ref_ppl": -0.22509765625, "entropy_loss": -0.505859375, "epoch": 0.5206438068579426, "grad_norm": 7.831105010258818, "k1_kl": 0.22509765625, "k3_kl": 0.1005859375, "kimi_kl": 0.18798828125, "learning_rate": 4.789915966386554e-07, "loss": 0.004, "ppl": 0.4453125, "reward": 0.962115079164505, "reward_std": 0.01821177313104272, "rewards/single_object_detection_bbox_reward": 0.962115079164505, "step": 372, "temperature": 0.9 }, { "advantages": -2.940984359156573e-06, "completion_length": 49.84375, "delta_ref_entropy_loss": 0.2587890625, "delta_ref_ppl": -0.197265625, "entropy_loss": -0.4951171875, "epoch": 0.5220433869839048, "grad_norm": 4.111047182848426, "k1_kl": 0.19580078125, "k3_kl": 0.0849609375, "kimi_kl": 0.1455078125, "learning_rate": 4.775910364145659e-07, "loss": 0.0034, "ppl": 0.4375, "reward": 0.985161155462265, "reward_std": 0.007429227000102401, "rewards/single_object_detection_bbox_reward": 0.9851612746715546, "step": 373, "temperature": 0.9 }, { "advantages": 8.81124350371465e-06, "completion_length": 126.65625, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.18310546875, "entropy_loss": -0.4736328125, "epoch": 0.523442967109867, "grad_norm": 4.6172191179325655, "k1_kl": 0.18310546875, "k3_kl": 0.078857421875, "kimi_kl": 0.1611328125, "learning_rate": 4.761904761904761e-07, "loss": 0.0031, "ppl": 0.421875, "reward": 0.9607781171798706, "reward_std": 0.01061315496917814, "rewards/single_object_detection_bbox_reward": 0.960778146982193, "step": 374, "temperature": 0.9 }, { "advantages": 1.3004723143694719e-05, "completion_length": 39.53125, "delta_ref_entropy_loss": 0.24951171875, "delta_ref_ppl": -0.228515625, "entropy_loss": -0.544921875, "epoch": 0.5248425472358292, "grad_norm": 2.7992509824026883, "k1_kl": 0.22802734375, "k3_kl": 0.115234375, "kimi_kl": 0.22802734375, "learning_rate": 4.747899159663865e-07, "loss": 0.0046, "ppl": 0.4833984375, "reward": 0.8776675164699554, "reward_std": 0.0533825634047389, "rewards/single_object_detection_bbox_reward": 0.8776675760746002, "step": 375, "temperature": 0.9 }, { "advantages": -8.417027629548102e-06, "completion_length": 119.53125, "delta_ref_entropy_loss": 0.26904296875, "delta_ref_ppl": -0.23974609375, "entropy_loss": -0.548828125, "epoch": 0.5262421273617914, "grad_norm": 2.3472914383653616, "k1_kl": 0.2392578125, "k3_kl": 0.1103515625, "kimi_kl": 0.2001953125, "learning_rate": 4.7338935574229687e-07, "loss": 0.0044, "ppl": 0.48046875, "reward": 0.971864253282547, "reward_std": 0.005370217142626643, "rewards/single_object_detection_bbox_reward": 0.971864253282547, "step": 376, "temperature": 0.9 }, { "advantages": -1.2645232459362887e-05, "completion_length": 51.6875, "delta_ref_entropy_loss": 0.23681640625, "delta_ref_ppl": -0.2177734375, "entropy_loss": -0.5146484375, "epoch": 0.5276417074877536, "grad_norm": 5.7867241906448905, "k1_kl": 0.2177734375, "k3_kl": 0.10205078125, "kimi_kl": 0.200927734375, "learning_rate": 4.7198879551820724e-07, "loss": 0.0041, "ppl": 0.44921875, "reward": 0.928299069404602, "reward_std": 0.03335620567668229, "rewards/single_object_detection_bbox_reward": 0.9282991290092468, "step": 377, "temperature": 0.9 }, { "advantages": 4.645570378158936e-06, "completion_length": 118.25, "delta_ref_entropy_loss": 0.26953125, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.4697265625, "epoch": 0.5290412876137159, "grad_norm": 3.1639945972516013, "k1_kl": 0.21484375, "k3_kl": 0.110107421875, "kimi_kl": 0.1796875, "learning_rate": 4.705882352941176e-07, "loss": 0.0044, "ppl": 0.4208984375, "reward": 0.969267725944519, "reward_std": 0.013709565391764045, "rewards/single_object_detection_bbox_reward": 0.9692677557468414, "step": 378, "temperature": 0.9 }, { "advantages": -5.835667479914264e-06, "completion_length": 131.84375, "delta_ref_entropy_loss": 0.24365234375, "delta_ref_ppl": -0.21337890625, "entropy_loss": -0.509765625, "epoch": 0.5304408677396781, "grad_norm": 3.1391289177206816, "k1_kl": 0.21240234375, "k3_kl": 0.111572265625, "kimi_kl": 0.23095703125, "learning_rate": 4.69187675070028e-07, "loss": 0.0045, "ppl": 0.453125, "reward": 0.9121111631393433, "reward_std": 0.04706569388508797, "rewards/single_object_detection_bbox_reward": 0.912111222743988, "step": 379, "temperature": 0.9 }, { "advantages": -9.05192359823559e-06, "completion_length": 87.0, "delta_ref_entropy_loss": 0.212890625, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.521484375, "epoch": 0.5318404478656403, "grad_norm": 3.0522370948993625, "k1_kl": 0.18408203125, "k3_kl": 0.087890625, "kimi_kl": 0.16015625, "learning_rate": 4.6778711484593836e-07, "loss": 0.0035, "ppl": 0.462890625, "reward": 0.9536957442760468, "reward_std": 0.029008996672928333, "rewards/single_object_detection_bbox_reward": 0.9536958038806915, "step": 380, "temperature": 0.9 }, { "advantages": -1.039675339598034e-05, "completion_length": 82.375, "delta_ref_entropy_loss": 0.24951171875, "delta_ref_ppl": -0.21728515625, "entropy_loss": -0.4736328125, "epoch": 0.5332400279916025, "grad_norm": 2.4690533079202313, "k1_kl": 0.21728515625, "k3_kl": 0.100830078125, "kimi_kl": 0.17529296875, "learning_rate": 4.6638655462184874e-07, "loss": 0.004, "ppl": 0.41796875, "reward": 0.9548930525779724, "reward_std": 0.02416069246828556, "rewards/single_object_detection_bbox_reward": 0.9548931121826172, "step": 381, "temperature": 0.9 }, { "advantages": 8.439774319413118e-07, "completion_length": 92.65625, "delta_ref_entropy_loss": 0.2138671875, "delta_ref_ppl": -0.17333984375, "entropy_loss": -0.5146484375, "epoch": 0.5346396081175647, "grad_norm": 1.6807130739027358, "k1_kl": 0.1728515625, "k3_kl": 0.074462890625, "kimi_kl": 0.12939453125, "learning_rate": 4.649859943977591e-07, "loss": 0.003, "ppl": 0.4521484375, "reward": 0.9560263752937317, "reward_std": 0.017774673178792, "rewards/single_object_detection_bbox_reward": 0.9560264647006989, "step": 382, "temperature": 0.9 }, { "advantages": 5.870126642548712e-06, "completion_length": 49.0, "delta_ref_entropy_loss": 0.2490234375, "delta_ref_ppl": -0.2294921875, "entropy_loss": -0.517578125, "epoch": 0.5360391882435269, "grad_norm": 2.2294465892934943, "k1_kl": 0.22900390625, "k3_kl": 0.108154296875, "kimi_kl": 0.23779296875, "learning_rate": 4.635854341736695e-07, "loss": 0.0043, "ppl": 0.451171875, "reward": 0.9898313879966736, "reward_std": 0.014531050343066454, "rewards/single_object_detection_bbox_reward": 0.9898315072059631, "step": 383, "temperature": 0.9 }, { "advantages": 1.97706477678139e-06, "completion_length": 20.53125, "delta_ref_entropy_loss": 0.248046875, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.5224609375, "epoch": 0.5374387683694891, "grad_norm": 3.766002625522663, "k1_kl": 0.185546875, "k3_kl": 0.078125, "kimi_kl": 0.134521484375, "learning_rate": 4.6218487394957986e-07, "loss": 0.0031, "ppl": 0.458984375, "reward": 0.8788594305515289, "reward_std": 0.02395795052871108, "rewards/single_object_detection_bbox_reward": 0.8788594305515289, "step": 384, "temperature": 0.9 }, { "advantages": 1.0467001914093998e-05, "completion_length": 90.875, "delta_ref_entropy_loss": 0.28515625, "delta_ref_ppl": -0.244140625, "entropy_loss": -0.46484375, "epoch": 0.5388383484954513, "grad_norm": 2.6966794529204505, "k1_kl": 0.24365234375, "k3_kl": 0.114501953125, "kimi_kl": 0.21728515625, "learning_rate": 4.6078431372549013e-07, "loss": 0.0046, "ppl": 0.4140625, "reward": 0.9668363928794861, "reward_std": 0.018805421888828278, "rewards/single_object_detection_bbox_reward": 0.9668365120887756, "step": 385, "temperature": 0.9 }, { "advantages": -1.3932852198195178e-05, "completion_length": 77.15625, "delta_ref_entropy_loss": 0.25732421875, "delta_ref_ppl": -0.22412109375, "entropy_loss": -0.537109375, "epoch": 0.5402379286214136, "grad_norm": 3.1236425474718916, "k1_kl": 0.22412109375, "k3_kl": 0.10400390625, "kimi_kl": 0.2138671875, "learning_rate": 4.593837535014005e-07, "loss": 0.0042, "ppl": 0.4775390625, "reward": 0.929926723241806, "reward_std": 0.019312336575239897, "rewards/single_object_detection_bbox_reward": 0.9299267828464508, "step": 386, "temperature": 0.9 }, { "advantages": 1.918285079227644e-05, "completion_length": 57.125, "delta_ref_entropy_loss": 0.2734375, "delta_ref_ppl": -0.22119140625, "entropy_loss": -0.4873046875, "epoch": 0.5416375087473758, "grad_norm": 4.2603989456690625, "k1_kl": 0.22119140625, "k3_kl": 0.103515625, "kimi_kl": 0.18017578125, "learning_rate": 4.579831932773109e-07, "loss": 0.0041, "ppl": 0.4306640625, "reward": 0.9407818019390106, "reward_std": 0.030741693219169974, "rewards/single_object_detection_bbox_reward": 0.9407818019390106, "step": 387, "temperature": 0.9 }, { "advantages": -8.322031135321595e-07, "completion_length": 61.0, "delta_ref_entropy_loss": 0.232421875, "delta_ref_ppl": -0.16796875, "entropy_loss": -0.4736328125, "epoch": 0.543037088873338, "grad_norm": 2.7008188431998787, "k1_kl": 0.1689453125, "k3_kl": 0.08203125, "kimi_kl": 0.114501953125, "learning_rate": 4.5658263305322125e-07, "loss": 0.0033, "ppl": 0.4267578125, "reward": 0.9755133986473083, "reward_std": 0.02047595870681107, "rewards/single_object_detection_bbox_reward": 0.9755134582519531, "step": 388, "temperature": 0.9 }, { "advantages": -3.3966666279638957e-07, "completion_length": 31.0, "delta_ref_entropy_loss": 0.2724609375, "delta_ref_ppl": -0.220703125, "entropy_loss": -0.51171875, "epoch": 0.5444366689993002, "grad_norm": 4.705344417120617, "k1_kl": 0.220703125, "k3_kl": 0.093017578125, "kimi_kl": 0.16064453125, "learning_rate": 4.551820728291316e-07, "loss": 0.0037, "ppl": 0.4541015625, "reward": 0.9467070996761322, "reward_std": 0.018499745521694422, "rewards/single_object_detection_bbox_reward": 0.9467071294784546, "step": 389, "temperature": 0.9 }, { "advantages": 9.540069640934234e-06, "completion_length": 114.25, "delta_ref_entropy_loss": 0.24072265625, "delta_ref_ppl": -0.1923828125, "entropy_loss": -0.5068359375, "epoch": 0.5458362491252624, "grad_norm": 2.5880021014148844, "k1_kl": 0.1923828125, "k3_kl": 0.08544921875, "kimi_kl": 0.13232421875, "learning_rate": 4.53781512605042e-07, "loss": 0.0034, "ppl": 0.439453125, "reward": 0.9920886754989624, "reward_std": 0.01198854693211615, "rewards/single_object_detection_bbox_reward": 0.9920887649059296, "step": 390, "temperature": 0.9 }, { "advantages": -6.232410896700458e-06, "completion_length": 124.5, "delta_ref_entropy_loss": 0.24560546875, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.509765625, "epoch": 0.5472358292512246, "grad_norm": 2.342854049304154, "k1_kl": 0.1748046875, "k3_kl": 0.076904296875, "kimi_kl": 0.117919921875, "learning_rate": 4.5238095238095237e-07, "loss": 0.0031, "ppl": 0.4521484375, "reward": 0.9753295481204987, "reward_std": 0.013924327678978443, "rewards/single_object_detection_bbox_reward": 0.9753296375274658, "step": 391, "temperature": 0.9 }, { "advantages": 1.2837750546168536e-05, "completion_length": 70.5, "delta_ref_entropy_loss": 0.2255859375, "delta_ref_ppl": -0.17236328125, "entropy_loss": -0.490234375, "epoch": 0.5486354093771868, "grad_norm": 2.4473324657392155, "k1_kl": 0.17236328125, "k3_kl": 0.0721435546875, "kimi_kl": 0.13037109375, "learning_rate": 4.5098039215686274e-07, "loss": 0.0029, "ppl": 0.4345703125, "reward": 0.9754654169082642, "reward_std": 0.009021535748615861, "rewards/single_object_detection_bbox_reward": 0.9754654765129089, "step": 392, "temperature": 0.9 }, { "advantages": -6.643789674853906e-06, "completion_length": 60.5, "delta_ref_entropy_loss": 0.236328125, "delta_ref_ppl": -0.19873046875, "entropy_loss": -0.52734375, "epoch": 0.5500349895031491, "grad_norm": 2.360732169576065, "k1_kl": 0.19970703125, "k3_kl": 0.097412109375, "kimi_kl": 0.1806640625, "learning_rate": 4.495798319327731e-07, "loss": 0.0039, "ppl": 0.4716796875, "reward": 0.9757494330406189, "reward_std": 0.008015302708372474, "rewards/single_object_detection_bbox_reward": 0.9757495224475861, "step": 393, "temperature": 0.9 }, { "advantages": 1.8261906461702893e-05, "completion_length": 69.0625, "delta_ref_entropy_loss": 0.2314453125, "delta_ref_ppl": -0.20166015625, "entropy_loss": -0.5205078125, "epoch": 0.5514345696291113, "grad_norm": 1.8532547121259595, "k1_kl": 0.20166015625, "k3_kl": 0.094482421875, "kimi_kl": 0.1669921875, "learning_rate": 4.481792717086835e-07, "loss": 0.0038, "ppl": 0.455078125, "reward": 0.9834299385547638, "reward_std": 0.015274374280124903, "rewards/single_object_detection_bbox_reward": 0.9834299683570862, "step": 394, "temperature": 0.9 }, { "advantages": 4.399834061530328e-06, "completion_length": 143.65625, "delta_ref_entropy_loss": 0.2138671875, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.5625, "epoch": 0.5528341497550735, "grad_norm": 2.3193805310300757, "k1_kl": 0.18408203125, "k3_kl": 0.078125, "kimi_kl": 0.128173828125, "learning_rate": 4.4677871148459387e-07, "loss": 0.0031, "ppl": 0.4931640625, "reward": 0.9223261773586273, "reward_std": 0.03393973130732775, "rewards/single_object_detection_bbox_reward": 0.9223261773586273, "step": 395, "temperature": 0.9 }, { "advantages": 4.269183136784704e-06, "completion_length": 75.75, "delta_ref_entropy_loss": 0.2216796875, "delta_ref_ppl": -0.20458984375, "entropy_loss": -0.56640625, "epoch": 0.5542337298810357, "grad_norm": 2.352681147279962, "k1_kl": 0.2041015625, "k3_kl": 0.103515625, "kimi_kl": 0.1943359375, "learning_rate": 4.453781512605042e-07, "loss": 0.0041, "ppl": 0.50390625, "reward": 0.969637542963028, "reward_std": 0.02086068969219923, "rewards/single_object_detection_bbox_reward": 0.9696376323699951, "step": 396, "temperature": 0.9 }, { "advantages": -1.0380521871411474e-05, "completion_length": 68.53125, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.22705078125, "entropy_loss": -0.544921875, "epoch": 0.5556333100069979, "grad_norm": 6.458236085200537, "k1_kl": 0.22705078125, "k3_kl": 0.113037109375, "kimi_kl": 0.21728515625, "learning_rate": 4.439775910364145e-07, "loss": 0.0045, "ppl": 0.4765625, "reward": 0.9286729395389557, "reward_std": 0.044537849724292755, "rewards/single_object_detection_bbox_reward": 0.9286729395389557, "step": 397, "temperature": 0.9 }, { "advantages": -1.0602974725770764e-05, "completion_length": 40.53125, "delta_ref_entropy_loss": 0.22900390625, "delta_ref_ppl": -0.17822265625, "entropy_loss": -0.505859375, "epoch": 0.5570328901329601, "grad_norm": 2.148532064941402, "k1_kl": 0.177734375, "k3_kl": 0.077880859375, "kimi_kl": 0.130615234375, "learning_rate": 4.425770308123249e-07, "loss": 0.0031, "ppl": 0.4521484375, "reward": 0.9548574686050415, "reward_std": 0.006401536869816482, "rewards/single_object_detection_bbox_reward": 0.9548575282096863, "step": 398, "temperature": 0.9 }, { "advantages": 4.883390147369937e-06, "completion_length": 81.34375, "delta_ref_entropy_loss": 0.22216796875, "delta_ref_ppl": -0.20068359375, "entropy_loss": -0.541015625, "epoch": 0.5584324702589223, "grad_norm": 15.515017422492416, "k1_kl": 0.201171875, "k3_kl": 0.09765625, "kimi_kl": 0.16552734375, "learning_rate": 4.4117647058823526e-07, "loss": 0.0039, "ppl": 0.484375, "reward": 0.9220120906829834, "reward_std": 0.03675997816026211, "rewards/single_object_detection_bbox_reward": 0.922012209892273, "step": 399, "temperature": 0.9 }, { "advantages": 5.990533281874377e-06, "completion_length": 116.1875, "delta_ref_entropy_loss": 0.22607421875, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.529296875, "epoch": 0.5598320503848845, "grad_norm": 3.74607107885756, "k1_kl": 0.185546875, "k3_kl": 0.089111328125, "kimi_kl": 0.15234375, "learning_rate": 4.3977591036414563e-07, "loss": 0.0035, "ppl": 0.4697265625, "reward": 0.9451086819171906, "reward_std": 0.03822888806462288, "rewards/single_object_detection_bbox_reward": 0.9451086819171906, "step": 400, "temperature": 0.9 }, { "advantages": 3.636016572272638e-06, "completion_length": 60.0, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.2314453125, "entropy_loss": -0.541015625, "epoch": 0.5612316305108468, "grad_norm": 2.3159862458433182, "k1_kl": 0.2314453125, "k3_kl": 0.116455078125, "kimi_kl": 0.2216796875, "learning_rate": 4.38375350140056e-07, "loss": 0.0047, "ppl": 0.4794921875, "reward": 0.9398345947265625, "reward_std": 0.0276358500123024, "rewards/single_object_detection_bbox_reward": 0.9398346841335297, "step": 401, "temperature": 0.9 }, { "advantages": -1.7379277778672986e-05, "completion_length": 40.5, "delta_ref_entropy_loss": 0.224609375, "delta_ref_ppl": -0.1787109375, "entropy_loss": -0.5146484375, "epoch": 0.562631210636809, "grad_norm": 2.827112217832387, "k1_kl": 0.17822265625, "k3_kl": 0.083251953125, "kimi_kl": 0.155517578125, "learning_rate": 4.369747899159664e-07, "loss": 0.0033, "ppl": 0.462890625, "reward": 0.9614855349063873, "reward_std": 0.01616683229804039, "rewards/single_object_detection_bbox_reward": 0.9614856541156769, "step": 402, "temperature": 0.9 }, { "advantages": 1.5872664789640112e-05, "completion_length": 39.96875, "delta_ref_entropy_loss": 0.27734375, "delta_ref_ppl": -0.29736328125, "entropy_loss": -0.4990234375, "epoch": 0.5640307907627712, "grad_norm": 2.889413751071054, "k1_kl": 0.29736328125, "k3_kl": 0.158935546875, "kimi_kl": 0.369140625, "learning_rate": 4.3557422969187675e-07, "loss": 0.0063, "ppl": 0.4365234375, "reward": 0.828760027885437, "reward_std": 0.0244603231549263, "rewards/single_object_detection_bbox_reward": 0.8287600576877594, "step": 403, "temperature": 0.9 }, { "advantages": 6.959906755810152e-06, "completion_length": 79.125, "delta_ref_entropy_loss": 0.24609375, "delta_ref_ppl": -0.1904296875, "entropy_loss": -0.4892578125, "epoch": 0.5654303708887334, "grad_norm": 2.978120256074333, "k1_kl": 0.19091796875, "k3_kl": 0.080322265625, "kimi_kl": 0.141357421875, "learning_rate": 4.341736694677871e-07, "loss": 0.0032, "ppl": 0.43359375, "reward": 0.9806303381919861, "reward_std": 0.02760624559596181, "rewards/single_object_detection_bbox_reward": 0.9806303977966309, "step": 404, "temperature": 0.9 }, { "advantages": 5.399808614470203e-06, "completion_length": 67.65625, "delta_ref_entropy_loss": 0.2548828125, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.498046875, "epoch": 0.5668299510146956, "grad_norm": 4.1327160009725095, "k1_kl": 0.21240234375, "k3_kl": 0.0966796875, "kimi_kl": 0.175048828125, "learning_rate": 4.327731092436975e-07, "loss": 0.0039, "ppl": 0.447265625, "reward": 0.9546349942684174, "reward_std": 0.016212548594921827, "rewards/single_object_detection_bbox_reward": 0.9546350240707397, "step": 405, "temperature": 0.9 }, { "advantages": -1.4600544545828598e-05, "completion_length": 77.9375, "delta_ref_entropy_loss": 0.23193359375, "delta_ref_ppl": -0.1806640625, "entropy_loss": -0.53125, "epoch": 0.5682295311406578, "grad_norm": 2.0534360782878642, "k1_kl": 0.18017578125, "k3_kl": 0.075439453125, "kimi_kl": 0.1298828125, "learning_rate": 4.313725490196078e-07, "loss": 0.003, "ppl": 0.46484375, "reward": 0.9640625417232513, "reward_std": 0.0055642700172029436, "rewards/single_object_detection_bbox_reward": 0.9640625715255737, "step": 406, "temperature": 0.9 }, { "advantages": 7.114373602234991e-06, "completion_length": 172.09375, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.1875, "entropy_loss": -0.5283203125, "epoch": 0.56962911126662, "grad_norm": 3.207719230190128, "k1_kl": 0.1884765625, "k3_kl": 0.09033203125, "kimi_kl": 0.1533203125, "learning_rate": 4.299719887955182e-07, "loss": 0.0036, "ppl": 0.46875, "reward": 0.9674240052700043, "reward_std": 0.020460971165448427, "rewards/single_object_detection_bbox_reward": 0.9674240946769714, "step": 407, "temperature": 0.9 }, { "advantages": -4.257476575730834e-07, "completion_length": 50.53125, "delta_ref_entropy_loss": 0.25244140625, "delta_ref_ppl": -0.21630859375, "entropy_loss": -0.50390625, "epoch": 0.5710286913925823, "grad_norm": 2.358151063110923, "k1_kl": 0.21630859375, "k3_kl": 0.0927734375, "kimi_kl": 0.1650390625, "learning_rate": 4.285714285714285e-07, "loss": 0.0037, "ppl": 0.4443359375, "reward": 0.9814941883087158, "reward_std": 0.007477845065295696, "rewards/single_object_detection_bbox_reward": 0.9814942479133606, "step": 408, "temperature": 0.9 }, { "advantages": -1.6174279608094366e-05, "completion_length": 87.875, "delta_ref_entropy_loss": 0.2392578125, "delta_ref_ppl": -0.19677734375, "entropy_loss": -0.52734375, "epoch": 0.5724282715185445, "grad_norm": 2.8524618008796945, "k1_kl": 0.19677734375, "k3_kl": 0.08837890625, "kimi_kl": 0.14453125, "learning_rate": 4.271708683473389e-07, "loss": 0.0035, "ppl": 0.46875, "reward": 0.9569520652294159, "reward_std": 0.011668029241263866, "rewards/single_object_detection_bbox_reward": 0.9569520950317383, "step": 409, "temperature": 0.9 }, { "advantages": -1.265960145246936e-05, "completion_length": 49.34375, "delta_ref_entropy_loss": 0.2412109375, "delta_ref_ppl": -0.21484375, "entropy_loss": -0.5419921875, "epoch": 0.5738278516445067, "grad_norm": 3.7585098793713803, "k1_kl": 0.21337890625, "k3_kl": 0.10107421875, "kimi_kl": 0.19775390625, "learning_rate": 4.2577030812324926e-07, "loss": 0.0041, "ppl": 0.478515625, "reward": 0.9222940504550934, "reward_std": 0.05158570781350136, "rewards/single_object_detection_bbox_reward": 0.9222941100597382, "step": 410, "temperature": 0.9 }, { "advantages": 8.63894865688053e-06, "completion_length": 60.0, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.2265625, "entropy_loss": -0.5185546875, "epoch": 0.5752274317704689, "grad_norm": 2.272392124997542, "k1_kl": 0.2275390625, "k3_kl": 0.1083984375, "kimi_kl": 0.20458984375, "learning_rate": 4.2436974789915964e-07, "loss": 0.0043, "ppl": 0.4619140625, "reward": 0.9241535663604736, "reward_std": 0.03428897354751825, "rewards/single_object_detection_bbox_reward": 0.9241536855697632, "step": 411, "temperature": 0.9 }, { "advantages": -1.1313574304949725e-05, "completion_length": 68.78125, "delta_ref_entropy_loss": 0.21630859375, "delta_ref_ppl": -0.17822265625, "entropy_loss": -0.501953125, "epoch": 0.5766270118964311, "grad_norm": 2.329609212346474, "k1_kl": 0.177734375, "k3_kl": 0.079345703125, "kimi_kl": 0.1494140625, "learning_rate": 4.2296918767507e-07, "loss": 0.0032, "ppl": 0.4384765625, "reward": 0.9561617076396942, "reward_std": 0.017164418939501047, "rewards/single_object_detection_bbox_reward": 0.956161767244339, "step": 412, "temperature": 0.9 }, { "advantages": -1.7844141666500946e-06, "completion_length": 71.21875, "delta_ref_entropy_loss": 0.2314453125, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.5458984375, "epoch": 0.5780265920223933, "grad_norm": 2.7412425224620227, "k1_kl": 0.21240234375, "k3_kl": 0.098388671875, "kimi_kl": 0.1884765625, "learning_rate": 4.215686274509804e-07, "loss": 0.0039, "ppl": 0.482421875, "reward": 0.9186808168888092, "reward_std": 0.039630543906241655, "rewards/single_object_detection_bbox_reward": 0.9186809062957764, "step": 413, "temperature": 0.9 }, { "advantages": 4.808817720913794e-06, "completion_length": 40.71875, "delta_ref_entropy_loss": 0.2314453125, "delta_ref_ppl": -0.17578125, "entropy_loss": -0.4814453125, "epoch": 0.5794261721483555, "grad_norm": 4.4818140964160875, "k1_kl": 0.17578125, "k3_kl": 0.076171875, "kimi_kl": 0.131591796875, "learning_rate": 4.2016806722689076e-07, "loss": 0.003, "ppl": 0.423828125, "reward": 0.9718904793262482, "reward_std": 0.008187896572053432, "rewards/single_object_detection_bbox_reward": 0.9718905389308929, "step": 414, "temperature": 0.9 }, { "advantages": 1.1883144004798396e-05, "completion_length": 107.40625, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.5546875, "epoch": 0.5808257522743177, "grad_norm": 2.6638221311573, "k1_kl": 0.17431640625, "k3_kl": 0.0693359375, "kimi_kl": 0.099853515625, "learning_rate": 4.1876750700280113e-07, "loss": 0.0028, "ppl": 0.486328125, "reward": 0.9428456425666809, "reward_std": 0.029612469486892223, "rewards/single_object_detection_bbox_reward": 0.9428457617759705, "step": 415, "temperature": 0.9 }, { "advantages": 5.776594946382829e-06, "completion_length": 62.03125, "delta_ref_entropy_loss": 0.26611328125, "delta_ref_ppl": -0.22021484375, "entropy_loss": -0.525390625, "epoch": 0.58222533240028, "grad_norm": 5.449360492802425, "k1_kl": 0.220703125, "k3_kl": 0.1015625, "kimi_kl": 0.17626953125, "learning_rate": 4.173669467787115e-07, "loss": 0.0041, "ppl": 0.46484375, "reward": 0.9115927517414093, "reward_std": 0.03178727254271507, "rewards/single_object_detection_bbox_reward": 0.9115928113460541, "step": 416, "temperature": 0.9 }, { "advantages": -4.6033947000978515e-06, "completion_length": 164.28125, "delta_ref_entropy_loss": 0.25, "delta_ref_ppl": -0.224609375, "entropy_loss": -0.515625, "epoch": 0.5836249125262422, "grad_norm": 2.6879073918210166, "k1_kl": 0.224609375, "k3_kl": 0.10400390625, "kimi_kl": 0.19140625, "learning_rate": 4.159663865546218e-07, "loss": 0.0042, "ppl": 0.453125, "reward": 0.9622102081775665, "reward_std": 0.03627282194793224, "rewards/single_object_detection_bbox_reward": 0.9622102677822113, "step": 417, "temperature": 0.9 }, { "advantages": 6.332993507385254e-07, "completion_length": 69.21875, "delta_ref_entropy_loss": 0.25146484375, "delta_ref_ppl": -0.2412109375, "entropy_loss": -0.544921875, "epoch": 0.5850244926522044, "grad_norm": 2.555994846659239, "k1_kl": 0.24072265625, "k3_kl": 0.118408203125, "kimi_kl": 0.2568359375, "learning_rate": 4.145658263305322e-07, "loss": 0.0047, "ppl": 0.484375, "reward": 0.949439287185669, "reward_std": 0.016261025797575712, "rewards/single_object_detection_bbox_reward": 0.9494393467903137, "step": 418, "temperature": 0.9 }, { "advantages": -6.471095275628613e-06, "completion_length": 96.90625, "delta_ref_entropy_loss": 0.26806640625, "delta_ref_ppl": -0.23095703125, "entropy_loss": -0.4833984375, "epoch": 0.5864240727781665, "grad_norm": 4.690156100392576, "k1_kl": 0.23095703125, "k3_kl": 0.114501953125, "kimi_kl": 0.26171875, "learning_rate": 4.131652661064425e-07, "loss": 0.0046, "ppl": 0.4296875, "reward": 0.995433121919632, "reward_std": 0.0009971547115128487, "rewards/single_object_detection_bbox_reward": 0.9954332113265991, "step": 419, "temperature": 0.9 }, { "advantages": 5.432138777905493e-06, "completion_length": 50.5, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.19384765625, "entropy_loss": -0.5068359375, "epoch": 0.5878236529041287, "grad_norm": 12.711270867841808, "k1_kl": 0.1943359375, "k3_kl": 0.08837890625, "kimi_kl": 0.15673828125, "learning_rate": 4.117647058823529e-07, "loss": 0.0035, "ppl": 0.4501953125, "reward": 0.9668359160423279, "reward_std": 0.010438218712806702, "rewards/single_object_detection_bbox_reward": 0.9668359756469727, "step": 420, "temperature": 0.9 }, { "advantages": -1.9872029952239245e-05, "completion_length": 70.90625, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.18408203125, "entropy_loss": -0.505859375, "epoch": 0.5892232330300909, "grad_norm": 2.762836945710889, "k1_kl": 0.1845703125, "k3_kl": 0.07568359375, "kimi_kl": 0.12158203125, "learning_rate": 4.1036414565826327e-07, "loss": 0.003, "ppl": 0.4443359375, "reward": 0.9512334167957306, "reward_std": 0.022379704751074314, "rewards/single_object_detection_bbox_reward": 0.9512334764003754, "step": 421, "temperature": 0.9 }, { "advantages": 7.835882342988043e-06, "completion_length": 71.0, "delta_ref_entropy_loss": 0.2607421875, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.4638671875, "epoch": 0.5906228131560531, "grad_norm": 2.356274506357668, "k1_kl": 0.208984375, "k3_kl": 0.088134765625, "kimi_kl": 0.14794921875, "learning_rate": 4.0896358543417364e-07, "loss": 0.0035, "ppl": 0.40625, "reward": 0.9652251303195953, "reward_std": 0.0006361566775012761, "rewards/single_object_detection_bbox_reward": 0.9652252197265625, "step": 422, "temperature": 0.9 }, { "advantages": -1.1693154817749019e-05, "completion_length": 70.5, "delta_ref_entropy_loss": 0.24072265625, "delta_ref_ppl": -0.201171875, "entropy_loss": -0.5048828125, "epoch": 0.5920223932820154, "grad_norm": 4.360958652515911, "k1_kl": 0.20068359375, "k3_kl": 0.088134765625, "kimi_kl": 0.15380859375, "learning_rate": 4.07563025210084e-07, "loss": 0.0035, "ppl": 0.4443359375, "reward": 0.9669423401355743, "reward_std": 0.011636595707386732, "rewards/single_object_detection_bbox_reward": 0.9669423401355743, "step": 423, "temperature": 0.9 }, { "advantages": -1.9450006902843597e-06, "completion_length": 68.65625, "delta_ref_entropy_loss": 0.25830078125, "delta_ref_ppl": -0.2314453125, "entropy_loss": -0.4609375, "epoch": 0.5934219734079776, "grad_norm": 3.7868893298157023, "k1_kl": 0.2314453125, "k3_kl": 0.111572265625, "kimi_kl": 0.2158203125, "learning_rate": 4.061624649859944e-07, "loss": 0.0045, "ppl": 0.4130859375, "reward": 0.9046172201633453, "reward_std": 0.029195432551205158, "rewards/single_object_detection_bbox_reward": 0.9046173691749573, "step": 424, "temperature": 0.9 }, { "advantages": 4.152634346610284e-06, "completion_length": 140.21875, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.2001953125, "entropy_loss": -0.541015625, "epoch": 0.5948215535339398, "grad_norm": 2.4050804108099197, "k1_kl": 0.2001953125, "k3_kl": 0.089599609375, "kimi_kl": 0.15625, "learning_rate": 4.0476190476190476e-07, "loss": 0.0036, "ppl": 0.4765625, "reward": 0.9280202686786652, "reward_std": 0.04146337741985917, "rewards/single_object_detection_bbox_reward": 0.9280203580856323, "step": 425, "temperature": 0.9 }, { "advantages": -2.6444241143508407e-06, "completion_length": 99.15625, "delta_ref_entropy_loss": 0.2568359375, "delta_ref_ppl": -0.244140625, "entropy_loss": -0.5546875, "epoch": 0.596221133659902, "grad_norm": 4.273698927408721, "k1_kl": 0.24365234375, "k3_kl": 0.1220703125, "kimi_kl": 0.29443359375, "learning_rate": 4.0336134453781514e-07, "loss": 0.0049, "ppl": 0.4892578125, "reward": 0.9303644597530365, "reward_std": 0.027086918242275715, "rewards/single_object_detection_bbox_reward": 0.9303645193576813, "step": 426, "temperature": 0.9 }, { "advantages": -2.3228582449519308e-05, "completion_length": 72.0, "delta_ref_entropy_loss": 0.22509765625, "delta_ref_ppl": -0.19140625, "entropy_loss": -0.5263671875, "epoch": 0.5976207137858642, "grad_norm": 2.200820625373534, "k1_kl": 0.19287109375, "k3_kl": 0.0908203125, "kimi_kl": 0.16845703125, "learning_rate": 4.019607843137255e-07, "loss": 0.0037, "ppl": 0.466796875, "reward": 0.9488346874713898, "reward_std": 0.02691508736461401, "rewards/single_object_detection_bbox_reward": 0.9488346874713898, "step": 427, "temperature": 0.9 }, { "advantages": 8.154394890880212e-06, "completion_length": 109.28125, "delta_ref_entropy_loss": 0.2431640625, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.5263671875, "epoch": 0.5990202939118264, "grad_norm": 2.745310315518311, "k1_kl": 0.18603515625, "k3_kl": 0.07763671875, "kimi_kl": 0.1181640625, "learning_rate": 4.0056022408963583e-07, "loss": 0.0031, "ppl": 0.466796875, "reward": 0.9447384774684906, "reward_std": 0.01741216378286481, "rewards/single_object_detection_bbox_reward": 0.9447385668754578, "step": 428, "temperature": 0.9 }, { "advantages": -1.4049533490378963e-05, "completion_length": 128.6875, "delta_ref_entropy_loss": 0.2138671875, "delta_ref_ppl": -0.20458984375, "entropy_loss": -0.576171875, "epoch": 0.6004198740377886, "grad_norm": 3.9786010352146763, "k1_kl": 0.20458984375, "k3_kl": 0.1005859375, "kimi_kl": 0.20703125, "learning_rate": 3.991596638655462e-07, "loss": 0.004, "ppl": 0.513671875, "reward": 0.947374165058136, "reward_std": 0.022336089052259922, "rewards/single_object_detection_bbox_reward": 0.9473741948604584, "step": 429, "temperature": 0.9 }, { "advantages": -2.73542764261947e-06, "completion_length": 88.5, "delta_ref_entropy_loss": 0.216796875, "delta_ref_ppl": -0.17919921875, "entropy_loss": -0.51953125, "epoch": 0.6018194541637508, "grad_norm": 5.194646259534791, "k1_kl": 0.1787109375, "k3_kl": 0.08154296875, "kimi_kl": 0.156982421875, "learning_rate": 3.9775910364145653e-07, "loss": 0.0033, "ppl": 0.458984375, "reward": 0.947076141834259, "reward_std": 0.01125338557176292, "rewards/single_object_detection_bbox_reward": 0.9470761716365814, "step": 430, "temperature": 0.9 }, { "advantages": 5.3504484185395995e-06, "completion_length": 48.5, "delta_ref_entropy_loss": 0.27294921875, "delta_ref_ppl": -0.197265625, "entropy_loss": -0.470703125, "epoch": 0.6032190342897131, "grad_norm": 3.6660818352749516, "k1_kl": 0.19677734375, "k3_kl": 0.07568359375, "kimi_kl": 0.112548828125, "learning_rate": 3.963585434173669e-07, "loss": 0.003, "ppl": 0.4140625, "reward": 0.9882084429264069, "reward_std": 0.010249376995489001, "rewards/single_object_detection_bbox_reward": 0.9882084727287292, "step": 431, "temperature": 0.9 }, { "advantages": -9.81294715529657e-06, "completion_length": 106.5, "delta_ref_entropy_loss": 0.23388671875, "delta_ref_ppl": -0.21923828125, "entropy_loss": -0.53125, "epoch": 0.6046186144156753, "grad_norm": 2.530043835559063, "k1_kl": 0.21923828125, "k3_kl": 0.112060546875, "kimi_kl": 0.24560546875, "learning_rate": 3.949579831932773e-07, "loss": 0.0045, "ppl": 0.4716796875, "reward": 0.9323261678218842, "reward_std": 0.02201015129685402, "rewards/single_object_detection_bbox_reward": 0.9323262572288513, "step": 432, "temperature": 0.9 }, { "advantages": -3.4816831657735747e-06, "completion_length": 69.0625, "delta_ref_entropy_loss": 0.251953125, "delta_ref_ppl": -0.21533203125, "entropy_loss": -0.5302734375, "epoch": 0.6060181945416375, "grad_norm": 2.6031387751223494, "k1_kl": 0.2158203125, "k3_kl": 0.099365234375, "kimi_kl": 0.19580078125, "learning_rate": 3.9355742296918765e-07, "loss": 0.004, "ppl": 0.4638671875, "reward": 0.8801971971988678, "reward_std": 0.032250299118459225, "rewards/single_object_detection_bbox_reward": 0.880197286605835, "step": 433, "temperature": 0.9 }, { "advantages": 1.581385959070758e-06, "completion_length": 102.1875, "delta_ref_entropy_loss": 0.24267578125, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.533203125, "epoch": 0.6074177746675997, "grad_norm": 3.0315648546197993, "k1_kl": 0.2080078125, "k3_kl": 0.096435546875, "kimi_kl": 0.21337890625, "learning_rate": 3.92156862745098e-07, "loss": 0.0039, "ppl": 0.46875, "reward": 0.9720211923122406, "reward_std": 0.010062003042548895, "rewards/single_object_detection_bbox_reward": 0.9720212519168854, "step": 434, "temperature": 0.9 }, { "advantages": 4.4623675421462394e-07, "completion_length": 78.0, "delta_ref_entropy_loss": 0.24267578125, "delta_ref_ppl": -0.19140625, "entropy_loss": -0.521484375, "epoch": 0.6088173547935619, "grad_norm": 2.2217638149389045, "k1_kl": 0.19189453125, "k3_kl": 0.083740234375, "kimi_kl": 0.165771484375, "learning_rate": 3.907563025210084e-07, "loss": 0.0033, "ppl": 0.4599609375, "reward": 0.9849400520324707, "reward_std": 0.011558742495253682, "rewards/single_object_detection_bbox_reward": 0.9849401414394379, "step": 435, "temperature": 0.9 }, { "advantages": 4.779281496780641e-06, "completion_length": 26.96875, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.501953125, "epoch": 0.6102169349195241, "grad_norm": 3.9786574518890037, "k1_kl": 0.18603515625, "k3_kl": 0.083984375, "kimi_kl": 0.114013671875, "learning_rate": 3.8935574229691877e-07, "loss": 0.0033, "ppl": 0.4462890625, "reward": 0.8993108868598938, "reward_std": 0.02591607451904565, "rewards/single_object_detection_bbox_reward": 0.8993108868598938, "step": 436, "temperature": 0.9 }, { "advantages": -2.332031726837158e-06, "completion_length": 94.4375, "delta_ref_entropy_loss": 0.2216796875, "delta_ref_ppl": -0.19384765625, "entropy_loss": -0.51171875, "epoch": 0.6116165150454863, "grad_norm": 2.837060609234072, "k1_kl": 0.1943359375, "k3_kl": 0.0927734375, "kimi_kl": 0.201171875, "learning_rate": 3.8795518207282914e-07, "loss": 0.0037, "ppl": 0.4521484375, "reward": 0.9463383555412292, "reward_std": 0.029582262970507145, "rewards/single_object_detection_bbox_reward": 0.9463384449481964, "step": 437, "temperature": 0.9 }, { "advantages": -1.4678973911941284e-06, "completion_length": 117.03125, "delta_ref_entropy_loss": 0.2587890625, "delta_ref_ppl": -0.21533203125, "entropy_loss": -0.50390625, "epoch": 0.6130160951714486, "grad_norm": 4.124084253388043, "k1_kl": 0.21533203125, "k3_kl": 0.099853515625, "kimi_kl": 0.17724609375, "learning_rate": 3.865546218487395e-07, "loss": 0.004, "ppl": 0.4453125, "reward": 0.9435870051383972, "reward_std": 0.031720384024083614, "rewards/single_object_detection_bbox_reward": 0.9435870945453644, "step": 438, "temperature": 0.9 }, { "advantages": 1.2906535175716272e-05, "completion_length": 38.5625, "delta_ref_entropy_loss": 0.24658203125, "delta_ref_ppl": -0.20849609375, "entropy_loss": -0.4716796875, "epoch": 0.6144156752974108, "grad_norm": 3.0971251833454176, "k1_kl": 0.20849609375, "k3_kl": 0.100830078125, "kimi_kl": 0.18115234375, "learning_rate": 3.8515406162464984e-07, "loss": 0.004, "ppl": 0.4140625, "reward": 0.9689906239509583, "reward_std": 0.022334317211061716, "rewards/single_object_detection_bbox_reward": 0.9689907133579254, "step": 439, "temperature": 0.9 }, { "advantages": -1.2007409736725094e-05, "completion_length": 89.3125, "delta_ref_entropy_loss": 0.20703125, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.5625, "epoch": 0.615815255423373, "grad_norm": 3.028178101936149, "k1_kl": 0.1845703125, "k3_kl": 0.08349609375, "kimi_kl": 0.13671875, "learning_rate": 3.837535014005602e-07, "loss": 0.0034, "ppl": 0.4990234375, "reward": 0.9491295218467712, "reward_std": 0.021570760756731033, "rewards/single_object_detection_bbox_reward": 0.9491295516490936, "step": 440, "temperature": 0.9 }, { "advantages": -2.001731627387926e-05, "completion_length": 74.75, "delta_ref_entropy_loss": 0.2353515625, "delta_ref_ppl": -0.2041015625, "entropy_loss": -0.513671875, "epoch": 0.6172148355493352, "grad_norm": 3.323140368182458, "k1_kl": 0.20361328125, "k3_kl": 0.09716796875, "kimi_kl": 0.197998046875, "learning_rate": 3.8235294117647053e-07, "loss": 0.0039, "ppl": 0.4501953125, "reward": 0.9485780894756317, "reward_std": 0.0248686196282506, "rewards/single_object_detection_bbox_reward": 0.9485781192779541, "step": 441, "temperature": 0.9 }, { "advantages": -4.287543561076745e-06, "completion_length": 97.90625, "delta_ref_entropy_loss": 0.20849609375, "delta_ref_ppl": -0.18212890625, "entropy_loss": -0.5546875, "epoch": 0.6186144156752974, "grad_norm": 4.191340662938949, "k1_kl": 0.18212890625, "k3_kl": 0.0849609375, "kimi_kl": 0.144775390625, "learning_rate": 3.809523809523809e-07, "loss": 0.0034, "ppl": 0.4931640625, "reward": 0.9314520061016083, "reward_std": 0.04806480277329683, "rewards/single_object_detection_bbox_reward": 0.9314520955085754, "step": 442, "temperature": 0.9 }, { "advantages": 1.1104026725661242e-05, "completion_length": 90.15625, "delta_ref_entropy_loss": 0.26171875, "delta_ref_ppl": -0.21484375, "entropy_loss": -0.5107421875, "epoch": 0.6200139958012596, "grad_norm": 2.444113381854868, "k1_kl": 0.21484375, "k3_kl": 0.093505859375, "kimi_kl": 0.15380859375, "learning_rate": 3.795518207282913e-07, "loss": 0.0037, "ppl": 0.4541015625, "reward": 0.9501246213912964, "reward_std": 0.00986072700470686, "rewards/single_object_detection_bbox_reward": 0.9501247107982635, "step": 443, "temperature": 0.9 }, { "advantages": -2.3529199779659393e-06, "completion_length": 135.25, "delta_ref_entropy_loss": 0.224609375, "delta_ref_ppl": -0.1796875, "entropy_loss": -0.509765625, "epoch": 0.6214135759272218, "grad_norm": 2.525132673692066, "k1_kl": 0.1796875, "k3_kl": 0.07568359375, "kimi_kl": 0.11865234375, "learning_rate": 3.7815126050420166e-07, "loss": 0.003, "ppl": 0.453125, "reward": 0.9426948130130768, "reward_std": 0.026395166292786598, "rewards/single_object_detection_bbox_reward": 0.942694902420044, "step": 444, "temperature": 0.9 }, { "advantages": 2.3096799850463867e-06, "completion_length": 144.96875, "delta_ref_entropy_loss": 0.21484375, "delta_ref_ppl": -0.18017578125, "entropy_loss": -0.537109375, "epoch": 0.622813156053184, "grad_norm": 4.107596913870407, "k1_kl": 0.18017578125, "k3_kl": 0.084716796875, "kimi_kl": 0.1435546875, "learning_rate": 3.7675070028011203e-07, "loss": 0.0034, "ppl": 0.4775390625, "reward": 0.9638459384441376, "reward_std": 0.017667249776422977, "rewards/single_object_detection_bbox_reward": 0.9638459980487823, "step": 445, "temperature": 0.9 }, { "advantages": 4.031296413131713e-06, "completion_length": 89.65625, "delta_ref_entropy_loss": 0.2470703125, "delta_ref_ppl": -0.19677734375, "entropy_loss": -0.552734375, "epoch": 0.6242127361791463, "grad_norm": 1.928052562230989, "k1_kl": 0.1962890625, "k3_kl": 0.090087890625, "kimi_kl": 0.16259765625, "learning_rate": 3.753501400560224e-07, "loss": 0.0036, "ppl": 0.4990234375, "reward": 0.9164307117462158, "reward_std": 0.02317541465163231, "rewards/single_object_detection_bbox_reward": 0.916430801153183, "step": 446, "temperature": 0.9 }, { "advantages": -9.121107495957403e-06, "completion_length": 51.0, "delta_ref_entropy_loss": 0.23486328125, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.5068359375, "epoch": 0.6256123163051085, "grad_norm": 2.8873910180480276, "k1_kl": 0.18408203125, "k3_kl": 0.07763671875, "kimi_kl": 0.12744140625, "learning_rate": 3.739495798319328e-07, "loss": 0.0031, "ppl": 0.4482421875, "reward": 0.9329797327518463, "reward_std": 0.023955040145665407, "rewards/single_object_detection_bbox_reward": 0.9329798221588135, "step": 447, "temperature": 0.9 }, { "advantages": -4.906739832222229e-06, "completion_length": 79.5, "delta_ref_entropy_loss": 0.27587890625, "delta_ref_ppl": -0.259765625, "entropy_loss": -0.4736328125, "epoch": 0.6270118964310707, "grad_norm": 4.395564423035803, "k1_kl": 0.259765625, "k3_kl": 0.123291015625, "kimi_kl": 0.240234375, "learning_rate": 3.7254901960784315e-07, "loss": 0.0049, "ppl": 0.4111328125, "reward": 0.9004068672657013, "reward_std": 0.014965365640819073, "rewards/single_object_detection_bbox_reward": 0.9004068970680237, "step": 448, "temperature": 0.9 }, { "advantages": -2.5902742493144615e-06, "completion_length": 153.75, "delta_ref_entropy_loss": 0.2734375, "delta_ref_ppl": -0.21728515625, "entropy_loss": -0.4951171875, "epoch": 0.6284114765570329, "grad_norm": 4.109142911711747, "k1_kl": 0.21728515625, "k3_kl": 0.09375, "kimi_kl": 0.14990234375, "learning_rate": 3.711484593837535e-07, "loss": 0.0038, "ppl": 0.4326171875, "reward": 0.9655456244945526, "reward_std": 0.02484460803680122, "rewards/single_object_detection_bbox_reward": 0.965545654296875, "step": 449, "temperature": 0.9 }, { "advantages": -1.6011564184736926e-05, "completion_length": 88.5625, "delta_ref_entropy_loss": 0.18798828125, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.60546875, "epoch": 0.6298110566829951, "grad_norm": 3.5851921069804944, "k1_kl": 0.17529296875, "k3_kl": 0.084228515625, "kimi_kl": 0.16552734375, "learning_rate": 3.6974789915966385e-07, "loss": 0.0034, "ppl": 0.5341796875, "reward": 0.8961506187915802, "reward_std": 0.03661548253148794, "rewards/single_object_detection_bbox_reward": 0.8961505889892578, "step": 450, "temperature": 0.9 }, { "advantages": 4.437885081642889e-06, "completion_length": 292.34375, "delta_ref_entropy_loss": 0.25830078125, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.53125, "epoch": 0.6312106368089573, "grad_norm": 8.64889226311734, "k1_kl": 0.21484375, "k3_kl": 0.104248046875, "kimi_kl": 0.175048828125, "learning_rate": 3.683473389355742e-07, "loss": 0.0042, "ppl": 0.4736328125, "reward": 0.9766197800636292, "reward_std": 0.016513065434992313, "rewards/single_object_detection_bbox_reward": 0.9766198396682739, "step": 451, "temperature": 0.9 }, { "advantages": -1.3171296359359985e-05, "completion_length": 86.125, "delta_ref_entropy_loss": 0.240234375, "delta_ref_ppl": -0.19482421875, "entropy_loss": -0.505859375, "epoch": 0.6326102169349195, "grad_norm": 2.472448564309226, "k1_kl": 0.19482421875, "k3_kl": 0.08544921875, "kimi_kl": 0.1513671875, "learning_rate": 3.6694677871148454e-07, "loss": 0.0034, "ppl": 0.4462890625, "reward": 0.9484961926937103, "reward_std": 0.02957653161138296, "rewards/single_object_detection_bbox_reward": 0.9484962522983551, "step": 452, "temperature": 0.9 }, { "advantages": 1.0899136668740539e-06, "completion_length": 87.1875, "delta_ref_entropy_loss": 0.21728515625, "delta_ref_ppl": -0.193359375, "entropy_loss": -0.5625, "epoch": 0.6340097970608818, "grad_norm": 5.328169443795491, "k1_kl": 0.193359375, "k3_kl": 0.10009765625, "kimi_kl": 0.17626953125, "learning_rate": 3.655462184873949e-07, "loss": 0.004, "ppl": 0.50390625, "reward": 0.8540197610855103, "reward_std": 0.0522612389177084, "rewards/single_object_detection_bbox_reward": 0.8540197908878326, "step": 453, "temperature": 0.9 }, { "advantages": -2.995931765781279e-06, "completion_length": 222.0, "delta_ref_entropy_loss": 0.24609375, "delta_ref_ppl": -0.263671875, "entropy_loss": -0.537109375, "epoch": 0.635409377186844, "grad_norm": 6.376648329032202, "k1_kl": 0.263671875, "k3_kl": 0.142578125, "kimi_kl": 0.373046875, "learning_rate": 3.641456582633053e-07, "loss": 0.0057, "ppl": 0.4794921875, "reward": 0.9281862378120422, "reward_std": 0.0150738496449776, "rewards/single_object_detection_bbox_reward": 0.9281863272190094, "step": 454, "temperature": 0.9 }, { "advantages": 1.9426060333671558e-06, "completion_length": 86.5, "delta_ref_entropy_loss": 0.2412109375, "delta_ref_ppl": -0.19677734375, "entropy_loss": -0.537109375, "epoch": 0.6368089573128062, "grad_norm": 2.582638030691802, "k1_kl": 0.197265625, "k3_kl": 0.08447265625, "kimi_kl": 0.15283203125, "learning_rate": 3.6274509803921566e-07, "loss": 0.0034, "ppl": 0.4697265625, "reward": 0.942881166934967, "reward_std": 0.02140963915735483, "rewards/single_object_detection_bbox_reward": 0.9428811967372894, "step": 455, "temperature": 0.9 }, { "advantages": -3.187784386682324e-06, "completion_length": 91.5625, "delta_ref_entropy_loss": 0.27001953125, "delta_ref_ppl": -0.2275390625, "entropy_loss": -0.478515625, "epoch": 0.6382085374387684, "grad_norm": 4.2848837344276, "k1_kl": 0.2275390625, "k3_kl": 0.098388671875, "kimi_kl": 0.17138671875, "learning_rate": 3.6134453781512604e-07, "loss": 0.0039, "ppl": 0.416015625, "reward": 0.9794356524944305, "reward_std": 0.009035879076691344, "rewards/single_object_detection_bbox_reward": 0.9794357717037201, "step": 456, "temperature": 0.9 }, { "advantages": -6.847350391581131e-06, "completion_length": 31.84375, "delta_ref_entropy_loss": 0.2529296875, "delta_ref_ppl": -0.224609375, "entropy_loss": -0.5078125, "epoch": 0.6396081175647306, "grad_norm": 3.5909127416003415, "k1_kl": 0.224609375, "k3_kl": 0.10498046875, "kimi_kl": 0.19189453125, "learning_rate": 3.599439775910364e-07, "loss": 0.0042, "ppl": 0.4462890625, "reward": 0.9747715294361115, "reward_std": 0.015090668573975563, "rewards/single_object_detection_bbox_reward": 0.9747715592384338, "step": 457, "temperature": 0.9 }, { "advantages": -3.329611274693889e-06, "completion_length": 56.25, "delta_ref_entropy_loss": 0.23486328125, "delta_ref_ppl": -0.20947265625, "entropy_loss": -0.505859375, "epoch": 0.6410076976906928, "grad_norm": 3.8975487179559605, "k1_kl": 0.2099609375, "k3_kl": 0.10498046875, "kimi_kl": 0.185546875, "learning_rate": 3.585434173669468e-07, "loss": 0.0042, "ppl": 0.4443359375, "reward": 0.9335418939590454, "reward_std": 0.02424075547605753, "rewards/single_object_detection_bbox_reward": 0.9335419237613678, "step": 458, "temperature": 0.9 }, { "advantages": -6.686896085739136e-07, "completion_length": 58.8125, "delta_ref_entropy_loss": 0.2216796875, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.55078125, "epoch": 0.642407277816655, "grad_norm": 2.345042031547222, "k1_kl": 0.185546875, "k3_kl": 0.089599609375, "kimi_kl": 0.151123046875, "learning_rate": 3.5714285714285716e-07, "loss": 0.0036, "ppl": 0.48828125, "reward": 0.9331086575984955, "reward_std": 0.0429189233109355, "rewards/single_object_detection_bbox_reward": 0.9331087172031403, "step": 459, "temperature": 0.9 }, { "advantages": 1.642733377593686e-05, "completion_length": 119.375, "delta_ref_entropy_loss": 0.24951171875, "delta_ref_ppl": -0.1904296875, "entropy_loss": -0.486328125, "epoch": 0.6438068579426172, "grad_norm": 2.1225799239200964, "k1_kl": 0.1904296875, "k3_kl": 0.076416015625, "kimi_kl": 0.11669921875, "learning_rate": 3.557422969187675e-07, "loss": 0.003, "ppl": 0.4296875, "reward": 0.9866345226764679, "reward_std": 0.01074646320194006, "rewards/single_object_detection_bbox_reward": 0.9866345822811127, "step": 460, "temperature": 0.9 }, { "advantages": 1.0663876764738234e-05, "completion_length": 108.5, "delta_ref_entropy_loss": 0.22412109375, "delta_ref_ppl": -0.16357421875, "entropy_loss": -0.498046875, "epoch": 0.6452064380685795, "grad_norm": 1.848384127194242, "k1_kl": 0.16357421875, "k3_kl": 0.06982421875, "kimi_kl": 0.11376953125, "learning_rate": 3.5434173669467785e-07, "loss": 0.0028, "ppl": 0.4462890625, "reward": 0.9329527318477631, "reward_std": 0.013764975650701672, "rewards/single_object_detection_bbox_reward": 0.9329527914524078, "step": 461, "temperature": 0.9 }, { "advantages": -5.046904334449209e-06, "completion_length": 119.21875, "delta_ref_entropy_loss": 0.24853515625, "delta_ref_ppl": -0.23046875, "entropy_loss": -0.521484375, "epoch": 0.6466060181945417, "grad_norm": 2.6830668282133456, "k1_kl": 0.2314453125, "k3_kl": 0.105224609375, "kimi_kl": 0.193359375, "learning_rate": 3.529411764705882e-07, "loss": 0.0042, "ppl": 0.4619140625, "reward": 0.9895251095294952, "reward_std": 0.011169482109835371, "rewards/single_object_detection_bbox_reward": 0.9895251393318176, "step": 462, "temperature": 0.9 }, { "advantages": 1.2027362572553102e-07, "completion_length": 59.0, "delta_ref_entropy_loss": 0.25244140625, "delta_ref_ppl": -0.2236328125, "entropy_loss": -0.548828125, "epoch": 0.6480055983205039, "grad_norm": 4.195508716417835, "k1_kl": 0.22412109375, "k3_kl": 0.11962890625, "kimi_kl": 0.20751953125, "learning_rate": 3.5154061624649855e-07, "loss": 0.0048, "ppl": 0.4814453125, "reward": 0.8801693022251129, "reward_std": 0.021025836933404207, "rewards/single_object_detection_bbox_reward": 0.8801693916320801, "step": 463, "temperature": 0.9 }, { "advantages": -8.9863316645733e-06, "completion_length": 106.5, "delta_ref_entropy_loss": 0.248046875, "delta_ref_ppl": -0.20751953125, "entropy_loss": -0.5341796875, "epoch": 0.6494051784464661, "grad_norm": 3.2235722459394913, "k1_kl": 0.2080078125, "k3_kl": 0.09423828125, "kimi_kl": 0.19921875, "learning_rate": 3.501400560224089e-07, "loss": 0.0038, "ppl": 0.466796875, "reward": 0.9507101476192474, "reward_std": 0.028966802172362804, "rewards/single_object_detection_bbox_reward": 0.9507101774215698, "step": 464, "temperature": 0.9 }, { "advantages": 1.8778125195240136e-05, "completion_length": 50.40625, "delta_ref_entropy_loss": 0.2646484375, "delta_ref_ppl": -0.23974609375, "entropy_loss": -0.55078125, "epoch": 0.6508047585724283, "grad_norm": 5.962076850999804, "k1_kl": 0.23974609375, "k3_kl": 0.106201171875, "kimi_kl": 0.17626953125, "learning_rate": 3.487394957983193e-07, "loss": 0.0042, "ppl": 0.4873046875, "reward": 0.9820957779884338, "reward_std": 0.02125309221446514, "rewards/single_object_detection_bbox_reward": 0.9820958971977234, "step": 465, "temperature": 0.9 }, { "advantages": -4.799238354280533e-06, "completion_length": 117.28125, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.21533203125, "entropy_loss": -0.517578125, "epoch": 0.6522043386983905, "grad_norm": 2.531207152529061, "k1_kl": 0.21533203125, "k3_kl": 0.100341796875, "kimi_kl": 0.19970703125, "learning_rate": 3.4733893557422967e-07, "loss": 0.004, "ppl": 0.451171875, "reward": 0.9673513770103455, "reward_std": 0.030848581343889236, "rewards/single_object_detection_bbox_reward": 0.9673514366149902, "step": 466, "temperature": 0.9 }, { "advantages": -1.6810640090625384e-05, "completion_length": 40.5, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.19482421875, "entropy_loss": -0.4912109375, "epoch": 0.6536039188243526, "grad_norm": 2.249568754845452, "k1_kl": 0.19482421875, "k3_kl": 0.082275390625, "kimi_kl": 0.152587890625, "learning_rate": 3.4593837535014004e-07, "loss": 0.0033, "ppl": 0.4345703125, "reward": 0.9624067842960358, "reward_std": 0.01862771948799491, "rewards/single_object_detection_bbox_reward": 0.9624069035053253, "step": 467, "temperature": 0.9 }, { "advantages": -7.569790113848285e-06, "completion_length": 79.5, "delta_ref_entropy_loss": 0.2294921875, "delta_ref_ppl": -0.212890625, "entropy_loss": -0.494140625, "epoch": 0.655003498950315, "grad_norm": 7.8168038826779345, "k1_kl": 0.212890625, "k3_kl": 0.107177734375, "kimi_kl": 0.224609375, "learning_rate": 3.445378151260504e-07, "loss": 0.0043, "ppl": 0.431640625, "reward": 0.9463571310043335, "reward_std": 0.03169342828914523, "rewards/single_object_detection_bbox_reward": 0.946357250213623, "step": 468, "temperature": 0.9 }, { "advantages": -9.510400559520349e-06, "completion_length": 168.84375, "delta_ref_entropy_loss": 0.224609375, "delta_ref_ppl": -0.2041015625, "entropy_loss": -0.525390625, "epoch": 0.6564030790762772, "grad_norm": 2.7559041444802572, "k1_kl": 0.20458984375, "k3_kl": 0.0908203125, "kimi_kl": 0.17431640625, "learning_rate": 3.431372549019608e-07, "loss": 0.0036, "ppl": 0.458984375, "reward": 0.9808916449546814, "reward_std": 0.024939969647675753, "rewards/single_object_detection_bbox_reward": 0.9808917045593262, "step": 469, "temperature": 0.9 }, { "advantages": 1.7556362763571087e-05, "completion_length": 69.5, "delta_ref_entropy_loss": 0.228515625, "delta_ref_ppl": -0.17578125, "entropy_loss": -0.52734375, "epoch": 0.6578026592022393, "grad_norm": 2.947768545134393, "k1_kl": 0.17578125, "k3_kl": 0.068359375, "kimi_kl": 0.102783203125, "learning_rate": 3.4173669467787116e-07, "loss": 0.0027, "ppl": 0.46875, "reward": 0.991355836391449, "reward_std": 0.007342934608459473, "rewards/single_object_detection_bbox_reward": 0.9913558661937714, "step": 470, "temperature": 0.9 }, { "advantages": -8.798870112514123e-06, "completion_length": 104.0, "delta_ref_entropy_loss": 0.21484375, "delta_ref_ppl": -0.18603515625, "entropy_loss": -0.544921875, "epoch": 0.6592022393282015, "grad_norm": 2.7450617929240355, "k1_kl": 0.18603515625, "k3_kl": 0.082275390625, "kimi_kl": 0.1416015625, "learning_rate": 3.403361344537815e-07, "loss": 0.0033, "ppl": 0.4755859375, "reward": 0.9368952214717865, "reward_std": 0.04111677315086126, "rewards/single_object_detection_bbox_reward": 0.9368953108787537, "step": 471, "temperature": 0.9 }, { "advantages": 9.368307019030908e-06, "completion_length": 119.1875, "delta_ref_entropy_loss": 0.23486328125, "delta_ref_ppl": -0.22900390625, "entropy_loss": -0.4853515625, "epoch": 0.6606018194541637, "grad_norm": 4.138365395455088, "k1_kl": 0.22802734375, "k3_kl": 0.107666015625, "kimi_kl": 0.2138671875, "learning_rate": 3.3893557422969186e-07, "loss": 0.0043, "ppl": 0.423828125, "reward": 0.9823789596557617, "reward_std": 0.011574403935810551, "rewards/single_object_detection_bbox_reward": 0.9823790490627289, "step": 472, "temperature": 0.9 }, { "advantages": 9.402101113664685e-06, "completion_length": 103.75, "delta_ref_entropy_loss": 0.21533203125, "delta_ref_ppl": -0.17626953125, "entropy_loss": -0.529296875, "epoch": 0.6620013995801259, "grad_norm": 2.3457911606739263, "k1_kl": 0.17529296875, "k3_kl": 0.071044921875, "kimi_kl": 0.106201171875, "learning_rate": 3.3753501400560223e-07, "loss": 0.0028, "ppl": 0.4638671875, "reward": 0.9664918184280396, "reward_std": 0.02527605462819338, "rewards/single_object_detection_bbox_reward": 0.9664919078350067, "step": 473, "temperature": 0.9 }, { "advantages": -9.856586700607295e-06, "completion_length": 49.0, "delta_ref_entropy_loss": 0.24951171875, "delta_ref_ppl": -0.216796875, "entropy_loss": -0.5234375, "epoch": 0.6634009797060881, "grad_norm": 2.7275826342145324, "k1_kl": 0.21630859375, "k3_kl": 0.098388671875, "kimi_kl": 0.18359375, "learning_rate": 3.361344537815126e-07, "loss": 0.0039, "ppl": 0.4560546875, "reward": 0.9824515283107758, "reward_std": 0.011108050122857094, "rewards/single_object_detection_bbox_reward": 0.9824515283107758, "step": 474, "temperature": 0.9 }, { "advantages": -4.96661039051105e-06, "completion_length": 69.15625, "delta_ref_entropy_loss": 0.2744140625, "delta_ref_ppl": -0.24169921875, "entropy_loss": -0.4677734375, "epoch": 0.6648005598320503, "grad_norm": 5.619229866058614, "k1_kl": 0.24267578125, "k3_kl": 0.116943359375, "kimi_kl": 0.25390625, "learning_rate": 3.3473389355742293e-07, "loss": 0.0047, "ppl": 0.41015625, "reward": 0.9847809374332428, "reward_std": 0.0172685282304883, "rewards/single_object_detection_bbox_reward": 0.9847810566425323, "step": 475, "temperature": 0.9 }, { "advantages": -2.191455405409215e-05, "completion_length": 114.125, "delta_ref_entropy_loss": 0.287109375, "delta_ref_ppl": -0.2568359375, "entropy_loss": -0.4853515625, "epoch": 0.6662001399580126, "grad_norm": 2.358371206511877, "k1_kl": 0.2568359375, "k3_kl": 0.12255859375, "kimi_kl": 0.25537109375, "learning_rate": 3.333333333333333e-07, "loss": 0.0049, "ppl": 0.4306640625, "reward": 0.9800456762313843, "reward_std": 0.013826898764818907, "rewards/single_object_detection_bbox_reward": 0.9800457060337067, "step": 476, "temperature": 0.9 }, { "advantages": -9.501087106400519e-06, "completion_length": 49.5, "delta_ref_entropy_loss": 0.232421875, "delta_ref_ppl": -0.24609375, "entropy_loss": -0.5087890625, "epoch": 0.6675997200839748, "grad_norm": 2.926073690391723, "k1_kl": 0.24755859375, "k3_kl": 0.130126953125, "kimi_kl": 0.28271484375, "learning_rate": 3.319327731092437e-07, "loss": 0.0052, "ppl": 0.4443359375, "reward": 0.9236551225185394, "reward_std": 0.033331695944070816, "rewards/single_object_detection_bbox_reward": 0.9236551821231842, "step": 477, "temperature": 0.9 }, { "advantages": 3.0826779493509093e-06, "completion_length": 51.5, "delta_ref_entropy_loss": 0.248046875, "delta_ref_ppl": -0.24072265625, "entropy_loss": -0.51171875, "epoch": 0.668999300209937, "grad_norm": 3.274591863161451, "k1_kl": 0.2412109375, "k3_kl": 0.125732421875, "kimi_kl": 0.2978515625, "learning_rate": 3.3053221288515405e-07, "loss": 0.005, "ppl": 0.451171875, "reward": 0.8938290476799011, "reward_std": 0.028214437421411276, "rewards/single_object_detection_bbox_reward": 0.8938290774822235, "step": 478, "temperature": 0.9 }, { "advantages": -1.1865049600601196e-06, "completion_length": 104.34375, "delta_ref_entropy_loss": 0.23486328125, "delta_ref_ppl": -0.22216796875, "entropy_loss": -0.509765625, "epoch": 0.6703988803358992, "grad_norm": 3.183708092747103, "k1_kl": 0.2236328125, "k3_kl": 0.111572265625, "kimi_kl": 0.2255859375, "learning_rate": 3.291316526610644e-07, "loss": 0.0045, "ppl": 0.4453125, "reward": 0.9311900734901428, "reward_std": 0.0651018749922514, "rewards/single_object_detection_bbox_reward": 0.9311901926994324, "step": 479, "temperature": 0.9 }, { "advantages": 1.530482404632494e-05, "completion_length": 182.96875, "delta_ref_entropy_loss": 0.21923828125, "delta_ref_ppl": -0.18017578125, "entropy_loss": -0.5234375, "epoch": 0.6717984604618614, "grad_norm": 2.7477239229146244, "k1_kl": 0.1806640625, "k3_kl": 0.081787109375, "kimi_kl": 0.14697265625, "learning_rate": 3.277310924369748e-07, "loss": 0.0033, "ppl": 0.4638671875, "reward": 0.9149506092071533, "reward_std": 0.03926351945847273, "rewards/single_object_detection_bbox_reward": 0.9149506986141205, "step": 480, "temperature": 0.9 }, { "advantages": -6.822869409006671e-06, "completion_length": 79.8125, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.2177734375, "entropy_loss": -0.509765625, "epoch": 0.6731980405878236, "grad_norm": 2.5260449729764636, "k1_kl": 0.21826171875, "k3_kl": 0.106201171875, "kimi_kl": 0.20556640625, "learning_rate": 3.2633053221288517e-07, "loss": 0.0043, "ppl": 0.4560546875, "reward": 0.9288871884346008, "reward_std": 0.009775050915777683, "rewards/single_object_detection_bbox_reward": 0.9288873374462128, "step": 481, "temperature": 0.9 }, { "advantages": -6.171475888550049e-06, "completion_length": 39.1875, "delta_ref_entropy_loss": 0.25439453125, "delta_ref_ppl": -0.248046875, "entropy_loss": -0.4970703125, "epoch": 0.6745976207137858, "grad_norm": 4.290182532559296, "k1_kl": 0.248046875, "k3_kl": 0.126220703125, "kimi_kl": 0.294921875, "learning_rate": 3.249299719887955e-07, "loss": 0.0051, "ppl": 0.435546875, "reward": 0.9414491057395935, "reward_std": 0.0481327585875988, "rewards/single_object_detection_bbox_reward": 0.9414491653442383, "step": 482, "temperature": 0.9 }, { "advantages": -1.72739050015025e-05, "completion_length": 178.5625, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.1767578125, "entropy_loss": -0.498046875, "epoch": 0.6759972008397481, "grad_norm": 2.140456597661662, "k1_kl": 0.17724609375, "k3_kl": 0.069580078125, "kimi_kl": 0.107421875, "learning_rate": 3.2352941176470586e-07, "loss": 0.0028, "ppl": 0.4375, "reward": 0.973800778388977, "reward_std": 0.009186149196466431, "rewards/single_object_detection_bbox_reward": 0.9738008677959442, "step": 483, "temperature": 0.9 }, { "advantages": 3.4091464385710424e-05, "completion_length": 79.34375, "delta_ref_entropy_loss": 0.26123046875, "delta_ref_ppl": -0.19482421875, "entropy_loss": -0.470703125, "epoch": 0.6773967809657103, "grad_norm": 2.0516570056068204, "k1_kl": 0.1943359375, "k3_kl": 0.080810546875, "kimi_kl": 0.125, "learning_rate": 3.2212885154061624e-07, "loss": 0.0032, "ppl": 0.416015625, "reward": 0.9785560965538025, "reward_std": 0.007851515896618366, "rewards/single_object_detection_bbox_reward": 0.978556215763092, "step": 484, "temperature": 0.9 }, { "advantages": 7.729711796855554e-06, "completion_length": 98.15625, "delta_ref_entropy_loss": 0.2265625, "delta_ref_ppl": -0.1787109375, "entropy_loss": -0.5078125, "epoch": 0.6787963610916725, "grad_norm": 2.5138257330761022, "k1_kl": 0.17822265625, "k3_kl": 0.076904296875, "kimi_kl": 0.135009765625, "learning_rate": 3.207282913165266e-07, "loss": 0.0031, "ppl": 0.44921875, "reward": 0.9743340909481049, "reward_std": 0.017951673828065395, "rewards/single_object_detection_bbox_reward": 0.974334180355072, "step": 485, "temperature": 0.9 }, { "advantages": 6.665608850653371e-06, "completion_length": 101.125, "delta_ref_entropy_loss": 0.21875, "delta_ref_ppl": -0.18310546875, "entropy_loss": -0.515625, "epoch": 0.6801959412176347, "grad_norm": 2.293517901015557, "k1_kl": 0.18212890625, "k3_kl": 0.0772705078125, "kimi_kl": 0.132080078125, "learning_rate": 3.1932773109243693e-07, "loss": 0.0031, "ppl": 0.4541015625, "reward": 0.9765308499336243, "reward_std": 0.02433514967560768, "rewards/single_object_detection_bbox_reward": 0.9765308797359467, "step": 486, "temperature": 0.9 }, { "advantages": -3.901177365150943e-06, "completion_length": 147.1875, "delta_ref_entropy_loss": 0.2470703125, "delta_ref_ppl": -0.20849609375, "entropy_loss": -0.4775390625, "epoch": 0.6815955213435969, "grad_norm": 3.2210919161207756, "k1_kl": 0.2080078125, "k3_kl": 0.09228515625, "kimi_kl": 0.1611328125, "learning_rate": 3.179271708683473e-07, "loss": 0.0037, "ppl": 0.421875, "reward": 0.9742518961429596, "reward_std": 0.015110593347344548, "rewards/single_object_detection_bbox_reward": 0.9742520153522491, "step": 487, "temperature": 0.9 }, { "advantages": -1.4374565125763183e-05, "completion_length": 58.5, "delta_ref_entropy_loss": 0.255859375, "delta_ref_ppl": -0.22802734375, "entropy_loss": -0.5126953125, "epoch": 0.6829951014695591, "grad_norm": 3.341950405457795, "k1_kl": 0.2275390625, "k3_kl": 0.103759765625, "kimi_kl": 0.17578125, "learning_rate": 3.165266106442577e-07, "loss": 0.0042, "ppl": 0.455078125, "reward": 0.9207509160041809, "reward_std": 0.026488052681088448, "rewards/single_object_detection_bbox_reward": 0.9207509160041809, "step": 488, "temperature": 0.9 }, { "advantages": -3.968764758610632e-06, "completion_length": 140.9375, "delta_ref_entropy_loss": 0.27685546875, "delta_ref_ppl": -0.25634765625, "entropy_loss": -0.5087890625, "epoch": 0.6843946815955213, "grad_norm": 3.0346634618199397, "k1_kl": 0.25634765625, "k3_kl": 0.136474609375, "kimi_kl": 0.3984375, "learning_rate": 3.1512605042016805e-07, "loss": 0.0055, "ppl": 0.44921875, "reward": 0.9354612827301025, "reward_std": 0.02443452924489975, "rewards/single_object_detection_bbox_reward": 0.9354613721370697, "step": 489, "temperature": 0.9 }, { "advantages": 2.702165829759906e-06, "completion_length": 39.0625, "delta_ref_entropy_loss": 0.21728515625, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.537109375, "epoch": 0.6857942617214835, "grad_norm": 3.1338940228356007, "k1_kl": 0.2138671875, "k3_kl": 0.101318359375, "kimi_kl": 0.19970703125, "learning_rate": 3.1372549019607843e-07, "loss": 0.0041, "ppl": 0.4716796875, "reward": 0.9599028527736664, "reward_std": 0.02639697026461363, "rewards/single_object_detection_bbox_reward": 0.9599029421806335, "step": 490, "temperature": 0.9 }, { "advantages": -7.58322789806698e-06, "completion_length": 104.84375, "delta_ref_entropy_loss": 0.2177734375, "delta_ref_ppl": -0.181640625, "entropy_loss": -0.517578125, "epoch": 0.6871938418474458, "grad_norm": 3.258859148987948, "k1_kl": 0.18212890625, "k3_kl": 0.083984375, "kimi_kl": 0.15087890625, "learning_rate": 3.123249299719888e-07, "loss": 0.0034, "ppl": 0.462890625, "reward": 0.9124480485916138, "reward_std": 0.03715316765010357, "rewards/single_object_detection_bbox_reward": 0.9124481081962585, "step": 491, "temperature": 0.9 }, { "advantages": 9.87774069471925e-06, "completion_length": 49.96875, "delta_ref_entropy_loss": 0.265625, "delta_ref_ppl": -0.2529296875, "entropy_loss": -0.4921875, "epoch": 0.688593421973408, "grad_norm": 6.79549021392104, "k1_kl": 0.25146484375, "k3_kl": 0.130615234375, "kimi_kl": 0.31103515625, "learning_rate": 3.109243697478992e-07, "loss": 0.0052, "ppl": 0.43359375, "reward": 0.9328238666057587, "reward_std": 0.05156426504254341, "rewards/single_object_detection_bbox_reward": 0.932823896408081, "step": 492, "temperature": 0.9 }, { "advantages": -3.6914966585754883e-06, "completion_length": 86.375, "delta_ref_entropy_loss": 0.23779296875, "delta_ref_ppl": -0.19970703125, "entropy_loss": -0.51171875, "epoch": 0.6899930020993702, "grad_norm": 3.9499523574886055, "k1_kl": 0.19970703125, "k3_kl": 0.08642578125, "kimi_kl": 0.141845703125, "learning_rate": 3.095238095238095e-07, "loss": 0.0035, "ppl": 0.451171875, "reward": 0.9578167498111725, "reward_std": 0.031024867668747902, "rewards/single_object_detection_bbox_reward": 0.9578168392181396, "step": 493, "temperature": 0.9 }, { "advantages": -6.6868961994259735e-06, "completion_length": 97.09375, "delta_ref_entropy_loss": 0.23828125, "delta_ref_ppl": -0.20361328125, "entropy_loss": -0.5390625, "epoch": 0.6913925822253324, "grad_norm": 4.672648153784681, "k1_kl": 0.205078125, "k3_kl": 0.097412109375, "kimi_kl": 0.1796875, "learning_rate": 3.0812324929971987e-07, "loss": 0.0039, "ppl": 0.4833984375, "reward": 0.9447441697120667, "reward_std": 0.04893511347472668, "rewards/single_object_detection_bbox_reward": 0.9447442591190338, "step": 494, "temperature": 0.9 }, { "advantages": -1.648440957069397e-06, "completion_length": 117.5, "delta_ref_entropy_loss": 0.236328125, "delta_ref_ppl": -0.21484375, "entropy_loss": -0.513671875, "epoch": 0.6927921623512946, "grad_norm": 3.229611917440449, "k1_kl": 0.2138671875, "k3_kl": 0.10302734375, "kimi_kl": 0.199462890625, "learning_rate": 3.0672268907563024e-07, "loss": 0.0041, "ppl": 0.451171875, "reward": 0.9430533647537231, "reward_std": 0.014978291583247483, "rewards/single_object_detection_bbox_reward": 0.9430534243583679, "step": 495, "temperature": 0.9 }, { "advantages": -2.178762699145409e-06, "completion_length": 141.8125, "delta_ref_entropy_loss": 0.2431640625, "delta_ref_ppl": -0.205078125, "entropy_loss": -0.55859375, "epoch": 0.6941917424772568, "grad_norm": 2.8233415841542158, "k1_kl": 0.20556640625, "k3_kl": 0.095947265625, "kimi_kl": 0.1884765625, "learning_rate": 3.053221288515406e-07, "loss": 0.0038, "ppl": 0.490234375, "reward": 0.8707749843597412, "reward_std": 0.04641125397756696, "rewards/single_object_detection_bbox_reward": 0.8707750141620636, "step": 496, "temperature": 0.9 }, { "advantages": 1.8061673472402617e-06, "completion_length": 67.3125, "delta_ref_entropy_loss": 0.23681640625, "delta_ref_ppl": -0.2099609375, "entropy_loss": -0.546875, "epoch": 0.695591322603219, "grad_norm": 4.528511201927004, "k1_kl": 0.21044921875, "k3_kl": 0.098876953125, "kimi_kl": 0.1767578125, "learning_rate": 3.0392156862745094e-07, "loss": 0.0039, "ppl": 0.4853515625, "reward": 0.9683829247951508, "reward_std": 0.04159888997673988, "rewards/single_object_detection_bbox_reward": 0.9683830142021179, "step": 497, "temperature": 0.9 }, { "advantages": -1.844377993620583e-05, "completion_length": 38.53125, "delta_ref_entropy_loss": 0.24365234375, "delta_ref_ppl": -0.22265625, "entropy_loss": -0.5234375, "epoch": 0.6969909027291813, "grad_norm": 2.4884993274880896, "k1_kl": 0.22314453125, "k3_kl": 0.10107421875, "kimi_kl": 0.196533203125, "learning_rate": 3.025210084033613e-07, "loss": 0.0041, "ppl": 0.4609375, "reward": 0.9369125366210938, "reward_std": 0.023292699828743935, "rewards/single_object_detection_bbox_reward": 0.9369126558303833, "step": 498, "temperature": 0.9 }, { "advantages": -1.8080963855027221e-06, "completion_length": 80.5, "delta_ref_entropy_loss": 0.2490234375, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.4921875, "epoch": 0.6983904828551435, "grad_norm": 2.801997789788218, "k1_kl": 0.21484375, "k3_kl": 0.09814453125, "kimi_kl": 0.1806640625, "learning_rate": 3.011204481792717e-07, "loss": 0.0039, "ppl": 0.4365234375, "reward": 0.9786340892314911, "reward_std": 0.008986112661659718, "rewards/single_object_detection_bbox_reward": 0.9786341190338135, "step": 499, "temperature": 0.9 }, { "advantages": -1.5883042351561016e-05, "completion_length": 20.5, "delta_ref_entropy_loss": 0.2578125, "delta_ref_ppl": -0.205078125, "entropy_loss": -0.5322265625, "epoch": 0.6997900629811057, "grad_norm": 4.414271899000515, "k1_kl": 0.205078125, "k3_kl": 0.093994140625, "kimi_kl": 0.1572265625, "learning_rate": 2.9971988795518206e-07, "loss": 0.0038, "ppl": 0.4716796875, "reward": 0.9094416201114655, "reward_std": 0.027493876172229648, "rewards/single_object_detection_bbox_reward": 0.9094416499137878, "step": 500, "temperature": 0.9 }, { "advantages": 1.2748478184221312e-06, "completion_length": 51.09375, "delta_ref_entropy_loss": 0.26025390625, "delta_ref_ppl": -0.232421875, "entropy_loss": -0.580078125, "epoch": 0.7011896431070679, "grad_norm": 2.3503824180462147, "k1_kl": 0.23193359375, "k3_kl": 0.109619140625, "kimi_kl": 0.220703125, "learning_rate": 2.9831932773109244e-07, "loss": 0.0044, "ppl": 0.513671875, "reward": 0.9015368819236755, "reward_std": 0.03137982916086912, "rewards/single_object_detection_bbox_reward": 0.9015369415283203, "step": 501, "temperature": 0.9 }, { "advantages": -1.25310791645461e-05, "completion_length": 86.65625, "delta_ref_entropy_loss": 0.2822265625, "delta_ref_ppl": -0.23291015625, "entropy_loss": -0.4609375, "epoch": 0.7025892232330301, "grad_norm": 2.999865332345087, "k1_kl": 0.23291015625, "k3_kl": 0.10107421875, "kimi_kl": 0.185546875, "learning_rate": 2.969187675070028e-07, "loss": 0.0041, "ppl": 0.4013671875, "reward": 0.9703725874423981, "reward_std": 0.005859955097548664, "rewards/single_object_detection_bbox_reward": 0.9703726470470428, "step": 502, "temperature": 0.9 }, { "advantages": -7.820582140993793e-06, "completion_length": 94.3125, "delta_ref_entropy_loss": 0.23486328125, "delta_ref_ppl": -0.20703125, "entropy_loss": -0.49609375, "epoch": 0.7039888033589923, "grad_norm": 4.991449155063765, "k1_kl": 0.2080078125, "k3_kl": 0.110107421875, "kimi_kl": 0.17822265625, "learning_rate": 2.955182072829132e-07, "loss": 0.0044, "ppl": 0.443359375, "reward": 0.9628324508666992, "reward_std": 0.028652008302742615, "rewards/single_object_detection_bbox_reward": 0.9628325402736664, "step": 503, "temperature": 0.9 }, { "advantages": 5.667364007422293e-06, "completion_length": 51.5, "delta_ref_entropy_loss": 0.2236328125, "delta_ref_ppl": -0.1787109375, "entropy_loss": -0.556640625, "epoch": 0.7053883834849545, "grad_norm": 20.23404993308242, "k1_kl": 0.17919921875, "k3_kl": 0.07666015625, "kimi_kl": 0.136474609375, "learning_rate": 2.941176470588235e-07, "loss": 0.0031, "ppl": 0.4990234375, "reward": 0.9552784860134125, "reward_std": 0.025514032458886504, "rewards/single_object_detection_bbox_reward": 0.9552785754203796, "step": 504, "temperature": 0.9 }, { "advantages": 5.9433018577692565e-06, "completion_length": 42.15625, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.18896484375, "entropy_loss": -0.517578125, "epoch": 0.7067879636109167, "grad_norm": 2.715757094391061, "k1_kl": 0.1884765625, "k3_kl": 0.08056640625, "kimi_kl": 0.15478515625, "learning_rate": 2.927170868347339e-07, "loss": 0.0032, "ppl": 0.447265625, "reward": 0.9357222318649292, "reward_std": 0.014273387845605612, "rewards/single_object_detection_bbox_reward": 0.935722291469574, "step": 505, "temperature": 0.9 }, { "advantages": 9.577721812092932e-06, "completion_length": 107.5, "delta_ref_entropy_loss": 0.2431640625, "delta_ref_ppl": -0.20654296875, "entropy_loss": -0.501953125, "epoch": 0.708187543736879, "grad_norm": 3.372006518168941, "k1_kl": 0.20654296875, "k3_kl": 0.0869140625, "kimi_kl": 0.14111328125, "learning_rate": 2.9131652661064425e-07, "loss": 0.0035, "ppl": 0.4462890625, "reward": 0.9257471263408661, "reward_std": 0.0278551890514791, "rewards/single_object_detection_bbox_reward": 0.9257472455501556, "step": 506, "temperature": 0.9 }, { "advantages": -4.894100356978015e-06, "completion_length": 42.9375, "delta_ref_entropy_loss": 0.29296875, "delta_ref_ppl": -0.2578125, "entropy_loss": -0.4619140625, "epoch": 0.7095871238628412, "grad_norm": 5.493602815410964, "k1_kl": 0.25732421875, "k3_kl": 0.1220703125, "kimi_kl": 0.23828125, "learning_rate": 2.899159663865546e-07, "loss": 0.0049, "ppl": 0.4072265625, "reward": 0.9890325963497162, "reward_std": 0.011267795169260353, "rewards/single_object_detection_bbox_reward": 0.989032655954361, "step": 507, "temperature": 0.9 }, { "advantages": -9.601669262337964e-06, "completion_length": 67.78125, "delta_ref_entropy_loss": 0.24365234375, "delta_ref_ppl": -0.1787109375, "entropy_loss": -0.486328125, "epoch": 0.7109867039888034, "grad_norm": 2.632295191689284, "k1_kl": 0.17919921875, "k3_kl": 0.080322265625, "kimi_kl": 0.135498046875, "learning_rate": 2.8851540616246495e-07, "loss": 0.0032, "ppl": 0.4296875, "reward": 0.9517145752906799, "reward_std": 0.009870658745057881, "rewards/single_object_detection_bbox_reward": 0.9517146944999695, "step": 508, "temperature": 0.9 }, { "advantages": -7.504597760998877e-06, "completion_length": 68.0, "delta_ref_entropy_loss": 0.25390625, "delta_ref_ppl": -0.2294921875, "entropy_loss": -0.568359375, "epoch": 0.7123862841147656, "grad_norm": 3.026289210857421, "k1_kl": 0.23046875, "k3_kl": 0.114990234375, "kimi_kl": 0.2158203125, "learning_rate": 2.871148459383753e-07, "loss": 0.0046, "ppl": 0.51171875, "reward": 0.9303008019924164, "reward_std": 0.023328804410994053, "rewards/single_object_detection_bbox_reward": 0.9303008615970612, "step": 509, "temperature": 0.9 }, { "advantages": -8.246196898653579e-07, "completion_length": 96.53125, "delta_ref_entropy_loss": 0.255859375, "delta_ref_ppl": -0.1865234375, "entropy_loss": -0.4853515625, "epoch": 0.7137858642407278, "grad_norm": 2.665540418599622, "k1_kl": 0.18701171875, "k3_kl": 0.0804443359375, "kimi_kl": 0.116455078125, "learning_rate": 2.857142857142857e-07, "loss": 0.0032, "ppl": 0.4306640625, "reward": 0.940784215927124, "reward_std": 0.01171724230516702, "rewards/single_object_detection_bbox_reward": 0.9407842755317688, "step": 510, "temperature": 0.9 }, { "advantages": 5.347654507659172e-06, "completion_length": 79.5, "delta_ref_entropy_loss": 0.25244140625, "delta_ref_ppl": -0.1826171875, "entropy_loss": -0.482421875, "epoch": 0.71518544436669, "grad_norm": 3.197509413428893, "k1_kl": 0.18310546875, "k3_kl": 0.076416015625, "kimi_kl": 0.12158203125, "learning_rate": 2.8431372549019607e-07, "loss": 0.003, "ppl": 0.423828125, "reward": 0.9646530747413635, "reward_std": 0.00417703902348876, "rewards/single_object_detection_bbox_reward": 0.9646531939506531, "step": 511, "temperature": 0.9 }, { "advantages": -1.1865848364323028e-05, "completion_length": 259.0625, "delta_ref_entropy_loss": 0.25, "delta_ref_ppl": -0.21826171875, "entropy_loss": -0.53515625, "epoch": 0.7165850244926522, "grad_norm": 4.354042906215967, "k1_kl": 0.2177734375, "k3_kl": 0.10107421875, "kimi_kl": 0.17822265625, "learning_rate": 2.8291316526610644e-07, "loss": 0.0041, "ppl": 0.4765625, "reward": 0.9640821814537048, "reward_std": 0.022318328730762005, "rewards/single_object_detection_bbox_reward": 0.9640822112560272, "step": 512, "temperature": 0.9 }, { "advantages": 8.01469650468789e-06, "completion_length": 77.0625, "delta_ref_entropy_loss": 0.251953125, "delta_ref_ppl": -0.2041015625, "entropy_loss": -0.552734375, "epoch": 0.7179846046186145, "grad_norm": 4.705897937213368, "k1_kl": 0.2041015625, "k3_kl": 0.0869140625, "kimi_kl": 0.15576171875, "learning_rate": 2.815126050420168e-07, "loss": 0.0035, "ppl": 0.4794921875, "reward": 0.9070480465888977, "reward_std": 0.029827729798853397, "rewards/single_object_detection_bbox_reward": 0.9070481061935425, "step": 513, "temperature": 0.9 }, { "advantages": 9.66247216638294e-06, "completion_length": 126.5, "delta_ref_entropy_loss": 0.2548828125, "delta_ref_ppl": -0.220703125, "entropy_loss": -0.533203125, "epoch": 0.7193841847445767, "grad_norm": 3.4988019415248015, "k1_kl": 0.22119140625, "k3_kl": 0.112548828125, "kimi_kl": 0.2763671875, "learning_rate": 2.8011204481792714e-07, "loss": 0.0045, "ppl": 0.4755859375, "reward": 0.9428652226924896, "reward_std": 0.015568329486995935, "rewards/single_object_detection_bbox_reward": 0.9428653120994568, "step": 514, "temperature": 0.9 }, { "advantages": 1.6772055460023694e-05, "completion_length": 77.3125, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.1826171875, "entropy_loss": -0.50390625, "epoch": 0.7207837648705389, "grad_norm": 2.375738073026284, "k1_kl": 0.181640625, "k3_kl": 0.070556640625, "kimi_kl": 0.099853515625, "learning_rate": 2.787114845938375e-07, "loss": 0.0028, "ppl": 0.4384765625, "reward": 0.9948497116565704, "reward_std": 0.002642260689754039, "rewards/single_object_detection_bbox_reward": 0.9948498010635376, "step": 515, "temperature": 0.9 }, { "advantages": -1.1027259006368695e-05, "completion_length": 98.375, "delta_ref_entropy_loss": 0.2607421875, "delta_ref_ppl": -0.2294921875, "entropy_loss": -0.509765625, "epoch": 0.722183344996501, "grad_norm": 2.7095009198658246, "k1_kl": 0.2294921875, "k3_kl": 0.10498046875, "kimi_kl": 0.212158203125, "learning_rate": 2.773109243697479e-07, "loss": 0.0042, "ppl": 0.451171875, "reward": 0.94158935546875, "reward_std": 0.0216144984588027, "rewards/single_object_detection_bbox_reward": 0.9415894150733948, "step": 516, "temperature": 0.9 }, { "advantages": -7.568725976625501e-06, "completion_length": 66.34375, "delta_ref_entropy_loss": 0.25, "delta_ref_ppl": -0.2275390625, "entropy_loss": -0.501953125, "epoch": 0.7235829251224632, "grad_norm": 4.019308791192452, "k1_kl": 0.22705078125, "k3_kl": 0.111083984375, "kimi_kl": 0.22021484375, "learning_rate": 2.7591036414565826e-07, "loss": 0.0045, "ppl": 0.4384765625, "reward": 0.9397666156291962, "reward_std": 0.04423794709146023, "rewards/single_object_detection_bbox_reward": 0.9397666752338409, "step": 517, "temperature": 0.9 }, { "advantages": 3.563839186426776e-06, "completion_length": 91.375, "delta_ref_entropy_loss": 0.3037109375, "delta_ref_ppl": -0.25830078125, "entropy_loss": -0.4970703125, "epoch": 0.7249825052484254, "grad_norm": 5.3613783171457525, "k1_kl": 0.25830078125, "k3_kl": 0.116455078125, "kimi_kl": 0.19775390625, "learning_rate": 2.7450980392156863e-07, "loss": 0.0047, "ppl": 0.4375, "reward": 0.9436845183372498, "reward_std": 0.012410969939082861, "rewards/single_object_detection_bbox_reward": 0.9436845779418945, "step": 518, "temperature": 0.9 }, { "advantages": 1.9104886519016873e-05, "completion_length": 118.5, "delta_ref_entropy_loss": 0.23193359375, "delta_ref_ppl": -0.17822265625, "entropy_loss": -0.4931640625, "epoch": 0.7263820853743876, "grad_norm": 2.3907765580419915, "k1_kl": 0.1787109375, "k3_kl": 0.076904296875, "kimi_kl": 0.126708984375, "learning_rate": 2.7310924369747895e-07, "loss": 0.003, "ppl": 0.4384765625, "reward": 0.9865018427371979, "reward_std": 0.010554697219049558, "rewards/single_object_detection_bbox_reward": 0.986501932144165, "step": 519, "temperature": 0.9 }, { "advantages": -1.2804355264961487e-06, "completion_length": 51.5, "delta_ref_entropy_loss": 0.263671875, "delta_ref_ppl": -0.2509765625, "entropy_loss": -0.509765625, "epoch": 0.72778166550035, "grad_norm": 3.529475428666957, "k1_kl": 0.2509765625, "k3_kl": 0.11376953125, "kimi_kl": 0.205078125, "learning_rate": 2.717086834733893e-07, "loss": 0.0045, "ppl": 0.4443359375, "reward": 0.9160755574703217, "reward_std": 0.03673379868268967, "rewards/single_object_detection_bbox_reward": 0.9160755574703217, "step": 520, "temperature": 0.9 }, { "advantages": 1.9915669327019714e-06, "completion_length": 32.5625, "delta_ref_entropy_loss": 0.2958984375, "delta_ref_ppl": -0.294921875, "entropy_loss": -0.474609375, "epoch": 0.7291812456263121, "grad_norm": 2.6748463162829648, "k1_kl": 0.294921875, "k3_kl": 0.1455078125, "kimi_kl": 0.3291015625, "learning_rate": 2.703081232492997e-07, "loss": 0.0058, "ppl": 0.4208984375, "reward": 0.94392529129982, "reward_std": 0.005689587153028697, "rewards/single_object_detection_bbox_reward": 0.9439253807067871, "step": 521, "temperature": 0.9 }, { "advantages": 3.130973595943942e-06, "completion_length": 260.75, "delta_ref_entropy_loss": 0.24853515625, "delta_ref_ppl": -0.216796875, "entropy_loss": -0.5166015625, "epoch": 0.7305808257522743, "grad_norm": 3.4458379978382907, "k1_kl": 0.21630859375, "k3_kl": 0.1064453125, "kimi_kl": 0.22216796875, "learning_rate": 2.689075630252101e-07, "loss": 0.0042, "ppl": 0.455078125, "reward": 0.9510157704353333, "reward_std": 0.028237802907824516, "rewards/single_object_detection_bbox_reward": 0.9510157704353333, "step": 522, "temperature": 0.9 }, { "advantages": -6.83138455315202e-06, "completion_length": 60.0, "delta_ref_entropy_loss": 0.2890625, "delta_ref_ppl": -0.25244140625, "entropy_loss": -0.484375, "epoch": 0.7319804058782365, "grad_norm": 2.9637651266634526, "k1_kl": 0.25244140625, "k3_kl": 0.118896484375, "kimi_kl": 0.2158203125, "learning_rate": 2.6750700280112045e-07, "loss": 0.0048, "ppl": 0.4287109375, "reward": 0.9483222961425781, "reward_std": 0.009438143286388367, "rewards/single_object_detection_bbox_reward": 0.9483223557472229, "step": 523, "temperature": 0.9 }, { "advantages": -5.713264727091882e-06, "completion_length": 20.34375, "delta_ref_entropy_loss": 0.2587890625, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.490234375, "epoch": 0.7333799860041987, "grad_norm": 2.825554663548167, "k1_kl": 0.20947265625, "k3_kl": 0.09912109375, "kimi_kl": 0.1640625, "learning_rate": 2.661064425770308e-07, "loss": 0.004, "ppl": 0.4326171875, "reward": 0.914613664150238, "reward_std": 0.015509944409132004, "rewards/single_object_detection_bbox_reward": 0.9146136939525604, "step": 524, "temperature": 0.9 }, { "advantages": -7.64083665671933e-06, "completion_length": 32.0, "delta_ref_entropy_loss": 0.287109375, "delta_ref_ppl": -0.2392578125, "entropy_loss": -0.482421875, "epoch": 0.7347795661301609, "grad_norm": 2.71513434847886, "k1_kl": 0.2392578125, "k3_kl": 0.10595703125, "kimi_kl": 0.17724609375, "learning_rate": 2.6470588235294114e-07, "loss": 0.0042, "ppl": 0.4267578125, "reward": 0.9081939458847046, "reward_std": 0.016164749395102262, "rewards/single_object_detection_bbox_reward": 0.9081940352916718, "step": 525, "temperature": 0.9 }, { "advantages": 3.599561864575662e-06, "completion_length": 96.9375, "delta_ref_entropy_loss": 0.22412109375, "delta_ref_ppl": -0.17724609375, "entropy_loss": -0.474609375, "epoch": 0.7361791462561231, "grad_norm": 3.133910985551437, "k1_kl": 0.17724609375, "k3_kl": 0.07470703125, "kimi_kl": 0.119873046875, "learning_rate": 2.633053221288515e-07, "loss": 0.003, "ppl": 0.416015625, "reward": 0.973834902048111, "reward_std": 0.0040297937812283635, "rewards/single_object_detection_bbox_reward": 0.9738349616527557, "step": 526, "temperature": 0.9 }, { "advantages": 4.403818820719607e-07, "completion_length": 92.0625, "delta_ref_entropy_loss": 0.23779296875, "delta_ref_ppl": -0.181640625, "entropy_loss": -0.4853515625, "epoch": 0.7375787263820853, "grad_norm": 2.33648586270087, "k1_kl": 0.181640625, "k3_kl": 0.08154296875, "kimi_kl": 0.12744140625, "learning_rate": 2.619047619047619e-07, "loss": 0.0033, "ppl": 0.427734375, "reward": 0.9760933816432953, "reward_std": 0.019098144373856485, "rewards/single_object_detection_bbox_reward": 0.9760934114456177, "step": 527, "temperature": 0.9 }, { "advantages": 1.5679880561947357e-05, "completion_length": 79.0, "delta_ref_entropy_loss": 0.23095703125, "delta_ref_ppl": -0.17236328125, "entropy_loss": -0.4931640625, "epoch": 0.7389783065080476, "grad_norm": 4.1853928107071825, "k1_kl": 0.17236328125, "k3_kl": 0.076416015625, "kimi_kl": 0.107421875, "learning_rate": 2.6050420168067226e-07, "loss": 0.003, "ppl": 0.4384765625, "reward": 0.9843293726444244, "reward_std": 0.012592675804626197, "rewards/single_object_detection_bbox_reward": 0.9843294322490692, "step": 528, "temperature": 0.9 }, { "advantages": 5.402735496318201e-06, "completion_length": 59.0625, "delta_ref_entropy_loss": 0.24658203125, "delta_ref_ppl": -0.2431640625, "entropy_loss": -0.513671875, "epoch": 0.7403778866340098, "grad_norm": 3.730073742361059, "k1_kl": 0.24365234375, "k3_kl": 0.126708984375, "kimi_kl": 0.2724609375, "learning_rate": 2.5910364145658264e-07, "loss": 0.0051, "ppl": 0.44921875, "reward": 0.9494643211364746, "reward_std": 0.017102777492254972, "rewards/single_object_detection_bbox_reward": 0.9494643807411194, "step": 529, "temperature": 0.9 }, { "advantages": -3.562975052773254e-07, "completion_length": 85.1875, "delta_ref_entropy_loss": 0.23583984375, "delta_ref_ppl": -0.201171875, "entropy_loss": -0.4912109375, "epoch": 0.741777466759972, "grad_norm": 3.117527896814717, "k1_kl": 0.20166015625, "k3_kl": 0.092529296875, "kimi_kl": 0.1689453125, "learning_rate": 2.5770308123249296e-07, "loss": 0.0037, "ppl": 0.4365234375, "reward": 0.9727593958377838, "reward_std": 0.02688107592985034, "rewards/single_object_detection_bbox_reward": 0.9727595150470734, "step": 530, "temperature": 0.9 }, { "advantages": -5.595652510237414e-06, "completion_length": 96.0, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.193359375, "entropy_loss": -0.5263671875, "epoch": 0.7431770468859342, "grad_norm": 3.063477149069003, "k1_kl": 0.193359375, "k3_kl": 0.0849609375, "kimi_kl": 0.145751953125, "learning_rate": 2.5630252100840333e-07, "loss": 0.0034, "ppl": 0.4677734375, "reward": 0.9719859659671783, "reward_std": 0.015615738928318024, "rewards/single_object_detection_bbox_reward": 0.9719860553741455, "step": 531, "temperature": 0.9 }, { "advantages": -2.2101615968495025e-06, "completion_length": 95.78125, "delta_ref_entropy_loss": 0.26171875, "delta_ref_ppl": -0.2626953125, "entropy_loss": -0.5546875, "epoch": 0.7445766270118964, "grad_norm": 5.018632443003686, "k1_kl": 0.2626953125, "k3_kl": 0.13330078125, "kimi_kl": 0.26318359375, "learning_rate": 2.549019607843137e-07, "loss": 0.0053, "ppl": 0.4931640625, "reward": 0.8695653975009918, "reward_std": 0.04888521507382393, "rewards/single_object_detection_bbox_reward": 0.8695654571056366, "step": 532, "temperature": 0.9 }, { "advantages": 3.1223255518852966e-06, "completion_length": 221.125, "delta_ref_entropy_loss": 0.2890625, "delta_ref_ppl": -0.255859375, "entropy_loss": -0.490234375, "epoch": 0.7459762071378586, "grad_norm": 3.408037540716906, "k1_kl": 0.25537109375, "k3_kl": 0.118408203125, "kimi_kl": 0.2294921875, "learning_rate": 2.535014005602241e-07, "loss": 0.0047, "ppl": 0.4306640625, "reward": 0.9400731027126312, "reward_std": 0.03097077552229166, "rewards/single_object_detection_bbox_reward": 0.9400731325149536, "step": 533, "temperature": 0.9 }, { "advantages": -1.486138080508681e-05, "completion_length": 97.59375, "delta_ref_entropy_loss": 0.23828125, "delta_ref_ppl": -0.21484375, "entropy_loss": -0.513671875, "epoch": 0.7473757872638208, "grad_norm": 3.20105020143961, "k1_kl": 0.2138671875, "k3_kl": 0.096435546875, "kimi_kl": 0.18798828125, "learning_rate": 2.5210084033613445e-07, "loss": 0.0039, "ppl": 0.4501953125, "reward": 0.9414955377578735, "reward_std": 0.022967428900301456, "rewards/single_object_detection_bbox_reward": 0.9414955973625183, "step": 534, "temperature": 0.9 }, { "advantages": -1.2308359600865515e-05, "completion_length": 51.125, "delta_ref_entropy_loss": 0.2421875, "delta_ref_ppl": -0.23193359375, "entropy_loss": -0.572265625, "epoch": 0.7487753673897831, "grad_norm": 4.470761150662017, "k1_kl": 0.23291015625, "k3_kl": 0.12646484375, "kimi_kl": 0.248046875, "learning_rate": 2.5070028011204483e-07, "loss": 0.0051, "ppl": 0.5146484375, "reward": 0.8988785743713379, "reward_std": 0.03255656827241182, "rewards/single_object_detection_bbox_reward": 0.898878663778305, "step": 535, "temperature": 0.9 }, { "advantages": 2.232819315395318e-05, "completion_length": 74.8125, "delta_ref_entropy_loss": 0.23193359375, "delta_ref_ppl": -0.205078125, "entropy_loss": -0.533203125, "epoch": 0.7501749475157453, "grad_norm": 2.49529623399798, "k1_kl": 0.20458984375, "k3_kl": 0.092529296875, "kimi_kl": 0.16845703125, "learning_rate": 2.4929971988795515e-07, "loss": 0.0037, "ppl": 0.470703125, "reward": 0.9255039393901825, "reward_std": 0.04619070328772068, "rewards/single_object_detection_bbox_reward": 0.9255039691925049, "step": 536, "temperature": 0.9 }, { "advantages": -6.83244888932677e-06, "completion_length": 79.5, "delta_ref_entropy_loss": 0.2392578125, "delta_ref_ppl": -0.20751953125, "entropy_loss": -0.51171875, "epoch": 0.7515745276417075, "grad_norm": 8.372187852262718, "k1_kl": 0.2080078125, "k3_kl": 0.092529296875, "kimi_kl": 0.1728515625, "learning_rate": 2.478991596638655e-07, "loss": 0.0037, "ppl": 0.44921875, "reward": 0.9141355156898499, "reward_std": 0.029993483796715736, "rewards/single_object_detection_bbox_reward": 0.9141355752944946, "step": 537, "temperature": 0.9 }, { "advantages": -4.6691195620951476e-06, "completion_length": 116.65625, "delta_ref_entropy_loss": 0.23388671875, "delta_ref_ppl": -0.212890625, "entropy_loss": -0.5263671875, "epoch": 0.7529741077676697, "grad_norm": 3.8943822370512886, "k1_kl": 0.2138671875, "k3_kl": 0.108642578125, "kimi_kl": 0.234375, "learning_rate": 2.464985994397759e-07, "loss": 0.0044, "ppl": 0.47265625, "reward": 0.9843419790267944, "reward_std": 0.018917036708444357, "rewards/single_object_detection_bbox_reward": 0.9843420088291168, "step": 538, "temperature": 0.9 }, { "advantages": 1.4928702967154095e-05, "completion_length": 71.0, "delta_ref_entropy_loss": 0.22509765625, "delta_ref_ppl": -0.197265625, "entropy_loss": -0.521484375, "epoch": 0.7543736878936319, "grad_norm": 2.7699966633582074, "k1_kl": 0.197265625, "k3_kl": 0.0894775390625, "kimi_kl": 0.15966796875, "learning_rate": 2.4509803921568627e-07, "loss": 0.0036, "ppl": 0.4609375, "reward": 0.9384076297283173, "reward_std": 0.01750162587268278, "rewards/single_object_detection_bbox_reward": 0.9384077191352844, "step": 539, "temperature": 0.9 }, { "advantages": -1.1457928849267773e-05, "completion_length": 41.90625, "delta_ref_entropy_loss": 0.23193359375, "delta_ref_ppl": -0.18896484375, "entropy_loss": -0.5009765625, "epoch": 0.7557732680195941, "grad_norm": 2.338541633507878, "k1_kl": 0.18896484375, "k3_kl": 0.0859375, "kimi_kl": 0.1416015625, "learning_rate": 2.4369747899159664e-07, "loss": 0.0034, "ppl": 0.4462890625, "reward": 0.9176111221313477, "reward_std": 0.03050532005727291, "rewards/single_object_detection_bbox_reward": 0.9176112115383148, "step": 540, "temperature": 0.9 }, { "advantages": -4.444271553438739e-06, "completion_length": 40.5, "delta_ref_entropy_loss": 0.23388671875, "delta_ref_ppl": -0.20947265625, "entropy_loss": -0.515625, "epoch": 0.7571728481455563, "grad_norm": 2.405145535614079, "k1_kl": 0.208984375, "k3_kl": 0.10009765625, "kimi_kl": 0.193359375, "learning_rate": 2.4229691876750697e-07, "loss": 0.004, "ppl": 0.458984375, "reward": 0.9670611917972565, "reward_std": 0.010946843773126602, "rewards/single_object_detection_bbox_reward": 0.9670612514019012, "step": 541, "temperature": 0.9 }, { "advantages": 1.754877814619249e-06, "completion_length": 87.5, "delta_ref_entropy_loss": 0.2578125, "delta_ref_ppl": -0.2119140625, "entropy_loss": -0.4716796875, "epoch": 0.7585724282715185, "grad_norm": 4.4640553808100805, "k1_kl": 0.21142578125, "k3_kl": 0.096923828125, "kimi_kl": 0.1796875, "learning_rate": 2.4089635854341734e-07, "loss": 0.0039, "ppl": 0.4150390625, "reward": 0.9882616400718689, "reward_std": 0.01023115191492252, "rewards/single_object_detection_bbox_reward": 0.9882616996765137, "step": 542, "temperature": 0.9 }, { "advantages": -1.976798785108258e-06, "completion_length": 51.0, "delta_ref_entropy_loss": 0.2568359375, "delta_ref_ppl": -0.21875, "entropy_loss": -0.5234375, "epoch": 0.7599720083974808, "grad_norm": 2.896046051987217, "k1_kl": 0.21875, "k3_kl": 0.09228515625, "kimi_kl": 0.16259765625, "learning_rate": 2.394957983193277e-07, "loss": 0.0037, "ppl": 0.455078125, "reward": 0.9539303481578827, "reward_std": 0.009948395192623138, "rewards/single_object_detection_bbox_reward": 0.9539304375648499, "step": 543, "temperature": 0.9 }, { "advantages": 1.1120524050056702e-05, "completion_length": 32.5, "delta_ref_entropy_loss": 0.2548828125, "delta_ref_ppl": -0.1953125, "entropy_loss": -0.50390625, "epoch": 0.761371588523443, "grad_norm": 2.5270121120270073, "k1_kl": 0.1953125, "k3_kl": 0.0869140625, "kimi_kl": 0.1396484375, "learning_rate": 2.3809523809523806e-07, "loss": 0.0035, "ppl": 0.4462890625, "reward": 0.9429798722267151, "reward_std": 0.02575243916362524, "rewards/single_object_detection_bbox_reward": 0.9429799020290375, "step": 544, "temperature": 0.9 }, { "advantages": 2.158805841645517e-06, "completion_length": 40.96875, "delta_ref_entropy_loss": 0.2451171875, "delta_ref_ppl": -0.20703125, "entropy_loss": -0.49609375, "epoch": 0.7627711686494052, "grad_norm": 4.738421043870149, "k1_kl": 0.20751953125, "k3_kl": 0.0888671875, "kimi_kl": 0.16552734375, "learning_rate": 2.3669467787114843e-07, "loss": 0.0035, "ppl": 0.43359375, "reward": 0.9813269972801208, "reward_std": 0.009153669234365225, "rewards/single_object_detection_bbox_reward": 0.981327086687088, "step": 545, "temperature": 0.9 }, { "advantages": 1.1200085964446771e-05, "completion_length": 95.3125, "delta_ref_entropy_loss": 0.2421875, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.51171875, "epoch": 0.7641707487753674, "grad_norm": 2.2887370965254847, "k1_kl": 0.21435546875, "k3_kl": 0.10791015625, "kimi_kl": 0.21435546875, "learning_rate": 2.352941176470588e-07, "loss": 0.0043, "ppl": 0.455078125, "reward": 0.9816718399524689, "reward_std": 0.015022790990769863, "rewards/single_object_detection_bbox_reward": 0.9816718995571136, "step": 546, "temperature": 0.9 }, { "advantages": 6.426392246794421e-06, "completion_length": 46.90625, "delta_ref_entropy_loss": 0.23828125, "delta_ref_ppl": -0.197265625, "entropy_loss": -0.5126953125, "epoch": 0.7655703289013296, "grad_norm": 3.48414401978313, "k1_kl": 0.197265625, "k3_kl": 0.090087890625, "kimi_kl": 0.16259765625, "learning_rate": 2.3389355742296918e-07, "loss": 0.0036, "ppl": 0.4521484375, "reward": 0.9557445049285889, "reward_std": 0.015028075780719519, "rewards/single_object_detection_bbox_reward": 0.955744594335556, "step": 547, "temperature": 0.9 }, { "advantages": -2.908041642513126e-05, "completion_length": 101.65625, "delta_ref_entropy_loss": 0.251953125, "delta_ref_ppl": -0.2021484375, "entropy_loss": -0.4873046875, "epoch": 0.7669699090272918, "grad_norm": 2.7904078211972507, "k1_kl": 0.20263671875, "k3_kl": 0.08935546875, "kimi_kl": 0.1689453125, "learning_rate": 2.3249299719887956e-07, "loss": 0.0036, "ppl": 0.4306640625, "reward": 0.9867091476917267, "reward_std": 0.010436342156026512, "rewards/single_object_detection_bbox_reward": 0.9867092072963715, "step": 548, "temperature": 0.9 }, { "advantages": -1.7136335372924805e-06, "completion_length": 128.5625, "delta_ref_entropy_loss": 0.228515625, "delta_ref_ppl": -0.189453125, "entropy_loss": -0.517578125, "epoch": 0.768369489153254, "grad_norm": 4.226981766910935, "k1_kl": 0.18994140625, "k3_kl": 0.07861328125, "kimi_kl": 0.1376953125, "learning_rate": 2.3109243697478993e-07, "loss": 0.0032, "ppl": 0.45703125, "reward": 0.9553475677967072, "reward_std": 0.015084690880030394, "rewards/single_object_detection_bbox_reward": 0.9553475975990295, "step": 549, "temperature": 0.9 }, { "advantages": -6.1238456510182004e-06, "completion_length": 110.28125, "delta_ref_entropy_loss": 0.2177734375, "delta_ref_ppl": -0.21484375, "entropy_loss": -0.53125, "epoch": 0.7697690692792163, "grad_norm": 4.431276677658758, "k1_kl": 0.21484375, "k3_kl": 0.103271484375, "kimi_kl": 0.22509765625, "learning_rate": 2.2969187675070025e-07, "loss": 0.0041, "ppl": 0.46484375, "reward": 0.9400520026683807, "reward_std": 0.024467698764055967, "rewards/single_object_detection_bbox_reward": 0.9400520324707031, "step": 550, "temperature": 0.9 }, { "advantages": -1.2839080795856717e-05, "completion_length": 105.15625, "delta_ref_entropy_loss": 0.234375, "delta_ref_ppl": -0.22021484375, "entropy_loss": -0.521484375, "epoch": 0.7711686494051785, "grad_norm": 24.696846786760528, "k1_kl": 0.22119140625, "k3_kl": 0.106201171875, "kimi_kl": 0.17236328125, "learning_rate": 2.2829131652661062e-07, "loss": 0.0043, "ppl": 0.462890625, "reward": 0.9541815519332886, "reward_std": 0.013795677572488785, "rewards/single_object_detection_bbox_reward": 0.9541816115379333, "step": 551, "temperature": 0.9 }, { "advantages": -1.7415732145309448e-06, "completion_length": 32.5, "delta_ref_entropy_loss": 0.21484375, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.580078125, "epoch": 0.7725682295311407, "grad_norm": 1.8616734178525522, "k1_kl": 0.18408203125, "k3_kl": 0.07763671875, "kimi_kl": 0.1318359375, "learning_rate": 2.26890756302521e-07, "loss": 0.0031, "ppl": 0.513671875, "reward": 0.897930234670639, "reward_std": 0.024612360633909702, "rewards/single_object_detection_bbox_reward": 0.897930234670639, "step": 552, "temperature": 0.9 }, { "advantages": -5.263037337499554e-06, "completion_length": 179.375, "delta_ref_entropy_loss": 0.23486328125, "delta_ref_ppl": -0.20068359375, "entropy_loss": -0.54296875, "epoch": 0.7739678096571029, "grad_norm": 3.037048510252865, "k1_kl": 0.201171875, "k3_kl": 0.089599609375, "kimi_kl": 0.14453125, "learning_rate": 2.2549019607843137e-07, "loss": 0.0036, "ppl": 0.48046875, "reward": 0.9510514438152313, "reward_std": 0.021968796849250793, "rewards/single_object_detection_bbox_reward": 0.9510514140129089, "step": 553, "temperature": 0.9 }, { "advantages": -3.5938414839620236e-06, "completion_length": 60.125, "delta_ref_entropy_loss": 0.24267578125, "delta_ref_ppl": -0.21533203125, "entropy_loss": -0.5439453125, "epoch": 0.7753673897830651, "grad_norm": 2.7203532941501205, "k1_kl": 0.21533203125, "k3_kl": 0.09912109375, "kimi_kl": 0.18603515625, "learning_rate": 2.2408963585434175e-07, "loss": 0.004, "ppl": 0.4765625, "reward": 0.934317022562027, "reward_std": 0.012474853545427322, "rewards/single_object_detection_bbox_reward": 0.9343171417713165, "step": 554, "temperature": 0.9 }, { "advantages": 3.2237066989182495e-06, "completion_length": 157.71875, "delta_ref_entropy_loss": 0.2861328125, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.47265625, "epoch": 0.7767669699090273, "grad_norm": 4.727228107602286, "k1_kl": 0.20751953125, "k3_kl": 0.09375, "kimi_kl": 0.1572265625, "learning_rate": 2.226890756302521e-07, "loss": 0.0037, "ppl": 0.4228515625, "reward": 0.9786800146102905, "reward_std": 0.014301196672022343, "rewards/single_object_detection_bbox_reward": 0.9786801040172577, "step": 555, "temperature": 0.9 }, { "advantages": -9.390791888108652e-06, "completion_length": 51.5, "delta_ref_entropy_loss": 0.2822265625, "delta_ref_ppl": -0.236328125, "entropy_loss": -0.47265625, "epoch": 0.7781665500349895, "grad_norm": 3.5317242194339924, "k1_kl": 0.236328125, "k3_kl": 0.104248046875, "kimi_kl": 0.169921875, "learning_rate": 2.2128851540616244e-07, "loss": 0.0042, "ppl": 0.41796875, "reward": 0.896818995475769, "reward_std": 0.03771821688860655, "rewards/single_object_detection_bbox_reward": 0.8968190252780914, "step": 556, "temperature": 0.9 }, { "advantages": 1.253028131031897e-06, "completion_length": 56.5, "delta_ref_entropy_loss": 0.27587890625, "delta_ref_ppl": -0.24560546875, "entropy_loss": -0.49609375, "epoch": 0.7795661301609517, "grad_norm": 5.55842908430905, "k1_kl": 0.24658203125, "k3_kl": 0.11328125, "kimi_kl": 0.20361328125, "learning_rate": 2.1988795518207281e-07, "loss": 0.0045, "ppl": 0.4453125, "reward": 0.9706622958183289, "reward_std": 0.011216156417503953, "rewards/single_object_detection_bbox_reward": 0.9706623256206512, "step": 557, "temperature": 0.9 }, { "advantages": 7.979505556754418e-06, "completion_length": 19.21875, "delta_ref_entropy_loss": 0.2666015625, "delta_ref_ppl": -0.28564453125, "entropy_loss": -0.52734375, "epoch": 0.780965710286914, "grad_norm": 6.761880302824008, "k1_kl": 0.2861328125, "k3_kl": 0.155029296875, "kimi_kl": 0.359375, "learning_rate": 2.184873949579832e-07, "loss": 0.0062, "ppl": 0.4658203125, "reward": 0.9256075024604797, "reward_std": 0.03561596479266882, "rewards/single_object_detection_bbox_reward": 0.9256075620651245, "step": 558, "temperature": 0.9 }, { "advantages": 5.2598441016016295e-06, "completion_length": 61.6875, "delta_ref_entropy_loss": 0.2685546875, "delta_ref_ppl": -0.25830078125, "entropy_loss": -0.5322265625, "epoch": 0.7823652904128762, "grad_norm": 3.559649153560117, "k1_kl": 0.2587890625, "k3_kl": 0.125244140625, "kimi_kl": 0.25439453125, "learning_rate": 2.1708683473389356e-07, "loss": 0.005, "ppl": 0.47265625, "reward": 0.9488549530506134, "reward_std": 0.017436034977436066, "rewards/single_object_detection_bbox_reward": 0.9488550126552582, "step": 559, "temperature": 0.9 }, { "advantages": -3.6973507349102874e-06, "completion_length": 67.90625, "delta_ref_entropy_loss": 0.21435546875, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.53125, "epoch": 0.7837648705388384, "grad_norm": 1.9118918597575971, "k1_kl": 0.185546875, "k3_kl": 0.08203125, "kimi_kl": 0.13818359375, "learning_rate": 2.156862745098039e-07, "loss": 0.0033, "ppl": 0.470703125, "reward": 0.9457289278507233, "reward_std": 0.01863897603470832, "rewards/single_object_detection_bbox_reward": 0.9457290172576904, "step": 560, "temperature": 0.9 }, { "advantages": 6.241192295419751e-06, "completion_length": 61.0, "delta_ref_entropy_loss": 0.2353515625, "delta_ref_ppl": -0.193359375, "entropy_loss": -0.509765625, "epoch": 0.7851644506648006, "grad_norm": 4.144760059313693, "k1_kl": 0.193359375, "k3_kl": 0.084716796875, "kimi_kl": 0.158447265625, "learning_rate": 2.1428571428571426e-07, "loss": 0.0034, "ppl": 0.4453125, "reward": 0.9039289951324463, "reward_std": 0.023800316266715527, "rewards/single_object_detection_bbox_reward": 0.9039290547370911, "step": 561, "temperature": 0.9 }, { "advantages": 1.1208600881218445e-05, "completion_length": 127.4375, "delta_ref_entropy_loss": 0.24169921875, "delta_ref_ppl": -0.16943359375, "entropy_loss": -0.478515625, "epoch": 0.7865640307907628, "grad_norm": 2.5229976739780735, "k1_kl": 0.169921875, "k3_kl": 0.068115234375, "kimi_kl": 0.100830078125, "learning_rate": 2.1288515406162463e-07, "loss": 0.0027, "ppl": 0.4248046875, "reward": 0.9238364696502686, "reward_std": 0.028763308189809322, "rewards/single_object_detection_bbox_reward": 0.9238365292549133, "step": 562, "temperature": 0.9 }, { "advantages": 9.80975414677232e-06, "completion_length": 52.46875, "delta_ref_entropy_loss": 0.2646484375, "delta_ref_ppl": -0.2294921875, "entropy_loss": -0.5283203125, "epoch": 0.787963610916725, "grad_norm": 4.162427808144148, "k1_kl": 0.2294921875, "k3_kl": 0.10107421875, "kimi_kl": 0.17431640625, "learning_rate": 2.11484593837535e-07, "loss": 0.004, "ppl": 0.4638671875, "reward": 0.9592917859554291, "reward_std": 0.047293209470808506, "rewards/single_object_detection_bbox_reward": 0.9592918157577515, "step": 563, "temperature": 0.9 }, { "advantages": 2.119158125424292e-05, "completion_length": 80.5, "delta_ref_entropy_loss": 0.25390625, "delta_ref_ppl": -0.19091796875, "entropy_loss": -0.48046875, "epoch": 0.7893631910426872, "grad_norm": 2.242670852863059, "k1_kl": 0.19091796875, "k3_kl": 0.07958984375, "kimi_kl": 0.140380859375, "learning_rate": 2.1008403361344538e-07, "loss": 0.0032, "ppl": 0.419921875, "reward": 0.948549747467041, "reward_std": 0.016563981771469116, "rewards/single_object_detection_bbox_reward": 0.9485498666763306, "step": 564, "temperature": 0.9 }, { "advantages": 1.2729450531878683e-05, "completion_length": 60.0, "delta_ref_entropy_loss": 0.2333984375, "delta_ref_ppl": -0.22802734375, "entropy_loss": -0.533203125, "epoch": 0.7907627711686495, "grad_norm": 3.83224904528937, "k1_kl": 0.2275390625, "k3_kl": 0.116455078125, "kimi_kl": 0.28271484375, "learning_rate": 2.0868347338935575e-07, "loss": 0.0046, "ppl": 0.4677734375, "reward": 0.9439437091350555, "reward_std": 0.04245711676776409, "rewards/single_object_detection_bbox_reward": 0.9439438581466675, "step": 565, "temperature": 0.9 }, { "advantages": 2.2180115593073424e-06, "completion_length": 132.625, "delta_ref_entropy_loss": 0.228515625, "delta_ref_ppl": -0.1796875, "entropy_loss": -0.517578125, "epoch": 0.7921623512946117, "grad_norm": 4.662879326740191, "k1_kl": 0.1796875, "k3_kl": 0.07568359375, "kimi_kl": 0.1337890625, "learning_rate": 2.072829131652661e-07, "loss": 0.003, "ppl": 0.4541015625, "reward": 0.9321891665458679, "reward_std": 0.016116457991302013, "rewards/single_object_detection_bbox_reward": 0.9321892559528351, "step": 566, "temperature": 0.9 }, { "advantages": -1.2574185802805005e-05, "completion_length": 118.875, "delta_ref_entropy_loss": 0.236328125, "delta_ref_ppl": -0.21240234375, "entropy_loss": -0.5185546875, "epoch": 0.7935619314205739, "grad_norm": 2.001085369324126, "k1_kl": 0.21240234375, "k3_kl": 0.095458984375, "kimi_kl": 0.185302734375, "learning_rate": 2.0588235294117645e-07, "loss": 0.0038, "ppl": 0.4541015625, "reward": 0.9581134915351868, "reward_std": 0.013860221486538649, "rewards/single_object_detection_bbox_reward": 0.9581136107444763, "step": 567, "temperature": 0.9 }, { "advantages": -1.6369059721910162e-05, "completion_length": 50.34375, "delta_ref_entropy_loss": 0.2509765625, "delta_ref_ppl": -0.23095703125, "entropy_loss": -0.48828125, "epoch": 0.794961511546536, "grad_norm": 2.5019948971628345, "k1_kl": 0.23046875, "k3_kl": 0.1123046875, "kimi_kl": 0.2216796875, "learning_rate": 2.0448179271708682e-07, "loss": 0.0045, "ppl": 0.4287109375, "reward": 0.9561145305633545, "reward_std": 0.028590275906026363, "rewards/single_object_detection_bbox_reward": 0.9561146199703217, "step": 568, "temperature": 0.9 }, { "advantages": 1.8093204516844708e-05, "completion_length": 126.5, "delta_ref_entropy_loss": 0.2607421875, "delta_ref_ppl": -0.20166015625, "entropy_loss": -0.4931640625, "epoch": 0.7963610916724982, "grad_norm": 12.735261884996765, "k1_kl": 0.20263671875, "k3_kl": 0.08154296875, "kimi_kl": 0.125244140625, "learning_rate": 2.030812324929972e-07, "loss": 0.0032, "ppl": 0.4365234375, "reward": 0.9697598218917847, "reward_std": 0.014018226531334221, "rewards/single_object_detection_bbox_reward": 0.9697599112987518, "step": 569, "temperature": 0.9 }, { "advantages": -8.763479883100445e-06, "completion_length": 47.125, "delta_ref_entropy_loss": 0.2470703125, "delta_ref_ppl": -0.23681640625, "entropy_loss": -0.505859375, "epoch": 0.7977606717984604, "grad_norm": 2.8915843485923545, "k1_kl": 0.23681640625, "k3_kl": 0.11376953125, "kimi_kl": 0.22802734375, "learning_rate": 2.0168067226890757e-07, "loss": 0.0046, "ppl": 0.44921875, "reward": 0.9447811543941498, "reward_std": 0.0275185639038682, "rewards/single_object_detection_bbox_reward": 0.9447812438011169, "step": 570, "temperature": 0.9 }, { "advantages": 1.145872738561593e-05, "completion_length": 22.0, "delta_ref_entropy_loss": 0.2626953125, "delta_ref_ppl": -0.20751953125, "entropy_loss": -0.4716796875, "epoch": 0.7991602519244226, "grad_norm": 3.819498301040871, "k1_kl": 0.2080078125, "k3_kl": 0.0927734375, "kimi_kl": 0.16015625, "learning_rate": 2.0028011204481792e-07, "loss": 0.0037, "ppl": 0.4189453125, "reward": 0.960919976234436, "reward_std": 0.016081460285931826, "rewards/single_object_detection_bbox_reward": 0.9609200656414032, "step": 571, "temperature": 0.9 }, { "advantages": -3.1158062938629882e-06, "completion_length": 61.28125, "delta_ref_entropy_loss": 0.2783203125, "delta_ref_ppl": -0.25537109375, "entropy_loss": -0.5029296875, "epoch": 0.8005598320503848, "grad_norm": 1.8317824132743954, "k1_kl": 0.25634765625, "k3_kl": 0.12353515625, "kimi_kl": 0.2607421875, "learning_rate": 1.9887955182072826e-07, "loss": 0.0049, "ppl": 0.4453125, "reward": 0.967374712228775, "reward_std": 0.013198774307966232, "rewards/single_object_detection_bbox_reward": 0.9673747420310974, "step": 572, "temperature": 0.9 }, { "advantages": 7.469473530363757e-06, "completion_length": 143.375, "delta_ref_entropy_loss": 0.25634765625, "delta_ref_ppl": -0.1953125, "entropy_loss": -0.5009765625, "epoch": 0.8019594121763471, "grad_norm": 5.098403381049385, "k1_kl": 0.1962890625, "k3_kl": 0.081787109375, "kimi_kl": 0.13037109375, "learning_rate": 1.9747899159663864e-07, "loss": 0.0033, "ppl": 0.4423828125, "reward": 0.9748967587947845, "reward_std": 0.01367028197273612, "rewards/single_object_detection_bbox_reward": 0.9748967587947845, "step": 573, "temperature": 0.9 }, { "advantages": -3.5512661042957916e-06, "completion_length": 79.5, "delta_ref_entropy_loss": 0.2333984375, "delta_ref_ppl": -0.21337890625, "entropy_loss": -0.578125, "epoch": 0.8033589923023093, "grad_norm": 2.7074256705554505, "k1_kl": 0.21337890625, "k3_kl": 0.1005859375, "kimi_kl": 0.189453125, "learning_rate": 1.96078431372549e-07, "loss": 0.004, "ppl": 0.5126953125, "reward": 0.9315529465675354, "reward_std": 0.027511716820299625, "rewards/single_object_detection_bbox_reward": 0.9315530061721802, "step": 574, "temperature": 0.9 }, { "advantages": 6.3399120335816406e-06, "completion_length": 32.0625, "delta_ref_entropy_loss": 0.23291015625, "delta_ref_ppl": -0.20849609375, "entropy_loss": -0.515625, "epoch": 0.8047585724282715, "grad_norm": 3.0269218700394003, "k1_kl": 0.20947265625, "k3_kl": 0.09716796875, "kimi_kl": 0.19189453125, "learning_rate": 1.9467787114845939e-07, "loss": 0.0039, "ppl": 0.4599609375, "reward": 0.9557640552520752, "reward_std": 0.014628566335886717, "rewards/single_object_detection_bbox_reward": 0.9557640552520752, "step": 575, "temperature": 0.9 }, { "advantages": 3.986859155702405e-06, "completion_length": 79.1875, "delta_ref_entropy_loss": 0.2255859375, "delta_ref_ppl": -0.18798828125, "entropy_loss": -0.5048828125, "epoch": 0.8061581525542337, "grad_norm": 2.3670974175092523, "k1_kl": 0.1884765625, "k3_kl": 0.082275390625, "kimi_kl": 0.14697265625, "learning_rate": 1.9327731092436976e-07, "loss": 0.0033, "ppl": 0.4462890625, "reward": 0.9552747905254364, "reward_std": 0.016116377897560596, "rewards/single_object_detection_bbox_reward": 0.9552748799324036, "step": 576, "temperature": 0.9 }, { "advantages": -1.8683130292629357e-05, "completion_length": 79.65625, "delta_ref_entropy_loss": 0.1962890625, "delta_ref_ppl": -0.173828125, "entropy_loss": -0.5, "epoch": 0.8075577326801959, "grad_norm": 3.0868786923047904, "k1_kl": 0.173828125, "k3_kl": 0.091552734375, "kimi_kl": 0.173095703125, "learning_rate": 1.918767507002801e-07, "loss": 0.0037, "ppl": 0.44921875, "reward": 0.9541601538658142, "reward_std": 0.026406260207295418, "rewards/single_object_detection_bbox_reward": 0.9541602730751038, "step": 577, "temperature": 0.9 }, { "advantages": -6.945671202629455e-06, "completion_length": 50.5625, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.18505859375, "entropy_loss": -0.53515625, "epoch": 0.8089573128061581, "grad_norm": 2.712062925841223, "k1_kl": 0.185546875, "k3_kl": 0.0771484375, "kimi_kl": 0.129638671875, "learning_rate": 1.9047619047619045e-07, "loss": 0.0031, "ppl": 0.4736328125, "reward": 0.9288170337677002, "reward_std": 0.026968171820044518, "rewards/single_object_detection_bbox_reward": 0.9288170337677002, "step": 578, "temperature": 0.9 }, { "advantages": -9.950250390744486e-06, "completion_length": 52.5625, "delta_ref_entropy_loss": 0.25341796875, "delta_ref_ppl": -0.20263671875, "entropy_loss": -0.4931640625, "epoch": 0.8103568929321203, "grad_norm": 3.3794315103782533, "k1_kl": 0.20263671875, "k3_kl": 0.086181640625, "kimi_kl": 0.135986328125, "learning_rate": 1.8907563025210083e-07, "loss": 0.0035, "ppl": 0.4287109375, "reward": 0.9336036741733551, "reward_std": 0.021561854518949986, "rewards/single_object_detection_bbox_reward": 0.9336037039756775, "step": 579, "temperature": 0.9 }, { "advantages": 2.4099970232782653e-06, "completion_length": 40.5, "delta_ref_entropy_loss": 0.23876953125, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.53125, "epoch": 0.8117564730580826, "grad_norm": 2.729551940876617, "k1_kl": 0.20947265625, "k3_kl": 0.092529296875, "kimi_kl": 0.15625, "learning_rate": 1.876750700280112e-07, "loss": 0.0037, "ppl": 0.47265625, "reward": 0.9173483848571777, "reward_std": 0.029529315419495106, "rewards/single_object_detection_bbox_reward": 0.9173484444618225, "step": 580, "temperature": 0.9 }, { "advantages": 3.35648678628786e-06, "completion_length": 77.65625, "delta_ref_entropy_loss": 0.271484375, "delta_ref_ppl": -0.24755859375, "entropy_loss": -0.54296875, "epoch": 0.8131560531840448, "grad_norm": 6.210664374781261, "k1_kl": 0.24658203125, "k3_kl": 0.124267578125, "kimi_kl": 0.24462890625, "learning_rate": 1.8627450980392158e-07, "loss": 0.005, "ppl": 0.4853515625, "reward": 0.8949756026268005, "reward_std": 0.03771005664020777, "rewards/single_object_detection_bbox_reward": 0.8949756622314453, "step": 581, "temperature": 0.9 }, { "advantages": 1.3397209272625332e-05, "completion_length": 85.46875, "delta_ref_entropy_loss": 0.22509765625, "delta_ref_ppl": -0.1689453125, "entropy_loss": -0.486328125, "epoch": 0.814555633310007, "grad_norm": 3.3562630295426414, "k1_kl": 0.1689453125, "k3_kl": 0.0670166015625, "kimi_kl": 0.095947265625, "learning_rate": 1.8487394957983192e-07, "loss": 0.0027, "ppl": 0.431640625, "reward": 0.9909517467021942, "reward_std": 0.011659625160973519, "rewards/single_object_detection_bbox_reward": 0.990951806306839, "step": 582, "temperature": 0.9 }, { "advantages": 1.0373603629432182e-06, "completion_length": 102.03125, "delta_ref_entropy_loss": 0.2412109375, "delta_ref_ppl": -0.2265625, "entropy_loss": -0.53125, "epoch": 0.8159552134359692, "grad_norm": 2.163125225569061, "k1_kl": 0.2265625, "k3_kl": 0.1142578125, "kimi_kl": 0.265625, "learning_rate": 1.8347338935574227e-07, "loss": 0.0046, "ppl": 0.470703125, "reward": 0.8314545750617981, "reward_std": 0.022414604667574167, "rewards/single_object_detection_bbox_reward": 0.8314545452594757, "step": 583, "temperature": 0.9 }, { "advantages": 6.407898808902246e-06, "completion_length": 57.25, "delta_ref_entropy_loss": 0.22802734375, "delta_ref_ppl": -0.1875, "entropy_loss": -0.5, "epoch": 0.8173547935619314, "grad_norm": 3.071288196055964, "k1_kl": 0.18798828125, "k3_kl": 0.08154296875, "kimi_kl": 0.150634765625, "learning_rate": 1.8207282913165264e-07, "loss": 0.0033, "ppl": 0.4384765625, "reward": 0.9796570539474487, "reward_std": 0.011975394561886787, "rewards/single_object_detection_bbox_reward": 0.9796571135520935, "step": 584, "temperature": 0.9 }, { "advantages": -3.736998223757837e-06, "completion_length": 97.5, "delta_ref_entropy_loss": 0.220703125, "delta_ref_ppl": -0.1611328125, "entropy_loss": -0.517578125, "epoch": 0.8187543736878936, "grad_norm": 9.545290321228459, "k1_kl": 0.16162109375, "k3_kl": 0.062744140625, "kimi_kl": 0.09814453125, "learning_rate": 1.8067226890756302e-07, "loss": 0.0025, "ppl": 0.451171875, "reward": 0.9439457058906555, "reward_std": 0.02769452054053545, "rewards/single_object_detection_bbox_reward": 0.9439457654953003, "step": 585, "temperature": 0.9 }, { "advantages": -3.870044793075067e-06, "completion_length": 116.40625, "delta_ref_entropy_loss": 0.24609375, "delta_ref_ppl": -0.19287109375, "entropy_loss": -0.505859375, "epoch": 0.8201539538138558, "grad_norm": 3.287762601503045, "k1_kl": 0.1923828125, "k3_kl": 0.081787109375, "kimi_kl": 0.12890625, "learning_rate": 1.792717086834734e-07, "loss": 0.0033, "ppl": 0.4443359375, "reward": 0.9497695863246918, "reward_std": 0.025954113341867924, "rewards/single_object_detection_bbox_reward": 0.9497696757316589, "step": 586, "temperature": 0.9 }, { "advantages": -3.694157726386038e-06, "completion_length": 22.0, "delta_ref_entropy_loss": 0.2626953125, "delta_ref_ppl": -0.24560546875, "entropy_loss": -0.57421875, "epoch": 0.821553533939818, "grad_norm": 3.1269395540485587, "k1_kl": 0.24560546875, "k3_kl": 0.1171875, "kimi_kl": 0.22802734375, "learning_rate": 1.7787114845938374e-07, "loss": 0.0047, "ppl": 0.515625, "reward": 0.9124336838722229, "reward_std": 0.031125076115131378, "rewards/single_object_detection_bbox_reward": 0.9124337136745453, "step": 587, "temperature": 0.9 }, { "advantages": -2.3562462843074172e-06, "completion_length": 84.09375, "delta_ref_entropy_loss": 0.236328125, "delta_ref_ppl": -0.2216796875, "entropy_loss": -0.5078125, "epoch": 0.8229531140657803, "grad_norm": 2.176012607605028, "k1_kl": 0.2216796875, "k3_kl": 0.10302734375, "kimi_kl": 0.1904296875, "learning_rate": 1.764705882352941e-07, "loss": 0.0041, "ppl": 0.4462890625, "reward": 0.946224719285965, "reward_std": 0.02864605188369751, "rewards/single_object_detection_bbox_reward": 0.9462247490882874, "step": 588, "temperature": 0.9 }, { "advantages": -1.4414013747909848e-05, "completion_length": 58.96875, "delta_ref_entropy_loss": 0.25048828125, "delta_ref_ppl": -0.23779296875, "entropy_loss": -0.513671875, "epoch": 0.8243526941917425, "grad_norm": 3.6523515590512994, "k1_kl": 0.23779296875, "k3_kl": 0.110595703125, "kimi_kl": 0.20166015625, "learning_rate": 1.7507002801120446e-07, "loss": 0.0044, "ppl": 0.4521484375, "reward": 0.9416790306568146, "reward_std": 0.036864062771201134, "rewards/single_object_detection_bbox_reward": 0.9416790306568146, "step": 589, "temperature": 0.9 }, { "advantages": 4.001760316896252e-06, "completion_length": 40.59375, "delta_ref_entropy_loss": 0.23828125, "delta_ref_ppl": -0.18701171875, "entropy_loss": -0.5458984375, "epoch": 0.8257522743177047, "grad_norm": 3.6594010091772287, "k1_kl": 0.1875, "k3_kl": 0.080810546875, "kimi_kl": 0.13720703125, "learning_rate": 1.7366946778711483e-07, "loss": 0.0032, "ppl": 0.48046875, "reward": 0.9772962331771851, "reward_std": 0.016429536743089557, "rewards/single_object_detection_bbox_reward": 0.9772962629795074, "step": 590, "temperature": 0.9 }, { "advantages": -1.3608353128802264e-05, "completion_length": 90.6875, "delta_ref_entropy_loss": 0.24658203125, "delta_ref_ppl": -0.23583984375, "entropy_loss": -0.53515625, "epoch": 0.8271518544436669, "grad_norm": 3.5464503329843886, "k1_kl": 0.2353515625, "k3_kl": 0.115966796875, "kimi_kl": 0.216796875, "learning_rate": 1.722689075630252e-07, "loss": 0.0046, "ppl": 0.4765625, "reward": 0.9433789253234863, "reward_std": 0.03176425117999315, "rewards/single_object_detection_bbox_reward": 0.9433789849281311, "step": 591, "temperature": 0.9 }, { "advantages": 9.409020549355773e-07, "completion_length": 99.09375, "delta_ref_entropy_loss": 0.2451171875, "delta_ref_ppl": -0.201171875, "entropy_loss": -0.541015625, "epoch": 0.8285514345696291, "grad_norm": 2.840489468726723, "k1_kl": 0.20166015625, "k3_kl": 0.088134765625, "kimi_kl": 0.14404296875, "learning_rate": 1.7086834733893558e-07, "loss": 0.0035, "ppl": 0.4833984375, "reward": 0.9083276689052582, "reward_std": 0.005336775211617351, "rewards/single_object_detection_bbox_reward": 0.9083277583122253, "step": 592, "temperature": 0.9 }, { "advantages": 5.728961696149781e-07, "completion_length": 152.1875, "delta_ref_entropy_loss": 0.27001953125, "delta_ref_ppl": -0.19677734375, "entropy_loss": -0.482421875, "epoch": 0.8299510146955913, "grad_norm": 2.3839411704027826, "k1_kl": 0.19677734375, "k3_kl": 0.080322265625, "kimi_kl": 0.125, "learning_rate": 1.6946778711484593e-07, "loss": 0.0032, "ppl": 0.431640625, "reward": 0.9789069592952728, "reward_std": 0.007569470908492804, "rewards/single_object_detection_bbox_reward": 0.9789070188999176, "step": 593, "temperature": 0.9 }, { "advantages": 1.7323532119917218e-05, "completion_length": 51.03125, "delta_ref_entropy_loss": 0.28125, "delta_ref_ppl": -0.26171875, "entropy_loss": -0.470703125, "epoch": 0.8313505948215535, "grad_norm": 2.2697345790074657, "k1_kl": 0.26171875, "k3_kl": 0.134033203125, "kimi_kl": 0.29541015625, "learning_rate": 1.680672268907563e-07, "loss": 0.0053, "ppl": 0.4208984375, "reward": 0.9833567440509796, "reward_std": 0.007292708847671747, "rewards/single_object_detection_bbox_reward": 0.983356773853302, "step": 594, "temperature": 0.9 }, { "advantages": 5.946361983255599e-06, "completion_length": 31.0, "delta_ref_entropy_loss": 0.275390625, "delta_ref_ppl": -0.236328125, "entropy_loss": -0.5029296875, "epoch": 0.8327501749475158, "grad_norm": 7.794295788664227, "k1_kl": 0.23583984375, "k3_kl": 0.113037109375, "kimi_kl": 0.20166015625, "learning_rate": 1.6666666666666665e-07, "loss": 0.0045, "ppl": 0.4443359375, "reward": 0.9339107871055603, "reward_std": 0.017770810052752495, "rewards/single_object_detection_bbox_reward": 0.9339108467102051, "step": 595, "temperature": 0.9 }, { "advantages": -5.63902560202223e-06, "completion_length": 33.0, "delta_ref_entropy_loss": 0.25537109375, "delta_ref_ppl": -0.21630859375, "entropy_loss": -0.5390625, "epoch": 0.834149755073478, "grad_norm": 6.215048699598077, "k1_kl": 0.21630859375, "k3_kl": 0.1044921875, "kimi_kl": 0.2021484375, "learning_rate": 1.6526610644257702e-07, "loss": 0.0042, "ppl": 0.474609375, "reward": 0.9404098093509674, "reward_std": 0.0384714612737298, "rewards/single_object_detection_bbox_reward": 0.9404098391532898, "step": 596, "temperature": 0.9 }, { "advantages": -2.0568904801621102e-07, "completion_length": 91.46875, "delta_ref_entropy_loss": 0.21826171875, "delta_ref_ppl": -0.2158203125, "entropy_loss": -0.521484375, "epoch": 0.8355493351994402, "grad_norm": 2.538218302499054, "k1_kl": 0.21533203125, "k3_kl": 0.104736328125, "kimi_kl": 0.2021484375, "learning_rate": 1.638655462184874e-07, "loss": 0.0042, "ppl": 0.462890625, "reward": 0.9304710626602173, "reward_std": 0.027257113717496395, "rewards/single_object_detection_bbox_reward": 0.9304711818695068, "step": 597, "temperature": 0.9 }, { "advantages": 9.784941084944876e-06, "completion_length": 58.0, "delta_ref_entropy_loss": 0.2265625, "delta_ref_ppl": -0.1796875, "entropy_loss": -0.5078125, "epoch": 0.8369489153254024, "grad_norm": 2.599202083592559, "k1_kl": 0.1796875, "k3_kl": 0.07177734375, "kimi_kl": 0.1142578125, "learning_rate": 1.6246498599439775e-07, "loss": 0.0029, "ppl": 0.4443359375, "reward": 0.9565943479537964, "reward_std": 0.014507518615573645, "rewards/single_object_detection_bbox_reward": 0.9565943777561188, "step": 598, "temperature": 0.9 }, { "advantages": -1.2938997770106653e-05, "completion_length": 88.03125, "delta_ref_entropy_loss": 0.22216796875, "delta_ref_ppl": -0.193359375, "entropy_loss": -0.51171875, "epoch": 0.8383484954513646, "grad_norm": 2.6376049903105794, "k1_kl": 0.19384765625, "k3_kl": 0.0869140625, "kimi_kl": 0.153076171875, "learning_rate": 1.6106442577030812e-07, "loss": 0.0035, "ppl": 0.4541015625, "reward": 0.9588789641857147, "reward_std": 0.005142119596712291, "rewards/single_object_detection_bbox_reward": 0.9588789343833923, "step": 599, "temperature": 0.9 }, { "advantages": -1.1739055025827838e-05, "completion_length": 79.5, "delta_ref_entropy_loss": 0.2333984375, "delta_ref_ppl": -0.19873046875, "entropy_loss": -0.4873046875, "epoch": 0.8397480755773268, "grad_norm": 1.8207952041286068, "k1_kl": 0.19873046875, "k3_kl": 0.091796875, "kimi_kl": 0.1728515625, "learning_rate": 1.5966386554621847e-07, "loss": 0.0037, "ppl": 0.4326171875, "reward": 0.9843710660934448, "reward_std": 0.009820314313401468, "rewards/single_object_detection_bbox_reward": 0.9843711256980896, "step": 600, "temperature": 0.9 }, { "advantages": 1.2179043551441282e-06, "completion_length": 96.40625, "delta_ref_entropy_loss": 0.25732421875, "delta_ref_ppl": -0.2109375, "entropy_loss": -0.482421875, "epoch": 0.841147655703289, "grad_norm": 4.990479635758079, "k1_kl": 0.2109375, "k3_kl": 0.094482421875, "kimi_kl": 0.1728515625, "learning_rate": 1.5826330532212884e-07, "loss": 0.0038, "ppl": 0.427734375, "reward": 0.9696933031082153, "reward_std": 0.01357987814117223, "rewards/single_object_detection_bbox_reward": 0.9696934521198273, "step": 601, "temperature": 0.9 }, { "advantages": -3.7617451198457275e-06, "completion_length": 71.78125, "delta_ref_entropy_loss": 0.25537109375, "delta_ref_ppl": -0.22265625, "entropy_loss": -0.5087890625, "epoch": 0.8425472358292512, "grad_norm": 3.3374104608469306, "k1_kl": 0.22265625, "k3_kl": 0.102294921875, "kimi_kl": 0.1787109375, "learning_rate": 1.5686274509803921e-07, "loss": 0.0041, "ppl": 0.44921875, "reward": 0.9572148621082306, "reward_std": 0.02833010978065431, "rewards/single_object_detection_bbox_reward": 0.957214891910553, "step": 602, "temperature": 0.9 }, { "advantages": 2.1128050775587326e-05, "completion_length": 71.0, "delta_ref_entropy_loss": 0.24951171875, "delta_ref_ppl": -0.19580078125, "entropy_loss": -0.50390625, "epoch": 0.8439468159552135, "grad_norm": 2.964673585187811, "k1_kl": 0.1962890625, "k3_kl": 0.08447265625, "kimi_kl": 0.1455078125, "learning_rate": 1.554621848739496e-07, "loss": 0.0034, "ppl": 0.443359375, "reward": 0.9376108944416046, "reward_std": 0.010302081936970353, "rewards/single_object_detection_bbox_reward": 0.9376110136508942, "step": 603, "temperature": 0.9 }, { "advantages": -8.183930731320288e-06, "completion_length": 54.8125, "delta_ref_entropy_loss": 0.25390625, "delta_ref_ppl": -0.2255859375, "entropy_loss": -0.458984375, "epoch": 0.8453463960811757, "grad_norm": 3.534347258162109, "k1_kl": 0.22607421875, "k3_kl": 0.114501953125, "kimi_kl": 0.224609375, "learning_rate": 1.5406162464985994e-07, "loss": 0.0046, "ppl": 0.412109375, "reward": 0.9416474103927612, "reward_std": 0.020443211600650102, "rewards/single_object_detection_bbox_reward": 0.941647469997406, "step": 604, "temperature": 0.9 }, { "advantages": -1.2539726640170556e-05, "completion_length": 96.875, "delta_ref_entropy_loss": 0.23095703125, "delta_ref_ppl": -0.17431640625, "entropy_loss": -0.5078125, "epoch": 0.8467459762071379, "grad_norm": 1.7910711454819435, "k1_kl": 0.17431640625, "k3_kl": 0.069580078125, "kimi_kl": 0.11083984375, "learning_rate": 1.526610644257703e-07, "loss": 0.0028, "ppl": 0.4423828125, "reward": 0.9682802855968475, "reward_std": 0.014098657760769129, "rewards/single_object_detection_bbox_reward": 0.9682802855968475, "step": 605, "temperature": 0.9 }, { "advantages": -4.387861736177001e-07, "completion_length": 41.0625, "delta_ref_entropy_loss": 0.2529296875, "delta_ref_ppl": -0.1865234375, "entropy_loss": -0.4853515625, "epoch": 0.8481455563331001, "grad_norm": 2.5572893688944403, "k1_kl": 0.1865234375, "k3_kl": 0.076904296875, "kimi_kl": 0.1279296875, "learning_rate": 1.5126050420168066e-07, "loss": 0.0031, "ppl": 0.4306640625, "reward": 0.9002394676208496, "reward_std": 0.02508911583572626, "rewards/single_object_detection_bbox_reward": 0.900239497423172, "step": 606, "temperature": 0.9 }, { "advantages": -2.976706809931784e-06, "completion_length": 19.75, "delta_ref_entropy_loss": 0.26953125, "delta_ref_ppl": -0.216796875, "entropy_loss": -0.51171875, "epoch": 0.8495451364590623, "grad_norm": 4.947432273381903, "k1_kl": 0.216796875, "k3_kl": 0.09423828125, "kimi_kl": 0.15966796875, "learning_rate": 1.4985994397759103e-07, "loss": 0.0038, "ppl": 0.451171875, "reward": 0.9946250915527344, "reward_std": 0.0016107793780975044, "rewards/single_object_detection_bbox_reward": 0.9946251213550568, "step": 607, "temperature": 0.9 }, { "advantages": 1.1510881222420721e-05, "completion_length": 123.0625, "delta_ref_entropy_loss": 0.25146484375, "delta_ref_ppl": -0.17724609375, "entropy_loss": -0.5078125, "epoch": 0.8509447165850245, "grad_norm": 1.9910981981061147, "k1_kl": 0.177734375, "k3_kl": 0.0712890625, "kimi_kl": 0.10791015625, "learning_rate": 1.484593837535014e-07, "loss": 0.0028, "ppl": 0.447265625, "reward": 0.9821947515010834, "reward_std": 0.012353504193015397, "rewards/single_object_detection_bbox_reward": 0.9821949005126953, "step": 608, "temperature": 0.9 }, { "advantages": -1.2035082818329101e-05, "completion_length": 119.125, "delta_ref_entropy_loss": 0.259765625, "delta_ref_ppl": -0.22607421875, "entropy_loss": -0.525390625, "epoch": 0.8523442967109867, "grad_norm": 4.136108861922265, "k1_kl": 0.2255859375, "k3_kl": 0.1055908203125, "kimi_kl": 0.222900390625, "learning_rate": 1.4705882352941175e-07, "loss": 0.0042, "ppl": 0.46484375, "reward": 0.9712217748165131, "reward_std": 0.025916220620274544, "rewards/single_object_detection_bbox_reward": 0.9712218642234802, "step": 609, "temperature": 0.9 }, { "advantages": -7.835616088414099e-06, "completion_length": 48.5, "delta_ref_entropy_loss": 0.3095703125, "delta_ref_ppl": -0.2568359375, "entropy_loss": -0.46484375, "epoch": 0.853743876836949, "grad_norm": 3.985933178582841, "k1_kl": 0.2568359375, "k3_kl": 0.11669921875, "kimi_kl": 0.21337890625, "learning_rate": 1.4565826330532213e-07, "loss": 0.0047, "ppl": 0.40625, "reward": 0.9803968369960785, "reward_std": 0.018368086777627468, "rewards/single_object_detection_bbox_reward": 0.9803968966007233, "step": 610, "temperature": 0.9 }, { "advantages": -5.327165126800537e-07, "completion_length": 93.1875, "delta_ref_entropy_loss": 0.2333984375, "delta_ref_ppl": -0.17822265625, "entropy_loss": -0.498046875, "epoch": 0.8551434569629112, "grad_norm": 2.840557219669283, "k1_kl": 0.177734375, "k3_kl": 0.075439453125, "kimi_kl": 0.127685546875, "learning_rate": 1.4425770308123247e-07, "loss": 0.003, "ppl": 0.4375, "reward": 0.9557763934135437, "reward_std": 0.022086257115006447, "rewards/single_object_detection_bbox_reward": 0.9557764530181885, "step": 611, "temperature": 0.9 }, { "advantages": 1.3428343663690612e-06, "completion_length": 166.09375, "delta_ref_entropy_loss": 0.24658203125, "delta_ref_ppl": -0.20849609375, "entropy_loss": -0.521484375, "epoch": 0.8565430370888734, "grad_norm": 2.4392899765514953, "k1_kl": 0.2080078125, "k3_kl": 0.0947265625, "kimi_kl": 0.197265625, "learning_rate": 1.4285714285714285e-07, "loss": 0.0038, "ppl": 0.4619140625, "reward": 0.9729045331478119, "reward_std": 0.009351676912046969, "rewards/single_object_detection_bbox_reward": 0.972904622554779, "step": 612, "temperature": 0.9 }, { "advantages": 4.890774221166794e-07, "completion_length": 121.9375, "delta_ref_entropy_loss": 0.23974609375, "delta_ref_ppl": -0.232421875, "entropy_loss": -0.5029296875, "epoch": 0.8579426172148356, "grad_norm": 7.299490022910222, "k1_kl": 0.23193359375, "k3_kl": 0.111083984375, "kimi_kl": 0.26025390625, "learning_rate": 1.4145658263305322e-07, "loss": 0.0044, "ppl": 0.4404296875, "reward": 0.9462699592113495, "reward_std": 0.035854121670126915, "rewards/single_object_detection_bbox_reward": 0.9462700486183167, "step": 613, "temperature": 0.9 }, { "advantages": -3.177672851961688e-06, "completion_length": 190.53125, "delta_ref_entropy_loss": 0.2470703125, "delta_ref_ppl": -0.19140625, "entropy_loss": -0.51171875, "epoch": 0.8593421973407978, "grad_norm": 3.1585265964945966, "k1_kl": 0.19189453125, "k3_kl": 0.0830078125, "kimi_kl": 0.141357421875, "learning_rate": 1.4005602240896357e-07, "loss": 0.0033, "ppl": 0.451171875, "reward": 0.9647945761680603, "reward_std": 0.02092398004606366, "rewards/single_object_detection_bbox_reward": 0.9647946059703827, "step": 614, "temperature": 0.9 }, { "advantages": 5.732956196879968e-06, "completion_length": 95.5625, "delta_ref_entropy_loss": 0.23681640625, "delta_ref_ppl": -0.2001953125, "entropy_loss": -0.515625, "epoch": 0.86074177746676, "grad_norm": 2.7602455024391075, "k1_kl": 0.2001953125, "k3_kl": 0.092529296875, "kimi_kl": 0.16650390625, "learning_rate": 1.3865546218487394e-07, "loss": 0.0037, "ppl": 0.4560546875, "reward": 0.9883618056774139, "reward_std": 0.010570151964202523, "rewards/single_object_detection_bbox_reward": 0.9883618950843811, "step": 615, "temperature": 0.9 }, { "advantages": -2.0941454749845434e-05, "completion_length": 41.5, "delta_ref_entropy_loss": 0.22607421875, "delta_ref_ppl": -0.17333984375, "entropy_loss": -0.4931640625, "epoch": 0.8621413575927221, "grad_norm": 3.640861500739778, "k1_kl": 0.17333984375, "k3_kl": 0.073974609375, "kimi_kl": 0.132080078125, "learning_rate": 1.3725490196078432e-07, "loss": 0.003, "ppl": 0.43359375, "reward": 0.9702033996582031, "reward_std": 0.012837899848818779, "rewards/single_object_detection_bbox_reward": 0.9702033996582031, "step": 616, "temperature": 0.9 }, { "advantages": 6.936889576536487e-06, "completion_length": 50.40625, "delta_ref_entropy_loss": 0.259765625, "delta_ref_ppl": -0.22509765625, "entropy_loss": -0.490234375, "epoch": 0.8635409377186843, "grad_norm": 3.727717888801859, "k1_kl": 0.22509765625, "k3_kl": 0.1044921875, "kimi_kl": 0.23095703125, "learning_rate": 1.3585434173669466e-07, "loss": 0.0042, "ppl": 0.4248046875, "reward": 0.966868132352829, "reward_std": 0.017175441898871213, "rewards/single_object_detection_bbox_reward": 0.9668681621551514, "step": 617, "temperature": 0.9 }, { "advantages": 2.1824880604981445e-06, "completion_length": 211.125, "delta_ref_entropy_loss": 0.240234375, "delta_ref_ppl": -0.22021484375, "entropy_loss": -0.5078125, "epoch": 0.8649405178446467, "grad_norm": 2.135156830709413, "k1_kl": 0.22119140625, "k3_kl": 0.102294921875, "kimi_kl": 0.20556640625, "learning_rate": 1.3445378151260504e-07, "loss": 0.0041, "ppl": 0.4443359375, "reward": 0.974092960357666, "reward_std": 0.009475498693063855, "rewards/single_object_detection_bbox_reward": 0.9740930199623108, "step": 618, "temperature": 0.9 }, { "advantages": -7.689930953347357e-06, "completion_length": 157.34375, "delta_ref_entropy_loss": 0.228515625, "delta_ref_ppl": -0.1943359375, "entropy_loss": -0.501953125, "epoch": 0.8663400979706088, "grad_norm": 2.7757234966375393, "k1_kl": 0.1943359375, "k3_kl": 0.0849609375, "kimi_kl": 0.138427734375, "learning_rate": 1.330532212885154e-07, "loss": 0.0034, "ppl": 0.4365234375, "reward": 0.9509028196334839, "reward_std": 0.019201734103262424, "rewards/single_object_detection_bbox_reward": 0.9509028792381287, "step": 619, "temperature": 0.9 }, { "advantages": 7.059425342959003e-06, "completion_length": 31.0, "delta_ref_entropy_loss": 0.25146484375, "delta_ref_ppl": -0.244140625, "entropy_loss": -0.5087890625, "epoch": 0.867739678096571, "grad_norm": 3.326254678822802, "k1_kl": 0.2451171875, "k3_kl": 0.118408203125, "kimi_kl": 0.2451171875, "learning_rate": 1.3165266106442576e-07, "loss": 0.0047, "ppl": 0.4560546875, "reward": 0.9725320041179657, "reward_std": 0.029741398058831692, "rewards/single_object_detection_bbox_reward": 0.9725321233272552, "step": 620, "temperature": 0.9 }, { "advantages": -8.09239520549454e-06, "completion_length": 31.5, "delta_ref_entropy_loss": 0.2578125, "delta_ref_ppl": -0.2216796875, "entropy_loss": -0.4970703125, "epoch": 0.8691392582225332, "grad_norm": 3.9500813953841187, "k1_kl": 0.2216796875, "k3_kl": 0.1015625, "kimi_kl": 0.1845703125, "learning_rate": 1.3025210084033613e-07, "loss": 0.0041, "ppl": 0.44140625, "reward": 0.9503155946731567, "reward_std": 0.01212736638262868, "rewards/single_object_detection_bbox_reward": 0.9503156542778015, "step": 621, "temperature": 0.9 }, { "advantages": 4.6542184009013e-06, "completion_length": 127.53125, "delta_ref_entropy_loss": 0.23193359375, "delta_ref_ppl": -0.20654296875, "entropy_loss": -0.51171875, "epoch": 0.8705388383484954, "grad_norm": 3.9812937065163063, "k1_kl": 0.20654296875, "k3_kl": 0.093017578125, "kimi_kl": 0.1572265625, "learning_rate": 1.2885154061624648e-07, "loss": 0.0037, "ppl": 0.455078125, "reward": 0.8804501295089722, "reward_std": 0.010767200030386448, "rewards/single_object_detection_bbox_reward": 0.8804501295089722, "step": 622, "temperature": 0.9 }, { "advantages": 4.642509793484351e-06, "completion_length": 117.9375, "delta_ref_entropy_loss": 0.2431640625, "delta_ref_ppl": -0.185546875, "entropy_loss": -0.4990234375, "epoch": 0.8719384184744576, "grad_norm": 4.127188276432474, "k1_kl": 0.18603515625, "k3_kl": 0.0771484375, "kimi_kl": 0.118408203125, "learning_rate": 1.2745098039215685e-07, "loss": 0.0031, "ppl": 0.4453125, "reward": 0.9678501784801483, "reward_std": 0.010978656355291605, "rewards/single_object_detection_bbox_reward": 0.9678502678871155, "step": 623, "temperature": 0.9 }, { "advantages": -1.6553726709389593e-05, "completion_length": 116.5, "delta_ref_entropy_loss": 0.2177734375, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.513671875, "epoch": 0.8733379986004198, "grad_norm": 2.5157117685979653, "k1_kl": 0.20751953125, "k3_kl": 0.10498046875, "kimi_kl": 0.26611328125, "learning_rate": 1.2605042016806723e-07, "loss": 0.0042, "ppl": 0.4521484375, "reward": 0.9617179334163666, "reward_std": 0.019399647135287523, "rewards/single_object_detection_bbox_reward": 0.9617180228233337, "step": 624, "temperature": 0.9 }, { "advantages": 7.0304213295457885e-06, "completion_length": 130.84375, "delta_ref_entropy_loss": 0.2490234375, "delta_ref_ppl": -0.21484375, "entropy_loss": -0.484375, "epoch": 0.8747375787263821, "grad_norm": 3.075827522417874, "k1_kl": 0.21484375, "k3_kl": 0.100830078125, "kimi_kl": 0.18505859375, "learning_rate": 1.2464985994397757e-07, "loss": 0.004, "ppl": 0.4228515625, "reward": 0.9385563433170319, "reward_std": 0.020460776053369045, "rewards/single_object_detection_bbox_reward": 0.9385563731193542, "step": 625, "temperature": 0.9 }, { "advantages": 1.1059855523853912e-05, "completion_length": 79.25, "delta_ref_entropy_loss": 0.24609375, "delta_ref_ppl": -0.205078125, "entropy_loss": -0.548828125, "epoch": 0.8761371588523443, "grad_norm": 2.546608247301829, "k1_kl": 0.205078125, "k3_kl": 0.08642578125, "kimi_kl": 0.1416015625, "learning_rate": 1.2324929971988795e-07, "loss": 0.0034, "ppl": 0.4765625, "reward": 0.9171332716941833, "reward_std": 0.032836779952049255, "rewards/single_object_detection_bbox_reward": 0.9171333611011505, "step": 626, "temperature": 0.9 }, { "advantages": -3.179798113706056e-08, "completion_length": 89.0, "delta_ref_entropy_loss": 0.2578125, "delta_ref_ppl": -0.19873046875, "entropy_loss": -0.490234375, "epoch": 0.8775367389783065, "grad_norm": 2.535288688782528, "k1_kl": 0.19921875, "k3_kl": 0.083740234375, "kimi_kl": 0.14794921875, "learning_rate": 1.2184873949579832e-07, "loss": 0.0034, "ppl": 0.4365234375, "reward": 0.9949473142623901, "reward_std": 0.0023992698406800628, "rewards/single_object_detection_bbox_reward": 0.9949473440647125, "step": 627, "temperature": 0.9 }, { "advantages": 1.84268628800055e-07, "completion_length": 66.53125, "delta_ref_entropy_loss": 0.2431640625, "delta_ref_ppl": -0.19921875, "entropy_loss": -0.5107421875, "epoch": 0.8789363191042687, "grad_norm": 2.1069472995016105, "k1_kl": 0.19873046875, "k3_kl": 0.08642578125, "kimi_kl": 0.17626953125, "learning_rate": 1.2044817927170867e-07, "loss": 0.0035, "ppl": 0.451171875, "reward": 0.9893092513084412, "reward_std": 0.01240507559850812, "rewards/single_object_detection_bbox_reward": 0.9893092513084412, "step": 628, "temperature": 0.9 }, { "advantages": 1.1640468130735826e-05, "completion_length": 68.4375, "delta_ref_entropy_loss": 0.2548828125, "delta_ref_ppl": -0.21484375, "entropy_loss": -0.4638671875, "epoch": 0.8803358992302309, "grad_norm": 2.8308997523832775, "k1_kl": 0.21533203125, "k3_kl": 0.099609375, "kimi_kl": 0.17626953125, "learning_rate": 1.1904761904761903e-07, "loss": 0.004, "ppl": 0.4072265625, "reward": 0.9074708521366119, "reward_std": 0.0206131546292454, "rewards/single_object_detection_bbox_reward": 0.9074708819389343, "step": 629, "temperature": 0.9 }, { "advantages": 1.8387435744671166e-05, "completion_length": 131.9375, "delta_ref_entropy_loss": 0.22314453125, "delta_ref_ppl": -0.18359375, "entropy_loss": -0.548828125, "epoch": 0.8817354793561931, "grad_norm": 4.49454646257809, "k1_kl": 0.18408203125, "k3_kl": 0.077392578125, "kimi_kl": 0.121826171875, "learning_rate": 1.176470588235294e-07, "loss": 0.0031, "ppl": 0.4853515625, "reward": 0.9736821055412292, "reward_std": 0.015642256010323763, "rewards/single_object_detection_bbox_reward": 0.9736821949481964, "step": 630, "temperature": 0.9 }, { "advantages": 5.6060298447846435e-06, "completion_length": 119.96875, "delta_ref_entropy_loss": 0.23974609375, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.529296875, "epoch": 0.8831350594821553, "grad_norm": 2.6938731225025614, "k1_kl": 0.21337890625, "k3_kl": 0.092529296875, "kimi_kl": 0.177734375, "learning_rate": 1.1624649859943978e-07, "loss": 0.0037, "ppl": 0.4658203125, "reward": 0.9610936939716339, "reward_std": 0.01892547495663166, "rewards/single_object_detection_bbox_reward": 0.9610937535762787, "step": 631, "temperature": 0.9 }, { "advantages": -7.041064691293286e-06, "completion_length": 49.5, "delta_ref_entropy_loss": 0.26953125, "delta_ref_ppl": -0.22265625, "entropy_loss": -0.486328125, "epoch": 0.8845346396081175, "grad_norm": 5.050456041967616, "k1_kl": 0.22314453125, "k3_kl": 0.09912109375, "kimi_kl": 0.17578125, "learning_rate": 1.1484593837535013e-07, "loss": 0.004, "ppl": 0.431640625, "reward": 0.9132378399372101, "reward_std": 0.005649450409691781, "rewards/single_object_detection_bbox_reward": 0.9132379293441772, "step": 632, "temperature": 0.9 }, { "advantages": -3.924726570403436e-06, "completion_length": 29.03125, "delta_ref_entropy_loss": 0.2763671875, "delta_ref_ppl": -0.23291015625, "entropy_loss": -0.5390625, "epoch": 0.8859342197340798, "grad_norm": 3.0826616053587514, "k1_kl": 0.23193359375, "k3_kl": 0.111083984375, "kimi_kl": 0.19873046875, "learning_rate": 1.134453781512605e-07, "loss": 0.0045, "ppl": 0.4765625, "reward": 0.96446293592453, "reward_std": 0.037932541221380234, "rewards/single_object_detection_bbox_reward": 0.9644629955291748, "step": 633, "temperature": 0.9 }, { "advantages": 2.784681328193983e-05, "completion_length": 42.0, "delta_ref_entropy_loss": 0.24560546875, "delta_ref_ppl": -0.216796875, "entropy_loss": -0.5341796875, "epoch": 0.887333799860042, "grad_norm": 2.658248194307422, "k1_kl": 0.216796875, "k3_kl": 0.10693359375, "kimi_kl": 0.23681640625, "learning_rate": 1.1204481792717087e-07, "loss": 0.0043, "ppl": 0.482421875, "reward": 0.9631878435611725, "reward_std": 0.026503021828830242, "rewards/single_object_detection_bbox_reward": 0.9631878733634949, "step": 634, "temperature": 0.9 }, { "advantages": -6.768852699678973e-06, "completion_length": 91.375, "delta_ref_entropy_loss": 0.25732421875, "delta_ref_ppl": -0.228515625, "entropy_loss": -0.55859375, "epoch": 0.8887333799860042, "grad_norm": 2.784548062214601, "k1_kl": 0.2275390625, "k3_kl": 0.10400390625, "kimi_kl": 0.18310546875, "learning_rate": 1.1064425770308122e-07, "loss": 0.0042, "ppl": 0.4951171875, "reward": 0.9250619113445282, "reward_std": 0.029732096940279007, "rewards/single_object_detection_bbox_reward": 0.925061970949173, "step": 635, "temperature": 0.9 }, { "advantages": 1.5170979622780578e-05, "completion_length": 116.5, "delta_ref_entropy_loss": 0.22509765625, "delta_ref_ppl": -0.20654296875, "entropy_loss": -0.55078125, "epoch": 0.8901329601119664, "grad_norm": 2.3901643015397074, "k1_kl": 0.20654296875, "k3_kl": 0.092529296875, "kimi_kl": 0.16015625, "learning_rate": 1.092436974789916e-07, "loss": 0.0037, "ppl": 0.4873046875, "reward": 0.9556330740451813, "reward_std": 0.015285622794181108, "rewards/single_object_detection_bbox_reward": 0.9556331038475037, "step": 636, "temperature": 0.9 }, { "advantages": -9.90953873269973e-06, "completion_length": 61.0, "delta_ref_entropy_loss": 0.26123046875, "delta_ref_ppl": -0.2275390625, "entropy_loss": -0.580078125, "epoch": 0.8915325402379286, "grad_norm": 2.5674381801397304, "k1_kl": 0.22705078125, "k3_kl": 0.106689453125, "kimi_kl": 0.22119140625, "learning_rate": 1.0784313725490195e-07, "loss": 0.0043, "ppl": 0.51953125, "reward": 0.9507614970207214, "reward_std": 0.031488736160099506, "rewards/single_object_detection_bbox_reward": 0.9507615566253662, "step": 637, "temperature": 0.9 }, { "advantages": -4.331448508310132e-06, "completion_length": 160.34375, "delta_ref_entropy_loss": 0.2275390625, "delta_ref_ppl": -0.18701171875, "entropy_loss": -0.5029296875, "epoch": 0.8929321203638908, "grad_norm": 3.1053238621214754, "k1_kl": 0.18701171875, "k3_kl": 0.0830078125, "kimi_kl": 0.154296875, "learning_rate": 1.0644257703081232e-07, "loss": 0.0033, "ppl": 0.443359375, "reward": 0.9856216907501221, "reward_std": 0.012891150312498212, "rewards/single_object_detection_bbox_reward": 0.9856217205524445, "step": 638, "temperature": 0.9 }, { "advantages": 3.8246760425408866e-06, "completion_length": 101.03125, "delta_ref_entropy_loss": 0.27978515625, "delta_ref_ppl": -0.236328125, "entropy_loss": -0.51953125, "epoch": 0.894331700489853, "grad_norm": 2.828552518949469, "k1_kl": 0.236328125, "k3_kl": 0.115966796875, "kimi_kl": 0.21337890625, "learning_rate": 1.0504201680672269e-07, "loss": 0.0046, "ppl": 0.4658203125, "reward": 0.9780307412147522, "reward_std": 0.018437882885336876, "rewards/single_object_detection_bbox_reward": 0.9780308306217194, "step": 639, "temperature": 0.9 }, { "advantages": 8.02055052417927e-06, "completion_length": 47.4375, "delta_ref_entropy_loss": 0.24951171875, "delta_ref_ppl": -0.26904296875, "entropy_loss": -0.5712890625, "epoch": 0.8957312806158153, "grad_norm": 3.373242518885796, "k1_kl": 0.27001953125, "k3_kl": 0.147705078125, "kimi_kl": 0.32373046875, "learning_rate": 1.0364145658263305e-07, "loss": 0.0059, "ppl": 0.5068359375, "reward": 0.8784225583076477, "reward_std": 0.022197901271283627, "rewards/single_object_detection_bbox_reward": 0.8784226477146149, "step": 640, "temperature": 0.9 }, { "advantages": 1.6090328244899865e-05, "completion_length": 196.5625, "delta_ref_entropy_loss": 0.25146484375, "delta_ref_ppl": -0.21337890625, "entropy_loss": -0.5107421875, "epoch": 0.8971308607417775, "grad_norm": 4.239942293534294, "k1_kl": 0.21337890625, "k3_kl": 0.097412109375, "kimi_kl": 0.16162109375, "learning_rate": 1.0224089635854341e-07, "loss": 0.0039, "ppl": 0.4609375, "reward": 0.9726508557796478, "reward_std": 0.016038164030760527, "rewards/single_object_detection_bbox_reward": 0.9726508855819702, "step": 641, "temperature": 0.9 }, { "advantages": -1.4855792869639117e-05, "completion_length": 59.875, "delta_ref_entropy_loss": 0.2353515625, "delta_ref_ppl": -0.21923828125, "entropy_loss": -0.498046875, "epoch": 0.8985304408677397, "grad_norm": 2.934833711369186, "k1_kl": 0.21826171875, "k3_kl": 0.097900390625, "kimi_kl": 0.17919921875, "learning_rate": 1.0084033613445378e-07, "loss": 0.0039, "ppl": 0.4375, "reward": 0.9835740327835083, "reward_std": 0.005171915981918573, "rewards/single_object_detection_bbox_reward": 0.9835741221904755, "step": 642, "temperature": 0.9 }, { "advantages": -4.153432882958441e-06, "completion_length": 67.78125, "delta_ref_entropy_loss": 0.2734375, "delta_ref_ppl": -0.2294921875, "entropy_loss": -0.4912109375, "epoch": 0.8999300209937019, "grad_norm": 2.2849298066928485, "k1_kl": 0.22900390625, "k3_kl": 0.103515625, "kimi_kl": 0.189697265625, "learning_rate": 9.943977591036413e-08, "loss": 0.0041, "ppl": 0.439453125, "reward": 0.9816772043704987, "reward_std": 0.001784885535016656, "rewards/single_object_detection_bbox_reward": 0.981677234172821, "step": 643, "temperature": 0.9 }, { "advantages": -1.0577296961855609e-05, "completion_length": 64.0625, "delta_ref_entropy_loss": 0.24853515625, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.5126953125, "epoch": 0.9013296011196641, "grad_norm": 3.5894477252714885, "k1_kl": 0.2080078125, "k3_kl": 0.0908203125, "kimi_kl": 0.15966796875, "learning_rate": 9.80392156862745e-08, "loss": 0.0036, "ppl": 0.458984375, "reward": 0.9378240406513214, "reward_std": 0.016320132737746462, "rewards/single_object_detection_bbox_reward": 0.9378240704536438, "step": 644, "temperature": 0.9 }, { "advantages": 4.641977852770651e-06, "completion_length": 48.5625, "delta_ref_entropy_loss": 0.25537109375, "delta_ref_ppl": -0.21435546875, "entropy_loss": -0.4921875, "epoch": 0.9027291812456263, "grad_norm": 2.898578298632838, "k1_kl": 0.21484375, "k3_kl": 0.098388671875, "kimi_kl": 0.1904296875, "learning_rate": 9.663865546218488e-08, "loss": 0.0039, "ppl": 0.4345703125, "reward": 0.9828623831272125, "reward_std": 0.009379560477100313, "rewards/single_object_detection_bbox_reward": 0.9828624427318573, "step": 645, "temperature": 0.9 }, { "advantages": -5.151012373971753e-06, "completion_length": 51.84375, "delta_ref_entropy_loss": 0.24853515625, "delta_ref_ppl": -0.19580078125, "entropy_loss": -0.51953125, "epoch": 0.9041287613715885, "grad_norm": 3.758329369518207, "k1_kl": 0.19580078125, "k3_kl": 0.084228515625, "kimi_kl": 0.1240234375, "learning_rate": 9.523809523809523e-08, "loss": 0.0034, "ppl": 0.4658203125, "reward": 0.9640673398971558, "reward_std": 0.008158909156918526, "rewards/single_object_detection_bbox_reward": 0.9640673696994781, "step": 646, "temperature": 0.9 }, { "advantages": -1.3291305549500976e-06, "completion_length": 66.125, "delta_ref_entropy_loss": 0.2548828125, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.4990234375, "epoch": 0.9055283414975507, "grad_norm": 3.536962582031652, "k1_kl": 0.20849609375, "k3_kl": 0.091552734375, "kimi_kl": 0.1748046875, "learning_rate": 9.38375350140056e-08, "loss": 0.0037, "ppl": 0.4365234375, "reward": 0.9682480692863464, "reward_std": 0.02745172753930092, "rewards/single_object_detection_bbox_reward": 0.9682481288909912, "step": 647, "temperature": 0.9 }, { "advantages": -2.3575762497785036e-07, "completion_length": 69.0, "delta_ref_entropy_loss": 0.23583984375, "delta_ref_ppl": -0.1650390625, "entropy_loss": -0.501953125, "epoch": 0.906927921623513, "grad_norm": 1.8404681853052045, "k1_kl": 0.1650390625, "k3_kl": 0.063720703125, "kimi_kl": 0.09033203125, "learning_rate": 9.243697478991596e-08, "loss": 0.0025, "ppl": 0.4482421875, "reward": 0.9943543672561646, "reward_std": 0.0019288809271529317, "rewards/single_object_detection_bbox_reward": 0.9943543970584869, "step": 648, "temperature": 0.9 }, { "advantages": -1.446995929654804e-05, "completion_length": 60.1875, "delta_ref_entropy_loss": 0.2421875, "delta_ref_ppl": -0.21875, "entropy_loss": -0.53515625, "epoch": 0.9083275017494752, "grad_norm": 2.324966501784279, "k1_kl": 0.21826171875, "k3_kl": 0.102294921875, "kimi_kl": 0.18017578125, "learning_rate": 9.103641456582632e-08, "loss": 0.0041, "ppl": 0.47265625, "reward": 0.9684039354324341, "reward_std": 0.024052956141531467, "rewards/single_object_detection_bbox_reward": 0.9684039950370789, "step": 649, "temperature": 0.9 }, { "advantages": -1.941142272698926e-06, "completion_length": 32.53125, "delta_ref_entropy_loss": 0.24072265625, "delta_ref_ppl": -0.22021484375, "entropy_loss": -0.587890625, "epoch": 0.9097270818754374, "grad_norm": 2.8654718988709376, "k1_kl": 0.2197265625, "k3_kl": 0.108154296875, "kimi_kl": 0.1943359375, "learning_rate": 8.96358543417367e-08, "loss": 0.0043, "ppl": 0.52734375, "reward": 0.8845174610614777, "reward_std": 0.03726256871595979, "rewards/single_object_detection_bbox_reward": 0.8845175206661224, "step": 650, "temperature": 0.9 }, { "advantages": 1.7793849451663846e-05, "completion_length": 89.84375, "delta_ref_entropy_loss": 0.23779296875, "delta_ref_ppl": -0.2080078125, "entropy_loss": -0.486328125, "epoch": 0.9111266620013996, "grad_norm": 3.1154664708196007, "k1_kl": 0.20703125, "k3_kl": 0.0966796875, "kimi_kl": 0.19189453125, "learning_rate": 8.823529411764706e-08, "loss": 0.0039, "ppl": 0.4248046875, "reward": 0.9524444937705994, "reward_std": 0.021070044487714767, "rewards/single_object_detection_bbox_reward": 0.9524445831775665, "step": 651, "temperature": 0.9 }, { "advantages": -1.202417365675501e-05, "completion_length": 102.59375, "delta_ref_entropy_loss": 0.2373046875, "delta_ref_ppl": -0.19580078125, "entropy_loss": -0.494140625, "epoch": 0.9125262421273618, "grad_norm": 3.2878478493096255, "k1_kl": 0.1962890625, "k3_kl": 0.090087890625, "kimi_kl": 0.14404296875, "learning_rate": 8.683473389355742e-08, "loss": 0.0036, "ppl": 0.4345703125, "reward": 0.9435420632362366, "reward_std": 0.028482663445174694, "rewards/single_object_detection_bbox_reward": 0.943542093038559, "step": 652, "temperature": 0.9 }, { "advantages": -3.5278499126434326e-06, "completion_length": 78.5625, "delta_ref_entropy_loss": 0.2275390625, "delta_ref_ppl": -0.20556640625, "entropy_loss": -0.48828125, "epoch": 0.913925822253324, "grad_norm": 2.359247909541508, "k1_kl": 0.205078125, "k3_kl": 0.099609375, "kimi_kl": 0.19287109375, "learning_rate": 8.543417366946779e-08, "loss": 0.004, "ppl": 0.4365234375, "reward": 0.9939088821411133, "reward_std": 0.005418892775196582, "rewards/single_object_detection_bbox_reward": 0.9939089119434357, "step": 653, "temperature": 0.9 }, { "advantages": -1.880473462279042e-06, "completion_length": 30.1875, "delta_ref_entropy_loss": 0.232421875, "delta_ref_ppl": -0.20263671875, "entropy_loss": -0.521484375, "epoch": 0.9153254023792862, "grad_norm": 7.715517248486637, "k1_kl": 0.20263671875, "k3_kl": 0.09814453125, "kimi_kl": 0.16650390625, "learning_rate": 8.403361344537815e-08, "loss": 0.0039, "ppl": 0.462890625, "reward": 0.9581893086433411, "reward_std": 0.04257629066705704, "rewards/single_object_detection_bbox_reward": 0.9581893384456635, "step": 654, "temperature": 0.9 }, { "advantages": -2.2149511664792954e-06, "completion_length": 68.5, "delta_ref_entropy_loss": 0.25537109375, "delta_ref_ppl": -0.20166015625, "entropy_loss": -0.4853515625, "epoch": 0.9167249825052485, "grad_norm": 2.095845919715165, "k1_kl": 0.20166015625, "k3_kl": 0.092529296875, "kimi_kl": 0.1611328125, "learning_rate": 8.263305322128851e-08, "loss": 0.0037, "ppl": 0.4306640625, "reward": 0.9908121824264526, "reward_std": 0.01083851302973926, "rewards/single_object_detection_bbox_reward": 0.9908121824264526, "step": 655, "temperature": 0.9 }, { "advantages": -1.1833119060611352e-05, "completion_length": 80.0, "delta_ref_entropy_loss": 0.26123046875, "delta_ref_ppl": -0.24658203125, "entropy_loss": -0.5107421875, "epoch": 0.9181245626312107, "grad_norm": 4.731161795507954, "k1_kl": 0.2470703125, "k3_kl": 0.1181640625, "kimi_kl": 0.240234375, "learning_rate": 8.123249299719887e-08, "loss": 0.0047, "ppl": 0.451171875, "reward": 0.8914787471294403, "reward_std": 0.012521081138402224, "rewards/single_object_detection_bbox_reward": 0.8914787769317627, "step": 656, "temperature": 0.9 }, { "advantages": -9.298458394368936e-06, "completion_length": 108.0, "delta_ref_entropy_loss": 0.25048828125, "delta_ref_ppl": -0.24267578125, "entropy_loss": -0.5029296875, "epoch": 0.9195241427571729, "grad_norm": 2.090354292589232, "k1_kl": 0.2431640625, "k3_kl": 0.116943359375, "kimi_kl": 0.234130859375, "learning_rate": 7.983193277310923e-08, "loss": 0.0047, "ppl": 0.44140625, "reward": 0.9449507892131805, "reward_std": 0.016058937646448612, "rewards/single_object_detection_bbox_reward": 0.9449508190155029, "step": 657, "temperature": 0.9 }, { "advantages": 8.114746378851123e-06, "completion_length": 42.5, "delta_ref_entropy_loss": 0.25048828125, "delta_ref_ppl": -0.22802734375, "entropy_loss": -0.5009765625, "epoch": 0.9209237228831351, "grad_norm": 5.1480057460480095, "k1_kl": 0.228515625, "k3_kl": 0.1083984375, "kimi_kl": 0.208984375, "learning_rate": 7.843137254901961e-08, "loss": 0.0043, "ppl": 0.4443359375, "reward": 0.930351048707962, "reward_std": 0.016970000928267837, "rewards/single_object_detection_bbox_reward": 0.9303511083126068, "step": 658, "temperature": 0.9 }, { "advantages": -1.1976809048519499e-05, "completion_length": 108.5, "delta_ref_entropy_loss": 0.21484375, "delta_ref_ppl": -0.1904296875, "entropy_loss": -0.525390625, "epoch": 0.9223233030090973, "grad_norm": 2.202399248561852, "k1_kl": 0.1904296875, "k3_kl": 0.089599609375, "kimi_kl": 0.16796875, "learning_rate": 7.703081232492997e-08, "loss": 0.0036, "ppl": 0.4638671875, "reward": 0.9661420285701752, "reward_std": 0.01763994013890624, "rewards/single_object_detection_bbox_reward": 0.9661421775817871, "step": 659, "temperature": 0.9 }, { "advantages": 2.5926425223588012e-05, "completion_length": 42.5, "delta_ref_entropy_loss": 0.2421875, "delta_ref_ppl": -0.22412109375, "entropy_loss": -0.4794921875, "epoch": 0.9237228831350595, "grad_norm": 2.798752830229502, "k1_kl": 0.224609375, "k3_kl": 0.1025390625, "kimi_kl": 0.1904296875, "learning_rate": 7.563025210084033e-08, "loss": 0.0041, "ppl": 0.41796875, "reward": 0.9483462572097778, "reward_std": 0.017956340219825506, "rewards/single_object_detection_bbox_reward": 0.9483463168144226, "step": 660, "temperature": 0.9 }, { "advantages": 1.2897487522423035e-06, "completion_length": 21.6875, "delta_ref_entropy_loss": 0.23583984375, "delta_ref_ppl": -0.2578125, "entropy_loss": -0.517578125, "epoch": 0.9251224632610217, "grad_norm": 4.024354949267309, "k1_kl": 0.25732421875, "k3_kl": 0.135009765625, "kimi_kl": 0.275390625, "learning_rate": 7.42296918767507e-08, "loss": 0.0054, "ppl": 0.4638671875, "reward": 0.9084001183509827, "reward_std": 0.025626478251069784, "rewards/single_object_detection_bbox_reward": 0.9084002077579498, "step": 661, "temperature": 0.9 }, { "advantages": 4.550176981865661e-07, "completion_length": 29.5, "delta_ref_entropy_loss": 0.251953125, "delta_ref_ppl": -0.2001953125, "entropy_loss": -0.4794921875, "epoch": 0.9265220433869839, "grad_norm": 1.9718585192900782, "k1_kl": 0.2001953125, "k3_kl": 0.0811767578125, "kimi_kl": 0.131103515625, "learning_rate": 7.282913165266106e-08, "loss": 0.0032, "ppl": 0.4248046875, "reward": 0.9557444453239441, "reward_std": 0.014413381577469409, "rewards/single_object_detection_bbox_reward": 0.955744594335556, "step": 662, "temperature": 0.9 }, { "advantages": 2.70961641035683e-06, "completion_length": 93.84375, "delta_ref_entropy_loss": 0.2548828125, "delta_ref_ppl": -0.24755859375, "entropy_loss": -0.4970703125, "epoch": 0.9279216235129462, "grad_norm": 4.815175050639766, "k1_kl": 0.248046875, "k3_kl": 0.123291015625, "kimi_kl": 0.2763671875, "learning_rate": 7.142857142857142e-08, "loss": 0.0049, "ppl": 0.4365234375, "reward": 0.9333947002887726, "reward_std": 0.039073542691767216, "rewards/single_object_detection_bbox_reward": 0.9333947896957397, "step": 663, "temperature": 0.9 }, { "advantages": -7.056764843582641e-06, "completion_length": 109.625, "delta_ref_entropy_loss": 0.21630859375, "delta_ref_ppl": -0.17724609375, "entropy_loss": -0.521484375, "epoch": 0.9293212036389084, "grad_norm": 3.769784795236152, "k1_kl": 0.17822265625, "k3_kl": 0.078369140625, "kimi_kl": 0.130126953125, "learning_rate": 7.002801120448178e-08, "loss": 0.0031, "ppl": 0.4580078125, "reward": 0.9765639901161194, "reward_std": 0.02352647390216589, "rewards/single_object_detection_bbox_reward": 0.9765641093254089, "step": 664, "temperature": 0.9 }, { "advantages": 1.4568558981409296e-07, "completion_length": 132.875, "delta_ref_entropy_loss": 0.26318359375, "delta_ref_ppl": -0.2294921875, "entropy_loss": -0.4912109375, "epoch": 0.9307207837648706, "grad_norm": 3.015032992020425, "k1_kl": 0.2294921875, "k3_kl": 0.106201171875, "kimi_kl": 0.2041015625, "learning_rate": 6.862745098039216e-08, "loss": 0.0042, "ppl": 0.4384765625, "reward": 0.9420503675937653, "reward_std": 0.023565257899463177, "rewards/single_object_detection_bbox_reward": 0.9420505464076996, "step": 665, "temperature": 0.9 }, { "advantages": -7.160141251461027e-06, "completion_length": 77.5, "delta_ref_entropy_loss": 0.24462890625, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.55859375, "epoch": 0.9321203638908327, "grad_norm": 2.277352725574313, "k1_kl": 0.20947265625, "k3_kl": 0.090087890625, "kimi_kl": 0.15576171875, "learning_rate": 6.722689075630252e-08, "loss": 0.0036, "ppl": 0.4921875, "reward": 0.9017703533172607, "reward_std": 0.030246201902627945, "rewards/single_object_detection_bbox_reward": 0.9017704129219055, "step": 666, "temperature": 0.9 }, { "advantages": -3.060059839299356e-06, "completion_length": 184.59375, "delta_ref_entropy_loss": 0.24951171875, "delta_ref_ppl": -0.22314453125, "entropy_loss": -0.50390625, "epoch": 0.933519944016795, "grad_norm": 2.338238705655818, "k1_kl": 0.2236328125, "k3_kl": 0.11376953125, "kimi_kl": 0.20556640625, "learning_rate": 6.582633053221288e-08, "loss": 0.0046, "ppl": 0.4453125, "reward": 0.9578758478164673, "reward_std": 0.015592347364872694, "rewards/single_object_detection_bbox_reward": 0.9578759074211121, "step": 667, "temperature": 0.9 }, { "advantages": 7.629394417563162e-06, "completion_length": 88.1875, "delta_ref_entropy_loss": 0.275390625, "delta_ref_ppl": -0.21875, "entropy_loss": -0.4873046875, "epoch": 0.9349195241427571, "grad_norm": 2.5989509129222887, "k1_kl": 0.21875, "k3_kl": 0.10107421875, "kimi_kl": 0.181640625, "learning_rate": 6.442577030812324e-08, "loss": 0.004, "ppl": 0.435546875, "reward": 0.963430792093277, "reward_std": 0.006758018833352253, "rewards/single_object_detection_bbox_reward": 0.9634308516979218, "step": 668, "temperature": 0.9 }, { "advantages": -1.7125825252151117e-05, "completion_length": 66.84375, "delta_ref_entropy_loss": 0.2490234375, "delta_ref_ppl": -0.2333984375, "entropy_loss": -0.544921875, "epoch": 0.9363191042687193, "grad_norm": 2.679490187381738, "k1_kl": 0.23388671875, "k3_kl": 0.1181640625, "kimi_kl": 0.23388671875, "learning_rate": 6.302521008403361e-08, "loss": 0.0047, "ppl": 0.484375, "reward": 0.9693574011325836, "reward_std": 0.029099813662469387, "rewards/single_object_detection_bbox_reward": 0.9693574607372284, "step": 669, "temperature": 0.9 }, { "advantages": 8.70839866706774e-06, "completion_length": 82.90625, "delta_ref_entropy_loss": 0.24853515625, "delta_ref_ppl": -0.21044921875, "entropy_loss": -0.5390625, "epoch": 0.9377186843946816, "grad_norm": 3.6005623060812644, "k1_kl": 0.20947265625, "k3_kl": 0.09765625, "kimi_kl": 0.180908203125, "learning_rate": 6.162464985994397e-08, "loss": 0.0039, "ppl": 0.4775390625, "reward": 0.9369010329246521, "reward_std": 0.027223931858316064, "rewards/single_object_detection_bbox_reward": 0.9369011521339417, "step": 670, "temperature": 0.9 }, { "advantages": -7.635514748471905e-06, "completion_length": 144.75, "delta_ref_entropy_loss": 0.23095703125, "delta_ref_ppl": -0.208984375, "entropy_loss": -0.513671875, "epoch": 0.9391182645206438, "grad_norm": 8.109196504547498, "k1_kl": 0.20947265625, "k3_kl": 0.096923828125, "kimi_kl": 0.17919921875, "learning_rate": 6.022408963585433e-08, "loss": 0.0039, "ppl": 0.4580078125, "reward": 0.9437495470046997, "reward_std": 0.040582881309092045, "rewards/single_object_detection_bbox_reward": 0.9437495768070221, "step": 671, "temperature": 0.9 }, { "advantages": -1.8066062239086023e-05, "completion_length": 106.40625, "delta_ref_entropy_loss": 0.2333984375, "delta_ref_ppl": -0.19873046875, "entropy_loss": -0.544921875, "epoch": 0.940517844646606, "grad_norm": 2.0147621388944867, "k1_kl": 0.19921875, "k3_kl": 0.082275390625, "kimi_kl": 0.138916015625, "learning_rate": 5.88235294117647e-08, "loss": 0.0033, "ppl": 0.4765625, "reward": 0.9516896307468414, "reward_std": 0.01665685698390007, "rewards/single_object_detection_bbox_reward": 0.9516896903514862, "step": 672, "temperature": 0.9 }, { "advantages": 3.4733011489151977e-06, "completion_length": 42.0, "delta_ref_entropy_loss": 0.2490234375, "delta_ref_ppl": -0.19921875, "entropy_loss": -0.4970703125, "epoch": 0.9419174247725682, "grad_norm": 2.212199332544914, "k1_kl": 0.19921875, "k3_kl": 0.082763671875, "kimi_kl": 0.13623046875, "learning_rate": 5.742296918767506e-08, "loss": 0.0033, "ppl": 0.4423828125, "reward": 0.9723545908927917, "reward_std": 0.014758489094674587, "rewards/single_object_detection_bbox_reward": 0.9723546504974365, "step": 673, "temperature": 0.9 }, { "advantages": -2.3420107027050108e-06, "completion_length": 88.5625, "delta_ref_entropy_loss": 0.25634765625, "delta_ref_ppl": -0.2236328125, "entropy_loss": -0.5009765625, "epoch": 0.9433170048985304, "grad_norm": 3.2057585658279435, "k1_kl": 0.22314453125, "k3_kl": 0.100341796875, "kimi_kl": 0.18310546875, "learning_rate": 5.6022408963585437e-08, "loss": 0.004, "ppl": 0.4375, "reward": 0.9543332755565643, "reward_std": 0.019772722385823727, "rewards/single_object_detection_bbox_reward": 0.9543333351612091, "step": 674, "temperature": 0.9 }, { "advantages": 3.06059155263938e-06, "completion_length": 77.0, "delta_ref_entropy_loss": 0.248046875, "delta_ref_ppl": -0.1962890625, "entropy_loss": -0.494140625, "epoch": 0.9447165850244926, "grad_norm": 3.5674026705901443, "k1_kl": 0.19677734375, "k3_kl": 0.07958984375, "kimi_kl": 0.1318359375, "learning_rate": 5.46218487394958e-08, "loss": 0.0032, "ppl": 0.435546875, "reward": 0.9887556731700897, "reward_std": 0.010000569716794416, "rewards/single_object_detection_bbox_reward": 0.9887557327747345, "step": 675, "temperature": 0.9 }, { "advantages": 1.0518623184907483e-05, "completion_length": 62.78125, "delta_ref_entropy_loss": 0.23486328125, "delta_ref_ppl": -0.19921875, "entropy_loss": -0.501953125, "epoch": 0.9461161651504548, "grad_norm": 3.085186176537179, "k1_kl": 0.19921875, "k3_kl": 0.08544921875, "kimi_kl": 0.14501953125, "learning_rate": 5.322128851540616e-08, "loss": 0.0034, "ppl": 0.439453125, "reward": 0.9825602769851685, "reward_std": 0.021125817380379885, "rewards/single_object_detection_bbox_reward": 0.9825603663921356, "step": 676, "temperature": 0.9 }, { "advantages": -1.0874122494897165e-05, "completion_length": 114.6875, "delta_ref_entropy_loss": 0.26611328125, "delta_ref_ppl": -0.19580078125, "entropy_loss": -0.5068359375, "epoch": 0.947515745276417, "grad_norm": 4.174050317105251, "k1_kl": 0.19580078125, "k3_kl": 0.08251953125, "kimi_kl": 0.133056640625, "learning_rate": 5.1820728291316525e-08, "loss": 0.0033, "ppl": 0.4462890625, "reward": 0.9507706165313721, "reward_std": 0.016026973724365234, "rewards/single_object_detection_bbox_reward": 0.950770765542984, "step": 677, "temperature": 0.9 }, { "advantages": 1.4508410004054895e-05, "completion_length": 75.59375, "delta_ref_entropy_loss": 0.234375, "delta_ref_ppl": -0.20654296875, "entropy_loss": -0.5546875, "epoch": 0.9489153254023793, "grad_norm": 2.7310399281770543, "k1_kl": 0.20703125, "k3_kl": 0.097900390625, "kimi_kl": 0.17529296875, "learning_rate": 5.042016806722689e-08, "loss": 0.0039, "ppl": 0.4853515625, "reward": 0.9281339049339294, "reward_std": 0.04499443620443344, "rewards/single_object_detection_bbox_reward": 0.9281339347362518, "step": 678, "temperature": 0.9 }, { "advantages": 7.69804671563179e-07, "completion_length": 54.28125, "delta_ref_entropy_loss": 0.248046875, "delta_ref_ppl": -0.212890625, "entropy_loss": -0.521484375, "epoch": 0.9503149055283415, "grad_norm": 3.954678142339731, "k1_kl": 0.21337890625, "k3_kl": 0.10205078125, "kimi_kl": 0.1845703125, "learning_rate": 4.901960784313725e-08, "loss": 0.0041, "ppl": 0.4658203125, "reward": 0.9706000685691833, "reward_std": 0.03335990197956562, "rewards/single_object_detection_bbox_reward": 0.9706001579761505, "step": 679, "temperature": 0.9 }, { "advantages": 2.416502684354782e-05, "completion_length": 69.875, "delta_ref_entropy_loss": 0.25927734375, "delta_ref_ppl": -0.19287109375, "entropy_loss": -0.4892578125, "epoch": 0.9517144856543037, "grad_norm": 1.857932485067777, "k1_kl": 0.19287109375, "k3_kl": 0.077880859375, "kimi_kl": 0.12158203125, "learning_rate": 4.7619047619047613e-08, "loss": 0.0031, "ppl": 0.43359375, "reward": 0.9634624123573303, "reward_std": 0.007469208387192339, "rewards/single_object_detection_bbox_reward": 0.9634624719619751, "step": 680, "temperature": 0.9 }, { "advantages": -8.019751930987695e-06, "completion_length": 136.0, "delta_ref_entropy_loss": 0.20849609375, "delta_ref_ppl": -0.169921875, "entropy_loss": -0.5087890625, "epoch": 0.9531140657802659, "grad_norm": 2.2767011756580273, "k1_kl": 0.17041015625, "k3_kl": 0.07373046875, "kimi_kl": 0.12841796875, "learning_rate": 4.621848739495798e-08, "loss": 0.003, "ppl": 0.451171875, "reward": 0.9238368272781372, "reward_std": 0.023540217895060778, "rewards/single_object_detection_bbox_reward": 0.923836886882782, "step": 681, "temperature": 0.9 }, { "advantages": 1.4946397186577087e-06, "completion_length": 41.5, "delta_ref_entropy_loss": 0.2978515625, "delta_ref_ppl": -0.24658203125, "entropy_loss": -0.4541015625, "epoch": 0.9545136459062281, "grad_norm": 5.487173809433768, "k1_kl": 0.24658203125, "k3_kl": 0.115478515625, "kimi_kl": 0.21337890625, "learning_rate": 4.481792717086835e-08, "loss": 0.0046, "ppl": 0.4072265625, "reward": 0.993513822555542, "reward_std": 0.003935830638511106, "rewards/single_object_detection_bbox_reward": 0.9935138821601868, "step": 682, "temperature": 0.9 }, { "advantages": 3.676861524581909e-06, "completion_length": 69.5625, "delta_ref_entropy_loss": 0.2255859375, "delta_ref_ppl": -0.1806640625, "entropy_loss": -0.52734375, "epoch": 0.9559132260321903, "grad_norm": 2.228147463247711, "k1_kl": 0.1806640625, "k3_kl": 0.077880859375, "kimi_kl": 0.126953125, "learning_rate": 4.341736694677871e-08, "loss": 0.0031, "ppl": 0.4697265625, "reward": 0.9601840376853943, "reward_std": 0.018306163605302572, "rewards/single_object_detection_bbox_reward": 0.9601840972900391, "step": 683, "temperature": 0.9 }, { "advantages": 1.1400187304388965e-05, "completion_length": 70.4375, "delta_ref_entropy_loss": 0.2353515625, "delta_ref_ppl": -0.20654296875, "entropy_loss": -0.509765625, "epoch": 0.9573128061581525, "grad_norm": 2.238645744395073, "k1_kl": 0.20654296875, "k3_kl": 0.09375, "kimi_kl": 0.17578125, "learning_rate": 4.2016806722689076e-08, "loss": 0.0037, "ppl": 0.447265625, "reward": 0.9437940120697021, "reward_std": 0.022566587664186954, "rewards/single_object_detection_bbox_reward": 0.9437940716743469, "step": 684, "temperature": 0.9 }, { "advantages": -3.6628252928494476e-06, "completion_length": 69.25, "delta_ref_entropy_loss": 0.2431640625, "delta_ref_ppl": -0.1865234375, "entropy_loss": -0.4951171875, "epoch": 0.9587123862841148, "grad_norm": 3.3896135812210697, "k1_kl": 0.18603515625, "k3_kl": 0.072509765625, "kimi_kl": 0.106689453125, "learning_rate": 4.0616246498599436e-08, "loss": 0.0029, "ppl": 0.4384765625, "reward": 0.9924481511116028, "reward_std": 0.006534527521580458, "rewards/single_object_detection_bbox_reward": 0.99244824051857, "step": 685, "temperature": 0.9 }, { "advantages": -2.4705593887119903e-05, "completion_length": 89.0, "delta_ref_entropy_loss": 0.21923828125, "delta_ref_ppl": -0.16552734375, "entropy_loss": -0.568359375, "epoch": 0.960111966410077, "grad_norm": 4.0651693079942115, "k1_kl": 0.16552734375, "k3_kl": 0.06640625, "kimi_kl": 0.099609375, "learning_rate": 3.9215686274509804e-08, "loss": 0.0027, "ppl": 0.505859375, "reward": 0.9547190070152283, "reward_std": 0.016478178091347218, "rewards/single_object_detection_bbox_reward": 0.9547190368175507, "step": 686, "temperature": 0.9 }, { "advantages": -2.800221409415826e-06, "completion_length": 78.0625, "delta_ref_entropy_loss": 0.25537109375, "delta_ref_ppl": -0.20654296875, "entropy_loss": -0.4794921875, "epoch": 0.9615115465360392, "grad_norm": 2.9065259457024477, "k1_kl": 0.20654296875, "k3_kl": 0.085205078125, "kimi_kl": 0.1357421875, "learning_rate": 3.7815126050420164e-08, "loss": 0.0034, "ppl": 0.416015625, "reward": 0.9593593180179596, "reward_std": 0.02326110890135169, "rewards/single_object_detection_bbox_reward": 0.9593594074249268, "step": 687, "temperature": 0.9 }, { "advantages": -1.6825006241560914e-06, "completion_length": 50.0, "delta_ref_entropy_loss": 0.24072265625, "delta_ref_ppl": -0.216796875, "entropy_loss": -0.564453125, "epoch": 0.9629111266620014, "grad_norm": 3.1092650146540812, "k1_kl": 0.216796875, "k3_kl": 0.1025390625, "kimi_kl": 0.20068359375, "learning_rate": 3.641456582633053e-08, "loss": 0.0041, "ppl": 0.4970703125, "reward": 0.9359869360923767, "reward_std": 0.0416277339681983, "rewards/single_object_detection_bbox_reward": 0.9359869956970215, "step": 688, "temperature": 0.9 }, { "advantages": 1.56754913405166e-06, "completion_length": 76.96875, "delta_ref_entropy_loss": 0.20947265625, "delta_ref_ppl": -0.18212890625, "entropy_loss": -0.5263671875, "epoch": 0.9643107067879636, "grad_norm": 3.283990981739139, "k1_kl": 0.18212890625, "k3_kl": 0.083251953125, "kimi_kl": 0.13525390625, "learning_rate": 3.501400560224089e-08, "loss": 0.0033, "ppl": 0.466796875, "reward": 0.9235495328903198, "reward_std": 0.02693348517641425, "rewards/single_object_detection_bbox_reward": 0.923549622297287, "step": 689, "temperature": 0.9 }, { "advantages": -2.303081055288203e-05, "completion_length": 69.0, "delta_ref_entropy_loss": 0.232421875, "delta_ref_ppl": -0.20361328125, "entropy_loss": -0.533203125, "epoch": 0.9657102869139258, "grad_norm": 3.0751124673635335, "k1_kl": 0.2041015625, "k3_kl": 0.093994140625, "kimi_kl": 0.174072265625, "learning_rate": 3.361344537815126e-08, "loss": 0.0038, "ppl": 0.4716796875, "reward": 0.9490193426609039, "reward_std": 0.013575292658060789, "rewards/single_object_detection_bbox_reward": 0.9490193724632263, "step": 690, "temperature": 0.9 }, { "advantages": -1.20286949822912e-06, "completion_length": 60.0625, "delta_ref_entropy_loss": 0.255859375, "delta_ref_ppl": -0.22802734375, "entropy_loss": -0.4677734375, "epoch": 0.967109867039888, "grad_norm": 3.4378439256851503, "k1_kl": 0.228515625, "k3_kl": 0.110107421875, "kimi_kl": 0.23486328125, "learning_rate": 3.221288515406162e-08, "loss": 0.0044, "ppl": 0.4169921875, "reward": 0.9888195693492889, "reward_std": 0.009374092653160915, "rewards/single_object_detection_bbox_reward": 0.9888196289539337, "step": 691, "temperature": 0.9 }, { "advantages": -2.1944621408920284e-06, "completion_length": 88.71875, "delta_ref_entropy_loss": 0.2490234375, "delta_ref_ppl": -0.224609375, "entropy_loss": -0.4873046875, "epoch": 0.9685094471658502, "grad_norm": 3.895150350124829, "k1_kl": 0.22412109375, "k3_kl": 0.10205078125, "kimi_kl": 0.18359375, "learning_rate": 3.081232492997199e-08, "loss": 0.0041, "ppl": 0.431640625, "reward": 0.9649063050746918, "reward_std": 0.008203716803109273, "rewards/single_object_detection_bbox_reward": 0.9649063646793365, "step": 692, "temperature": 0.9 }, { "advantages": 5.428280474006897e-06, "completion_length": 167.5625, "delta_ref_entropy_loss": 0.23095703125, "delta_ref_ppl": -0.16943359375, "entropy_loss": -0.5224609375, "epoch": 0.9699090272918125, "grad_norm": 2.3944422947272854, "k1_kl": 0.169921875, "k3_kl": 0.0714111328125, "kimi_kl": 0.126708984375, "learning_rate": 2.941176470588235e-08, "loss": 0.0029, "ppl": 0.458984375, "reward": 0.9182946383953094, "reward_std": 0.030334169045090675, "rewards/single_object_detection_bbox_reward": 0.9182946681976318, "step": 693, "temperature": 0.9 }, { "advantages": -9.243110980605707e-06, "completion_length": 66.21875, "delta_ref_entropy_loss": 0.248046875, "delta_ref_ppl": -0.22412109375, "entropy_loss": -0.515625, "epoch": 0.9713086074177747, "grad_norm": 6.602390777215559, "k1_kl": 0.22412109375, "k3_kl": 0.10791015625, "kimi_kl": 0.2001953125, "learning_rate": 2.8011204481792718e-08, "loss": 0.0043, "ppl": 0.4658203125, "reward": 0.9484541416168213, "reward_std": 0.02340947650372982, "rewards/single_object_detection_bbox_reward": 0.9484542906284332, "step": 694, "temperature": 0.9 }, { "advantages": -5.169905080038006e-06, "completion_length": 88.40625, "delta_ref_entropy_loss": 0.2685546875, "delta_ref_ppl": -0.25048828125, "entropy_loss": -0.5078125, "epoch": 0.9727081875437369, "grad_norm": 3.8654036636636806, "k1_kl": 0.25, "k3_kl": 0.119140625, "kimi_kl": 0.23974609375, "learning_rate": 2.661064425770308e-08, "loss": 0.0048, "ppl": 0.4482421875, "reward": 0.9369454383850098, "reward_std": 0.013432509964331985, "rewards/single_object_detection_bbox_reward": 0.9369455277919769, "step": 695, "temperature": 0.9 }, { "advantages": 1.298796178161865e-06, "completion_length": 40.0, "delta_ref_entropy_loss": 0.255859375, "delta_ref_ppl": -0.25244140625, "entropy_loss": -0.4873046875, "epoch": 0.9741077676696991, "grad_norm": 2.781139291119691, "k1_kl": 0.25244140625, "k3_kl": 0.13623046875, "kimi_kl": 0.27197265625, "learning_rate": 2.5210084033613446e-08, "loss": 0.0054, "ppl": 0.4384765625, "reward": 0.8855583667755127, "reward_std": 0.02321896585635841, "rewards/single_object_detection_bbox_reward": 0.8855584263801575, "step": 696, "temperature": 0.9 }, { "advantages": 2.4825071704981383e-06, "completion_length": 66.65625, "delta_ref_entropy_loss": 0.22314453125, "delta_ref_ppl": -0.19482421875, "entropy_loss": -0.4697265625, "epoch": 0.9755073477956613, "grad_norm": 2.8354325115858896, "k1_kl": 0.1943359375, "k3_kl": 0.091796875, "kimi_kl": 0.17431640625, "learning_rate": 2.3809523809523807e-08, "loss": 0.0037, "ppl": 0.4140625, "reward": 0.9851730167865753, "reward_std": 0.013298890611622483, "rewards/single_object_detection_bbox_reward": 0.9851730763912201, "step": 697, "temperature": 0.9 }, { "advantages": 1.0939448138458374e-05, "completion_length": 42.0, "delta_ref_entropy_loss": 0.23095703125, "delta_ref_ppl": -0.19873046875, "entropy_loss": -0.53125, "epoch": 0.9769069279216235, "grad_norm": 2.9944920133903166, "k1_kl": 0.19873046875, "k3_kl": 0.0859375, "kimi_kl": 0.149658203125, "learning_rate": 2.2408963585434174e-08, "loss": 0.0034, "ppl": 0.4716796875, "reward": 0.9790703356266022, "reward_std": 0.013643525307998061, "rewards/single_object_detection_bbox_reward": 0.979070395231247, "step": 698, "temperature": 0.9 }, { "advantages": -8.467054612992797e-07, "completion_length": 100.4375, "delta_ref_entropy_loss": 0.24853515625, "delta_ref_ppl": -0.2197265625, "entropy_loss": -0.51171875, "epoch": 0.9783065080475857, "grad_norm": 12.17333659337229, "k1_kl": 0.2197265625, "k3_kl": 0.099609375, "kimi_kl": 0.1748046875, "learning_rate": 2.1008403361344538e-08, "loss": 0.004, "ppl": 0.451171875, "reward": 0.9500268995761871, "reward_std": 0.03471473138779402, "rewards/single_object_detection_bbox_reward": 0.9500269591808319, "step": 699, "temperature": 0.9 }, { "advantages": -1.0587809811113402e-06, "completion_length": 77.625, "delta_ref_entropy_loss": 0.25390625, "delta_ref_ppl": -0.2314453125, "entropy_loss": -0.53515625, "epoch": 0.979706088173548, "grad_norm": 3.4548462510291973, "k1_kl": 0.2314453125, "k3_kl": 0.100341796875, "kimi_kl": 0.17529296875, "learning_rate": 1.9607843137254902e-08, "loss": 0.004, "ppl": 0.4697265625, "reward": 0.9572602212429047, "reward_std": 0.018771857488900423, "rewards/single_object_detection_bbox_reward": 0.9572602808475494, "step": 700, "temperature": 0.9 } ], "logging_steps": 1.0, "max_steps": 714, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }