{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990375360923965, "eval_steps": 500, "global_step": 1038, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 2465.2222900390625, "epoch": 0.0009624639076034649, "grad_norm": 0.11130716651678085, "kl": 0.0, "learning_rate": 1.923076923076923e-08, "loss": -0.013, "reward": 0.3222222179174423, "reward_std": 0.06036265566945076, "rewards/accuracy_multibox_reward": 0.3222222179174423, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2297.999969482422, "epoch": 0.0019249278152069298, "grad_norm": 0.20857006311416626, "kl": 0.0, "learning_rate": 3.846153846153846e-08, "loss": 0.0435, "reward": 0.4194444417953491, "reward_std": 0.2237897887825966, "rewards/accuracy_multibox_reward": 0.4194444417953491, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 2270.1250610351562, "epoch": 0.0028873917228103944, "grad_norm": 0.1847522258758545, "kl": 0.00010526180267333984, "learning_rate": 5.7692307692307695e-08, "loss": 0.0259, "reward": 0.4361111018806696, "reward_std": 0.2912161685526371, "rewards/accuracy_multibox_reward": 0.4361111018806696, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2741.8055725097656, "epoch": 0.0038498556304138597, "grad_norm": 0.1558922678232193, "kl": 0.0001150369644165039, "learning_rate": 7.692307692307692e-08, "loss": -0.0067, "reward": 0.2361111119389534, "reward_std": 0.10240122582763433, "rewards/accuracy_multibox_reward": 0.2361111119389534, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 2746.4444580078125, "epoch": 0.004812319538017324, "grad_norm": 0.11506213992834091, "kl": 9.489059448242188e-05, "learning_rate": 9.615384615384616e-08, "loss": -0.0188, "reward": 0.34166666958481073, "reward_std": 0.17411227151751518, "rewards/accuracy_multibox_reward": 0.34166666958481073, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 2812.5695190429688, "epoch": 0.005774783445620789, "grad_norm": 0.11609864979982376, "kl": 0.00010800361633300781, "learning_rate": 1.1538461538461539e-07, "loss": -0.0048, "reward": 0.36666667833924294, "reward_std": 0.1676008477807045, "rewards/accuracy_multibox_reward": 0.36666667833924294, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 2724.4444274902344, "epoch": 0.006737247353224254, "grad_norm": 0.2222578376531601, "kl": 0.00012183189392089844, "learning_rate": 1.346153846153846e-07, "loss": -0.0114, "reward": 0.5444444641470909, "reward_std": 0.21295367553830147, "rewards/accuracy_multibox_reward": 0.5444444641470909, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 2656.986083984375, "epoch": 0.007699711260827719, "grad_norm": 0.22545209527015686, "kl": 0.00012922286987304688, "learning_rate": 1.5384615384615385e-07, "loss": 0.0031, "reward": 0.5888889133930206, "reward_std": 0.2788212336599827, "rewards/accuracy_multibox_reward": 0.5888889133930206, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 2722.4444885253906, "epoch": 0.008662175168431183, "grad_norm": 0.1569640040397644, "kl": 0.00011050701141357422, "learning_rate": 1.7307692307692305e-07, "loss": 0.0061, "reward": 0.4000000022351742, "reward_std": 0.19834503531455994, "rewards/accuracy_multibox_reward": 0.4000000022351742, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 2226.125, "epoch": 0.009624639076034648, "grad_norm": 0.11278218030929565, "kl": 0.00011169910430908203, "learning_rate": 1.9230769230769231e-07, "loss": 0.0546, "reward": 0.4611111208796501, "reward_std": 0.12696115672588348, "rewards/accuracy_multibox_reward": 0.4611111208796501, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 3053.9861450195312, "epoch": 0.010587102983638113, "grad_norm": 0.16099444031715393, "kl": 0.00013959407806396484, "learning_rate": 2.1153846153846152e-07, "loss": 0.0111, "reward": 0.39444445446133614, "reward_std": 0.20140227302908897, "rewards/accuracy_multibox_reward": 0.39444445446133614, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 2780.2916870117188, "epoch": 0.011549566891241578, "grad_norm": 0.18906927108764648, "kl": 0.00010120868682861328, "learning_rate": 2.3076923076923078e-07, "loss": 0.0438, "reward": 0.5805555805563927, "reward_std": 0.2288927398622036, "rewards/accuracy_multibox_reward": 0.5805555805563927, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 2806.02783203125, "epoch": 0.012512030798845043, "grad_norm": 0.09122008085250854, "kl": 0.00011861324310302734, "learning_rate": 2.5e-07, "loss": 0.0046, "reward": 0.3321428531780839, "reward_std": 0.10140847321599722, "rewards/accuracy_multibox_reward": 0.3321428531780839, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2596.9166259765625, "epoch": 0.013474494706448507, "grad_norm": 0.13614681363105774, "kl": 8.928775787353516e-05, "learning_rate": 2.692307692307692e-07, "loss": 0.0144, "reward": 0.35833333618938923, "reward_std": 0.2205657339654863, "rewards/accuracy_multibox_reward": 0.35833333618938923, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 2618.5556030273438, "epoch": 0.014436958614051972, "grad_norm": 0.532120943069458, "kl": 0.00012755393981933594, "learning_rate": 2.884615384615384e-07, "loss": 0.0842, "reward": 0.22500000428408384, "reward_std": 0.19533459097146988, "rewards/accuracy_multibox_reward": 0.22500000428408384, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 2333.5000610351562, "epoch": 0.015399422521655439, "grad_norm": 0.14197005331516266, "kl": 0.00010943412780761719, "learning_rate": 3.076923076923077e-07, "loss": 0.0278, "reward": 0.34166666120290756, "reward_std": 0.21097959950566292, "rewards/accuracy_multibox_reward": 0.34166666120290756, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 2813.2500610351562, "epoch": 0.016361886429258902, "grad_norm": 0.4132319688796997, "kl": 0.0001150369644165039, "learning_rate": 3.269230769230769e-07, "loss": 0.0721, "reward": 0.5833333656191826, "reward_std": 0.3286023996770382, "rewards/accuracy_multibox_reward": 0.5833333656191826, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 2563.9722290039062, "epoch": 0.017324350336862367, "grad_norm": 0.20454616844654083, "kl": 0.00016307830810546875, "learning_rate": 3.461538461538461e-07, "loss": 0.0326, "reward": 0.4583333358168602, "reward_std": 0.334005338139832, "rewards/accuracy_multibox_reward": 0.4583333358168602, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 2966.0277709960938, "epoch": 0.01828681424446583, "grad_norm": 0.20834138989448547, "kl": 8.797645568847656e-05, "learning_rate": 3.6538461538461534e-07, "loss": -0.0056, "reward": 0.3888889029622078, "reward_std": 0.22609320655465126, "rewards/accuracy_multibox_reward": 0.3888889029622078, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 3015.8333740234375, "epoch": 0.019249278152069296, "grad_norm": 0.16505780816078186, "kl": 0.00015234947204589844, "learning_rate": 3.8461538461538463e-07, "loss": -0.0128, "reward": 0.42222224175930023, "reward_std": 0.16851050918921828, "rewards/accuracy_multibox_reward": 0.42222224175930023, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 3524.2916870117188, "epoch": 0.02021174205967276, "grad_norm": 0.21831244230270386, "kl": 0.000125885009765625, "learning_rate": 4.0384615384615386e-07, "loss": 0.0655, "reward": 0.3583333343267441, "reward_std": 0.2736743427813053, "rewards/accuracy_multibox_reward": 0.3583333343267441, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 2075.7500610351562, "epoch": 0.021174205967276226, "grad_norm": 0.784837543964386, "kl": 0.00013458728790283203, "learning_rate": 4.2307692307692304e-07, "loss": 0.1383, "reward": 0.4444444477558136, "reward_std": 0.2948262393474579, "rewards/accuracy_multibox_reward": 0.4444444477558136, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 3029.3750610351562, "epoch": 0.02213666987487969, "grad_norm": 0.2634040415287018, "kl": 9.167194366455078e-05, "learning_rate": 4.423076923076923e-07, "loss": 0.0279, "reward": 0.5000000149011612, "reward_std": 0.17780112847685814, "rewards/accuracy_multibox_reward": 0.5000000149011612, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 3167.9305419921875, "epoch": 0.023099133782483156, "grad_norm": 0.16189482808113098, "kl": 9.542703628540039e-05, "learning_rate": 4.6153846153846156e-07, "loss": -0.0311, "reward": 0.5125000327825546, "reward_std": 0.33778534457087517, "rewards/accuracy_multibox_reward": 0.5125000327825546, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 3359.3472900390625, "epoch": 0.02406159769008662, "grad_norm": 0.39204806089401245, "kl": 0.00014483928680419922, "learning_rate": 4.807692307692307e-07, "loss": 0.0731, "reward": 0.4750000163912773, "reward_std": 0.30297882854938507, "rewards/accuracy_multibox_reward": 0.4750000163912773, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 2899.263916015625, "epoch": 0.025024061597690085, "grad_norm": 0.39059585332870483, "kl": 9.298324584960938e-05, "learning_rate": 5e-07, "loss": 0.0194, "reward": 0.6879629790782928, "reward_std": 0.2638947209343314, "rewards/accuracy_multibox_reward": 0.6879629790782928, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 3168.791748046875, "epoch": 0.02598652550529355, "grad_norm": 0.34193095564842224, "kl": 9.202957153320312e-05, "learning_rate": 5.192307692307692e-07, "loss": 0.0562, "reward": 0.7972222715616226, "reward_std": 0.28743964433670044, "rewards/accuracy_multibox_reward": 0.7972222715616226, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 2829.6250915527344, "epoch": 0.026948989412897015, "grad_norm": 0.1456146091222763, "kl": 8.398294448852539e-05, "learning_rate": 5.384615384615384e-07, "loss": 0.0129, "reward": 0.5416666939854622, "reward_std": 0.11392283020541072, "rewards/accuracy_multibox_reward": 0.5416666939854622, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2981.5695190429688, "epoch": 0.02791145332050048, "grad_norm": 0.14968328177928925, "kl": 9.739398956298828e-05, "learning_rate": 5.576923076923077e-07, "loss": -0.0153, "reward": 0.28888889495283365, "reward_std": 0.14623613748699427, "rewards/accuracy_multibox_reward": 0.28888889495283365, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 3410.8333129882812, "epoch": 0.028873917228103944, "grad_norm": 0.09670193493366241, "kl": 0.0001138448715209961, "learning_rate": 5.769230769230768e-07, "loss": 0.0409, "reward": 0.2444444615393877, "reward_std": 0.1291775479912758, "rewards/accuracy_multibox_reward": 0.2444444615393877, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 3719.9583129882812, "epoch": 0.029836381135707413, "grad_norm": 0.19139364361763, "kl": 0.00011837482452392578, "learning_rate": 5.961538461538461e-07, "loss": 0.0449, "reward": 0.37553420290350914, "reward_std": 0.3030422776937485, "rewards/accuracy_multibox_reward": 0.37553420290350914, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 3527.0833740234375, "epoch": 0.030798845043310877, "grad_norm": 0.12521016597747803, "kl": 0.00010192394256591797, "learning_rate": 6.153846153846154e-07, "loss": 0.0071, "reward": 0.3833333496004343, "reward_std": 0.2155737802386284, "rewards/accuracy_multibox_reward": 0.3833333496004343, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 2734.2361450195312, "epoch": 0.03176130895091434, "grad_norm": 0.24408525228500366, "kl": 0.00010687112808227539, "learning_rate": 6.346153846153845e-07, "loss": -0.0191, "reward": 0.6013889014720917, "reward_std": 0.2883816212415695, "rewards/accuracy_multibox_reward": 0.6013889014720917, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 3439.888916015625, "epoch": 0.032723772858517804, "grad_norm": 0.14492616057395935, "kl": 0.0001010894775390625, "learning_rate": 6.538461538461538e-07, "loss": -0.0203, "reward": 0.3166666943579912, "reward_std": 0.13116297498345375, "rewards/accuracy_multibox_reward": 0.3166666943579912, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3109.875030517578, "epoch": 0.03368623676612127, "grad_norm": 0.15988919138908386, "kl": 0.00018680095672607422, "learning_rate": 6.730769230769231e-07, "loss": 0.0016, "reward": 0.4194444753229618, "reward_std": 0.09159003011882305, "rewards/accuracy_multibox_reward": 0.4194444753229618, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 2307.4305114746094, "epoch": 0.03464870067372473, "grad_norm": 0.1616358757019043, "kl": 0.00013339519500732422, "learning_rate": 6.923076923076922e-07, "loss": -0.0302, "reward": 0.6833333456888795, "reward_std": 0.22222047112882137, "rewards/accuracy_multibox_reward": 0.6833333456888795, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3337.2777709960938, "epoch": 0.0356111645813282, "grad_norm": 0.20638230443000793, "kl": 0.00016510486602783203, "learning_rate": 7.115384615384616e-07, "loss": 0.016, "reward": 0.2833333471789956, "reward_std": 0.1835455223917961, "rewards/accuracy_multibox_reward": 0.2833333471789956, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 2937.6943969726562, "epoch": 0.03657362848893166, "grad_norm": 0.25408297777175903, "kl": 0.00022268295288085938, "learning_rate": 7.307692307692307e-07, "loss": 0.0317, "reward": 0.42500000819563866, "reward_std": 0.2801080383360386, "rewards/accuracy_multibox_reward": 0.42500000819563866, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 2954.5, "epoch": 0.03753609239653513, "grad_norm": 0.13980451226234436, "kl": 0.0002697110176086426, "learning_rate": 7.5e-07, "loss": -0.0057, "reward": 0.4583333432674408, "reward_std": 0.21448132768273354, "rewards/accuracy_multibox_reward": 0.4583333432674408, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 2871.6806030273438, "epoch": 0.03849855630413859, "grad_norm": 0.2506042718887329, "kl": 0.0002181529998779297, "learning_rate": 7.692307692307693e-07, "loss": 0.0454, "reward": 0.7472222447395325, "reward_std": 0.19567373720929027, "rewards/accuracy_multibox_reward": 0.7472222447395325, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2730.4444885253906, "epoch": 0.03946102021174206, "grad_norm": 0.08611779659986496, "kl": 0.0003705024719238281, "learning_rate": 7.884615384615384e-07, "loss": 0.0055, "reward": 0.47222224064171314, "reward_std": 0.08618255332112312, "rewards/accuracy_multibox_reward": 0.47222224064171314, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 3729.5416870117188, "epoch": 0.04042348411934552, "grad_norm": 0.07599527388811111, "kl": 0.0003304481506347656, "learning_rate": 8.076923076923077e-07, "loss": 0.0337, "reward": 0.27222223207354546, "reward_std": 0.1734677292406559, "rewards/accuracy_multibox_reward": 0.27222223207354546, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 3449.8611450195312, "epoch": 0.04138594802694899, "grad_norm": 0.11734402179718018, "kl": 0.00020182132720947266, "learning_rate": 8.269230769230768e-07, "loss": 0.0143, "reward": 0.3527777809649706, "reward_std": 0.14009357383474708, "rewards/accuracy_multibox_reward": 0.3527777809649706, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 3584.0834350585938, "epoch": 0.04234841193455245, "grad_norm": 0.09284485876560211, "kl": 0.00031280517578125, "learning_rate": 8.461538461538461e-07, "loss": 0.0029, "reward": 0.2861111257225275, "reward_std": 0.12435884028673172, "rewards/accuracy_multibox_reward": 0.2861111257225275, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3340.0973205566406, "epoch": 0.04331087584215592, "grad_norm": 0.43972623348236084, "kl": 0.0005960464477539062, "learning_rate": 8.653846153846154e-07, "loss": 0.0539, "reward": 0.31111110746860504, "reward_std": 0.2948014587163925, "rewards/accuracy_multibox_reward": 0.31111110746860504, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 2780.749969482422, "epoch": 0.04427333974975938, "grad_norm": 0.19996361434459686, "kl": 0.0007853507995605469, "learning_rate": 8.846153846153846e-07, "loss": 0.0207, "reward": 0.8111111670732498, "reward_std": 0.17650396330282092, "rewards/accuracy_multibox_reward": 0.8111111670732498, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2860.013916015625, "epoch": 0.04523580365736285, "grad_norm": 0.13719673454761505, "kl": 0.0021467208862304688, "learning_rate": 9.038461538461538e-07, "loss": 0.0431, "reward": 0.6000000257045031, "reward_std": 0.10100000537931919, "rewards/accuracy_multibox_reward": 0.6000000257045031, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 3601.1944580078125, "epoch": 0.04619826756496631, "grad_norm": 0.12301400303840637, "kl": 0.0006518363952636719, "learning_rate": 9.230769230769231e-07, "loss": 0.006, "reward": 0.28888888843357563, "reward_std": 0.24846430495381355, "rewards/accuracy_multibox_reward": 0.28888888843357563, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 3335.2916259765625, "epoch": 0.04716073147256978, "grad_norm": 0.15541821718215942, "kl": 0.0008196830749511719, "learning_rate": 9.423076923076923e-07, "loss": 0.0472, "reward": 0.28611111734062433, "reward_std": 0.12435883283615112, "rewards/accuracy_multibox_reward": 0.28611111734062433, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 2631.638916015625, "epoch": 0.04812319538017324, "grad_norm": 0.1420290619134903, "kl": 0.0010700225830078125, "learning_rate": 9.615384615384615e-07, "loss": 0.0111, "reward": 0.30555556528270245, "reward_std": 0.16550762578845024, "rewards/accuracy_multibox_reward": 0.30555556528270245, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 3693.041748046875, "epoch": 0.04908565928777671, "grad_norm": 0.11556524783372879, "kl": 0.0017480850219726562, "learning_rate": 9.807692307692306e-07, "loss": -0.0182, "reward": 0.3111111167818308, "reward_std": 0.059972384944558144, "rewards/accuracy_multibox_reward": 0.3111111167818308, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3165.5138549804688, "epoch": 0.05004812319538017, "grad_norm": 0.0865161344408989, "kl": 0.001529693603515625, "learning_rate": 1e-06, "loss": 0.0099, "reward": 0.3000000147148967, "reward_std": 0.16352220997214317, "rewards/accuracy_multibox_reward": 0.3000000147148967, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 3531.1806030273438, "epoch": 0.05101058710298364, "grad_norm": 0.11084459722042084, "kl": 0.0037794113159179688, "learning_rate": 9.999977204580362e-07, "loss": -0.0084, "reward": 0.5027778074145317, "reward_std": 0.10566705744713545, "rewards/accuracy_multibox_reward": 0.5027778074145317, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3453.7083129882812, "epoch": 0.0519730510105871, "grad_norm": 0.1519511491060257, "kl": 0.002765655517578125, "learning_rate": 9.999908818552392e-07, "loss": -0.028, "reward": 0.5333333536982536, "reward_std": 0.07664800435304642, "rewards/accuracy_multibox_reward": 0.5333333536982536, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3402.236083984375, "epoch": 0.05293551491819057, "grad_norm": 0.24988575279712677, "kl": 0.0041217803955078125, "learning_rate": 9.999794842608932e-07, "loss": 0.0057, "reward": 0.5083333514630795, "reward_std": 0.11566951498389244, "rewards/accuracy_multibox_reward": 0.5083333514630795, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 3442.0416870117188, "epoch": 0.05389797882579403, "grad_norm": 0.09926177561283112, "kl": 0.0032711029052734375, "learning_rate": 9.999635277904707e-07, "loss": -0.0172, "reward": 0.21944445557892323, "reward_std": 0.13594409078359604, "rewards/accuracy_multibox_reward": 0.21944445557892323, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 3187.0277709960938, "epoch": 0.0548604427333975, "grad_norm": 0.1346801221370697, "kl": 0.0023555755615234375, "learning_rate": 9.99943012605631e-07, "loss": 0.0076, "reward": 0.5944444928318262, "reward_std": 0.15069184638559818, "rewards/accuracy_multibox_reward": 0.5944444928318262, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 3655.2638549804688, "epoch": 0.05582290664100096, "grad_norm": 0.08860195428133011, "kl": 0.0017242431640625, "learning_rate": 9.9991793891422e-07, "loss": 0.0245, "reward": 0.19722222723066807, "reward_std": 0.11203211918473244, "rewards/accuracy_multibox_reward": 0.19722222723066807, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 3465.0, "epoch": 0.05678537054860443, "grad_norm": 0.1310395449399948, "kl": 0.0026073455810546875, "learning_rate": 9.998883069702666e-07, "loss": 0.0511, "reward": 0.5361111424863338, "reward_std": 0.2253449782729149, "rewards/accuracy_multibox_reward": 0.5361111424863338, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 2772.2777709960938, "epoch": 0.05774783445620789, "grad_norm": 0.16688986122608185, "kl": 0.0018625259399414062, "learning_rate": 9.998541170739808e-07, "loss": -0.0203, "reward": 0.5194444768130779, "reward_std": 0.2851221263408661, "rewards/accuracy_multibox_reward": 0.5194444768130779, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 3090.4306030273438, "epoch": 0.05871029836381136, "grad_norm": 0.10028397291898727, "kl": 0.003421783447265625, "learning_rate": 9.998153695717503e-07, "loss": -0.0089, "reward": 0.5611111298203468, "reward_std": 0.08551743254065514, "rewards/accuracy_multibox_reward": 0.5611111298203468, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 4010.1805419921875, "epoch": 0.059672762271414825, "grad_norm": 0.24111774563789368, "kl": 0.0017642974853515625, "learning_rate": 9.997720648561382e-07, "loss": 0.038, "reward": 0.4694444611668587, "reward_std": 0.31071539549157023, "rewards/accuracy_multibox_reward": 0.4694444611668587, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 3576.125, "epoch": 0.06063522617901829, "grad_norm": 0.13691775500774384, "kl": 0.0020380020141601562, "learning_rate": 9.997242033658771e-07, "loss": 0.015, "reward": 0.3638889044523239, "reward_std": 0.208286851644516, "rewards/accuracy_multibox_reward": 0.3638889044523239, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 3448.8195190429688, "epoch": 0.061597690086621755, "grad_norm": 0.11842115223407745, "kl": 0.00231170654296875, "learning_rate": 9.99671785585866e-07, "loss": 0.0406, "reward": 0.5638888981193304, "reward_std": 0.17820694670081139, "rewards/accuracy_multibox_reward": 0.5638888981193304, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3654.111083984375, "epoch": 0.06256015399422522, "grad_norm": 0.11574345082044601, "kl": 0.0021200180053710938, "learning_rate": 9.996148120471652e-07, "loss": 0.0317, "reward": 0.5861111208796501, "reward_std": 0.26666226610541344, "rewards/accuracy_multibox_reward": 0.5861111208796501, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 3715.916748046875, "epoch": 0.06352261790182868, "grad_norm": 0.14002302289009094, "kl": 0.0027618408203125, "learning_rate": 9.995532833269903e-07, "loss": 0.0225, "reward": 0.38055555522441864, "reward_std": 0.20574601739645004, "rewards/accuracy_multibox_reward": 0.38055555522441864, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 3440.0833129882812, "epoch": 0.06448508180943215, "grad_norm": 0.3106401264667511, "kl": 0.004134178161621094, "learning_rate": 9.994872000487072e-07, "loss": 0.0137, "reward": 0.3000000063329935, "reward_std": 0.21139422059059143, "rewards/accuracy_multibox_reward": 0.3000000063329935, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 3446.9722290039062, "epoch": 0.06544754571703561, "grad_norm": 0.0957900807261467, "kl": 0.001895904541015625, "learning_rate": 9.994165628818255e-07, "loss": 0.0083, "reward": 0.31111112236976624, "reward_std": 0.19840421248227358, "rewards/accuracy_multibox_reward": 0.31111112236976624, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 3254.138916015625, "epoch": 0.06641000962463908, "grad_norm": 0.10106165707111359, "kl": 0.0028934478759765625, "learning_rate": 9.99341372541991e-07, "loss": 0.0137, "reward": 0.6166666969656944, "reward_std": 0.16280359355732799, "rewards/accuracy_multibox_reward": 0.6166666969656944, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 3649.0694580078125, "epoch": 0.06737247353224254, "grad_norm": 0.14687857031822205, "kl": 0.001850128173828125, "learning_rate": 9.992616297909796e-07, "loss": 0.0192, "reward": 0.32500000298023224, "reward_std": 0.18547911942005157, "rewards/accuracy_multibox_reward": 0.32500000298023224, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 3336.0, "epoch": 0.068334937439846, "grad_norm": 0.0944090336561203, "kl": 0.003101348876953125, "learning_rate": 9.991773354366888e-07, "loss": -0.0091, "reward": 0.3638889044523239, "reward_std": 0.15311685157939792, "rewards/accuracy_multibox_reward": 0.3638889044523239, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 3078.4166870117188, "epoch": 0.06929740134744947, "grad_norm": 0.20393605530261993, "kl": 0.004413604736328125, "learning_rate": 9.990884903331299e-07, "loss": 0.0182, "reward": 0.33888889104127884, "reward_std": 0.26261031255126, "rewards/accuracy_multibox_reward": 0.33888889104127884, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 3025.4306640625, "epoch": 0.07025986525505294, "grad_norm": 0.15864180028438568, "kl": 0.002300262451171875, "learning_rate": 9.98995095380419e-07, "loss": 0.0384, "reward": 0.5805555731058121, "reward_std": 0.2967027500271797, "rewards/accuracy_multibox_reward": 0.5805555731058121, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 3702.1388549804688, "epoch": 0.0712223291626564, "grad_norm": 0.13274550437927246, "kl": 0.0021495819091796875, "learning_rate": 9.988971515247678e-07, "loss": 0.021, "reward": 0.1388888843357563, "reward_std": 0.10084376111626625, "rewards/accuracy_multibox_reward": 0.1388888843357563, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 3513.7084350585938, "epoch": 0.07218479307025986, "grad_norm": 0.19914722442626953, "kl": 0.004657745361328125, "learning_rate": 9.987946597584755e-07, "loss": 0.0122, "reward": 0.5601852238178253, "reward_std": 0.35124552994966507, "rewards/accuracy_multibox_reward": 0.5601852238178253, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 3718.388916015625, "epoch": 0.07314725697786333, "grad_norm": 0.1906462013721466, "kl": 0.00392913818359375, "learning_rate": 9.986876211199159e-07, "loss": 0.0051, "reward": 0.4416666701436043, "reward_std": 0.13074181601405144, "rewards/accuracy_multibox_reward": 0.4416666701436043, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 3189.77783203125, "epoch": 0.0741097208854668, "grad_norm": 0.2731522023677826, "kl": 0.0026454925537109375, "learning_rate": 9.985760366935296e-07, "loss": 0.01, "reward": 0.31388888880610466, "reward_std": 0.25756508484482765, "rewards/accuracy_multibox_reward": 0.31388888880610466, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 3500.638916015625, "epoch": 0.07507218479307026, "grad_norm": 0.35730600357055664, "kl": 0.00348663330078125, "learning_rate": 9.984599076098116e-07, "loss": 0.0227, "reward": 0.5833333432674408, "reward_std": 0.2766689187847078, "rewards/accuracy_multibox_reward": 0.5833333432674408, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 3585.8056640625, "epoch": 0.07603464870067372, "grad_norm": 0.08607491850852966, "kl": 0.001842498779296875, "learning_rate": 9.983392350453003e-07, "loss": 0.0024, "reward": 0.35555557906627655, "reward_std": 0.11049480689689517, "rewards/accuracy_multibox_reward": 0.35555557906627655, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 3604.6389770507812, "epoch": 0.07699711260827719, "grad_norm": 0.12056368589401245, "kl": 0.003692626953125, "learning_rate": 9.982140202225654e-07, "loss": 0.007, "reward": 0.3972222302109003, "reward_std": 0.097060427069664, "rewards/accuracy_multibox_reward": 0.3972222302109003, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 3425.8194580078125, "epoch": 0.07795957651588066, "grad_norm": 0.1707271784543991, "kl": 0.003154754638671875, "learning_rate": 9.980842644101953e-07, "loss": -0.0113, "reward": 0.5472222343087196, "reward_std": 0.20941452123224735, "rewards/accuracy_multibox_reward": 0.5472222343087196, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 3489.2222290039062, "epoch": 0.07892204042348412, "grad_norm": 0.5101723670959473, "kl": 0.0034847259521484375, "learning_rate": 9.97949968922785e-07, "loss": 0.0404, "reward": 0.5666666738688946, "reward_std": 0.19860261492431164, "rewards/accuracy_multibox_reward": 0.5666666738688946, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 3077.1944580078125, "epoch": 0.07988450433108758, "grad_norm": 0.5464356541633606, "kl": 0.001800537109375, "learning_rate": 9.978111351209218e-07, "loss": 0.1029, "reward": 0.550000011920929, "reward_std": 0.25563743663951755, "rewards/accuracy_multibox_reward": 0.550000011920929, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 3356.1806030273438, "epoch": 0.08084696823869104, "grad_norm": 0.12452831864356995, "kl": 0.0021991729736328125, "learning_rate": 9.976677644111724e-07, "loss": 0.0251, "reward": 0.6694444790482521, "reward_std": 0.10007737763226032, "rewards/accuracy_multibox_reward": 0.6694444790482521, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 3775.3750610351562, "epoch": 0.08180943214629452, "grad_norm": 0.07462698221206665, "kl": 0.0030727386474609375, "learning_rate": 9.975198582460682e-07, "loss": -0.0034, "reward": 0.20000000204890966, "reward_std": 0.14281124994158745, "rewards/accuracy_multibox_reward": 0.20000000204890966, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 3611.3472900390625, "epoch": 0.08277189605389798, "grad_norm": 0.13631397485733032, "kl": 0.0022258758544921875, "learning_rate": 9.973674181240905e-07, "loss": 0.0434, "reward": 0.3277777824550867, "reward_std": 0.06757262628525496, "rewards/accuracy_multibox_reward": 0.3277777824550867, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 3004.5556030273438, "epoch": 0.08373435996150144, "grad_norm": 0.15965494513511658, "kl": 0.004913330078125, "learning_rate": 9.972104455896557e-07, "loss": 0.0106, "reward": 0.3611111082136631, "reward_std": 0.3104199469089508, "rewards/accuracy_multibox_reward": 0.3611111082136631, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 3728.90283203125, "epoch": 0.0846968238691049, "grad_norm": 0.06844818592071533, "kl": 0.002758026123046875, "learning_rate": 9.97048942233099e-07, "loss": 0.0035, "reward": 0.2638888927176595, "reward_std": 0.08879294618964195, "rewards/accuracy_multibox_reward": 0.2638888927176595, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 3584.3056030273438, "epoch": 0.08565928777670838, "grad_norm": 0.10796435922384262, "kl": 0.0041675567626953125, "learning_rate": 9.968829096906589e-07, "loss": -0.0025, "reward": 0.2861111257225275, "reward_std": 0.19754209369421005, "rewards/accuracy_multibox_reward": 0.2861111257225275, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 3544.5556030273438, "epoch": 0.08662175168431184, "grad_norm": 0.18381601572036743, "kl": 0.003765106201171875, "learning_rate": 9.96712349644461e-07, "loss": 0.0351, "reward": 0.43611112982034683, "reward_std": 0.18288761749863625, "rewards/accuracy_multibox_reward": 0.43611112982034683, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 3639.3333740234375, "epoch": 0.0875842155919153, "grad_norm": 0.11833083629608154, "kl": 0.00388336181640625, "learning_rate": 9.965372638224994e-07, "loss": 0.0234, "reward": 0.39444446563720703, "reward_std": 0.1931697279214859, "rewards/accuracy_multibox_reward": 0.39444446563720703, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 3383.0972290039062, "epoch": 0.08854667949951876, "grad_norm": 0.12733034789562225, "kl": 0.006488800048828125, "learning_rate": 9.963576539986208e-07, "loss": 0.0141, "reward": 0.5138889066874981, "reward_std": 0.18835240975022316, "rewards/accuracy_multibox_reward": 0.5138889066874981, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 3633.1250610351562, "epoch": 0.08950914340712224, "grad_norm": 0.16952967643737793, "kl": 0.00450897216796875, "learning_rate": 9.96173521992506e-07, "loss": -0.004, "reward": 0.38333334028720856, "reward_std": 0.1083512376062572, "rewards/accuracy_multibox_reward": 0.38333334028720856, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 3057.263916015625, "epoch": 0.0904716073147257, "grad_norm": 0.136738121509552, "kl": 0.003498077392578125, "learning_rate": 9.95984869669651e-07, "loss": 0.0127, "reward": 0.6305555440485477, "reward_std": 0.24464774876832962, "rewards/accuracy_multibox_reward": 0.6305555440485477, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 2913.3472290039062, "epoch": 0.09143407122232916, "grad_norm": 0.1691979616880417, "kl": 0.004119873046875, "learning_rate": 9.957916989413486e-07, "loss": 0.0298, "reward": 0.39166667591780424, "reward_std": 0.18254700675606728, "rewards/accuracy_multibox_reward": 0.39166667591780424, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3634.2222900390625, "epoch": 0.09239653512993262, "grad_norm": 0.13666173815727234, "kl": 0.00719451904296875, "learning_rate": 9.95594011764669e-07, "loss": 0.0098, "reward": 0.3666666727513075, "reward_std": 0.2188832312822342, "rewards/accuracy_multibox_reward": 0.3666666727513075, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 3291.4722900390625, "epoch": 0.0933589990375361, "grad_norm": 0.21308623254299164, "kl": 0.005107879638671875, "learning_rate": 9.953918101424396e-07, "loss": 0.0375, "reward": 0.4555555656552315, "reward_std": 0.18532183207571507, "rewards/accuracy_multibox_reward": 0.4555555656552315, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 3607.4027709960938, "epoch": 0.09432146294513956, "grad_norm": 0.15544836223125458, "kl": 0.00621795654296875, "learning_rate": 9.951850961232255e-07, "loss": 0.0129, "reward": 0.46666669473052025, "reward_std": 0.19853749265894294, "rewards/accuracy_multibox_reward": 0.46666669473052025, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 3839.5278930664062, "epoch": 0.09528392685274302, "grad_norm": 0.11245381832122803, "kl": 0.00554656982421875, "learning_rate": 9.949738718013078e-07, "loss": 0.018, "reward": 0.10000000335276127, "reward_std": 0.08164966106414795, "rewards/accuracy_multibox_reward": 0.10000000335276127, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 3567.2361450195312, "epoch": 0.09624639076034648, "grad_norm": 0.08032280206680298, "kl": 0.00496673583984375, "learning_rate": 9.94758139316663e-07, "loss": 0.0011, "reward": 0.20277778059244156, "reward_std": 0.006804134231060743, "rewards/accuracy_multibox_reward": 0.20277778059244156, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 3492.361083984375, "epoch": 0.09720885466794996, "grad_norm": 0.11768710613250732, "kl": 0.004337310791015625, "learning_rate": 9.94537900854941e-07, "loss": 0.0385, "reward": 0.5111111328005791, "reward_std": 0.18457387015223503, "rewards/accuracy_multibox_reward": 0.5111111328005791, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 3928.1944580078125, "epoch": 0.09817131857555342, "grad_norm": 0.14249296486377716, "kl": 0.0094451904296875, "learning_rate": 9.943131586474436e-07, "loss": 0.0057, "reward": 0.4194444492459297, "reward_std": 0.1372167943045497, "rewards/accuracy_multibox_reward": 0.4194444492459297, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 3752.8056030273438, "epoch": 0.09913378248315688, "grad_norm": 0.2437194436788559, "kl": 0.0050563812255859375, "learning_rate": 9.940839149711009e-07, "loss": 0.0119, "reward": 0.6277778018265963, "reward_std": 0.18959707394242287, "rewards/accuracy_multibox_reward": 0.6277778018265963, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 3488.5833740234375, "epoch": 0.10009624639076034, "grad_norm": 0.2315751314163208, "kl": 0.0062160491943359375, "learning_rate": 9.93850172148448e-07, "loss": -0.0151, "reward": 0.3805555636063218, "reward_std": 0.08353401115164161, "rewards/accuracy_multibox_reward": 0.3805555636063218, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 3617.25, "epoch": 0.10105871029836382, "grad_norm": 0.1759169101715088, "kl": 0.00568389892578125, "learning_rate": 9.936119325476042e-07, "loss": 0.04, "reward": 0.42222223803400993, "reward_std": 0.36728398501873016, "rewards/accuracy_multibox_reward": 0.42222223803400993, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 3828.90283203125, "epoch": 0.10202117420596728, "grad_norm": 0.15607592463493347, "kl": 0.005893707275390625, "learning_rate": 9.93369198582245e-07, "loss": 0.0301, "reward": 0.3194444589316845, "reward_std": 0.0962381474673748, "rewards/accuracy_multibox_reward": 0.3194444589316845, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 3599.3472900390625, "epoch": 0.10298363811357074, "grad_norm": 0.11216462403535843, "kl": 0.00664520263671875, "learning_rate": 9.931219727115805e-07, "loss": -0.0205, "reward": 0.38333335518836975, "reward_std": 0.19678517058491707, "rewards/accuracy_multibox_reward": 0.38333335518836975, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 3698.9999389648438, "epoch": 0.1039461020211742, "grad_norm": 0.1152242049574852, "kl": 0.005146026611328125, "learning_rate": 9.928702574403301e-07, "loss": 0.0203, "reward": 0.40820106863975525, "reward_std": 0.15179424732923508, "rewards/accuracy_multibox_reward": 0.40820106863975525, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 3671.3333740234375, "epoch": 0.10490856592877768, "grad_norm": 0.07912494987249374, "kl": 0.004741668701171875, "learning_rate": 9.926140553186956e-07, "loss": 0.0004, "reward": 0.2527777850627899, "reward_std": 0.10060762194916606, "rewards/accuracy_multibox_reward": 0.2527777850627899, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 3661.9027709960938, "epoch": 0.10587102983638114, "grad_norm": 0.10528931766748428, "kl": 0.006855010986328125, "learning_rate": 9.923533689423371e-07, "loss": -0.0016, "reward": 0.2805555500090122, "reward_std": 0.1905558556318283, "rewards/accuracy_multibox_reward": 0.2805555500090122, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 4041.6945190429688, "epoch": 0.1068334937439846, "grad_norm": 0.0787297785282135, "kl": 0.005962371826171875, "learning_rate": 9.920882009523458e-07, "loss": 0.0027, "reward": 0.29722223803400993, "reward_std": 0.12330532446503639, "rewards/accuracy_multibox_reward": 0.29722223803400993, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 3767.9722290039062, "epoch": 0.10779595765158806, "grad_norm": 0.1236087754368782, "kl": 0.004596710205078125, "learning_rate": 9.918185540352179e-07, "loss": 0.011, "reward": 0.47222223971039057, "reward_std": 0.14270470943301916, "rewards/accuracy_multibox_reward": 0.47222223971039057, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 3207.375, "epoch": 0.10875842155919153, "grad_norm": 0.1736755222082138, "kl": 0.004367828369140625, "learning_rate": 9.915444309228259e-07, "loss": -0.0009, "reward": 0.5722222402691841, "reward_std": 0.2549763135612011, "rewards/accuracy_multibox_reward": 0.5722222402691841, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 3685.375, "epoch": 0.109720885466795, "grad_norm": 0.1050589457154274, "kl": 0.003978729248046875, "learning_rate": 9.912658343923934e-07, "loss": 0.0072, "reward": 0.17500000074505806, "reward_std": 0.1587034948170185, "rewards/accuracy_multibox_reward": 0.17500000074505806, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 3612.736083984375, "epoch": 0.11068334937439846, "grad_norm": 0.0682823657989502, "kl": 0.006702423095703125, "learning_rate": 9.90982767266464e-07, "loss": -0.0076, "reward": 0.1333333346992731, "reward_std": 0.05443309945985675, "rewards/accuracy_multibox_reward": 0.1333333346992731, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 3987.2222900390625, "epoch": 0.11164581328200192, "grad_norm": 0.18245284259319305, "kl": 0.005523681640625, "learning_rate": 9.906952324128756e-07, "loss": 0.0122, "reward": 0.2583333421498537, "reward_std": 0.180131945759058, "rewards/accuracy_multibox_reward": 0.2583333421498537, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 3940.9165649414062, "epoch": 0.1126082771896054, "grad_norm": 0.08878901600837708, "kl": 0.00728607177734375, "learning_rate": 9.90403232744729e-07, "loss": 0.0144, "reward": 0.291666679084301, "reward_std": 0.11064173420891166, "rewards/accuracy_multibox_reward": 0.291666679084301, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 3939.3750610351562, "epoch": 0.11357074109720885, "grad_norm": 0.1614331156015396, "kl": 0.004207611083984375, "learning_rate": 9.9010677122036e-07, "loss": 0.0063, "reward": 0.4194444604218006, "reward_std": 0.23391840001568198, "rewards/accuracy_multibox_reward": 0.4194444604218006, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 3917.0972900390625, "epoch": 0.11453320500481232, "grad_norm": 0.1234181821346283, "kl": 0.005123138427734375, "learning_rate": 9.898058508433084e-07, "loss": 0.0269, "reward": 0.3527777912095189, "reward_std": 0.33327627182006836, "rewards/accuracy_multibox_reward": 0.3527777912095189, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 3083.9444580078125, "epoch": 0.11549566891241578, "grad_norm": 0.1449313461780548, "kl": 0.0035905838012695312, "learning_rate": 9.895004746622883e-07, "loss": 0.0389, "reward": 0.36944444850087166, "reward_std": 0.22871611639857292, "rewards/accuracy_multibox_reward": 0.36944444850087166, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 3723.5556030273438, "epoch": 0.11645813282001925, "grad_norm": 0.142880380153656, "kl": 0.0050716400146484375, "learning_rate": 9.891906457711564e-07, "loss": -0.0048, "reward": 0.3722222335636616, "reward_std": 0.177376851439476, "rewards/accuracy_multibox_reward": 0.3722222335636616, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 3960.5416870117188, "epoch": 0.11742059672762271, "grad_norm": 0.11253928393125534, "kl": 0.004901885986328125, "learning_rate": 9.888763673088815e-07, "loss": 0.0457, "reward": 0.4000000078231096, "reward_std": 0.2442757561802864, "rewards/accuracy_multibox_reward": 0.4000000078231096, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 4043.8333740234375, "epoch": 0.11838306063522618, "grad_norm": 0.12665565311908722, "kl": 0.004459381103515625, "learning_rate": 9.885576424595124e-07, "loss": 0.0505, "reward": 0.2277777846902609, "reward_std": 0.19980080891400576, "rewards/accuracy_multibox_reward": 0.2277777846902609, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 3565.02783203125, "epoch": 0.11934552454282965, "grad_norm": 0.0935640037059784, "kl": 0.004180908203125, "learning_rate": 9.88234474452145e-07, "loss": 0.0448, "reward": 0.48055557161569595, "reward_std": 0.2079591266810894, "rewards/accuracy_multibox_reward": 0.48055557161569595, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 3709.5140380859375, "epoch": 0.12030798845043311, "grad_norm": 0.3217061460018158, "kl": 0.004199981689453125, "learning_rate": 9.87906866560891e-07, "loss": -0.0652, "reward": 0.35833332501351833, "reward_std": 0.14870021026581526, "rewards/accuracy_multibox_reward": 0.35833332501351833, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 3448.3333740234375, "epoch": 0.12127045235803657, "grad_norm": 0.18528644740581512, "kl": 0.00522613525390625, "learning_rate": 9.875748221048432e-07, "loss": 0.0278, "reward": 0.4930555522441864, "reward_std": 0.11941156722605228, "rewards/accuracy_multibox_reward": 0.4930555522441864, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 3796.3888549804688, "epoch": 0.12223291626564003, "grad_norm": 0.13414032757282257, "kl": 0.0051727294921875, "learning_rate": 9.872383444480428e-07, "loss": 0.0415, "reward": 0.45000000298023224, "reward_std": 0.23648500815033913, "rewards/accuracy_multibox_reward": 0.45000000298023224, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 3870.8888549804688, "epoch": 0.12319538017324351, "grad_norm": 0.12725594639778137, "kl": 0.00487518310546875, "learning_rate": 9.868974369994451e-07, "loss": 0.0368, "reward": 0.42222224548459053, "reward_std": 0.14068505307659507, "rewards/accuracy_multibox_reward": 0.42222224548459053, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 3600.4722900390625, "epoch": 0.12415784408084697, "grad_norm": 0.13314376771450043, "kl": 0.007450103759765625, "learning_rate": 9.86552103212885e-07, "loss": -0.0019, "reward": 0.4888888970017433, "reward_std": 0.158585699275136, "rewards/accuracy_multibox_reward": 0.4888888970017433, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 3556.3889770507812, "epoch": 0.12512030798845045, "grad_norm": 0.16161026060581207, "kl": 0.00640869140625, "learning_rate": 9.862023465870421e-07, "loss": -0.0005, "reward": 0.40833333507180214, "reward_std": 0.16617706324905157, "rewards/accuracy_multibox_reward": 0.40833333507180214, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 3571.0833129882812, "epoch": 0.1260827718960539, "grad_norm": 0.12189988791942596, "kl": 0.0050201416015625, "learning_rate": 9.858481706654043e-07, "loss": 0.011, "reward": 0.24444444850087166, "reward_std": 0.1777322101406753, "rewards/accuracy_multibox_reward": 0.24444444850087166, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 3127.7083740234375, "epoch": 0.12704523580365737, "grad_norm": 0.17650698125362396, "kl": 0.00763702392578125, "learning_rate": 9.854895790362337e-07, "loss": -0.0021, "reward": 0.3333333469927311, "reward_std": 0.23127097636461258, "rewards/accuracy_multibox_reward": 0.3333333469927311, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 3653.166748046875, "epoch": 0.12800769971126083, "grad_norm": 0.12247216701507568, "kl": 0.005245208740234375, "learning_rate": 9.851265753325289e-07, "loss": 0.0436, "reward": 0.41111112385988235, "reward_std": 0.26003193110227585, "rewards/accuracy_multibox_reward": 0.41111112385988235, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 4011.8611450195312, "epoch": 0.1289701636188643, "grad_norm": 0.20165696740150452, "kl": 0.0062408447265625, "learning_rate": 9.847591632319882e-07, "loss": -0.0128, "reward": 0.18888889253139496, "reward_std": 0.04951331717893481, "rewards/accuracy_multibox_reward": 0.18888889253139496, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 3736.8056640625, "epoch": 0.12993262752646775, "grad_norm": 0.1663724035024643, "kl": 0.00638580322265625, "learning_rate": 9.843873464569727e-07, "loss": 0.0495, "reward": 0.36666668206453323, "reward_std": 0.19988172501325607, "rewards/accuracy_multibox_reward": 0.36666668206453323, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 3425.5972900390625, "epoch": 0.13089509143407121, "grad_norm": 0.13148580491542816, "kl": 0.004099845886230469, "learning_rate": 9.840111287744695e-07, "loss": 0.0228, "reward": 0.6444444581866264, "reward_std": 0.1945418524555862, "rewards/accuracy_multibox_reward": 0.6444444581866264, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 3664.90283203125, "epoch": 0.13185755534167468, "grad_norm": 0.12286718934774399, "kl": 0.006633758544921875, "learning_rate": 9.836305139960513e-07, "loss": 0.0294, "reward": 0.3166666654869914, "reward_std": 0.2699621021747589, "rewards/accuracy_multibox_reward": 0.3166666654869914, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 3443.0694580078125, "epoch": 0.13282001924927817, "grad_norm": 0.2325388491153717, "kl": 0.005558013916015625, "learning_rate": 9.8324550597784e-07, "loss": 0.042, "reward": 0.5861111283302307, "reward_std": 0.29276277869939804, "rewards/accuracy_multibox_reward": 0.5861111283302307, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 3745.861083984375, "epoch": 0.13378248315688163, "grad_norm": 0.1312907636165619, "kl": 0.0056915283203125, "learning_rate": 9.828561086204662e-07, "loss": 0.0276, "reward": 0.29722223430871964, "reward_std": 0.15524353086948395, "rewards/accuracy_multibox_reward": 0.29722223430871964, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 3726.0972900390625, "epoch": 0.1347449470644851, "grad_norm": 0.17527508735656738, "kl": 0.00432586669921875, "learning_rate": 9.824623258690308e-07, "loss": 0.0489, "reward": 0.4833333306014538, "reward_std": 0.24312062561511993, "rewards/accuracy_multibox_reward": 0.4833333306014538, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 3537.6805419921875, "epoch": 0.13570741097208855, "grad_norm": 0.18606775999069214, "kl": 0.00485992431640625, "learning_rate": 9.820641617130635e-07, "loss": 0.0198, "reward": 0.5194444507360458, "reward_std": 0.2554696383886039, "rewards/accuracy_multibox_reward": 0.5194444507360458, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 3892.0416870117188, "epoch": 0.136669874879692, "grad_norm": 0.17854081094264984, "kl": 0.0061969757080078125, "learning_rate": 9.816616201864842e-07, "loss": 0.0448, "reward": 0.3402777910232544, "reward_std": 0.2221098616719246, "rewards/accuracy_multibox_reward": 0.3402777910232544, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 3491.5416870117188, "epoch": 0.13763233878729547, "grad_norm": 0.15011605620384216, "kl": 0.003662109375, "learning_rate": 9.81254705367561e-07, "loss": 0.0186, "reward": 0.21388889849185944, "reward_std": 0.11700921878218651, "rewards/accuracy_multibox_reward": 0.21388889849185944, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 3892.4444580078125, "epoch": 0.13859480269489893, "grad_norm": 0.14944396913051605, "kl": 0.0042724609375, "learning_rate": 9.80843421378869e-07, "loss": 0.0044, "reward": 0.37222224473953247, "reward_std": 0.2646499630063772, "rewards/accuracy_multibox_reward": 0.37222224473953247, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 4113.111083984375, "epoch": 0.1395572666025024, "grad_norm": 0.08410988003015518, "kl": 0.003753662109375, "learning_rate": 9.804277723872486e-07, "loss": 0.0334, "reward": 0.18333334475755692, "reward_std": 0.14785372838377953, "rewards/accuracy_multibox_reward": 0.18333334475755692, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 3307.2084350585938, "epoch": 0.14051973051010588, "grad_norm": 0.15188713371753693, "kl": 0.00379180908203125, "learning_rate": 9.800077626037633e-07, "loss": 0.0051, "reward": 0.4158730246126652, "reward_std": 0.23412438482046127, "rewards/accuracy_multibox_reward": 0.4158730246126652, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 3835.7362060546875, "epoch": 0.14148219441770934, "grad_norm": 0.08457997441291809, "kl": 0.004669189453125, "learning_rate": 9.795833962836574e-07, "loss": 0.0004, "reward": 0.24722223542630672, "reward_std": 0.10980967991054058, "rewards/accuracy_multibox_reward": 0.24722223542630672, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 3564.6666870117188, "epoch": 0.1424446583253128, "grad_norm": 0.13392095267772675, "kl": 0.003963470458984375, "learning_rate": 9.791546777263122e-07, "loss": 0.0247, "reward": 0.4416666850447655, "reward_std": 0.1899460325948894, "rewards/accuracy_multibox_reward": 0.4416666850447655, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 3819.138916015625, "epoch": 0.14340712223291627, "grad_norm": 0.19325554370880127, "kl": 0.00379180908203125, "learning_rate": 9.787216112752035e-07, "loss": 0.0309, "reward": 0.46111113019287586, "reward_std": 0.2969263382256031, "rewards/accuracy_multibox_reward": 0.46111113019287586, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 3666.2916870117188, "epoch": 0.14436958614051973, "grad_norm": 0.13392120599746704, "kl": 0.005462646484375, "learning_rate": 9.782842013178559e-07, "loss": 0.0122, "reward": 0.1138888904824853, "reward_std": 0.13582700490951538, "rewards/accuracy_multibox_reward": 0.1138888904824853, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 3920.9722900390625, "epoch": 0.1453320500481232, "grad_norm": 0.17915813624858856, "kl": 0.005466461181640625, "learning_rate": 9.778424522858002e-07, "loss": 0.0246, "reward": 0.17222222592681646, "reward_std": 0.21288084238767624, "rewards/accuracy_multibox_reward": 0.17222222592681646, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 3778.041748046875, "epoch": 0.14629451395572665, "grad_norm": 0.16470938920974731, "kl": 0.00572967529296875, "learning_rate": 9.773963686545268e-07, "loss": 0.0245, "reward": 0.4138888865709305, "reward_std": 0.18004896119236946, "rewards/accuracy_multibox_reward": 0.4138888865709305, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 3388.9583740234375, "epoch": 0.1472569778633301, "grad_norm": 0.15592330694198608, "kl": 0.005641937255859375, "learning_rate": 9.769459549434422e-07, "loss": 0.012, "reward": 0.2944444566965103, "reward_std": 0.10107124224305153, "rewards/accuracy_multibox_reward": 0.2944444566965103, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 3804.9861450195312, "epoch": 0.1482194417709336, "grad_norm": 0.09174719452857971, "kl": 0.00565338134765625, "learning_rate": 9.764912157158215e-07, "loss": 0.0164, "reward": 0.23055556789040565, "reward_std": 0.13528107106685638, "rewards/accuracy_multibox_reward": 0.23055556789040565, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3487.2916870117188, "epoch": 0.14918190567853706, "grad_norm": 0.12140607088804245, "kl": 0.0045623779296875, "learning_rate": 9.760321555787636e-07, "loss": -0.0095, "reward": 0.5777778029441833, "reward_std": 0.1274236962199211, "rewards/accuracy_multibox_reward": 0.5777778029441833, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 3878.2361450195312, "epoch": 0.15014436958614052, "grad_norm": 0.08763060718774796, "kl": 0.00701141357421875, "learning_rate": 9.755687791831427e-07, "loss": 0.0095, "reward": 0.1916666654869914, "reward_std": 0.19732000632211566, "rewards/accuracy_multibox_reward": 0.1916666654869914, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 3859.2222900390625, "epoch": 0.15110683349374399, "grad_norm": 0.14390024542808533, "kl": 0.006866455078125, "learning_rate": 9.751010912235634e-07, "loss": -0.0235, "reward": 0.302777785807848, "reward_std": 0.16032306477427483, "rewards/accuracy_multibox_reward": 0.302777785807848, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 4083.5833740234375, "epoch": 0.15206929740134745, "grad_norm": 0.1413411796092987, "kl": 0.00652313232421875, "learning_rate": 9.746290964383116e-07, "loss": 0.0147, "reward": 0.13055555894970894, "reward_std": 0.19393739104270935, "rewards/accuracy_multibox_reward": 0.13055555894970894, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 3892.916748046875, "epoch": 0.1530317613089509, "grad_norm": 0.07263165712356567, "kl": 0.006622314453125, "learning_rate": 9.74152799609307e-07, "loss": 0.0183, "reward": 0.1805555708706379, "reward_std": 0.10372589901089668, "rewards/accuracy_multibox_reward": 0.1805555708706379, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 3875.7361450195312, "epoch": 0.15399422521655437, "grad_norm": 0.17878884077072144, "kl": 0.0062255859375, "learning_rate": 9.736722055620542e-07, "loss": 0.0535, "reward": 0.34166667237877846, "reward_std": 0.30671266466379166, "rewards/accuracy_multibox_reward": 0.34166667237877846, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 3570.4583740234375, "epoch": 0.15495668912415783, "grad_norm": 0.16018472611904144, "kl": 0.0087890625, "learning_rate": 9.731873191655946e-07, "loss": -0.0397, "reward": 0.44722223468124866, "reward_std": 0.1670007831417024, "rewards/accuracy_multibox_reward": 0.44722223468124866, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 3762.2362060546875, "epoch": 0.15591915303176132, "grad_norm": 0.16481521725654602, "kl": 0.0063018798828125, "learning_rate": 9.726981453324568e-07, "loss": 0.0411, "reward": 0.3444444574415684, "reward_std": 0.3314964883029461, "rewards/accuracy_multibox_reward": 0.3444444574415684, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 4110.2777099609375, "epoch": 0.15688161693936478, "grad_norm": 0.07446856796741486, "kl": 0.0064697265625, "learning_rate": 9.722046890186067e-07, "loss": 0.0146, "reward": 0.06666667014360428, "reward_std": 0.10327956825494766, "rewards/accuracy_multibox_reward": 0.06666667014360428, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 3443.7222290039062, "epoch": 0.15784408084696824, "grad_norm": 0.08308328688144684, "kl": 0.00839996337890625, "learning_rate": 9.717069552233966e-07, "loss": 0.0237, "reward": 0.3083333373069763, "reward_std": 0.09981933888047934, "rewards/accuracy_multibox_reward": 0.3083333373069763, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 3726.7501220703125, "epoch": 0.1588065447545717, "grad_norm": 0.10596819221973419, "kl": 0.0059356689453125, "learning_rate": 9.712049489895158e-07, "loss": 0.0044, "reward": 0.32777778059244156, "reward_std": 0.16668182611465454, "rewards/accuracy_multibox_reward": 0.32777778059244156, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 3416.9583740234375, "epoch": 0.15976900866217517, "grad_norm": 0.12530213594436646, "kl": 0.00612640380859375, "learning_rate": 9.706986754029391e-07, "loss": -0.0014, "reward": 0.35277779772877693, "reward_std": 0.14521147217601538, "rewards/accuracy_multibox_reward": 0.35277779772877693, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 3281.9166870117188, "epoch": 0.16073147256977863, "grad_norm": 0.12662166357040405, "kl": 0.010345458984375, "learning_rate": 9.701881395928747e-07, "loss": 0.0299, "reward": 0.4083333546295762, "reward_std": 0.22899410128593445, "rewards/accuracy_multibox_reward": 0.4083333546295762, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 3612.9027099609375, "epoch": 0.1616939364773821, "grad_norm": 0.13573803007602692, "kl": 0.00595855712890625, "learning_rate": 9.696733467317127e-07, "loss": -0.0252, "reward": 0.28611112758517265, "reward_std": 0.24503237009048462, "rewards/accuracy_multibox_reward": 0.28611112758517265, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 3937.0694580078125, "epoch": 0.16265640038498555, "grad_norm": 0.09121771156787872, "kl": 0.00795745849609375, "learning_rate": 9.691543020349732e-07, "loss": 0.0202, "reward": 0.13611110672354698, "reward_std": 0.15312519297003746, "rewards/accuracy_multibox_reward": 0.13611110672354698, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 3249.5416870117188, "epoch": 0.16361886429258904, "grad_norm": 0.16358071565628052, "kl": 0.010894775390625, "learning_rate": 9.686310107612522e-07, "loss": 0.0233, "reward": 0.4090277887880802, "reward_std": 0.10734977200627327, "rewards/accuracy_multibox_reward": 0.4090277887880802, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 3975.986083984375, "epoch": 0.1645813282001925, "grad_norm": 0.10440467298030853, "kl": 0.00681304931640625, "learning_rate": 9.681034782121695e-07, "loss": 0.0235, "reward": 0.20000000670552254, "reward_std": 0.14410318993031979, "rewards/accuracy_multibox_reward": 0.20000000670552254, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 3522.9305419921875, "epoch": 0.16554379210779596, "grad_norm": 0.11578962951898575, "kl": 0.012451171875, "learning_rate": 9.67571709732314e-07, "loss": -0.0023, "reward": 0.24444444850087166, "reward_std": 0.06749687297269702, "rewards/accuracy_multibox_reward": 0.24444444850087166, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 3926.5694580078125, "epoch": 0.16650625601539942, "grad_norm": 0.1174028217792511, "kl": 0.0109710693359375, "learning_rate": 9.670357107091908e-07, "loss": 0.0381, "reward": 0.09722222574055195, "reward_std": 0.14455072954297066, "rewards/accuracy_multibox_reward": 0.09722222574055195, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 3769.3333740234375, "epoch": 0.16746871992300288, "grad_norm": 0.2790958285331726, "kl": 0.01125335693359375, "learning_rate": 9.664954865731655e-07, "loss": 0.039, "reward": 0.22777778189629316, "reward_std": 0.08164965733885765, "rewards/accuracy_multibox_reward": 0.22777778189629316, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 3649.9583740234375, "epoch": 0.16843118383060635, "grad_norm": 0.15021437406539917, "kl": 0.010162353515625, "learning_rate": 9.659510427974095e-07, "loss": -0.0076, "reward": 0.3383720777928829, "reward_std": 0.2205847203731537, "rewards/accuracy_multibox_reward": 0.3383720777928829, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 4131.0972900390625, "epoch": 0.1693936477382098, "grad_norm": 0.13074146211147308, "kl": 0.0146636962890625, "learning_rate": 9.654023848978448e-07, "loss": 0.0038, "reward": 0.24444445595145226, "reward_std": 0.23407471552491188, "rewards/accuracy_multibox_reward": 0.24444445595145226, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 3838.0277709960938, "epoch": 0.17035611164581327, "grad_norm": 0.37512320280075073, "kl": 0.011505126953125, "learning_rate": 9.648495184330876e-07, "loss": 0.0228, "reward": 0.21388889476656914, "reward_std": 0.0916629247367382, "rewards/accuracy_multibox_reward": 0.21388889476656914, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 3418.7084350585938, "epoch": 0.17131857555341676, "grad_norm": 0.09808792918920517, "kl": 0.010711669921875, "learning_rate": 9.64292449004393e-07, "loss": -0.0076, "reward": 0.33888888359069824, "reward_std": 0.06499818759039044, "rewards/accuracy_multibox_reward": 0.33888888359069824, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 4210.902893066406, "epoch": 0.17228103946102022, "grad_norm": 0.09819231182336807, "kl": 0.01296234130859375, "learning_rate": 9.637311822555969e-07, "loss": 0.0238, "reward": 0.28611110150814056, "reward_std": 0.23645970597863197, "rewards/accuracy_multibox_reward": 0.28611110150814056, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 3570.9861450195312, "epoch": 0.17324350336862368, "grad_norm": 0.1745200753211975, "kl": 0.0102081298828125, "learning_rate": 9.631657238730597e-07, "loss": 0.0108, "reward": 0.3500000089406967, "reward_std": 0.21217102464288473, "rewards/accuracy_multibox_reward": 0.3500000089406967, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 3778.1387939453125, "epoch": 0.17420596727622714, "grad_norm": 0.07609622180461884, "kl": 0.012664794921875, "learning_rate": 9.625960795856091e-07, "loss": 0.0316, "reward": 0.23333333805203438, "reward_std": 0.12836965546011925, "rewards/accuracy_multibox_reward": 0.23333333805203438, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 3817.09716796875, "epoch": 0.1751684311838306, "grad_norm": 0.17466215789318085, "kl": 0.0131683349609375, "learning_rate": 9.620222551644803e-07, "loss": 0.0188, "reward": 0.20555557310581207, "reward_std": 0.13771457970142365, "rewards/accuracy_multibox_reward": 0.20555557310581207, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 3842.5000610351562, "epoch": 0.17613089509143406, "grad_norm": 0.07578685134649277, "kl": 0.01538848876953125, "learning_rate": 9.614442564232594e-07, "loss": 0.0089, "reward": 0.25555556640028954, "reward_std": 0.06337888352572918, "rewards/accuracy_multibox_reward": 0.25555556640028954, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 3967.6666870117188, "epoch": 0.17709335899903753, "grad_norm": 0.05800415202975273, "kl": 0.011617660522460938, "learning_rate": 9.60862089217824e-07, "loss": 0.0082, "reward": 0.11666666902601719, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.11666666902601719, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 3610.7777709960938, "epoch": 0.17805582290664101, "grad_norm": 0.13688166439533234, "kl": 0.0148773193359375, "learning_rate": 9.602757594462832e-07, "loss": -0.0094, "reward": 0.21666666120290756, "reward_std": 0.054433105047792196, "rewards/accuracy_multibox_reward": 0.21666666120290756, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 3852.625, "epoch": 0.17901828681424448, "grad_norm": 0.07554133236408234, "kl": 0.011138916015625, "learning_rate": 9.596852730489184e-07, "loss": 0.0213, "reward": 0.22500000149011612, "reward_std": 0.13399580121040344, "rewards/accuracy_multibox_reward": 0.22500000149011612, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 3413.513916015625, "epoch": 0.17998075072184794, "grad_norm": 0.1946292519569397, "kl": 0.01605224609375, "learning_rate": 9.590906360081226e-07, "loss": 0.0073, "reward": 0.4444444663822651, "reward_std": 0.26935600861907005, "rewards/accuracy_multibox_reward": 0.4444444663822651, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 3370.3056030273438, "epoch": 0.1809432146294514, "grad_norm": 0.1966099590063095, "kl": 0.0149078369140625, "learning_rate": 9.584918543483411e-07, "loss": 0.0251, "reward": 0.4083333471789956, "reward_std": 0.1655353340320289, "rewards/accuracy_multibox_reward": 0.4083333471789956, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 3979.4861450195312, "epoch": 0.18190567853705486, "grad_norm": 0.13165150582790375, "kl": 0.016033172607421875, "learning_rate": 9.578889341360092e-07, "loss": 0.0182, "reward": 0.29722223803400993, "reward_std": 0.1869506686925888, "rewards/accuracy_multibox_reward": 0.29722223803400993, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 3329.611083984375, "epoch": 0.18286814244465832, "grad_norm": 0.2570968568325043, "kl": 0.0191650390625, "learning_rate": 9.572818814794907e-07, "loss": 0.0117, "reward": 0.21111111342906952, "reward_std": 0.09982090443372726, "rewards/accuracy_multibox_reward": 0.21111111342906952, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 4113.90283203125, "epoch": 0.18383060635226178, "grad_norm": 0.0756428986787796, "kl": 0.014373779296875, "learning_rate": 9.566707025290169e-07, "loss": 0.0221, "reward": 0.18611112236976624, "reward_std": 0.047628965228796005, "rewards/accuracy_multibox_reward": 0.18611112236976624, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 3653.5001220703125, "epoch": 0.18479307025986524, "grad_norm": 0.12549282610416412, "kl": 0.01275634765625, "learning_rate": 9.560554034766235e-07, "loss": 0.0287, "reward": 0.2944444613531232, "reward_std": 0.1513548525981605, "rewards/accuracy_multibox_reward": 0.2944444613531232, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 3801.0972290039062, "epoch": 0.18575553416746873, "grad_norm": 0.13181938230991364, "kl": 0.0201416015625, "learning_rate": 9.554359905560885e-07, "loss": 0.0282, "reward": 0.4250000072643161, "reward_std": 0.14029167126864195, "rewards/accuracy_multibox_reward": 0.4250000072643161, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 3453.1944580078125, "epoch": 0.1867179980750722, "grad_norm": 0.137934148311615, "kl": 0.01070404052734375, "learning_rate": 9.548124700428684e-07, "loss": 0.0264, "reward": 0.4555555656552315, "reward_std": 0.122441909275949, "rewards/accuracy_multibox_reward": 0.4555555656552315, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 4043.5695190429688, "epoch": 0.18768046198267566, "grad_norm": 0.14582735300064087, "kl": 0.014862060546875, "learning_rate": 9.541848482540351e-07, "loss": 0.032, "reward": 0.3166666701436043, "reward_std": 0.2764529548585415, "rewards/accuracy_multibox_reward": 0.3166666701436043, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 4105.888977050781, "epoch": 0.18864292589027912, "grad_norm": 0.13068167865276337, "kl": 0.019947052001953125, "learning_rate": 9.53553131548212e-07, "loss": 0.0531, "reward": 0.28055555559694767, "reward_std": 0.2641298249363899, "rewards/accuracy_multibox_reward": 0.28055555559694767, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 3775.3612060546875, "epoch": 0.18960538979788258, "grad_norm": 0.3541240096092224, "kl": 0.017303466796875, "learning_rate": 9.529173263255088e-07, "loss": -0.0095, "reward": 0.2722222153097391, "reward_std": 0.18751297891139984, "rewards/accuracy_multibox_reward": 0.2722222153097391, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 4305.40283203125, "epoch": 0.19056785370548604, "grad_norm": 0.11011335253715515, "kl": 0.015625, "learning_rate": 9.522774390274574e-07, "loss": 0.0116, "reward": 0.24166666343808174, "reward_std": 0.2237016912549734, "rewards/accuracy_multibox_reward": 0.24166666343808174, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 3941.8055419921875, "epoch": 0.1915303176130895, "grad_norm": 0.1520339846611023, "kl": 0.01438140869140625, "learning_rate": 9.516334761369466e-07, "loss": 0.0205, "reward": 0.48055557534098625, "reward_std": 0.19874423509463668, "rewards/accuracy_multibox_reward": 0.48055557534098625, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 3612.6387939453125, "epoch": 0.19249278152069296, "grad_norm": 0.08752023428678513, "kl": 0.016986846923828125, "learning_rate": 9.509854441781558e-07, "loss": 0.0001, "reward": 0.4277777820825577, "reward_std": 0.06303972192108631, "rewards/accuracy_multibox_reward": 0.4277777820825577, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 3696.3055419921875, "epoch": 0.19345524542829645, "grad_norm": 0.09001725912094116, "kl": 0.01519775390625, "learning_rate": 9.5033334971649e-07, "loss": 0.0132, "reward": 0.563888892531395, "reward_std": 0.17577658593654633, "rewards/accuracy_multibox_reward": 0.563888892531395, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 3652.013916015625, "epoch": 0.1944177093358999, "grad_norm": 0.11756515502929688, "kl": 0.013824462890625, "learning_rate": 9.496771993585123e-07, "loss": 0.0279, "reward": 0.39722222834825516, "reward_std": 0.2717311382293701, "rewards/accuracy_multibox_reward": 0.39722222834825516, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 4053.1250610351562, "epoch": 0.19538017324350337, "grad_norm": 0.19886107742786407, "kl": 0.0153350830078125, "learning_rate": 9.49016999751877e-07, "loss": 0.0356, "reward": 0.27777777425944805, "reward_std": 0.23628078773617744, "rewards/accuracy_multibox_reward": 0.27777777425944805, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 4033.0138549804688, "epoch": 0.19634263715110684, "grad_norm": 0.0993325486779213, "kl": 0.01755523681640625, "learning_rate": 9.483527575852629e-07, "loss": 0.004, "reward": 0.14722222555428743, "reward_std": 0.12648530676960945, "rewards/accuracy_multibox_reward": 0.14722222555428743, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 3801.3333129882812, "epoch": 0.1973051010587103, "grad_norm": 0.11120632290840149, "kl": 0.0141143798828125, "learning_rate": 9.47684479588305e-07, "loss": 0.0207, "reward": 0.20277778431773186, "reward_std": 0.15404100669547915, "rewards/accuracy_multibox_reward": 0.20277778431773186, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 3419.208251953125, "epoch": 0.19826756496631376, "grad_norm": 0.1339973509311676, "kl": 0.02536773681640625, "learning_rate": 9.470121725315267e-07, "loss": 0.0237, "reward": 0.525000024586916, "reward_std": 0.10515233501791954, "rewards/accuracy_multibox_reward": 0.525000024586916, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 3467.3472900390625, "epoch": 0.19923002887391722, "grad_norm": 0.1530306488275528, "kl": 0.021820068359375, "learning_rate": 9.463358432262709e-07, "loss": 0.0165, "reward": 0.36944444477558136, "reward_std": 0.10074726864695549, "rewards/accuracy_multibox_reward": 0.36944444477558136, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 3845.6944580078125, "epoch": 0.20019249278152068, "grad_norm": 0.13096755743026733, "kl": 0.023529052734375, "learning_rate": 9.45655498524631e-07, "loss": 0.022, "reward": 0.5583333447575569, "reward_std": 0.1723633618094027, "rewards/accuracy_multibox_reward": 0.5583333447575569, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 4238.916687011719, "epoch": 0.20115495668912417, "grad_norm": 0.1587723344564438, "kl": 0.01995849609375, "learning_rate": 9.449711453193817e-07, "loss": 0.0293, "reward": 0.2444444466382265, "reward_std": 0.22469089180231094, "rewards/accuracy_multibox_reward": 0.2444444466382265, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 3248.1805419921875, "epoch": 0.20211742059672763, "grad_norm": 0.25790685415267944, "kl": 0.022308349609375, "learning_rate": 9.44282790543909e-07, "loss": 0.0324, "reward": 0.2805555621162057, "reward_std": 0.15649517998099327, "rewards/accuracy_multibox_reward": 0.2805555621162057, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 3969.22216796875, "epoch": 0.2030798845043311, "grad_norm": 0.394692599773407, "kl": 0.019439697265625, "learning_rate": 9.435904411721399e-07, "loss": 0.0464, "reward": 0.2250000163912773, "reward_std": 0.21549124270677567, "rewards/accuracy_multibox_reward": 0.2250000163912773, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 3625.5972290039062, "epoch": 0.20404234841193455, "grad_norm": 0.1120128184556961, "kl": 0.0176544189453125, "learning_rate": 9.42894104218472e-07, "loss": 0.0069, "reward": 0.4111111033707857, "reward_std": 0.1587441023439169, "rewards/accuracy_multibox_reward": 0.4111111033707857, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 3736.6805419921875, "epoch": 0.20500481231953802, "grad_norm": 0.07531909644603729, "kl": 0.0236358642578125, "learning_rate": 9.421937867377021e-07, "loss": 0.0106, "reward": 0.3694444466382265, "reward_std": 0.09926874889060855, "rewards/accuracy_multibox_reward": 0.3694444466382265, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 4299.361083984375, "epoch": 0.20596727622714148, "grad_norm": 0.07095351815223694, "kl": 0.0220184326171875, "learning_rate": 9.414894958249554e-07, "loss": 0.0142, "reward": 0.10555555112659931, "reward_std": 0.09033814445137978, "rewards/accuracy_multibox_reward": 0.10555555112659931, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 3773.2639770507812, "epoch": 0.20692974013474494, "grad_norm": 0.1966848373413086, "kl": 0.012142181396484375, "learning_rate": 9.407812386156123e-07, "loss": 0.0389, "reward": 0.23055556043982506, "reward_std": 0.16367204813286662, "rewards/accuracy_multibox_reward": 0.23055556043982506, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 4142.777770996094, "epoch": 0.2078922040423484, "grad_norm": 0.1301807314157486, "kl": 0.020721435546875, "learning_rate": 9.400690222852377e-07, "loss": 0.0211, "reward": 0.16388889774680138, "reward_std": 0.13784046471118927, "rewards/accuracy_multibox_reward": 0.16388889774680138, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 3229.1527709960938, "epoch": 0.2088546679499519, "grad_norm": 0.09417244791984558, "kl": 0.01155853271484375, "learning_rate": 9.39352854049507e-07, "loss": 0.0153, "reward": 0.31666668877005577, "reward_std": 0.12247449159622192, "rewards/accuracy_multibox_reward": 0.31666668877005577, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 3639.25, "epoch": 0.20981713185755535, "grad_norm": 0.11257392913103104, "kl": 0.01763916015625, "learning_rate": 9.386327411641339e-07, "loss": 0.0147, "reward": 0.4194444604218006, "reward_std": 0.22715587355196476, "rewards/accuracy_multibox_reward": 0.4194444604218006, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 3924.02783203125, "epoch": 0.2107795957651588, "grad_norm": 0.06958074867725372, "kl": 0.022216796875, "learning_rate": 9.379086909247963e-07, "loss": 0.0046, "reward": 0.08611110597848892, "reward_std": 0.04270918294787407, "rewards/accuracy_multibox_reward": 0.08611110597848892, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 3787.9583740234375, "epoch": 0.21174205967276227, "grad_norm": 0.16281241178512573, "kl": 0.0164337158203125, "learning_rate": 9.371807106670627e-07, "loss": 0.0063, "reward": 0.305555559694767, "reward_std": 0.24845526739954948, "rewards/accuracy_multibox_reward": 0.305555559694767, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 3301.0833740234375, "epoch": 0.21270452358036573, "grad_norm": 0.21330398321151733, "kl": 0.013641357421875, "learning_rate": 9.364488077663177e-07, "loss": -0.0138, "reward": 0.2805555462837219, "reward_std": 0.1332874409854412, "rewards/accuracy_multibox_reward": 0.2805555462837219, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 3449.77783203125, "epoch": 0.2136669874879692, "grad_norm": 0.21594278514385223, "kl": 0.0190887451171875, "learning_rate": 9.357129896376873e-07, "loss": 0.0665, "reward": 0.29027779400348663, "reward_std": 0.15938228368759155, "rewards/accuracy_multibox_reward": 0.29027779400348663, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 3818.8195190429688, "epoch": 0.21462945139557266, "grad_norm": 0.12590175867080688, "kl": 0.025848388671875, "learning_rate": 9.349732637359641e-07, "loss": 0.0178, "reward": 0.3083333484828472, "reward_std": 0.16897395765408874, "rewards/accuracy_multibox_reward": 0.3083333484828472, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 3730.4027709960938, "epoch": 0.21559191530317612, "grad_norm": 0.059126317501068115, "kl": 0.0213623046875, "learning_rate": 9.342296375555314e-07, "loss": 0.0002, "reward": 0.013888888992369175, "reward_std": 0.03402068838477135, "rewards/accuracy_multibox_reward": 0.013888888992369175, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 4149.777893066406, "epoch": 0.2165543792107796, "grad_norm": 0.08872301876544952, "kl": 0.01983642578125, "learning_rate": 9.334821186302871e-07, "loss": 0.0073, "reward": 0.17222221195697784, "reward_std": 0.10931175947189331, "rewards/accuracy_multibox_reward": 0.17222221195697784, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 3746.3611450195312, "epoch": 0.21751684311838307, "grad_norm": 0.1335543990135193, "kl": 0.02423095703125, "learning_rate": 9.327307145335682e-07, "loss": 0.052, "reward": 0.41388891637325287, "reward_std": 0.19253847748041153, "rewards/accuracy_multibox_reward": 0.41388891637325287, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 3840.4862060546875, "epoch": 0.21847930702598653, "grad_norm": 0.09815480560064316, "kl": 0.021514892578125, "learning_rate": 9.319754328780733e-07, "loss": 0.0125, "reward": 0.14722223207354546, "reward_std": 0.17010344564914703, "rewards/accuracy_multibox_reward": 0.14722223207354546, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 3883.4166870117188, "epoch": 0.21944177093359, "grad_norm": 0.08889266848564148, "kl": 0.01714324951171875, "learning_rate": 9.312162813157853e-07, "loss": -0.0018, "reward": 0.0944444453343749, "reward_std": 0.1646435409784317, "rewards/accuracy_multibox_reward": 0.0944444453343749, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 3429.1112060546875, "epoch": 0.22040423484119345, "grad_norm": 0.051759421825408936, "kl": 0.01654052734375, "learning_rate": 9.304532675378946e-07, "loss": 0.0007, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 3793.3194580078125, "epoch": 0.22136669874879691, "grad_norm": 0.1175106093287468, "kl": 0.01519775390625, "learning_rate": 9.296863992747214e-07, "loss": 0.0201, "reward": 0.38611113280057907, "reward_std": 0.24817372858524323, "rewards/accuracy_multibox_reward": 0.38611113280057907, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 3889.75, "epoch": 0.22232916265640038, "grad_norm": 0.11876222491264343, "kl": 0.017425537109375, "learning_rate": 9.289156842956357e-07, "loss": 0.0175, "reward": 0.21666667703539133, "reward_std": 0.16919447481632233, "rewards/accuracy_multibox_reward": 0.21666667703539133, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 3871.9444580078125, "epoch": 0.22329162656400384, "grad_norm": 0.11996452510356903, "kl": 0.020263671875, "learning_rate": 9.281411304089807e-07, "loss": 0.0205, "reward": 0.3861111253499985, "reward_std": 0.16633447259664536, "rewards/accuracy_multibox_reward": 0.3861111253499985, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 4256.041687011719, "epoch": 0.22425409047160733, "grad_norm": 0.12357217818498611, "kl": 0.0108184814453125, "learning_rate": 9.273627454619923e-07, "loss": 0.0114, "reward": 0.2277777846902609, "reward_std": 0.24911566451191902, "rewards/accuracy_multibox_reward": 0.2277777846902609, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 3960.4166259765625, "epoch": 0.2252165543792108, "grad_norm": 0.1322464793920517, "kl": 0.0279083251953125, "learning_rate": 9.265805373407199e-07, "loss": 0.0112, "reward": 0.38888890761882067, "reward_std": 0.21179041266441345, "rewards/accuracy_multibox_reward": 0.38888890761882067, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 4034.9444580078125, "epoch": 0.22617901828681425, "grad_norm": 0.09824764728546143, "kl": 0.0267486572265625, "learning_rate": 9.257945139699467e-07, "loss": 0.0356, "reward": 0.26944445818662643, "reward_std": 0.157456923276186, "rewards/accuracy_multibox_reward": 0.26944445818662643, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 3806.4861450195312, "epoch": 0.2271414821944177, "grad_norm": 0.10577450692653656, "kl": 0.02447509765625, "learning_rate": 9.250046833131097e-07, "loss": 0.0162, "reward": 0.3083333298563957, "reward_std": 0.15772783569991589, "rewards/accuracy_multibox_reward": 0.3083333298563957, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 3575.90283203125, "epoch": 0.22810394610202117, "grad_norm": 0.18376494944095612, "kl": 0.0223236083984375, "learning_rate": 9.242110533722184e-07, "loss": 0.0549, "reward": 0.5055555487051606, "reward_std": 0.25937413796782494, "rewards/accuracy_multibox_reward": 0.5055555487051606, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 3932.7916870117188, "epoch": 0.22906641000962463, "grad_norm": 0.1626387983560562, "kl": 0.03045654296875, "learning_rate": 9.234136321877736e-07, "loss": 0.0275, "reward": 0.27222223673015833, "reward_std": 0.24690594151616096, "rewards/accuracy_multibox_reward": 0.27222223673015833, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 3639.52783203125, "epoch": 0.2300288739172281, "grad_norm": 0.20477136969566345, "kl": 0.0242919921875, "learning_rate": 9.226124278386868e-07, "loss": 0.0049, "reward": 0.4305555820465088, "reward_std": 0.15649516507983208, "rewards/accuracy_multibox_reward": 0.4305555820465088, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 3838.7361450195312, "epoch": 0.23099133782483156, "grad_norm": 0.07073355466127396, "kl": 0.0217437744140625, "learning_rate": 9.218074484421977e-07, "loss": 0.0026, "reward": 0.1277777859941125, "reward_std": 0.08164966106414795, "rewards/accuracy_multibox_reward": 0.1277777859941125, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 3950.4028930664062, "epoch": 0.23195380173243504, "grad_norm": 0.20643223822116852, "kl": 0.024322509765625, "learning_rate": 9.209987021537921e-07, "loss": -0.0096, "reward": 0.19444444216787815, "reward_std": 0.11661261413246393, "rewards/accuracy_multibox_reward": 0.19444444216787815, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 3543.9306030273438, "epoch": 0.2329162656400385, "grad_norm": 0.09312759339809418, "kl": 0.0176239013671875, "learning_rate": 9.201861971671195e-07, "loss": -0.0067, "reward": 0.3850019443780184, "reward_std": 0.10711081977933645, "rewards/accuracy_multibox_reward": 0.3850019443780184, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 3739.3888549804688, "epoch": 0.23387872954764197, "grad_norm": 0.13449543714523315, "kl": 0.0245513916015625, "learning_rate": 9.193699417139096e-07, "loss": 0.0136, "reward": 0.25555556267499924, "reward_std": 0.0968865305185318, "rewards/accuracy_multibox_reward": 0.25555556267499924, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 3853.27783203125, "epoch": 0.23484119345524543, "grad_norm": 0.18399423360824585, "kl": 0.026214599609375, "learning_rate": 9.185499440638892e-07, "loss": 0.016, "reward": 0.18333333916962147, "reward_std": 0.09753628401085734, "rewards/accuracy_multibox_reward": 0.18333333916962147, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 3566.72216796875, "epoch": 0.2358036573628489, "grad_norm": 0.11895571649074554, "kl": 0.0148773193359375, "learning_rate": 9.177262125246988e-07, "loss": 0.0362, "reward": 0.4166666828095913, "reward_std": 0.16927633434534073, "rewards/accuracy_multibox_reward": 0.4166666828095913, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 3892.9444580078125, "epoch": 0.23676612127045235, "grad_norm": 0.10373275727033615, "kl": 0.0175323486328125, "learning_rate": 9.168987554418078e-07, "loss": 0.0079, "reward": 0.11944444663822651, "reward_std": 0.04762896476313472, "rewards/accuracy_multibox_reward": 0.11944444663822651, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 3638.7083740234375, "epoch": 0.2377285851780558, "grad_norm": 0.17320069670677185, "kl": 0.0206298828125, "learning_rate": 9.1606758119843e-07, "loss": 0.0023, "reward": 0.3305555656552315, "reward_std": 0.13866045325994492, "rewards/accuracy_multibox_reward": 0.3305555656552315, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 3670.4166870117188, "epoch": 0.2386910490856593, "grad_norm": 0.080434150993824, "kl": 0.01971435546875, "learning_rate": 9.152326982154389e-07, "loss": 0.006, "reward": 0.15000000223517418, "reward_std": 0.054772257804870605, "rewards/accuracy_multibox_reward": 0.15000000223517418, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 3671.3333740234375, "epoch": 0.23965351299326276, "grad_norm": 0.15996555984020233, "kl": 0.01988983154296875, "learning_rate": 9.143941149512828e-07, "loss": 0.0279, "reward": 0.31388889253139496, "reward_std": 0.1952805556356907, "rewards/accuracy_multibox_reward": 0.31388889253139496, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 3995.8333129882812, "epoch": 0.24061597690086622, "grad_norm": 0.10721436142921448, "kl": 0.027130126953125, "learning_rate": 9.135518399018983e-07, "loss": 0.0284, "reward": 0.16111111361533403, "reward_std": 0.09770363569259644, "rewards/accuracy_multibox_reward": 0.16111111361533403, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 3737.7223510742188, "epoch": 0.24157844080846969, "grad_norm": 0.15812598168849945, "kl": 0.0199432373046875, "learning_rate": 9.127058816006243e-07, "loss": 0.002, "reward": 0.4777778033167124, "reward_std": 0.16720083355903625, "rewards/accuracy_multibox_reward": 0.4777778033167124, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 3249.9444580078125, "epoch": 0.24254090471607315, "grad_norm": 0.12829096615314484, "kl": 0.0160980224609375, "learning_rate": 9.118562486181164e-07, "loss": 0.0031, "reward": 0.4055555574595928, "reward_std": 0.20447580143809319, "rewards/accuracy_multibox_reward": 0.4055555574595928, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 3911.0138549804688, "epoch": 0.2435033686236766, "grad_norm": 0.2833734154701233, "kl": 0.02130126953125, "learning_rate": 9.110029495622589e-07, "loss": 0.0206, "reward": 0.1388888992369175, "reward_std": 0.16200795397162437, "rewards/accuracy_multibox_reward": 0.1388888992369175, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 4221.208435058594, "epoch": 0.24446583253128007, "grad_norm": 0.14375948905944824, "kl": 0.0391845703125, "learning_rate": 9.10145993078079e-07, "loss": 0.0017, "reward": 0.0833333358168602, "reward_std": 0.17411427199840546, "rewards/accuracy_multibox_reward": 0.0833333358168602, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 2996.249969482422, "epoch": 0.24542829643888353, "grad_norm": 0.19870680570602417, "kl": 0.0240478515625, "learning_rate": 9.092853878476574e-07, "loss": 0.0321, "reward": 0.3916666693985462, "reward_std": 0.15393884386867285, "rewards/accuracy_multibox_reward": 0.3916666693985462, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 3814.4305419921875, "epoch": 0.24639076034648702, "grad_norm": 0.13255567848682404, "kl": 0.0204620361328125, "learning_rate": 9.084211425900421e-07, "loss": 0.0186, "reward": 0.2305555660277605, "reward_std": 0.19406627863645554, "rewards/accuracy_multibox_reward": 0.2305555660277605, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 3544.6944580078125, "epoch": 0.24735322425409048, "grad_norm": 0.20454497635364532, "kl": 0.01556396484375, "learning_rate": 9.075532660611589e-07, "loss": 0.0055, "reward": 0.32777778804302216, "reward_std": 0.12978702131658792, "rewards/accuracy_multibox_reward": 0.32777778804302216, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 3809.97216796875, "epoch": 0.24831568816169394, "grad_norm": 0.12961366772651672, "kl": 0.0228729248046875, "learning_rate": 9.066817670537232e-07, "loss": 0.0055, "reward": 0.2173611158505082, "reward_std": 0.12417552201077342, "rewards/accuracy_multibox_reward": 0.2173611158505082, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 3689.3194580078125, "epoch": 0.2492781520692974, "grad_norm": 0.15478357672691345, "kl": 0.02288818359375, "learning_rate": 9.05806654397151e-07, "loss": -0.0116, "reward": 0.17499999701976776, "reward_std": 0.04280824912711978, "rewards/accuracy_multibox_reward": 0.17499999701976776, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 4206.680725097656, "epoch": 0.2502406159769009, "grad_norm": 0.09190937876701355, "kl": 0.029815673828125, "learning_rate": 9.049279369574688e-07, "loss": 0.0071, "reward": 0.08888888359069824, "reward_std": 0.044305335730314255, "rewards/accuracy_multibox_reward": 0.08888888359069824, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 3545.3472290039062, "epoch": 0.25120307988450435, "grad_norm": 0.10095853358507156, "kl": 0.0184326171875, "learning_rate": 9.040456236372247e-07, "loss": -0.0072, "reward": 0.21111110597848892, "reward_std": 0.05675767362117767, "rewards/accuracy_multibox_reward": 0.21111110597848892, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 3688.77783203125, "epoch": 0.2521655437921078, "grad_norm": 0.07417663186788559, "kl": 0.020172119140625, "learning_rate": 9.031597233753974e-07, "loss": 0.0033, "reward": 0.09444444626569748, "reward_std": 0.10484267771244049, "rewards/accuracy_multibox_reward": 0.09444444626569748, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 3740.7777709960938, "epoch": 0.2531280076997113, "grad_norm": 0.06811701506376266, "kl": 0.027618408203125, "learning_rate": 9.022702451473062e-07, "loss": 0.0164, "reward": 0.16944445669651031, "reward_std": 0.05417735129594803, "rewards/accuracy_multibox_reward": 0.16944445669651031, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 3574.9306030273438, "epoch": 0.25409047160731474, "grad_norm": 0.08482251316308975, "kl": 0.014862060546875, "learning_rate": 9.013771979645198e-07, "loss": 0.0184, "reward": 0.5444444827735424, "reward_std": 0.17073044553399086, "rewards/accuracy_multibox_reward": 0.5444444827735424, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 3645.8194580078125, "epoch": 0.2550529355149182, "grad_norm": 0.07928410172462463, "kl": 0.011749267578125, "learning_rate": 9.004805908747649e-07, "loss": 0.0266, "reward": 0.4416666738688946, "reward_std": 0.17501812800765038, "rewards/accuracy_multibox_reward": 0.4416666738688946, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 4015.9445190429688, "epoch": 0.25601539942252166, "grad_norm": 0.07715762406587601, "kl": 0.0144805908203125, "learning_rate": 8.995804329618347e-07, "loss": 0.022, "reward": 0.23611110635101795, "reward_std": 0.1488310731947422, "rewards/accuracy_multibox_reward": 0.23611110635101795, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 4165.402648925781, "epoch": 0.2569778633301251, "grad_norm": 0.07266516238451004, "kl": 0.021240234375, "learning_rate": 8.986767333454972e-07, "loss": 0.0009, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 4051.3889770507812, "epoch": 0.2579403272377286, "grad_norm": 0.11230955272912979, "kl": 0.03021240234375, "learning_rate": 8.977695011814019e-07, "loss": -0.0003, "reward": 0.17222222685813904, "reward_std": 0.14286448433995247, "rewards/accuracy_multibox_reward": 0.17222222685813904, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 3550.8055419921875, "epoch": 0.25890279114533205, "grad_norm": 0.23930281400680542, "kl": 0.026824951171875, "learning_rate": 8.968587456609881e-07, "loss": 0.0334, "reward": 0.30277779046446085, "reward_std": 0.22747621685266495, "rewards/accuracy_multibox_reward": 0.30277779046446085, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 3731.8472290039062, "epoch": 0.2598652550529355, "grad_norm": 0.2879273593425751, "kl": 0.02099609375, "learning_rate": 8.959444760113906e-07, "loss": 0.0257, "reward": 0.13611111789941788, "reward_std": 0.15059919282794, "rewards/accuracy_multibox_reward": 0.13611111789941788, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 3823.1944580078125, "epoch": 0.26082771896053897, "grad_norm": 0.1468006819486618, "kl": 0.0194854736328125, "learning_rate": 8.950267014953477e-07, "loss": 0.0202, "reward": 0.35555557161569595, "reward_std": 0.2107452116906643, "rewards/accuracy_multibox_reward": 0.35555557161569595, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 4273.541687011719, "epoch": 0.26179018286814243, "grad_norm": 0.08609103411436081, "kl": 0.01666259765625, "learning_rate": 8.941054314111063e-07, "loss": 0.0094, "reward": 0.1388888880610466, "reward_std": 0.06299494160339236, "rewards/accuracy_multibox_reward": 0.1388888880610466, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 4260.097229003906, "epoch": 0.2627526467757459, "grad_norm": 0.12982021272182465, "kl": 0.037750244140625, "learning_rate": 8.931806750923274e-07, "loss": 0.0028, "reward": 0.11944444663822651, "reward_std": 0.04762896476313472, "rewards/accuracy_multibox_reward": 0.11944444663822651, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 3644.0695190429688, "epoch": 0.26371511068334935, "grad_norm": 0.2523293197154999, "kl": 0.021759033203125, "learning_rate": 8.922524419079927e-07, "loss": 0.0036, "reward": 0.16388889122754335, "reward_std": 0.08979802951216698, "rewards/accuracy_multibox_reward": 0.16388889122754335, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 3759.1805419921875, "epoch": 0.2646775745909528, "grad_norm": 0.0797586664557457, "kl": 0.0192108154296875, "learning_rate": 8.913207412623089e-07, "loss": 0.0099, "reward": 0.14444445446133614, "reward_std": 0.058890245854854584, "rewards/accuracy_multibox_reward": 0.14444445446133614, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 3540.0556640625, "epoch": 0.26564003849855633, "grad_norm": 0.11917101591825485, "kl": 0.01824951171875, "learning_rate": 8.903855825946123e-07, "loss": 0.013, "reward": 0.2819444630295038, "reward_std": 0.20388893876224756, "rewards/accuracy_multibox_reward": 0.2819444630295038, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 3730.7777709960938, "epoch": 0.2666025024061598, "grad_norm": 0.11781871318817139, "kl": 0.0241851806640625, "learning_rate": 8.89446975379274e-07, "loss": 0.0098, "reward": 0.3444444574415684, "reward_std": 0.1556864855811, "rewards/accuracy_multibox_reward": 0.3444444574415684, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 3412.8194580078125, "epoch": 0.26756496631376325, "grad_norm": 0.08007071912288666, "kl": 0.02425384521484375, "learning_rate": 8.885049291256028e-07, "loss": -0.0032, "reward": 0.11388888955116272, "reward_std": 0.0897926613688469, "rewards/accuracy_multibox_reward": 0.11388888955116272, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 3914.444580078125, "epoch": 0.2685274302213667, "grad_norm": 0.2536351680755615, "kl": 0.017791748046875, "learning_rate": 8.875594533777498e-07, "loss": -0.0204, "reward": 0.2888888865709305, "reward_std": 0.02221490489318967, "rewards/accuracy_multibox_reward": 0.2888888865709305, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 3754.9305419921875, "epoch": 0.2694898941289702, "grad_norm": 0.11386415362358093, "kl": 0.02142333984375, "learning_rate": 8.86610557714611e-07, "loss": 0.0239, "reward": 0.4055555686354637, "reward_std": 0.1998920552432537, "rewards/accuracy_multibox_reward": 0.4055555686354637, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 4320.263916015625, "epoch": 0.27045235803657364, "grad_norm": 0.08586591482162476, "kl": 0.023834228515625, "learning_rate": 8.856582517497311e-07, "loss": 0.0127, "reward": 0.0972222238779068, "reward_std": 0.07484552264213562, "rewards/accuracy_multibox_reward": 0.0972222238779068, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 4050.97216796875, "epoch": 0.2714148219441771, "grad_norm": 0.18207122385501862, "kl": 0.017120361328125, "learning_rate": 8.84702545131205e-07, "loss": 0.0109, "reward": 0.033333334140479565, "reward_std": 0.08164965361356735, "rewards/accuracy_multibox_reward": 0.033333334140479565, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 3787.6806640625, "epoch": 0.27237728585178056, "grad_norm": 0.12520091235637665, "kl": 0.02630615234375, "learning_rate": 8.83743447541581e-07, "loss": 0.0287, "reward": 0.44166668877005577, "reward_std": 0.194070715457201, "rewards/accuracy_multibox_reward": 0.44166668877005577, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 3772.2083129882812, "epoch": 0.273339749759384, "grad_norm": 0.09107410907745361, "kl": 0.0163116455078125, "learning_rate": 8.82780968697762e-07, "loss": 0.0188, "reward": 0.1583333369344473, "reward_std": 0.10137549880892038, "rewards/accuracy_multibox_reward": 0.1583333369344473, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 3422.6805419921875, "epoch": 0.2743022136669875, "grad_norm": 0.4255950152873993, "kl": 0.013397216796875, "learning_rate": 8.818151183509078e-07, "loss": 0.0282, "reward": 0.3138888943940401, "reward_std": 0.15372710302472115, "rewards/accuracy_multibox_reward": 0.3138888943940401, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 3446.5139770507812, "epoch": 0.27526467757459094, "grad_norm": 0.12794780731201172, "kl": 0.0192718505859375, "learning_rate": 8.80845906286336e-07, "loss": 0.014, "reward": 0.42222222685813904, "reward_std": 0.10987596027553082, "rewards/accuracy_multibox_reward": 0.42222222685813904, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 3834.84716796875, "epoch": 0.2762271414821944, "grad_norm": 0.22605697810649872, "kl": 0.022613525390625, "learning_rate": 8.798733423234219e-07, "loss": 0.0174, "reward": 0.1111111119389534, "reward_std": 0.09982088953256607, "rewards/accuracy_multibox_reward": 0.1111111119389534, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 3798.5972290039062, "epoch": 0.27718960538979787, "grad_norm": 0.08970917016267776, "kl": 0.017669677734375, "learning_rate": 8.788974363155007e-07, "loss": 0.0049, "reward": 0.07777778059244156, "reward_std": 0.13549776375293732, "rewards/accuracy_multibox_reward": 0.07777778059244156, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 3810.9861450195312, "epoch": 0.2781520692974013, "grad_norm": 0.12395250052213669, "kl": 0.02374267578125, "learning_rate": 8.779181981497669e-07, "loss": 0.0199, "reward": 0.17435897164978087, "reward_std": 0.07297342550009489, "rewards/accuracy_multibox_reward": 0.17435897164978087, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 3851.1527709960938, "epoch": 0.2791145332050048, "grad_norm": 0.07345845550298691, "kl": 0.0212860107421875, "learning_rate": 8.769356377471734e-07, "loss": 0.0046, "reward": 0.10277777351438999, "reward_std": 0.13684292882680893, "rewards/accuracy_multibox_reward": 0.10277777351438999, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 3608.9722290039062, "epoch": 0.28007699711260825, "grad_norm": 0.12635809183120728, "kl": 0.0208892822265625, "learning_rate": 8.759497650623324e-07, "loss": 0.0084, "reward": 0.5249999985098839, "reward_std": 0.06930387578904629, "rewards/accuracy_multibox_reward": 0.5249999985098839, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 3890.2222900390625, "epoch": 0.28103946102021177, "grad_norm": 0.12662087380886078, "kl": 0.01383209228515625, "learning_rate": 8.74960590083413e-07, "loss": 0.0303, "reward": 0.06388888973742723, "reward_std": 0.08879294991493225, "rewards/accuracy_multibox_reward": 0.06388888973742723, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 3688.9306640625, "epoch": 0.28200192492781523, "grad_norm": 0.11119332164525986, "kl": 0.0281524658203125, "learning_rate": 8.739681228320418e-07, "loss": 0.0085, "reward": 0.269444452598691, "reward_std": 0.181393189355731, "rewards/accuracy_multibox_reward": 0.269444452598691, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 3965.416748046875, "epoch": 0.2829643888354187, "grad_norm": 0.1089433878660202, "kl": 0.021148681640625, "learning_rate": 8.729723733631998e-07, "loss": 0.0085, "reward": 0.1666666716337204, "reward_std": 0.05270462483167648, "rewards/accuracy_multibox_reward": 0.1666666716337204, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 3419.52783203125, "epoch": 0.28392685274302215, "grad_norm": 0.12976950407028198, "kl": 0.0247039794921875, "learning_rate": 8.71973351765121e-07, "loss": 0.0161, "reward": 0.15555556304752827, "reward_std": 0.10107123851776123, "rewards/accuracy_multibox_reward": 0.15555556304752827, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 3990.77783203125, "epoch": 0.2848893166506256, "grad_norm": 0.08886986970901489, "kl": 0.024810791015625, "learning_rate": 8.709710681591906e-07, "loss": 0.0142, "reward": 0.10000000521540642, "reward_std": 0.10327956825494766, "rewards/accuracy_multibox_reward": 0.10000000521540642, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 3197.5555419921875, "epoch": 0.2858517805582291, "grad_norm": 0.1475580781698227, "kl": 0.0214385986328125, "learning_rate": 8.699655326998423e-07, "loss": 0.0054, "reward": 0.1666666753590107, "reward_std": 0.09660216048359871, "rewards/accuracy_multibox_reward": 0.1666666753590107, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 3959.4583740234375, "epoch": 0.28681424446583254, "grad_norm": 0.07950277626514435, "kl": 0.0271148681640625, "learning_rate": 8.68956755574455e-07, "loss": 0.004, "reward": 0.11666665971279144, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.11666665971279144, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 3413.736114501953, "epoch": 0.287776708373436, "grad_norm": 0.13760590553283691, "kl": 0.02294921875, "learning_rate": 8.679447470032501e-07, "loss": -0.0024, "reward": 0.15833333879709244, "reward_std": 0.09511926770210266, "rewards/accuracy_multibox_reward": 0.15833333879709244, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 3945.8472290039062, "epoch": 0.28873917228103946, "grad_norm": 0.11237850785255432, "kl": 0.0211944580078125, "learning_rate": 8.669295172391872e-07, "loss": 0.0133, "reward": 0.08333333674818277, "reward_std": 0.08819803968071938, "rewards/accuracy_multibox_reward": 0.08333333674818277, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 3435.5834350585938, "epoch": 0.2897016361886429, "grad_norm": 0.1475084125995636, "kl": 0.0286712646484375, "learning_rate": 8.659110765678615e-07, "loss": 0.0106, "reward": 0.3638888970017433, "reward_std": 0.07035869685932994, "rewards/accuracy_multibox_reward": 0.3638888970017433, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 3505.6111450195312, "epoch": 0.2906641000962464, "grad_norm": 0.2830542325973511, "kl": 0.0243988037109375, "learning_rate": 8.648894353073987e-07, "loss": 0.0135, "reward": 0.2638888955116272, "reward_std": 0.15590589493513107, "rewards/accuracy_multibox_reward": 0.2638888955116272, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 3520.763916015625, "epoch": 0.29162656400384984, "grad_norm": 0.11044588685035706, "kl": 0.016357421875, "learning_rate": 8.638646038083501e-07, "loss": 0.0093, "reward": 0.13055556267499924, "reward_std": 0.15318689495325089, "rewards/accuracy_multibox_reward": 0.13055556267499924, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 3809.1527709960938, "epoch": 0.2925890279114533, "grad_norm": 0.14650945365428925, "kl": 0.0189208984375, "learning_rate": 8.628365924535891e-07, "loss": 0.0578, "reward": 0.32500000298023224, "reward_std": 0.18890303000807762, "rewards/accuracy_multibox_reward": 0.32500000298023224, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 3866.8751220703125, "epoch": 0.29355149181905676, "grad_norm": 0.18905411660671234, "kl": 0.017730712890625, "learning_rate": 8.618054116582041e-07, "loss": 0.0321, "reward": 0.20833333767950535, "reward_std": 0.17265092954039574, "rewards/accuracy_multibox_reward": 0.20833333767950535, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 3636.0555419921875, "epoch": 0.2945139557266602, "grad_norm": 0.07070495188236237, "kl": 0.0254974365234375, "learning_rate": 8.607710718693951e-07, "loss": 0.0034, "reward": 0.14444444701075554, "reward_std": 0.08164965361356735, "rewards/accuracy_multibox_reward": 0.14444444701075554, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 3100.8333740234375, "epoch": 0.29547641963426374, "grad_norm": 0.12067365646362305, "kl": 0.014556884765625, "learning_rate": 8.597335835663662e-07, "loss": -0.0108, "reward": 0.2527777776122093, "reward_std": 0.042808253318071365, "rewards/accuracy_multibox_reward": 0.2527777776122093, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 3450.7222290039062, "epoch": 0.2964388835418672, "grad_norm": 0.3272677958011627, "kl": 0.0187225341796875, "learning_rate": 8.586929572602202e-07, "loss": 0.0187, "reward": 0.38333333656191826, "reward_std": 0.272291112691164, "rewards/accuracy_multibox_reward": 0.38333333656191826, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 3916.0834350585938, "epoch": 0.29740134744947067, "grad_norm": 0.16139155626296997, "kl": 0.02239990234375, "learning_rate": 8.576492034938517e-07, "loss": 0.0102, "reward": 0.505555547773838, "reward_std": 0.223561841994524, "rewards/accuracy_multibox_reward": 0.505555547773838, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 3628.1944580078125, "epoch": 0.2983638113570741, "grad_norm": 0.2633228302001953, "kl": 0.0180816650390625, "learning_rate": 8.56602332841841e-07, "loss": 0.0584, "reward": 0.3750000149011612, "reward_std": 0.2667866498231888, "rewards/accuracy_multibox_reward": 0.3750000149011612, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 3757.9027709960938, "epoch": 0.2993262752646776, "grad_norm": 0.10251495987176895, "kl": 0.024169921875, "learning_rate": 8.555523559103462e-07, "loss": 0.0169, "reward": 0.26944445818662643, "reward_std": 0.08945460431277752, "rewards/accuracy_multibox_reward": 0.26944445818662643, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 3648.5555419921875, "epoch": 0.30028873917228105, "grad_norm": 0.19866324961185455, "kl": 0.0247955322265625, "learning_rate": 8.544992833369961e-07, "loss": 0.0004, "reward": 0.08611111342906952, "reward_std": 0.12950152903795242, "rewards/accuracy_multibox_reward": 0.08611111342906952, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 3250.9166259765625, "epoch": 0.3012512030798845, "grad_norm": 0.37991511821746826, "kl": 0.0263519287109375, "learning_rate": 8.534431257907821e-07, "loss": 0.0365, "reward": 0.37499998696148396, "reward_std": 0.09419586090371013, "rewards/accuracy_multibox_reward": 0.37499998696148396, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 3602.6527709960938, "epoch": 0.30221366698748797, "grad_norm": 0.08071848005056381, "kl": 0.020263671875, "learning_rate": 8.52383893971951e-07, "loss": 0.0084, "reward": 0.19722222536802292, "reward_std": 0.08084797114133835, "rewards/accuracy_multibox_reward": 0.19722222536802292, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 3900.9722290039062, "epoch": 0.30317613089509143, "grad_norm": 0.08644905686378479, "kl": 0.0316162109375, "learning_rate": 8.513215986118954e-07, "loss": 0.0013, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 3714.5416870117188, "epoch": 0.3041385948026949, "grad_norm": 0.12465652823448181, "kl": 0.02166748046875, "learning_rate": 8.502562504730457e-07, "loss": 0.0351, "reward": 0.288888905197382, "reward_std": 0.194532900582999, "rewards/accuracy_multibox_reward": 0.288888905197382, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 3279.5416259765625, "epoch": 0.30510105871029836, "grad_norm": 0.13978981971740723, "kl": 0.026092529296875, "learning_rate": 8.491878603487612e-07, "loss": 0.0237, "reward": 0.19999999925494194, "reward_std": 0.21236572414636612, "rewards/accuracy_multibox_reward": 0.19999999925494194, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 3605.4583740234375, "epoch": 0.3060635226179018, "grad_norm": 0.04338908940553665, "kl": 0.017547607421875, "learning_rate": 8.4811643906322e-07, "loss": 0.0007, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 3626.1944580078125, "epoch": 0.3070259865255053, "grad_norm": 0.14544837176799774, "kl": 0.0266571044921875, "learning_rate": 8.470419974713099e-07, "loss": 0.0244, "reward": 0.2722222227603197, "reward_std": 0.13409734517335892, "rewards/accuracy_multibox_reward": 0.2722222227603197, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 3763.1389770507812, "epoch": 0.30798845043310874, "grad_norm": 0.06241542845964432, "kl": 0.02356719970703125, "learning_rate": 8.459645464585188e-07, "loss": 0.0037, "reward": 0.08888889290392399, "reward_std": 0.10180631652474403, "rewards/accuracy_multibox_reward": 0.08888889290392399, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 3727.0833129882812, "epoch": 0.3089509143407122, "grad_norm": 0.11057883501052856, "kl": 0.0277252197265625, "learning_rate": 8.448840969408233e-07, "loss": 0.0159, "reward": 0.13888889737427235, "reward_std": 0.15835123136639595, "rewards/accuracy_multibox_reward": 0.13888889737427235, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 3972.25, "epoch": 0.30991337824831566, "grad_norm": 0.1880810558795929, "kl": 0.021636962890625, "learning_rate": 8.438006598645793e-07, "loss": 0.0331, "reward": 0.3055555559694767, "reward_std": 0.15703379828482866, "rewards/accuracy_multibox_reward": 0.3055555559694767, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 3564.7362060546875, "epoch": 0.3108758421559192, "grad_norm": 0.11208683252334595, "kl": 0.0209503173828125, "learning_rate": 8.427142462064098e-07, "loss": 0.013, "reward": 0.32777777686715126, "reward_std": 0.12436037044972181, "rewards/accuracy_multibox_reward": 0.32777777686715126, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 3816.3750610351562, "epoch": 0.31183830606352264, "grad_norm": 0.09422220289707184, "kl": 0.01776123046875, "learning_rate": 8.416248669730953e-07, "loss": 0.0225, "reward": 0.08611111529171467, "reward_std": 0.09926874935626984, "rewards/accuracy_multibox_reward": 0.08611111529171467, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 3510.361083984375, "epoch": 0.3128007699711261, "grad_norm": 0.10775721073150635, "kl": 0.0235443115234375, "learning_rate": 8.40532533201461e-07, "loss": 0.0221, "reward": 0.25833334773778915, "reward_std": 0.14847273007035255, "rewards/accuracy_multibox_reward": 0.25833334773778915, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 3694.3472290039062, "epoch": 0.31376323387872956, "grad_norm": 0.1538933515548706, "kl": 0.023406982421875, "learning_rate": 8.394372559582654e-07, "loss": 0.0005, "reward": 0.34722222574055195, "reward_std": 0.06484221573919058, "rewards/accuracy_multibox_reward": 0.34722222574055195, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 3506.8750610351562, "epoch": 0.314725697786333, "grad_norm": 0.11298210918903351, "kl": 0.0137176513671875, "learning_rate": 8.383390463400882e-07, "loss": 0.0331, "reward": 0.4138888977468014, "reward_std": 0.18385350983589888, "rewards/accuracy_multibox_reward": 0.4138888977468014, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 3960.0973510742188, "epoch": 0.3156881616939365, "grad_norm": 0.12746112048625946, "kl": 0.03106689453125, "learning_rate": 8.372379154732177e-07, "loss": 0.0178, "reward": 0.11111111380159855, "reward_std": 0.08385797962546349, "rewards/accuracy_multibox_reward": 0.11111111380159855, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 3543.0833129882812, "epoch": 0.31665062560153995, "grad_norm": 0.6437938809394836, "kl": 0.037811279296875, "learning_rate": 8.361338745135388e-07, "loss": 0.0339, "reward": 0.1861111167818308, "reward_std": 0.13061749562621117, "rewards/accuracy_multibox_reward": 0.1861111167818308, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 3619.9722900390625, "epoch": 0.3176130895091434, "grad_norm": 0.23140501976013184, "kl": 0.0366973876953125, "learning_rate": 8.350269346464187e-07, "loss": 0.0549, "reward": 0.3027777783572674, "reward_std": 0.2633451297879219, "rewards/accuracy_multibox_reward": 0.3027777783572674, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 3425.90283203125, "epoch": 0.31857555341674687, "grad_norm": 0.7611210942268372, "kl": 0.034576416015625, "learning_rate": 8.339171070865948e-07, "loss": 0.0213, "reward": 0.11388888396322727, "reward_std": 0.08702389150857925, "rewards/accuracy_multibox_reward": 0.11388888396322727, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 3764.8472900390625, "epoch": 0.31953801732435033, "grad_norm": 0.3939546048641205, "kl": 0.02770233154296875, "learning_rate": 8.328044030780604e-07, "loss": 0.0131, "reward": 0.23888889886438847, "reward_std": 0.14189605880528688, "rewards/accuracy_multibox_reward": 0.23888889886438847, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 3612.388916015625, "epoch": 0.3205004812319538, "grad_norm": 0.13332480192184448, "kl": 0.035430908203125, "learning_rate": 8.316888338939512e-07, "loss": 0.0147, "reward": 0.15833333507180214, "reward_std": 0.15086374804377556, "rewards/accuracy_multibox_reward": 0.15833333507180214, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 3465.5, "epoch": 0.32146294513955725, "grad_norm": 0.13514494895935059, "kl": 0.01751708984375, "learning_rate": 8.305704108364301e-07, "loss": 0.003, "reward": 0.2361111156642437, "reward_std": 0.13509192783385515, "rewards/accuracy_multibox_reward": 0.2361111156642437, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 2619.6944580078125, "epoch": 0.3224254090471607, "grad_norm": 0.09563828259706497, "kl": 0.022064208984375, "learning_rate": 8.294491452365743e-07, "loss": 0.0009, "reward": 0.20000000298023224, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.20000000298023224, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 3071.5138549804688, "epoch": 0.3233878729547642, "grad_norm": 0.0971384271979332, "kl": 0.0132293701171875, "learning_rate": 8.283250484542593e-07, "loss": 0.0167, "reward": 0.191666672937572, "reward_std": 0.17125433310866356, "rewards/accuracy_multibox_reward": 0.191666672937572, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 3083.0, "epoch": 0.32435033686236764, "grad_norm": 0.16158224642276764, "kl": 0.026641845703125, "learning_rate": 8.271981318780441e-07, "loss": -0.0049, "reward": 0.4638889208436012, "reward_std": 0.11142591293901205, "rewards/accuracy_multibox_reward": 0.4638889208436012, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 3383.7083740234375, "epoch": 0.3253128007699711, "grad_norm": 0.23850443959236145, "kl": 0.034698486328125, "learning_rate": 8.260684069250559e-07, "loss": -0.0327, "reward": 0.12777777388691902, "reward_std": 0.08385797590017319, "rewards/accuracy_multibox_reward": 0.12777777388691902, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 3888.4722290039062, "epoch": 0.3262752646775746, "grad_norm": 0.16044527292251587, "kl": 0.0333251953125, "learning_rate": 8.249358850408743e-07, "loss": 0.0074, "reward": 0.1250000037252903, "reward_std": 0.09211019054055214, "rewards/accuracy_multibox_reward": 0.1250000037252903, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 3944.90283203125, "epoch": 0.3272377285851781, "grad_norm": 0.11109481751918793, "kl": 0.037017822265625, "learning_rate": 8.238005776994159e-07, "loss": 0.018, "reward": 0.2083333395421505, "reward_std": 0.13323572278022766, "rewards/accuracy_multibox_reward": 0.2083333395421505, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 4041.1389770507812, "epoch": 0.32820019249278154, "grad_norm": 0.0847981870174408, "kl": 0.03313446044921875, "learning_rate": 8.226624964028172e-07, "loss": -0.0005, "reward": 0.0972222238779068, "reward_std": 0.09721050411462784, "rewards/accuracy_multibox_reward": 0.0972222238779068, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 3692.4723510742188, "epoch": 0.329162656400385, "grad_norm": 0.09407465904951096, "kl": 0.0259552001953125, "learning_rate": 8.21521652681318e-07, "loss": 0.0051, "reward": 0.3611111044883728, "reward_std": 0.1489720782265067, "rewards/accuracy_multibox_reward": 0.3611111044883728, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 3917.3056640625, "epoch": 0.33012512030798846, "grad_norm": 0.10268981009721756, "kl": 0.032745361328125, "learning_rate": 8.203780580931462e-07, "loss": 0.0051, "reward": 0.03055555559694767, "reward_std": 0.07484552264213562, "rewards/accuracy_multibox_reward": 0.03055555559694767, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2932.861114501953, "epoch": 0.3310875842155919, "grad_norm": 0.20973597466945648, "kl": 0.0291748046875, "learning_rate": 8.192317242243986e-07, "loss": -0.0063, "reward": 0.38333333283662796, "reward_std": 0.13522373046725988, "rewards/accuracy_multibox_reward": 0.38333333283662796, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 3406.6944580078125, "epoch": 0.3320500481231954, "grad_norm": 0.12534824013710022, "kl": 0.02239990234375, "learning_rate": 8.180826626889251e-07, "loss": -0.0142, "reward": 0.29722223430871964, "reward_std": 0.10252998862415552, "rewards/accuracy_multibox_reward": 0.29722223430871964, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 3564.5416870117188, "epoch": 0.33301251203079885, "grad_norm": 0.15438194572925568, "kl": 0.022735595703125, "learning_rate": 8.169308851282098e-07, "loss": 0.0447, "reward": 0.22499999776482582, "reward_std": 0.18468710407614708, "rewards/accuracy_multibox_reward": 0.22499999776482582, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 3665.486083984375, "epoch": 0.3339749759384023, "grad_norm": 0.21131783723831177, "kl": 0.033721923828125, "learning_rate": 8.157764032112541e-07, "loss": 0.0201, "reward": 0.049999997951090336, "reward_std": 0.09021150693297386, "rewards/accuracy_multibox_reward": 0.049999997951090336, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 3369.9583129882812, "epoch": 0.33493743984600577, "grad_norm": 0.21260809898376465, "kl": 0.02093505859375, "learning_rate": 8.146192286344576e-07, "loss": 0.0169, "reward": 0.2222222276031971, "reward_std": 0.10241011157631874, "rewards/accuracy_multibox_reward": 0.2222222276031971, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 3102.7222290039062, "epoch": 0.33589990375360923, "grad_norm": 0.14697478711605072, "kl": 0.029296875, "learning_rate": 8.134593731215008e-07, "loss": 0.0423, "reward": 0.297222213819623, "reward_std": 0.1446264609694481, "rewards/accuracy_multibox_reward": 0.297222213819623, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 3067.6806030273438, "epoch": 0.3368623676612127, "grad_norm": 0.7539258599281311, "kl": 0.019439697265625, "learning_rate": 8.12296848423225e-07, "loss": 0.0404, "reward": 0.1388888955116272, "reward_std": 0.0680413767695427, "rewards/accuracy_multibox_reward": 0.1388888955116272, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 3825.7222900390625, "epoch": 0.33782483156881615, "grad_norm": 0.2723618745803833, "kl": 0.0390625, "learning_rate": 8.111316663175136e-07, "loss": -0.006, "reward": 0.1944444440305233, "reward_std": 0.10633629653602839, "rewards/accuracy_multibox_reward": 0.1944444440305233, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 3861.875, "epoch": 0.3387872954764196, "grad_norm": 0.09912211447954178, "kl": 0.04205322265625, "learning_rate": 8.099638386091735e-07, "loss": 0.003, "reward": 0.04444444552063942, "reward_std": 0.08164966106414795, "rewards/accuracy_multibox_reward": 0.04444444552063942, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 3679.0833740234375, "epoch": 0.3397497593840231, "grad_norm": 0.131086528301239, "kl": 0.01995086669921875, "learning_rate": 8.087933771298146e-07, "loss": 0.0743, "reward": 0.1250000037252903, "reward_std": 0.09518228471279144, "rewards/accuracy_multibox_reward": 0.1250000037252903, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 3532.125, "epoch": 0.34071222329162654, "grad_norm": 29.465490341186523, "kl": 0.21270751953125, "learning_rate": 8.076202937377308e-07, "loss": -0.0166, "reward": 0.236111119389534, "reward_std": 0.0950021855533123, "rewards/accuracy_multibox_reward": 0.236111119389534, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 3061.8055419921875, "epoch": 0.34167468719923005, "grad_norm": 0.35739514231681824, "kl": 0.0213470458984375, "learning_rate": 8.064446003177788e-07, "loss": 0.0223, "reward": 0.27222223952412605, "reward_std": 0.1885304576717317, "rewards/accuracy_multibox_reward": 0.27222223952412605, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 3422.75, "epoch": 0.3426371511068335, "grad_norm": 0.13083553314208984, "kl": 0.0328826904296875, "learning_rate": 8.052663087812587e-07, "loss": 0.0316, "reward": 0.22500000894069672, "reward_std": 0.20443101599812508, "rewards/accuracy_multibox_reward": 0.22500000894069672, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 3859.8056030273438, "epoch": 0.343599615014437, "grad_norm": 0.07086029648780823, "kl": 0.0338134765625, "learning_rate": 8.04085431065793e-07, "loss": 0.0038, "reward": 0.10000000149011612, "reward_std": 0.010540924035012722, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 3301.5555419921875, "epoch": 0.34456207892204044, "grad_norm": 0.1433977484703064, "kl": 0.0400390625, "learning_rate": 8.029019791352046e-07, "loss": -0.0048, "reward": 0.26388888619840145, "reward_std": 0.14288689149543643, "rewards/accuracy_multibox_reward": 0.26388888619840145, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 3894.1943969726562, "epoch": 0.3455245428296439, "grad_norm": 0.15955905616283417, "kl": 0.0543212890625, "learning_rate": 8.017159649793977e-07, "loss": 0.0219, "reward": 0.21388889476656914, "reward_std": 0.10861241817474365, "rewards/accuracy_multibox_reward": 0.21388889476656914, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 3033.0556030273438, "epoch": 0.34648700673724736, "grad_norm": 0.16739636659622192, "kl": 0.04217529296875, "learning_rate": 8.005274006142347e-07, "loss": 0.031, "reward": 0.3916666954755783, "reward_std": 0.14368731901049614, "rewards/accuracy_multibox_reward": 0.3916666954755783, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 3647.2362060546875, "epoch": 0.3474494706448508, "grad_norm": 0.30153796076774597, "kl": 0.054290771484375, "learning_rate": 7.993362980814148e-07, "loss": 0.0211, "reward": 0.2805555574595928, "reward_std": 0.1547694755718112, "rewards/accuracy_multibox_reward": 0.2805555574595928, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 4177.3055419921875, "epoch": 0.3484119345524543, "grad_norm": 0.11328305304050446, "kl": 0.04266357421875, "learning_rate": 7.981426694483521e-07, "loss": 0.0122, "reward": 0.04722222313284874, "reward_std": 0.052086107432842255, "rewards/accuracy_multibox_reward": 0.04722222313284874, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 3757.1806640625, "epoch": 0.34937439846005774, "grad_norm": 0.18688535690307617, "kl": 0.044952392578125, "learning_rate": 7.96946526808054e-07, "loss": -0.0031, "reward": 0.10277778282761574, "reward_std": 0.08566047623753548, "rewards/accuracy_multibox_reward": 0.10277778282761574, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 3843.1112060546875, "epoch": 0.3503368623676612, "grad_norm": 0.1305060237646103, "kl": 0.05010986328125, "learning_rate": 7.95747882278997e-07, "loss": 0.0032, "reward": 0.12222222238779068, "reward_std": 0.0494314543902874, "rewards/accuracy_multibox_reward": 0.12222222238779068, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 4024.2777709960938, "epoch": 0.35129932627526467, "grad_norm": 0.16686484217643738, "kl": 0.0716552734375, "learning_rate": 7.94546748005006e-07, "loss": 0.0247, "reward": 0.2638888955116272, "reward_std": 0.1315615363419056, "rewards/accuracy_multibox_reward": 0.2638888955116272, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 3587.8055419921875, "epoch": 0.35226179018286813, "grad_norm": 0.11898510158061981, "kl": 0.040924072265625, "learning_rate": 7.933431361551296e-07, "loss": 0.015, "reward": 0.11944444850087166, "reward_std": 0.10977436602115631, "rewards/accuracy_multibox_reward": 0.11944444850087166, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 4169.569519042969, "epoch": 0.3532242540904716, "grad_norm": 0.10186997801065445, "kl": 0.0611572265625, "learning_rate": 7.921370589235177e-07, "loss": 0.0094, "reward": 0.1111111156642437, "reward_std": 0.09060795605182648, "rewards/accuracy_multibox_reward": 0.1111111156642437, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 3866.9722290039062, "epoch": 0.35418671799807505, "grad_norm": 0.10068448632955551, "kl": 0.054168701171875, "learning_rate": 7.909285285292977e-07, "loss": 0.0067, "reward": 0.12222222238779068, "reward_std": 0.0494314581155777, "rewards/accuracy_multibox_reward": 0.12222222238779068, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 3640.7362060546875, "epoch": 0.3551491819056785, "grad_norm": 0.14164845645427704, "kl": 0.05645751953125, "learning_rate": 7.897175572164507e-07, "loss": 0.0005, "reward": 0.5666666626930237, "reward_std": 0.11381005588918924, "rewards/accuracy_multibox_reward": 0.5666666626930237, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 3733.138916015625, "epoch": 0.35611164581328203, "grad_norm": 0.1986931711435318, "kl": 0.031829833984375, "learning_rate": 7.885041572536875e-07, "loss": 0.0149, "reward": 0.19722222164273262, "reward_std": 0.10857411101460457, "rewards/accuracy_multibox_reward": 0.19722222164273262, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 4022.2223510742188, "epoch": 0.3570741097208855, "grad_norm": 0.09916967898607254, "kl": 0.03692626953125, "learning_rate": 7.872883409343243e-07, "loss": 0.0089, "reward": 0.0833333358168602, "reward_std": 0.042163703590631485, "rewards/accuracy_multibox_reward": 0.0833333358168602, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 3389.6944580078125, "epoch": 0.35803657362848895, "grad_norm": 0.1387748271226883, "kl": 0.0465087890625, "learning_rate": 7.860701205761579e-07, "loss": 0.0016, "reward": 0.28611112385988235, "reward_std": 0.22695961594581604, "rewards/accuracy_multibox_reward": 0.28611112385988235, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 3633.7916870117188, "epoch": 0.3589990375360924, "grad_norm": 0.2233714610338211, "kl": 0.05828857421875, "learning_rate": 7.848495085213414e-07, "loss": 0.0355, "reward": 0.3083333261311054, "reward_std": 0.21901171654462814, "rewards/accuracy_multibox_reward": 0.3083333261311054, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 3786.819580078125, "epoch": 0.3599615014436959, "grad_norm": 0.1191701889038086, "kl": 0.0431365966796875, "learning_rate": 7.836265171362589e-07, "loss": 0.0131, "reward": 0.1333333272486925, "reward_std": 0.08164965361356735, "rewards/accuracy_multibox_reward": 0.1333333272486925, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 3794.2222290039062, "epoch": 0.36092396535129934, "grad_norm": 0.24200160801410675, "kl": 0.0546875, "learning_rate": 7.824011588114e-07, "loss": -0.0029, "reward": 0.2527777887880802, "reward_std": 0.2344372756779194, "rewards/accuracy_multibox_reward": 0.2527777887880802, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 3791.2916870117188, "epoch": 0.3618864292589028, "grad_norm": 0.11816313862800598, "kl": 0.04156494140625, "learning_rate": 7.811734459612345e-07, "loss": 0.0205, "reward": 0.1083333371207118, "reward_std": 0.13155697286128998, "rewards/accuracy_multibox_reward": 0.1083333371207118, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 3843.013916015625, "epoch": 0.36284889316650626, "grad_norm": 0.12038025259971619, "kl": 0.05108642578125, "learning_rate": 7.79943391024087e-07, "loss": 0.0053, "reward": 0.16944444552063942, "reward_std": 0.10783059895038605, "rewards/accuracy_multibox_reward": 0.16944444552063942, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 2567.3472900390625, "epoch": 0.3638113570741097, "grad_norm": 0.18741610646247864, "kl": 0.0465087890625, "learning_rate": 7.787110064620097e-07, "loss": 0.023, "reward": 0.1972222262993455, "reward_std": 0.03402068838477135, "rewards/accuracy_multibox_reward": 0.1972222262993455, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 3196.875, "epoch": 0.3647738209817132, "grad_norm": 0.12501658499240875, "kl": 0.030914306640625, "learning_rate": 7.774763047606577e-07, "loss": 0.0185, "reward": 0.13472222164273262, "reward_std": 0.09920763969421387, "rewards/accuracy_multibox_reward": 0.13472222164273262, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 3425.77783203125, "epoch": 0.36573628488931664, "grad_norm": 0.2270078957080841, "kl": 0.032196044921875, "learning_rate": 7.762392984291614e-07, "loss": 0.0322, "reward": 0.10277777910232544, "reward_std": 0.08566047996282578, "rewards/accuracy_multibox_reward": 0.10277777910232544, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 4042.7362060546875, "epoch": 0.3666987487969201, "grad_norm": 0.16370971500873566, "kl": 0.04437255859375, "learning_rate": 7.75e-07, "loss": 0.0036, "reward": 0.22546296566724777, "reward_std": 0.14039592444896698, "rewards/accuracy_multibox_reward": 0.22546296566724777, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 3390.986083984375, "epoch": 0.36766121270452357, "grad_norm": 0.23939941823482513, "kl": 0.039794921875, "learning_rate": 7.737584220288747e-07, "loss": 0.0098, "reward": 0.2861111145466566, "reward_std": 0.2254300694912672, "rewards/accuracy_multibox_reward": 0.2861111145466566, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 3239.8750610351562, "epoch": 0.368623676612127, "grad_norm": 0.18506558239459991, "kl": 0.053955078125, "learning_rate": 7.725145770945815e-07, "loss": 0.0053, "reward": 0.26111111231148243, "reward_std": 0.13409734098240733, "rewards/accuracy_multibox_reward": 0.26111111231148243, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 3277.6805419921875, "epoch": 0.3695861405197305, "grad_norm": 0.16627445816993713, "kl": 0.031829833984375, "learning_rate": 7.712684777988835e-07, "loss": 0.0284, "reward": 0.23333333805203438, "reward_std": 0.13991066440939903, "rewards/accuracy_multibox_reward": 0.23333333805203438, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 3303.7500610351562, "epoch": 0.37054860442733395, "grad_norm": 0.09121895581483841, "kl": 0.03668212890625, "learning_rate": 7.700201367663837e-07, "loss": 0.0015, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 3236.9166259765625, "epoch": 0.37151106833493747, "grad_norm": 0.08927532285451889, "kl": 0.0382080078125, "learning_rate": 7.687695666443965e-07, "loss": 0.0068, "reward": 0.1111111156642437, "reward_std": 0.10070512443780899, "rewards/accuracy_multibox_reward": 0.1111111156642437, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 3614.4583740234375, "epoch": 0.37247353224254093, "grad_norm": 0.0668332502245903, "kl": 0.038665771484375, "learning_rate": 7.675167801028202e-07, "loss": 0.0016, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 3806.0834350585938, "epoch": 0.3734359961501444, "grad_norm": 0.3677464425563812, "kl": 0.0253448486328125, "learning_rate": 7.662617898340077e-07, "loss": -0.0166, "reward": 0.3750000149011612, "reward_std": 0.15930559486150742, "rewards/accuracy_multibox_reward": 0.3750000149011612, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 3004.6250610351562, "epoch": 0.37439846005774785, "grad_norm": 0.1573367416858673, "kl": 0.035614013671875, "learning_rate": 7.650046085526393e-07, "loss": -0.0034, "reward": 0.11388889141380787, "reward_std": 0.15146656334400177, "rewards/accuracy_multibox_reward": 0.11388889141380787, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 3171.930633544922, "epoch": 0.3753609239653513, "grad_norm": 0.478659451007843, "kl": 0.0343017578125, "learning_rate": 7.637452489955926e-07, "loss": 0.0573, "reward": 0.39722220599651337, "reward_std": 0.17039216682314873, "rewards/accuracy_multibox_reward": 0.39722220599651337, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 3745.8194580078125, "epoch": 0.3763233878729548, "grad_norm": 0.10107823461294174, "kl": 0.049224853515625, "learning_rate": 7.62483723921814e-07, "loss": 0.0119, "reward": 0.3333333358168602, "reward_std": 0.07160157570615411, "rewards/accuracy_multibox_reward": 0.3333333358168602, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 3712.5277709960938, "epoch": 0.37728585178055823, "grad_norm": 0.10918932408094406, "kl": 0.0313720703125, "learning_rate": 7.612200461121896e-07, "loss": 0.0009, "reward": 0.17499999701976776, "reward_std": 0.06324157444760203, "rewards/accuracy_multibox_reward": 0.17499999701976776, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 3846.263916015625, "epoch": 0.3782483156881617, "grad_norm": 0.2952110171318054, "kl": 0.05499267578125, "learning_rate": 7.599542283694152e-07, "loss": 0.063, "reward": 0.15277778077870607, "reward_std": 0.08646837994456291, "rewards/accuracy_multibox_reward": 0.15277778077870607, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 3810.1388549804688, "epoch": 0.37921077959576516, "grad_norm": 0.1533743292093277, "kl": 0.03912353515625, "learning_rate": 7.586862835178673e-07, "loss": 0.0008, "reward": 0.2166666779667139, "reward_std": 0.2364606000483036, "rewards/accuracy_multibox_reward": 0.2166666779667139, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 3389.9583129882812, "epoch": 0.3801732435033686, "grad_norm": 0.16574697196483612, "kl": 0.03525543212890625, "learning_rate": 7.574162244034725e-07, "loss": 0.0206, "reward": 0.2527777925133705, "reward_std": 0.09541380405426025, "rewards/accuracy_multibox_reward": 0.2527777925133705, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 4019.4862060546875, "epoch": 0.3811357074109721, "grad_norm": 0.33421239256858826, "kl": 0.050537109375, "learning_rate": 7.561440638935776e-07, "loss": 0.0366, "reward": 0.21944444812834263, "reward_std": 0.1902601160109043, "rewards/accuracy_multibox_reward": 0.21944444812834263, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 3676.1805419921875, "epoch": 0.38209817131857554, "grad_norm": 0.28768956661224365, "kl": 0.0347442626953125, "learning_rate": 7.548698148768194e-07, "loss": 0.0243, "reward": 0.28611111640930176, "reward_std": 0.11321617243811488, "rewards/accuracy_multibox_reward": 0.28611111640930176, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 3744.7083129882812, "epoch": 0.383060635226179, "grad_norm": 0.1380336582660675, "kl": 0.039825439453125, "learning_rate": 7.535934902629941e-07, "loss": 0.0065, "reward": 0.31111110374331474, "reward_std": 0.14028968615457416, "rewards/accuracy_multibox_reward": 0.31111110374331474, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 3555.513916015625, "epoch": 0.38402309913378246, "grad_norm": 0.22839130461215973, "kl": 0.056884765625, "learning_rate": 7.523151029829257e-07, "loss": -0.0075, "reward": 0.2847222313284874, "reward_std": 0.128792776260525, "rewards/accuracy_multibox_reward": 0.2847222313284874, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 3408.916748046875, "epoch": 0.3849855630413859, "grad_norm": 0.29369664192199707, "kl": 0.047119140625, "learning_rate": 7.510346659883367e-07, "loss": -0.0084, "reward": 0.18611110746860504, "reward_std": 0.12031529005616903, "rewards/accuracy_multibox_reward": 0.18611110746860504, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 3772.4722290039062, "epoch": 0.3859480269489894, "grad_norm": 0.4488571882247925, "kl": 0.0443115234375, "learning_rate": 7.49752192251715e-07, "loss": 0.036, "reward": 0.11944444105029106, "reward_std": 0.12166458740830421, "rewards/accuracy_multibox_reward": 0.11944444105029106, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 2600.361114501953, "epoch": 0.3869104908565929, "grad_norm": 0.16591526567935944, "kl": 0.0390625, "learning_rate": 7.484676947661842e-07, "loss": -0.0303, "reward": 0.32777778059244156, "reward_std": 0.12471285741776228, "rewards/accuracy_multibox_reward": 0.32777778059244156, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 3251.513916015625, "epoch": 0.38787295476419636, "grad_norm": 0.3027108311653137, "kl": 0.0562744140625, "learning_rate": 7.471811865453701e-07, "loss": 0.0136, "reward": 0.24444444850087166, "reward_std": 0.13205386325716972, "rewards/accuracy_multibox_reward": 0.24444444850087166, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 3380.40283203125, "epoch": 0.3888354186717998, "grad_norm": 0.12195685505867004, "kl": 0.05242919921875, "learning_rate": 7.458926806232708e-07, "loss": 0.027, "reward": 0.08611110597848892, "reward_std": 0.04270918294787407, "rewards/accuracy_multibox_reward": 0.08611110597848892, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 3536.263885498047, "epoch": 0.3897978825794033, "grad_norm": 0.11115583032369614, "kl": 0.0465087890625, "learning_rate": 7.446021900541229e-07, "loss": 0.0086, "reward": 0.17500000726431608, "reward_std": 0.13182609528303146, "rewards/accuracy_multibox_reward": 0.17500000726431608, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 3095.6806030273438, "epoch": 0.39076034648700675, "grad_norm": 0.20518086850643158, "kl": 0.039764404296875, "learning_rate": 7.433097279122709e-07, "loss": -0.0396, "reward": 0.2930555511265993, "reward_std": 0.1359320767223835, "rewards/accuracy_multibox_reward": 0.2930555511265993, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 3759.9027099609375, "epoch": 0.3917228103946102, "grad_norm": 0.10258515179157257, "kl": 0.0655517578125, "learning_rate": 7.420153072920328e-07, "loss": 0.0154, "reward": 0.17222221940755844, "reward_std": 0.09726227819919586, "rewards/accuracy_multibox_reward": 0.17222221940755844, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 3393.375, "epoch": 0.39268527430221367, "grad_norm": 0.2605299949645996, "kl": 0.06683349609375, "learning_rate": 7.407189413075696e-07, "loss": 0.0369, "reward": 0.1944444552063942, "reward_std": 0.20412414520978928, "rewards/accuracy_multibox_reward": 0.1944444552063942, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 3340.8611450195312, "epoch": 0.39364773820981713, "grad_norm": 0.28157827258110046, "kl": 0.051849365234375, "learning_rate": 7.394206430927507e-07, "loss": 0.0458, "reward": 0.21666667237877846, "reward_std": 0.16539975255727768, "rewards/accuracy_multibox_reward": 0.21666667237877846, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 3851.236083984375, "epoch": 0.3946102021174206, "grad_norm": 0.1586942821741104, "kl": 0.045166015625, "learning_rate": 7.381204258010219e-07, "loss": -0.0077, "reward": 0.11388889327645302, "reward_std": 0.0916629247367382, "rewards/accuracy_multibox_reward": 0.11388889327645302, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 3750.8057250976562, "epoch": 0.39557266602502406, "grad_norm": 0.2038753181695938, "kl": 0.03948974609375, "learning_rate": 7.368183026052713e-07, "loss": 0.0268, "reward": 0.17222222313284874, "reward_std": 0.2221895530819893, "rewards/accuracy_multibox_reward": 0.17222222313284874, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 3087.6111755371094, "epoch": 0.3965351299326275, "grad_norm": 0.0741092711687088, "kl": 0.033477783203125, "learning_rate": 7.355142866976968e-07, "loss": 0.0074, "reward": 0.12777777947485447, "reward_std": 0.04303314909338951, "rewards/accuracy_multibox_reward": 0.12777777947485447, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 3581.1944580078125, "epoch": 0.397497593840231, "grad_norm": 0.19178606569766998, "kl": 0.043670654296875, "learning_rate": 7.342083912896716e-07, "loss": 0.0292, "reward": 0.125, "reward_std": 0.09728332608938217, "rewards/accuracy_multibox_reward": 0.125, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 3496.6945190429688, "epoch": 0.39846005774783444, "grad_norm": 0.15376611053943634, "kl": 0.071044921875, "learning_rate": 7.329006296116104e-07, "loss": 0.0152, "reward": 0.15833333600312471, "reward_std": 0.16319824010133743, "rewards/accuracy_multibox_reward": 0.15833333600312471, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 2900.3472900390625, "epoch": 0.3994225216554379, "grad_norm": 0.17677362263202667, "kl": 0.0482177734375, "learning_rate": 7.315910149128365e-07, "loss": 0.0266, "reward": 0.13333333563059568, "reward_std": 0.08739172574132681, "rewards/accuracy_multibox_reward": 0.13333333563059568, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 3380.9444274902344, "epoch": 0.40038498556304136, "grad_norm": 0.09783168137073517, "kl": 0.04034423828125, "learning_rate": 7.302795604614458e-07, "loss": 0.0146, "reward": 0.23333334177732468, "reward_std": 0.06873702444136143, "rewards/accuracy_multibox_reward": 0.23333334177732468, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 3490.4305419921875, "epoch": 0.4013474494706448, "grad_norm": 0.2137366384267807, "kl": 0.0411376953125, "learning_rate": 7.289662795441738e-07, "loss": 0.0105, "reward": 0.09895833872724324, "reward_std": 0.10579968942329288, "rewards/accuracy_multibox_reward": 0.09895833872724324, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 3537.0695190429688, "epoch": 0.40230991337824834, "grad_norm": 0.08821175247430801, "kl": 0.0587615966796875, "learning_rate": 7.276511854662603e-07, "loss": 0.0031, "reward": 0.15000000223517418, "reward_std": 0.054772257804870605, "rewards/accuracy_multibox_reward": 0.15000000223517418, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 3511.569549560547, "epoch": 0.4032723772858518, "grad_norm": 0.15202607214450836, "kl": 0.05023193359375, "learning_rate": 7.263342915513146e-07, "loss": 0.0302, "reward": 0.288888905197382, "reward_std": 0.1439753733575344, "rewards/accuracy_multibox_reward": 0.288888905197382, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 3592.7083129882812, "epoch": 0.40423484119345526, "grad_norm": 0.20059779286384583, "kl": 0.05615234375, "learning_rate": 7.250156111411813e-07, "loss": 0.0556, "reward": 0.17777777928858995, "reward_std": 0.1382910804823041, "rewards/accuracy_multibox_reward": 0.17777777928858995, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 4077.1806640625, "epoch": 0.4051973051010587, "grad_norm": 0.1816830188035965, "kl": 0.05316162109375, "learning_rate": 7.236951575958039e-07, "loss": 0.0168, "reward": 0.06666666828095913, "reward_std": 0.09971507266163826, "rewards/accuracy_multibox_reward": 0.06666666828095913, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 3171.1806030273438, "epoch": 0.4061597690086622, "grad_norm": 0.35944288969039917, "kl": 0.04656982421875, "learning_rate": 7.223729442930903e-07, "loss": 0.0376, "reward": 0.31111111491918564, "reward_std": 0.22131222486495972, "rewards/accuracy_multibox_reward": 0.31111111491918564, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 2968.8611450195312, "epoch": 0.40712223291626565, "grad_norm": 0.3136230409145355, "kl": 0.04620361328125, "learning_rate": 7.210489846287766e-07, "loss": 0.0266, "reward": 0.40277777798473835, "reward_std": 0.23079419136047363, "rewards/accuracy_multibox_reward": 0.40277777798473835, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 3825.1806030273438, "epoch": 0.4080846968238691, "grad_norm": 0.24501006305217743, "kl": 0.05340576171875, "learning_rate": 7.197232920162926e-07, "loss": 0.0259, "reward": 0.2805555472150445, "reward_std": 0.17734061088413, "rewards/accuracy_multibox_reward": 0.2805555472150445, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 4042.4306030273438, "epoch": 0.40904716073147257, "grad_norm": 0.10687070339918137, "kl": 0.065673828125, "learning_rate": 7.183958798866247e-07, "loss": 0.0027, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 2732.4166870117188, "epoch": 0.41000962463907603, "grad_norm": 0.1714727133512497, "kl": 0.06207275390625, "learning_rate": 7.170667616881803e-07, "loss": 0.0399, "reward": 0.1611111182719469, "reward_std": 0.16283666342496872, "rewards/accuracy_multibox_reward": 0.1611111182719469, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 3607.263916015625, "epoch": 0.4109720885466795, "grad_norm": 0.30600279569625854, "kl": 0.0660400390625, "learning_rate": 7.157359508866511e-07, "loss": -0.0318, "reward": 0.2833333471789956, "reward_std": 0.1775226630270481, "rewards/accuracy_multibox_reward": 0.2833333471789956, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 3503.2916870117188, "epoch": 0.41193455245428295, "grad_norm": 0.25976884365081787, "kl": 0.05682373046875, "learning_rate": 7.144034609648778e-07, "loss": -0.01, "reward": 0.20555556006729603, "reward_std": 0.12347529456019402, "rewards/accuracy_multibox_reward": 0.20555556006729603, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 3941.3472290039062, "epoch": 0.4128970163618864, "grad_norm": 0.17574220895767212, "kl": 0.085205078125, "learning_rate": 7.13069305422712e-07, "loss": 0.0167, "reward": 0.11666667088866234, "reward_std": 0.142631147056818, "rewards/accuracy_multibox_reward": 0.11666667088866234, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 3702.5833740234375, "epoch": 0.4138594802694899, "grad_norm": 0.2651737928390503, "kl": 0.0689697265625, "learning_rate": 7.117334977768806e-07, "loss": 0.0525, "reward": 0.30277779698371887, "reward_std": 0.2308831699192524, "rewards/accuracy_multibox_reward": 0.30277779698371887, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 2958.013885498047, "epoch": 0.41482194417709334, "grad_norm": 0.16968749463558197, "kl": 0.05865478515625, "learning_rate": 7.103960515608489e-07, "loss": -0.0191, "reward": 0.2361111119389534, "reward_std": 0.11648200917989016, "rewards/accuracy_multibox_reward": 0.2361111119389534, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 3745.8056030273438, "epoch": 0.4157844080846968, "grad_norm": 0.1656910479068756, "kl": 0.066650390625, "learning_rate": 7.09056980324682e-07, "loss": 0.0292, "reward": 0.13055555056780577, "reward_std": 0.12435884028673172, "rewards/accuracy_multibox_reward": 0.13055555056780577, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 3034.875, "epoch": 0.4167468719923003, "grad_norm": 0.1277688890695572, "kl": 0.07403564453125, "learning_rate": 7.077162976349094e-07, "loss": 0.013, "reward": 0.08611110597848892, "reward_std": 0.006804138422012329, "rewards/accuracy_multibox_reward": 0.08611110597848892, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 3657.02783203125, "epoch": 0.4177093358999038, "grad_norm": 0.3083247244358063, "kl": 0.0816650390625, "learning_rate": 7.063740170743864e-07, "loss": 0.0371, "reward": 0.18333333544433117, "reward_std": 0.20853915065526962, "rewards/accuracy_multibox_reward": 0.18333333544433117, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 3379.8056030273438, "epoch": 0.41867179980750724, "grad_norm": 0.3438844680786133, "kl": 0.05462646484375, "learning_rate": 7.050301522421569e-07, "loss": 0.0221, "reward": 0.29444445855915546, "reward_std": 0.25219083204865456, "rewards/accuracy_multibox_reward": 0.29444445855915546, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 3426.5833740234375, "epoch": 0.4196342637151107, "grad_norm": 0.1938169300556183, "kl": 0.0933837890625, "learning_rate": 7.036847167533152e-07, "loss": 0.004, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 3209.4722900390625, "epoch": 0.42059672762271416, "grad_norm": 0.26037320494651794, "kl": 0.06982421875, "learning_rate": 7.023377242388691e-07, "loss": 0.0472, "reward": 0.25833333283662796, "reward_std": 0.12648529931902885, "rewards/accuracy_multibox_reward": 0.25833333283662796, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 3574.138916015625, "epoch": 0.4215591915303176, "grad_norm": 0.1938478648662567, "kl": 0.1080322265625, "learning_rate": 7.009891883456001e-07, "loss": 0.0224, "reward": 0.18194444850087166, "reward_std": 0.09299542009830475, "rewards/accuracy_multibox_reward": 0.18194444850087166, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 3599.9306030273438, "epoch": 0.4225216554379211, "grad_norm": 0.3221508860588074, "kl": 0.079345703125, "learning_rate": 6.996391227359271e-07, "loss": 0.0259, "reward": 0.2638888880610466, "reward_std": 0.10831554606556892, "rewards/accuracy_multibox_reward": 0.2638888880610466, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 3584.40283203125, "epoch": 0.42348411934552455, "grad_norm": 0.2532755136489868, "kl": 0.0965576171875, "learning_rate": 6.982875410877666e-07, "loss": 0.0379, "reward": 0.1666666716337204, "reward_std": 0.2383199855685234, "rewards/accuracy_multibox_reward": 0.1666666716337204, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 3249.0416259765625, "epoch": 0.424446583253128, "grad_norm": 0.14027166366577148, "kl": 0.087158203125, "learning_rate": 6.969344570943945e-07, "loss": 0.0182, "reward": 0.2000000039115548, "reward_std": 0.04082482261583209, "rewards/accuracy_multibox_reward": 0.2000000039115548, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 3752.0, "epoch": 0.42540904716073147, "grad_norm": 0.2763819694519043, "kl": 0.1259765625, "learning_rate": 6.955798844643072e-07, "loss": 0.0168, "reward": 0.23333333432674408, "reward_std": 0.11073195189237595, "rewards/accuracy_multibox_reward": 0.23333333432674408, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 3227.0833740234375, "epoch": 0.42637151106833493, "grad_norm": 0.4600532352924347, "kl": 0.0872802734375, "learning_rate": 6.942238369210833e-07, "loss": 0.0365, "reward": 0.3722222149372101, "reward_std": 0.22076303511857986, "rewards/accuracy_multibox_reward": 0.3722222149372101, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 3410.625, "epoch": 0.4273339749759384, "grad_norm": 0.6325965523719788, "kl": 0.086181640625, "learning_rate": 6.928663282032441e-07, "loss": -0.0139, "reward": 0.05000000260770321, "reward_std": 0.09352945536375046, "rewards/accuracy_multibox_reward": 0.05000000260770321, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 3503.1666870117188, "epoch": 0.42829643888354185, "grad_norm": 0.2220795601606369, "kl": 0.103515625, "learning_rate": 6.915073720641145e-07, "loss": 0.0219, "reward": 0.16265431325882673, "reward_std": 0.1317676182370633, "rewards/accuracy_multibox_reward": 0.16265431325882673, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 3333.5694580078125, "epoch": 0.4292589027911453, "grad_norm": 0.6250410079956055, "kl": 0.134521484375, "learning_rate": 6.901469822716834e-07, "loss": 0.0701, "reward": 0.12777777947485447, "reward_std": 0.18886961042881012, "rewards/accuracy_multibox_reward": 0.12777777947485447, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 3888.72216796875, "epoch": 0.4302213666987488, "grad_norm": 0.36426642537117004, "kl": 0.11181640625, "learning_rate": 6.887851726084648e-07, "loss": -0.0197, "reward": 0.04722222313284874, "reward_std": 0.08845379576086998, "rewards/accuracy_multibox_reward": 0.04722222313284874, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 3095.15283203125, "epoch": 0.43118383060635224, "grad_norm": 0.41637134552001953, "kl": 0.07763671875, "learning_rate": 6.874219568713575e-07, "loss": 0.0111, "reward": 0.16458333283662796, "reward_std": 0.27581701800227165, "rewards/accuracy_multibox_reward": 0.16458333283662796, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 2776.513916015625, "epoch": 0.43214629451395575, "grad_norm": 1.1361815929412842, "kl": 0.07379150390625, "learning_rate": 6.860573488715061e-07, "loss": -0.019, "reward": 0.23055555950850248, "reward_std": 0.168673537671566, "rewards/accuracy_multibox_reward": 0.23055555950850248, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 3400.1389770507812, "epoch": 0.4331087584215592, "grad_norm": 0.16649413108825684, "kl": 0.08648681640625, "learning_rate": 6.846913624341604e-07, "loss": 0.0215, "reward": 0.26944444328546524, "reward_std": 0.060768479481339455, "rewards/accuracy_multibox_reward": 0.26944444328546524, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 3601.3055419921875, "epoch": 0.4340712223291627, "grad_norm": 0.25001445412635803, "kl": 0.11181640625, "learning_rate": 6.833240113985353e-07, "loss": 0.0198, "reward": 0.2083333358168602, "reward_std": 0.09359721839427948, "rewards/accuracy_multibox_reward": 0.2083333358168602, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 3295.8333740234375, "epoch": 0.43503368623676614, "grad_norm": 0.4502008557319641, "kl": 0.07061767578125, "learning_rate": 6.819553096176712e-07, "loss": 0.0362, "reward": 0.21111110970377922, "reward_std": 0.1002201079390943, "rewards/accuracy_multibox_reward": 0.21111110970377922, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 4114.3890380859375, "epoch": 0.4359961501443696, "grad_norm": 0.22959484159946442, "kl": 0.1279296875, "learning_rate": 6.805852709582932e-07, "loss": 0.0123, "reward": 0.03611110895872116, "reward_std": 0.056190814822912216, "rewards/accuracy_multibox_reward": 0.03611110895872116, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 3921.9722900390625, "epoch": 0.43695861405197306, "grad_norm": 0.2004883736371994, "kl": 0.07623291015625, "learning_rate": 6.792139093006707e-07, "loss": -0.0052, "reward": 0.08611110597848892, "reward_std": 0.09045813232660294, "rewards/accuracy_multibox_reward": 0.08611110597848892, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 4158.277893066406, "epoch": 0.4379210779595765, "grad_norm": 0.3461688160896301, "kl": 0.07720947265625, "learning_rate": 6.778412385384768e-07, "loss": 0.0071, "reward": 0.1527777835726738, "reward_std": 0.1599409021437168, "rewards/accuracy_multibox_reward": 0.1527777835726738, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 3314.3195190429688, "epoch": 0.43888354186718, "grad_norm": 0.39656302332878113, "kl": 0.077392578125, "learning_rate": 6.764672725786476e-07, "loss": 0.019, "reward": 0.24444443825632334, "reward_std": 0.13530466333031654, "rewards/accuracy_multibox_reward": 0.24444443825632334, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 4011.1111450195312, "epoch": 0.43984600577478344, "grad_norm": 0.18435296416282654, "kl": 0.0953369140625, "learning_rate": 6.75092025341241e-07, "loss": 0.0039, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 3800.5556030273438, "epoch": 0.4408084696823869, "grad_norm": 0.3965436816215515, "kl": 0.0902099609375, "learning_rate": 6.737155107592962e-07, "loss": 0.0187, "reward": 0.16944444365799427, "reward_std": 0.16866211965680122, "rewards/accuracy_multibox_reward": 0.16944444365799427, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 3804.0416870117188, "epoch": 0.44177093358999037, "grad_norm": 0.4827900528907776, "kl": 0.09796142578125, "learning_rate": 6.723377427786918e-07, "loss": -0.0057, "reward": 0.11666666809469461, "reward_std": 0.04082482261583209, "rewards/accuracy_multibox_reward": 0.11666666809469461, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 3273.5556030273438, "epoch": 0.44273339749759383, "grad_norm": 0.1559198498725891, "kl": 0.04840087890625, "learning_rate": 6.709587353580058e-07, "loss": -0.0088, "reward": 0.180555559694767, "reward_std": 0.050564064644277096, "rewards/accuracy_multibox_reward": 0.180555559694767, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 3795.8056030273438, "epoch": 0.4436958614051973, "grad_norm": 0.6637396216392517, "kl": 0.09814453125, "learning_rate": 6.695785024683723e-07, "loss": -0.0325, "reward": 0.22777778655290604, "reward_std": 0.11769295483827591, "rewards/accuracy_multibox_reward": 0.22777778655290604, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 3204.0416870117188, "epoch": 0.44465832531280075, "grad_norm": 0.4115016460418701, "kl": 0.078369140625, "learning_rate": 6.681970580933417e-07, "loss": 0.0252, "reward": 0.308333333581686, "reward_std": 0.31836244463920593, "rewards/accuracy_multibox_reward": 0.308333333581686, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 3629.8750610351562, "epoch": 0.4456207892204042, "grad_norm": 0.15529295802116394, "kl": 0.0782470703125, "learning_rate": 6.668144162287384e-07, "loss": 0.0114, "reward": 0.01666666753590107, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.01666666753590107, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 3417.9445190429688, "epoch": 0.4465832531280077, "grad_norm": 0.5855889320373535, "kl": 0.0654296875, "learning_rate": 6.654305908825178e-07, "loss": 0.0551, "reward": 0.29999999329447746, "reward_std": 0.152607764583081, "rewards/accuracy_multibox_reward": 0.29999999329447746, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 3145.236114501953, "epoch": 0.4475457170356112, "grad_norm": 0.35842758417129517, "kl": 0.060638427734375, "learning_rate": 6.64045596074627e-07, "loss": -0.003, "reward": 0.4027777910232544, "reward_std": 0.11830805940553546, "rewards/accuracy_multibox_reward": 0.4027777910232544, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 3789.1805419921875, "epoch": 0.44850818094321465, "grad_norm": 0.41061052680015564, "kl": 0.0814208984375, "learning_rate": 6.626594458368606e-07, "loss": 0.0605, "reward": 0.2388888904824853, "reward_std": 0.22858282178640366, "rewards/accuracy_multibox_reward": 0.2388888904824853, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 3970.4860229492188, "epoch": 0.4494706448508181, "grad_norm": 0.1800222545862198, "kl": 0.09234619140625, "learning_rate": 6.612721542127189e-07, "loss": 0.0093, "reward": 0.18055556155741215, "reward_std": 0.08084797114133835, "rewards/accuracy_multibox_reward": 0.18055556155741215, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 2924.3889770507812, "epoch": 0.4504331087584216, "grad_norm": 0.13980431854724884, "kl": 0.059814453125, "learning_rate": 6.598837352572664e-07, "loss": 0.0163, "reward": 0.18333332613110542, "reward_std": 0.09209848940372467, "rewards/accuracy_multibox_reward": 0.18333332613110542, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 3593.8334350585938, "epoch": 0.45139557266602504, "grad_norm": 0.19357290863990784, "kl": 0.0625, "learning_rate": 6.584942030369887e-07, "loss": 0.0226, "reward": 0.11666666902601719, "reward_std": 0.11755470186471939, "rewards/accuracy_multibox_reward": 0.11666666902601719, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 3675.0000610351562, "epoch": 0.4523580365736285, "grad_norm": 0.2853853404521942, "kl": 0.04705810546875, "learning_rate": 6.571035716296505e-07, "loss": -0.0139, "reward": 0.1750000063329935, "reward_std": 0.27937041595578194, "rewards/accuracy_multibox_reward": 0.1750000063329935, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 3483.236083984375, "epoch": 0.45332050048123196, "grad_norm": 0.9503285884857178, "kl": 0.0578460693359375, "learning_rate": 6.557118551241521e-07, "loss": -0.0181, "reward": 0.19999999925494194, "reward_std": 0.21214578673243523, "rewards/accuracy_multibox_reward": 0.19999999925494194, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 3305.6527709960938, "epoch": 0.4542829643888354, "grad_norm": 0.21742840111255646, "kl": 0.0498046875, "learning_rate": 6.543190676203877e-07, "loss": 0.0304, "reward": 0.19351852033287287, "reward_std": 0.1694956123828888, "rewards/accuracy_multibox_reward": 0.19351852033287287, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 3802.7083740234375, "epoch": 0.4552454282964389, "grad_norm": 0.17480701208114624, "kl": 0.06451416015625, "learning_rate": 6.529252232291021e-07, "loss": 0.0047, "reward": 0.0555555559694767, "reward_std": 0.1110745258629322, "rewards/accuracy_multibox_reward": 0.0555555559694767, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 3622.6944580078125, "epoch": 0.45620789220404234, "grad_norm": 0.16504710912704468, "kl": 0.064544677734375, "learning_rate": 6.515303360717477e-07, "loss": 0.0234, "reward": 0.08611110597848892, "reward_std": 0.04270917922258377, "rewards/accuracy_multibox_reward": 0.08611110597848892, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 3683.5833740234375, "epoch": 0.4571703561116458, "grad_norm": 0.16695918142795563, "kl": 0.05352783203125, "learning_rate": 6.501344202803414e-07, "loss": 0.0136, "reward": 0.22500000428408384, "reward_std": 0.2669190987944603, "rewards/accuracy_multibox_reward": 0.22500000428408384, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 3154.4583740234375, "epoch": 0.45813282001924927, "grad_norm": 0.15023912489414215, "kl": 0.04058837890625, "learning_rate": 6.487374899973217e-07, "loss": -0.0002, "reward": 0.19999999180436134, "reward_std": 0.11028015986084938, "rewards/accuracy_multibox_reward": 0.19999999180436134, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 3671.3473510742188, "epoch": 0.4590952839268527, "grad_norm": 0.18750423192977905, "kl": 0.0599365234375, "learning_rate": 6.473395593754045e-07, "loss": -0.015, "reward": 0.24722222425043583, "reward_std": 0.1625748099759221, "rewards/accuracy_multibox_reward": 0.24722222425043583, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 3549.4166870117188, "epoch": 0.4600577478344562, "grad_norm": 0.4796087145805359, "kl": 0.06060791015625, "learning_rate": 6.459406425774415e-07, "loss": 0.0411, "reward": 0.4611111208796501, "reward_std": 0.2712896056473255, "rewards/accuracy_multibox_reward": 0.4611111208796501, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 3517.5000610351562, "epoch": 0.46102021174205965, "grad_norm": 0.17003844678401947, "kl": 0.037445068359375, "learning_rate": 6.44540753776275e-07, "loss": 0.007, "reward": 0.0833333358168602, "reward_std": 0.09246460348367691, "rewards/accuracy_multibox_reward": 0.0833333358168602, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 3711.9583740234375, "epoch": 0.4619826756496631, "grad_norm": 0.223869189620018, "kl": 0.042236328125, "learning_rate": 6.431399071545951e-07, "loss": 0.0074, "reward": 0.2805555686354637, "reward_std": 0.2079591527581215, "rewards/accuracy_multibox_reward": 0.2805555686354637, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 3880.291748046875, "epoch": 0.4629451395572666, "grad_norm": 0.24836479127407074, "kl": 0.05706787109375, "learning_rate": 6.417381169047957e-07, "loss": -0.0066, "reward": 0.27361112274229527, "reward_std": 0.1961144618690014, "rewards/accuracy_multibox_reward": 0.27361112274229527, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 3509.3472900390625, "epoch": 0.4639076034648701, "grad_norm": 0.10472600162029266, "kl": 0.0440673828125, "learning_rate": 6.403353972288311e-07, "loss": 0.0151, "reward": 0.27222222834825516, "reward_std": 0.13549776375293732, "rewards/accuracy_multibox_reward": 0.27222222834825516, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 3748.4166870117188, "epoch": 0.46487006737247355, "grad_norm": 0.16970203816890717, "kl": 0.044586181640625, "learning_rate": 6.389317623380718e-07, "loss": 0.02, "reward": 0.09907407592982054, "reward_std": 0.0911332368850708, "rewards/accuracy_multibox_reward": 0.09907407592982054, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 3467.7361450195312, "epoch": 0.465832531280077, "grad_norm": 0.5755534172058105, "kl": 0.03839111328125, "learning_rate": 6.375272264531607e-07, "loss": 0.0495, "reward": 0.26944445446133614, "reward_std": 0.2598174959421158, "rewards/accuracy_multibox_reward": 0.26944445446133614, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 3812.52783203125, "epoch": 0.4667949951876805, "grad_norm": 0.11556749045848846, "kl": 0.046051025390625, "learning_rate": 6.361218038038687e-07, "loss": 0.0151, "reward": 0.0833333432674408, "reward_std": 0.042163703590631485, "rewards/accuracy_multibox_reward": 0.0833333432674408, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 3896.4722290039062, "epoch": 0.46775745909528393, "grad_norm": 0.17998448014259338, "kl": 0.038665771484375, "learning_rate": 6.34715508628951e-07, "loss": 0.0193, "reward": 0.07777777872979641, "reward_std": 0.1632993184030056, "rewards/accuracy_multibox_reward": 0.07777777872979641, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 3395.3750610351562, "epoch": 0.4687199230028874, "grad_norm": 0.37251973152160645, "kl": 0.04229736328125, "learning_rate": 6.333083551760028e-07, "loss": -0.0274, "reward": 0.3444444537162781, "reward_std": 0.15064706280827522, "rewards/accuracy_multibox_reward": 0.3444444537162781, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 3577.5972290039062, "epoch": 0.46968238691049086, "grad_norm": 0.30518800020217896, "kl": 0.058563232421875, "learning_rate": 6.319003577013141e-07, "loss": 0.0533, "reward": 0.1083333371207118, "reward_std": 0.1704426109790802, "rewards/accuracy_multibox_reward": 0.1083333371207118, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 3391.9444580078125, "epoch": 0.4706448508180943, "grad_norm": 0.24555133283138275, "kl": 0.049774169921875, "learning_rate": 6.304915304697267e-07, "loss": 0.0318, "reward": 0.29166666604578495, "reward_std": 0.29754167050123215, "rewards/accuracy_multibox_reward": 0.29166666604578495, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 3672.7500610351562, "epoch": 0.4716073147256978, "grad_norm": 0.2618182897567749, "kl": 0.053497314453125, "learning_rate": 6.290818877544883e-07, "loss": 0.048, "reward": 0.19722221046686172, "reward_std": 0.10685836710035801, "rewards/accuracy_multibox_reward": 0.19722221046686172, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 3566.0834350585938, "epoch": 0.47256977863330124, "grad_norm": 0.24052606523036957, "kl": 0.0484619140625, "learning_rate": 6.276714438371091e-07, "loss": 0.0695, "reward": 0.17777778208255768, "reward_std": 0.09327251464128494, "rewards/accuracy_multibox_reward": 0.17777778208255768, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 3254.375, "epoch": 0.4735322425409047, "grad_norm": 0.4133087992668152, "kl": 0.0417022705078125, "learning_rate": 6.262602130072159e-07, "loss": 0.0541, "reward": 0.3222222290933132, "reward_std": 0.3228658474981785, "rewards/accuracy_multibox_reward": 0.3222222290933132, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 3333.916748046875, "epoch": 0.47449470644850816, "grad_norm": 0.49887409806251526, "kl": 0.07647705078125, "learning_rate": 6.248482095624086e-07, "loss": -0.0221, "reward": 0.31388889253139496, "reward_std": 0.25283197313547134, "rewards/accuracy_multibox_reward": 0.31388889253139496, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 3406.3751220703125, "epoch": 0.4754571703561116, "grad_norm": 0.4571772813796997, "kl": 0.04443359375, "learning_rate": 6.234354478081139e-07, "loss": 0.0461, "reward": 0.26944445073604584, "reward_std": 0.20909829530864954, "rewards/accuracy_multibox_reward": 0.26944445073604584, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 3484.8889770507812, "epoch": 0.4764196342637151, "grad_norm": 0.1948314905166626, "kl": 0.066253662109375, "learning_rate": 6.220219420574419e-07, "loss": 0.0031, "reward": 0.302777785807848, "reward_std": 0.1343880551867187, "rewards/accuracy_multibox_reward": 0.302777785807848, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 3839.6945190429688, "epoch": 0.4773820981713186, "grad_norm": 0.1769389659166336, "kl": 0.08502197265625, "learning_rate": 6.206077066310398e-07, "loss": 0.0035, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 3677.5000610351562, "epoch": 0.47834456207892206, "grad_norm": 0.4380002021789551, "kl": 0.078369140625, "learning_rate": 6.191927558569479e-07, "loss": 0.0089, "reward": 0.19722222536802292, "reward_std": 0.28856465220451355, "rewards/accuracy_multibox_reward": 0.19722222536802292, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 3238.90283203125, "epoch": 0.4793070259865255, "grad_norm": 0.5803089141845703, "kl": 0.060211181640625, "learning_rate": 6.177771040704533e-07, "loss": 0.0676, "reward": 0.19722222723066807, "reward_std": 0.24444737657904625, "rewards/accuracy_multibox_reward": 0.19722222723066807, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 3233.125, "epoch": 0.480269489894129, "grad_norm": 0.32682004570961, "kl": 0.04998779296875, "learning_rate": 6.16360765613946e-07, "loss": -0.0159, "reward": 0.2277777772396803, "reward_std": 0.19545596465468407, "rewards/accuracy_multibox_reward": 0.2277777772396803, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 3905.1666870117188, "epoch": 0.48123195380173245, "grad_norm": 0.11550021916627884, "kl": 0.0860595703125, "learning_rate": 6.149437548367719e-07, "loss": 0.0189, "reward": 0.06111111305654049, "reward_std": 0.08989017084240913, "rewards/accuracy_multibox_reward": 0.06111111305654049, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 3399.513916015625, "epoch": 0.4821944177093359, "grad_norm": 0.4278028905391693, "kl": 0.07415771484375, "learning_rate": 6.135260860950894e-07, "loss": -0.0174, "reward": 0.4361111279577017, "reward_std": 0.2019997825846076, "rewards/accuracy_multibox_reward": 0.4361111279577017, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 3721.8473510742188, "epoch": 0.48315688161693937, "grad_norm": 0.13588866591453552, "kl": 0.08544921875, "learning_rate": 6.121077737517221e-07, "loss": 0.0015, "reward": 0.03333333507180214, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 3656.9445190429688, "epoch": 0.48411934552454283, "grad_norm": 0.12102865427732468, "kl": 0.084228515625, "learning_rate": 6.106888321760144e-07, "loss": 0.0139, "reward": 0.08888888359069824, "reward_std": 0.044305335730314255, "rewards/accuracy_multibox_reward": 0.08888888359069824, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 3697.013916015625, "epoch": 0.4850818094321463, "grad_norm": 0.328457236289978, "kl": 0.07861328125, "learning_rate": 6.092692757436862e-07, "loss": 0.0364, "reward": 0.058333334513008595, "reward_std": 0.11787866801023483, "rewards/accuracy_multibox_reward": 0.058333334513008595, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 3648.6666870117188, "epoch": 0.48604427333974976, "grad_norm": 0.18200773000717163, "kl": 0.072021484375, "learning_rate": 6.078491188366859e-07, "loss": 0.0202, "reward": 0.26388889737427235, "reward_std": 0.13930176524445415, "rewards/accuracy_multibox_reward": 0.26388889737427235, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 3418.77783203125, "epoch": 0.4870067372473532, "grad_norm": 1.0849215984344482, "kl": 0.07501220703125, "learning_rate": 6.064283758430455e-07, "loss": 0.1054, "reward": 0.211111125536263, "reward_std": 0.22042057663202286, "rewards/accuracy_multibox_reward": 0.211111125536263, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 3636.5556030273438, "epoch": 0.4879692011549567, "grad_norm": 0.10336239635944366, "kl": 0.0819091796875, "learning_rate": 6.050070611567355e-07, "loss": 0.0112, "reward": 0.11666666902601719, "reward_std": 0.0513657545670867, "rewards/accuracy_multibox_reward": 0.11666666902601719, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 3102.0000610351562, "epoch": 0.48893166506256014, "grad_norm": 0.18454711139202118, "kl": 0.06256103515625, "learning_rate": 6.03585189177518e-07, "loss": -0.0176, "reward": 0.06666666455566883, "reward_std": 0.10381977260112762, "rewards/accuracy_multibox_reward": 0.06666666455566883, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 2927.3750610351562, "epoch": 0.4898941289701636, "grad_norm": 0.3327130079269409, "kl": 0.05462646484375, "learning_rate": 6.021627743108008e-07, "loss": 0.0186, "reward": 0.3444444537162781, "reward_std": 0.17123108357191086, "rewards/accuracy_multibox_reward": 0.3444444537162781, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 3409.0972900390625, "epoch": 0.49085659287776706, "grad_norm": 0.2277969866991043, "kl": 0.1015625, "learning_rate": 6.007398309674927e-07, "loss": 0.0146, "reward": 0.06736111268401146, "reward_std": 0.1553131826221943, "rewards/accuracy_multibox_reward": 0.06736111268401146, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 3984.1666870117188, "epoch": 0.4918190567853705, "grad_norm": 0.13976094126701355, "kl": 0.1060791015625, "learning_rate": 5.993163735638561e-07, "loss": 0.0092, "reward": 0.04722222313284874, "reward_std": 0.05314201861619949, "rewards/accuracy_multibox_reward": 0.04722222313284874, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 3701.0416870117188, "epoch": 0.49278152069297404, "grad_norm": 0.28218549489974976, "kl": 0.1214599609375, "learning_rate": 5.978924165213613e-07, "loss": 0.009, "reward": 0.2638888880610466, "reward_std": 0.18938182294368744, "rewards/accuracy_multibox_reward": 0.2638888880610466, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 3520.4166870117188, "epoch": 0.4937439846005775, "grad_norm": 0.5459305047988892, "kl": 0.1156005859375, "learning_rate": 5.964679742665413e-07, "loss": 0.0597, "reward": 0.25833332631736994, "reward_std": 0.1592269316315651, "rewards/accuracy_multibox_reward": 0.25833332631736994, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 3877.47216796875, "epoch": 0.49470644850818096, "grad_norm": 0.12950903177261353, "kl": 0.0872802734375, "learning_rate": 5.950430612308444e-07, "loss": 0.009, "reward": 0.01666666753590107, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.01666666753590107, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 3420.3194580078125, "epoch": 0.4956689124157844, "grad_norm": 0.3592342138290405, "kl": 0.123779296875, "learning_rate": 5.936176918504882e-07, "loss": 0.0255, "reward": 0.22500000521540642, "reward_std": 0.13839105796068907, "rewards/accuracy_multibox_reward": 0.22500000521540642, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 3483.4166870117188, "epoch": 0.4966313763233879, "grad_norm": 0.3443153202533722, "kl": 0.12890625, "learning_rate": 5.921918805663149e-07, "loss": 0.0398, "reward": 0.04722222313284874, "reward_std": 0.05314201861619949, "rewards/accuracy_multibox_reward": 0.04722222313284874, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 3626.27783203125, "epoch": 0.49759384023099135, "grad_norm": 0.5960866212844849, "kl": 0.11224365234375, "learning_rate": 5.907656418236426e-07, "loss": 0.0494, "reward": 0.23888888582587242, "reward_std": 0.2542063407599926, "rewards/accuracy_multibox_reward": 0.23888888582587242, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 3622.1527709960938, "epoch": 0.4985563041385948, "grad_norm": 0.6622093915939331, "kl": 0.1234130859375, "learning_rate": 5.8933899007212e-07, "loss": 0.0512, "reward": 0.1888888943940401, "reward_std": 0.15383683517575264, "rewards/accuracy_multibox_reward": 0.1888888943940401, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 4044.5556030273438, "epoch": 0.49951876804619827, "grad_norm": 0.16040094196796417, "kl": 0.1534423828125, "learning_rate": 5.879119397655812e-07, "loss": 0.0085, "reward": 0.01666666753590107, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.01666666753590107, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 3584.6527709960938, "epoch": 0.5004812319538018, "grad_norm": 0.2761547565460205, "kl": 0.11083984375, "learning_rate": 5.864845053618975e-07, "loss": 0.0433, "reward": 0.15555555559694767, "reward_std": 0.1083974577486515, "rewards/accuracy_multibox_reward": 0.15555555559694767, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 3815.9444580078125, "epoch": 0.5014436958614052, "grad_norm": 0.5335500240325928, "kl": 0.1695556640625, "learning_rate": 5.850567013228314e-07, "loss": 0.0686, "reward": 0.284722238779068, "reward_std": 0.21446025371551514, "rewards/accuracy_multibox_reward": 0.284722238779068, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 3361.52783203125, "epoch": 0.5024061597690087, "grad_norm": 0.7363224029541016, "kl": 0.145263671875, "learning_rate": 5.836285421138909e-07, "loss": -0.0468, "reward": 0.31111111491918564, "reward_std": 0.07836419809609652, "rewards/accuracy_multibox_reward": 0.31111111491918564, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 3785.4027709960938, "epoch": 0.5033686236766122, "grad_norm": 0.386507511138916, "kl": 0.17529296875, "learning_rate": 5.822000422041817e-07, "loss": 0.0385, "reward": 0.1555555621162057, "reward_std": 0.29228584095835686, "rewards/accuracy_multibox_reward": 0.1555555621162057, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 3487.6388549804688, "epoch": 0.5043310875842156, "grad_norm": 0.36943694949150085, "kl": 0.197265625, "learning_rate": 5.807712160662618e-07, "loss": 0.0567, "reward": 0.058333334513008595, "reward_std": 0.1156703345477581, "rewards/accuracy_multibox_reward": 0.058333334513008595, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 3613.1806030273438, "epoch": 0.5052935514918191, "grad_norm": 0.838036298751831, "kl": 0.14892578125, "learning_rate": 5.793420781759938e-07, "loss": 0.0885, "reward": 0.26388889364898205, "reward_std": 0.27887914702296257, "rewards/accuracy_multibox_reward": 0.26388889364898205, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 3629.15283203125, "epoch": 0.5062560153994226, "grad_norm": 0.1435745805501938, "kl": 0.154052734375, "learning_rate": 5.77912643012399e-07, "loss": 0.016, "reward": 0.12222222611308098, "reward_std": 0.097181785851717, "rewards/accuracy_multibox_reward": 0.12222222611308098, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 3662.763916015625, "epoch": 0.507218479307026, "grad_norm": 0.437927782535553, "kl": 0.1873779296875, "learning_rate": 5.764829250575106e-07, "loss": 0.0435, "reward": 0.28611110895872116, "reward_std": 0.1245713415555656, "rewards/accuracy_multibox_reward": 0.28611110895872116, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 3541.2916870117188, "epoch": 0.5081809432146295, "grad_norm": 0.2493341565132141, "kl": 0.222900390625, "learning_rate": 5.750529387962268e-07, "loss": 0.0393, "reward": 0.09444444812834263, "reward_std": 0.09573778137564659, "rewards/accuracy_multibox_reward": 0.09444444812834263, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 3806.4722900390625, "epoch": 0.5091434071222329, "grad_norm": 0.16213388741016388, "kl": 0.25634765625, "learning_rate": 5.736226987161637e-07, "loss": 0.0392, "reward": 0.03333333507180214, "reward_std": 0.08164966106414795, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 3877.3194580078125, "epoch": 0.5101058710298364, "grad_norm": 0.33630263805389404, "kl": 0.267822265625, "learning_rate": 5.721922193075095e-07, "loss": 0.0529, "reward": 0.21944444999098778, "reward_std": 0.20147881284356117, "rewards/accuracy_multibox_reward": 0.21944444999098778, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 3432.9722900390625, "epoch": 0.5110683349374399, "grad_norm": 0.4518989324569702, "kl": 0.3046875, "learning_rate": 5.707615150628765e-07, "loss": 0.0164, "reward": 0.30000001192092896, "reward_std": 0.10941680427640676, "rewards/accuracy_multibox_reward": 0.30000001192092896, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 3880.02783203125, "epoch": 0.5120307988450433, "grad_norm": 0.24155446887016296, "kl": 0.268798828125, "learning_rate": 5.693306004771556e-07, "loss": 0.0279, "reward": 0.15000000223517418, "reward_std": 0.17690759152173996, "rewards/accuracy_multibox_reward": 0.15000000223517418, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 3839.8611450195312, "epoch": 0.5129932627526468, "grad_norm": 0.4043300747871399, "kl": 0.24462890625, "learning_rate": 5.678994900473683e-07, "loss": 0.0496, "reward": 0.06666667014360428, "reward_std": 0.13328944519162178, "rewards/accuracy_multibox_reward": 0.06666667014360428, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 3654.8055419921875, "epoch": 0.5139557266602502, "grad_norm": 0.2262481153011322, "kl": 0.22412109375, "learning_rate": 5.664681982725199e-07, "loss": 0.0335, "reward": 0.11944444850087166, "reward_std": 0.11742250621318817, "rewards/accuracy_multibox_reward": 0.11944444850087166, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 3583.125, "epoch": 0.5149181905678537, "grad_norm": 0.38706448674201965, "kl": 0.260009765625, "learning_rate": 5.650367396534536e-07, "loss": 0.0131, "reward": 0.13333333376795053, "reward_std": 0.19597504287958145, "rewards/accuracy_multibox_reward": 0.13333333376795053, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 3449.4862060546875, "epoch": 0.5158806544754572, "grad_norm": 0.27941134572029114, "kl": 0.210205078125, "learning_rate": 5.636051286927029e-07, "loss": 0.009, "reward": 0.1138888904824853, "reward_std": 0.03402068838477135, "rewards/accuracy_multibox_reward": 0.1138888904824853, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 3432.388916015625, "epoch": 0.5168431183830606, "grad_norm": 0.16507090628147125, "kl": 0.196533203125, "learning_rate": 5.621733798943439e-07, "loss": 0.0255, "reward": 0.20555556658655405, "reward_std": 0.09020515158772469, "rewards/accuracy_multibox_reward": 0.20555556658655405, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 4046.041748046875, "epoch": 0.5178055822906641, "grad_norm": 0.3485652208328247, "kl": 0.408203125, "learning_rate": 5.607415077638504e-07, "loss": 0.0639, "reward": 0.24722222983837128, "reward_std": 0.2145540937781334, "rewards/accuracy_multibox_reward": 0.24722222983837128, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 4297.6944580078125, "epoch": 0.5187680461982676, "grad_norm": 0.2475382536649704, "kl": 0.324951171875, "learning_rate": 5.593095268079448e-07, "loss": 0.0347, "reward": 0.21111111342906952, "reward_std": 0.25067583844065666, "rewards/accuracy_multibox_reward": 0.21111111342906952, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 3801.6250610351562, "epoch": 0.519730510105871, "grad_norm": 0.5661216974258423, "kl": 0.27685546875, "learning_rate": 5.578774515344526e-07, "loss": 0.0624, "reward": 0.2638888955116272, "reward_std": 0.319710448384285, "rewards/accuracy_multibox_reward": 0.2638888955116272, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 3985.7777709960938, "epoch": 0.5206929740134745, "grad_norm": 0.3925462067127228, "kl": 0.30615234375, "learning_rate": 5.564452964521543e-07, "loss": 0.0394, "reward": 0.24166667833924294, "reward_std": 0.24361707270145416, "rewards/accuracy_multibox_reward": 0.24166667833924294, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 3808.5139770507812, "epoch": 0.5216554379210779, "grad_norm": 0.46257349848747253, "kl": 0.37890625, "learning_rate": 5.550130760706395e-07, "loss": 0.0166, "reward": 0.09444444626569748, "reward_std": 0.10383758693933487, "rewards/accuracy_multibox_reward": 0.09444444626569748, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 3945.5972900390625, "epoch": 0.5226179018286814, "grad_norm": 0.3608599305152893, "kl": 0.2978515625, "learning_rate": 5.535808049001591e-07, "loss": 0.013, "reward": 0.022916667396202683, "reward_std": 0.05613413732498884, "rewards/accuracy_multibox_reward": 0.022916667396202683, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 3616.22216796875, "epoch": 0.5235803657362849, "grad_norm": 0.28265345096588135, "kl": 0.2763671875, "learning_rate": 5.521484974514784e-07, "loss": 0.0342, "reward": 0.2711805496364832, "reward_std": 0.21610762923955917, "rewards/accuracy_multibox_reward": 0.2711805496364832, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 4035.0972290039062, "epoch": 0.5245428296438883, "grad_norm": 0.41573020815849304, "kl": 0.32373046875, "learning_rate": 5.507161682357306e-07, "loss": 0.0096, "reward": 0.12777778040617704, "reward_std": 0.12247448787093163, "rewards/accuracy_multibox_reward": 0.12777778040617704, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 4019.2916259765625, "epoch": 0.5255052935514918, "grad_norm": 0.5005810856819153, "kl": 0.310546875, "learning_rate": 5.492838317642694e-07, "loss": -0.0056, "reward": 0.25000001210719347, "reward_std": 0.19890336971729994, "rewards/accuracy_multibox_reward": 0.25000001210719347, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 4041.3194580078125, "epoch": 0.5264677574590952, "grad_norm": 0.28957727551460266, "kl": 0.33203125, "learning_rate": 5.478515025485215e-07, "loss": 0.0343, "reward": 0.033333334140479565, "reward_std": 0.08164965733885765, "rewards/accuracy_multibox_reward": 0.033333334140479565, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 3867.9583740234375, "epoch": 0.5274302213666987, "grad_norm": 0.23154979944229126, "kl": 0.25830078125, "learning_rate": 5.46419195099841e-07, "loss": 0.0206, "reward": 0.1527777798473835, "reward_std": 0.09706042427569628, "rewards/accuracy_multibox_reward": 0.1527777798473835, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 3896.0833740234375, "epoch": 0.5283926852743022, "grad_norm": 0.4197494089603424, "kl": 0.291015625, "learning_rate": 5.449869239293605e-07, "loss": 0.0067, "reward": 0.1861111167818308, "reward_std": 0.13922948949038982, "rewards/accuracy_multibox_reward": 0.1861111167818308, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 3945.8888549804688, "epoch": 0.5293551491819056, "grad_norm": 0.21555788815021515, "kl": 0.271240234375, "learning_rate": 5.435547035478456e-07, "loss": 0.0263, "reward": 0.22500000335276127, "reward_std": 0.13788525387644768, "rewards/accuracy_multibox_reward": 0.22500000335276127, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 3886.5416870117188, "epoch": 0.5303176130895092, "grad_norm": 0.18396471440792084, "kl": 0.25341796875, "learning_rate": 5.421225484655475e-07, "loss": 0.0322, "reward": 0.16805555718019605, "reward_std": 0.12328904587775469, "rewards/accuracy_multibox_reward": 0.16805555718019605, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 3961.3195190429688, "epoch": 0.5312800769971127, "grad_norm": 0.38764241337776184, "kl": 0.268798828125, "learning_rate": 5.406904731920552e-07, "loss": 0.0253, "reward": 0.25833334121853113, "reward_std": 0.17644505575299263, "rewards/accuracy_multibox_reward": 0.25833334121853113, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 3772.0972900390625, "epoch": 0.5322425409047161, "grad_norm": 0.21507379412651062, "kl": 0.25146484375, "learning_rate": 5.392584922361496e-07, "loss": 0.0434, "reward": 0.08809523936361074, "reward_std": 0.1353185772895813, "rewards/accuracy_multibox_reward": 0.08809523936361074, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 3877.5416259765625, "epoch": 0.5332050048123196, "grad_norm": 0.3207280933856964, "kl": 0.260986328125, "learning_rate": 5.378266201056561e-07, "loss": 0.0594, "reward": 0.11388889327645302, "reward_std": 0.14718842506408691, "rewards/accuracy_multibox_reward": 0.11388889327645302, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 4113.916687011719, "epoch": 0.534167468719923, "grad_norm": 0.150282084941864, "kl": 0.2236328125, "learning_rate": 5.363948713072973e-07, "loss": 0.0395, "reward": 0.08414039202034473, "reward_std": 0.1224910356104374, "rewards/accuracy_multibox_reward": 0.08414039202034473, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 4086.736083984375, "epoch": 0.5351299326275265, "grad_norm": 0.20541338622570038, "kl": 0.226318359375, "learning_rate": 5.349632603465466e-07, "loss": 0.019, "reward": 0.08055555634200573, "reward_std": 0.14115842804312706, "rewards/accuracy_multibox_reward": 0.08055555634200573, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 4064.4028930664062, "epoch": 0.53609239653513, "grad_norm": 0.20287075638771057, "kl": 0.260009765625, "learning_rate": 5.335318017274802e-07, "loss": 0.0206, "reward": 0.05277777835726738, "reward_std": 0.05813458189368248, "rewards/accuracy_multibox_reward": 0.05277777835726738, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 3986.8195190429688, "epoch": 0.5370548604427334, "grad_norm": 0.3114657998085022, "kl": 0.237060546875, "learning_rate": 5.321005099526319e-07, "loss": 0.0534, "reward": 0.08611111529171467, "reward_std": 0.18091841042041779, "rewards/accuracy_multibox_reward": 0.08611111529171467, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 3881.4861450195312, "epoch": 0.5380173243503369, "grad_norm": 0.15287050604820251, "kl": 0.257080078125, "learning_rate": 5.306693995228443e-07, "loss": 0.0173, "reward": 0.0833333358168602, "reward_std": 0.13642191886901855, "rewards/accuracy_multibox_reward": 0.0833333358168602, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 3950.6387939453125, "epoch": 0.5389797882579404, "grad_norm": 0.22614151239395142, "kl": 0.204345703125, "learning_rate": 5.292384849371234e-07, "loss": -0.0067, "reward": 0.18055556807667017, "reward_std": 0.095002181828022, "rewards/accuracy_multibox_reward": 0.18055556807667017, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 4000.6944580078125, "epoch": 0.5399422521655438, "grad_norm": 0.4775259792804718, "kl": 0.21533203125, "learning_rate": 5.278077806924906e-07, "loss": 0.0545, "reward": 0.21944443974643946, "reward_std": 0.2693425826728344, "rewards/accuracy_multibox_reward": 0.21944443974643946, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 3906.1250610351562, "epoch": 0.5409047160731473, "grad_norm": 0.18929684162139893, "kl": 0.2080078125, "learning_rate": 5.263773012838364e-07, "loss": 0.0269, "reward": 0.19166666641831398, "reward_std": 0.12112953094765544, "rewards/accuracy_multibox_reward": 0.19166666641831398, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 3594.27783203125, "epoch": 0.5418671799807507, "grad_norm": 0.8284193277359009, "kl": 0.19091796875, "learning_rate": 5.249470612037734e-07, "loss": 0.1004, "reward": 0.22486772760748863, "reward_std": 0.23692556843161583, "rewards/accuracy_multibox_reward": 0.22486772760748863, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 3826.6943969726562, "epoch": 0.5428296438883542, "grad_norm": 0.2741878628730774, "kl": 0.2080078125, "learning_rate": 5.235170749424893e-07, "loss": 0.0316, "reward": 0.13611110672354698, "reward_std": 0.0963914506137371, "rewards/accuracy_multibox_reward": 0.13611110672354698, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 3559.27783203125, "epoch": 0.5437921077959577, "grad_norm": 0.5702493786811829, "kl": 0.173095703125, "learning_rate": 5.220873569876011e-07, "loss": 0.0638, "reward": 0.18888888508081436, "reward_std": 0.18491437286138535, "rewards/accuracy_multibox_reward": 0.18888888508081436, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 3911.6666870117188, "epoch": 0.5447545717035611, "grad_norm": 0.28968921303749084, "kl": 0.227783203125, "learning_rate": 5.206579218240062e-07, "loss": 0.0048, "reward": 0.19444444123655558, "reward_std": 0.08502903953194618, "rewards/accuracy_multibox_reward": 0.19444444123655558, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 4070.2222900390625, "epoch": 0.5457170356111646, "grad_norm": 0.5173502564430237, "kl": 0.202392578125, "learning_rate": 5.192287839337383e-07, "loss": 0.0201, "reward": 0.18333333916962147, "reward_std": 0.12644988298416138, "rewards/accuracy_multibox_reward": 0.18333333916962147, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 3985.3750610351562, "epoch": 0.546679499518768, "grad_norm": 0.278224378824234, "kl": 0.29052734375, "learning_rate": 5.177999577958183e-07, "loss": 0.0154, "reward": 0.0694444477558136, "reward_std": 0.05417735129594803, "rewards/accuracy_multibox_reward": 0.0694444477558136, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 3872.5277709960938, "epoch": 0.5476419634263715, "grad_norm": 0.21936258673667908, "kl": 0.256103515625, "learning_rate": 5.163714578861091e-07, "loss": 0.0498, "reward": 0.1138888904824853, "reward_std": 0.18125754967331886, "rewards/accuracy_multibox_reward": 0.1138888904824853, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 4067.9583740234375, "epoch": 0.548604427333975, "grad_norm": 0.41079092025756836, "kl": 0.3232421875, "learning_rate": 5.149432986771686e-07, "loss": 0.0159, "reward": 0.06689814664423466, "reward_std": 0.11205330118536949, "rewards/accuracy_multibox_reward": 0.06689814664423466, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 4034.27783203125, "epoch": 0.5495668912415784, "grad_norm": 0.2903394103050232, "kl": 0.30224609375, "learning_rate": 5.135154946381026e-07, "loss": 0.0154, "reward": 0.06666667014360428, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.06666667014360428, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 3993.6527709960938, "epoch": 0.5505293551491819, "grad_norm": 0.24677245318889618, "kl": 0.268798828125, "learning_rate": 5.120880602344187e-07, "loss": 0.0129, "reward": 0.06388889066874981, "reward_std": 0.09291093796491623, "rewards/accuracy_multibox_reward": 0.06388889066874981, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 3848.8472900390625, "epoch": 0.5514918190567853, "grad_norm": 0.26237183809280396, "kl": 0.244873046875, "learning_rate": 5.106610099278801e-07, "loss": 0.0387, "reward": 0.2083333283662796, "reward_std": 0.15753640979528427, "rewards/accuracy_multibox_reward": 0.2083333283662796, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 3980.3889770507812, "epoch": 0.5524542829643888, "grad_norm": 0.2823127210140228, "kl": 0.2423095703125, "learning_rate": 5.092343581763576e-07, "loss": 0.0146, "reward": 0.12222221586853266, "reward_std": 0.04314939584583044, "rewards/accuracy_multibox_reward": 0.12222221586853266, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 3815.4861450195312, "epoch": 0.5534167468719923, "grad_norm": 0.220231294631958, "kl": 0.282470703125, "learning_rate": 5.078081194336853e-07, "loss": 0.0481, "reward": 0.14038461120799184, "reward_std": 0.1469232514500618, "rewards/accuracy_multibox_reward": 0.14038461120799184, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 4188.263916015625, "epoch": 0.5543792107795957, "grad_norm": 0.35187411308288574, "kl": 0.2255859375, "learning_rate": 5.063823081495117e-07, "loss": 0.0327, "reward": 0.20000000298023224, "reward_std": 0.21368711441755295, "rewards/accuracy_multibox_reward": 0.20000000298023224, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 4164.5001220703125, "epoch": 0.5553416746871992, "grad_norm": 0.3526155650615692, "kl": 0.2578125, "learning_rate": 5.049569387691557e-07, "loss": 0.0104, "reward": 0.20000001043081284, "reward_std": 0.12005183845758438, "rewards/accuracy_multibox_reward": 0.20000001043081284, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 4240.6251220703125, "epoch": 0.5563041385948027, "grad_norm": 0.22636735439300537, "kl": 0.271240234375, "learning_rate": 5.035320257334587e-07, "loss": 0.0332, "reward": 0.11805555783212185, "reward_std": 0.1747179590165615, "rewards/accuracy_multibox_reward": 0.11805555783212185, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 3875.375, "epoch": 0.5572666025024061, "grad_norm": 0.24035298824310303, "kl": 0.2490234375, "learning_rate": 5.021075834786386e-07, "loss": 0.0489, "reward": 0.08888888917863369, "reward_std": 0.09095283225178719, "rewards/accuracy_multibox_reward": 0.08888888917863369, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 4056.59716796875, "epoch": 0.5582290664100096, "grad_norm": 0.3038846552371979, "kl": 0.22265625, "learning_rate": 5.006836264361441e-07, "loss": 0.0011, "reward": 0.2944444492459297, "reward_std": 0.052447676192969084, "rewards/accuracy_multibox_reward": 0.2944444492459297, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 3964.0416259765625, "epoch": 0.559191530317613, "grad_norm": 0.18873348832130432, "kl": 0.270751953125, "learning_rate": 4.992601690325073e-07, "loss": 0.0353, "reward": 0.29444443993270397, "reward_std": 0.15684858337044716, "rewards/accuracy_multibox_reward": 0.29444443993270397, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 3816.3612060546875, "epoch": 0.5601539942252165, "grad_norm": 0.31492364406585693, "kl": 0.183349609375, "learning_rate": 4.978372256891992e-07, "loss": -0.0074, "reward": 0.2638888955116272, "reward_std": 0.08167735114693642, "rewards/accuracy_multibox_reward": 0.2638888955116272, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 3932.4027709960938, "epoch": 0.5611164581328201, "grad_norm": 0.2693656086921692, "kl": 0.2135009765625, "learning_rate": 4.964148108224821e-07, "loss": 0.0153, "reward": 0.2805555574595928, "reward_std": 0.17288116738200188, "rewards/accuracy_multibox_reward": 0.2805555574595928, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 3270.4722900390625, "epoch": 0.5620789220404235, "grad_norm": 0.21125417947769165, "kl": 0.1429443359375, "learning_rate": 4.949929388432645e-07, "loss": 0.0019, "reward": 0.22453704103827477, "reward_std": 0.15575147792696953, "rewards/accuracy_multibox_reward": 0.22453704103827477, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 3780.2222290039062, "epoch": 0.563041385948027, "grad_norm": 0.5510251522064209, "kl": 0.172607421875, "learning_rate": 4.935716241569547e-07, "loss": 0.0664, "reward": 0.1388888880610466, "reward_std": 0.20453689992427826, "rewards/accuracy_multibox_reward": 0.1388888880610466, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 3892.083251953125, "epoch": 0.5640038498556305, "grad_norm": 0.17825822532176971, "kl": 0.19921875, "learning_rate": 4.921508811633142e-07, "loss": 0.0222, "reward": 0.21944444440305233, "reward_std": 0.13553372025489807, "rewards/accuracy_multibox_reward": 0.21944444440305233, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 3817.6388549804688, "epoch": 0.5649663137632339, "grad_norm": 0.3809424638748169, "kl": 0.23779296875, "learning_rate": 4.907307242563137e-07, "loss": 0.043, "reward": 0.25000000558793545, "reward_std": 0.2036091717891395, "rewards/accuracy_multibox_reward": 0.25000000558793545, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 3609.0972290039062, "epoch": 0.5659287776708374, "grad_norm": 0.18043872714042664, "kl": 0.160888671875, "learning_rate": 4.893111678239854e-07, "loss": 0.0244, "reward": 0.15555555373430252, "reward_std": 0.10576354712247849, "rewards/accuracy_multibox_reward": 0.15555555373430252, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 4050.4027709960938, "epoch": 0.5668912415784408, "grad_norm": 0.21128909289836884, "kl": 0.171630859375, "learning_rate": 4.878922262482779e-07, "loss": 0.0339, "reward": 0.1305555598810315, "reward_std": 0.12648530304431915, "rewards/accuracy_multibox_reward": 0.1305555598810315, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 4172.805603027344, "epoch": 0.5678537054860443, "grad_norm": 0.17088694870471954, "kl": 0.24755859375, "learning_rate": 4.864739139049108e-07, "loss": 0.0245, "reward": 0.13611110672354698, "reward_std": 0.09876299649477005, "rewards/accuracy_multibox_reward": 0.13611110672354698, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 3984.7083740234375, "epoch": 0.5688161693936478, "grad_norm": 0.3422278165817261, "kl": 0.265380859375, "learning_rate": 4.85056245163228e-07, "loss": 0.0242, "reward": 0.1111111156642437, "reward_std": 0.14499705284833908, "rewards/accuracy_multibox_reward": 0.1111111156642437, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 3244.5972290039062, "epoch": 0.5697786333012512, "grad_norm": 0.5458455681800842, "kl": 0.157958984375, "learning_rate": 4.836392343860541e-07, "loss": 0.0446, "reward": 0.2750000096857548, "reward_std": 0.2083052210509777, "rewards/accuracy_multibox_reward": 0.2750000096857548, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 3723.5556640625, "epoch": 0.5707410972088547, "grad_norm": 0.16273659467697144, "kl": 0.21826171875, "learning_rate": 4.822228959295465e-07, "loss": 0.0256, "reward": 0.32777778804302216, "reward_std": 0.10618913266807795, "rewards/accuracy_multibox_reward": 0.32777778804302216, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 3917.34716796875, "epoch": 0.5717035611164581, "grad_norm": 0.21477922797203064, "kl": 0.21923828125, "learning_rate": 4.808072441430521e-07, "loss": 0.0375, "reward": 0.08611110970377922, "reward_std": 0.11096306517720222, "rewards/accuracy_multibox_reward": 0.08611110970377922, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 3833.7083740234375, "epoch": 0.5726660250240616, "grad_norm": 0.33271124958992004, "kl": 0.2578125, "learning_rate": 4.7939229336896e-07, "loss": 0.0404, "reward": 0.15555556118488312, "reward_std": 0.21942578256130219, "rewards/accuracy_multibox_reward": 0.15555556118488312, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 3799.52783203125, "epoch": 0.5736284889316651, "grad_norm": 0.16301937401294708, "kl": 0.2109375, "learning_rate": 4.779780579425582e-07, "loss": 0.0176, "reward": 0.060108025558292866, "reward_std": 0.06696937419474125, "rewards/accuracy_multibox_reward": 0.060108025558292866, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 3839.5694580078125, "epoch": 0.5745909528392685, "grad_norm": 0.2432936578989029, "kl": 0.236328125, "learning_rate": 4.765645521918862e-07, "loss": 0.015, "reward": 0.32500001043081284, "reward_std": 0.1483340859413147, "rewards/accuracy_multibox_reward": 0.32500001043081284, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 3913.763916015625, "epoch": 0.575553416746872, "grad_norm": 0.16713795065879822, "kl": 0.2109375, "learning_rate": 4.7515179043759146e-07, "loss": 0.0182, "reward": 0.15555554814636707, "reward_std": 0.1818312145769596, "rewards/accuracy_multibox_reward": 0.15555554814636707, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 4004.0554809570312, "epoch": 0.5765158806544755, "grad_norm": 0.29602164030075073, "kl": 0.25146484375, "learning_rate": 4.7373978699278405e-07, "loss": 0.0435, "reward": 0.13055555894970894, "reward_std": 0.1624976322054863, "rewards/accuracy_multibox_reward": 0.13055555894970894, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 3919.4583740234375, "epoch": 0.5774783445620789, "grad_norm": 0.3672622740268707, "kl": 0.232421875, "learning_rate": 4.723285561628909e-07, "loss": 0.0671, "reward": 0.2527777785435319, "reward_std": 0.1730136126279831, "rewards/accuracy_multibox_reward": 0.2527777785435319, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 3917.2222290039062, "epoch": 0.5784408084696824, "grad_norm": 0.22706127166748047, "kl": 0.283447265625, "learning_rate": 4.709181122455117e-07, "loss": 0.0308, "reward": 0.044444444589316845, "reward_std": 0.08164965733885765, "rewards/accuracy_multibox_reward": 0.044444444589316845, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 3954.4445190429688, "epoch": 0.5794032723772858, "grad_norm": 0.304601788520813, "kl": 0.191650390625, "learning_rate": 4.6950846953027347e-07, "loss": 0.0023, "reward": 0.12500000931322575, "reward_std": 0.05623559094965458, "rewards/accuracy_multibox_reward": 0.12500000931322575, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 3734.9304809570312, "epoch": 0.5803657362848893, "grad_norm": 0.4020278751850128, "kl": 0.232177734375, "learning_rate": 4.6809964229868593e-07, "loss": 0.0421, "reward": 0.16666666883975267, "reward_std": 0.09215527027845383, "rewards/accuracy_multibox_reward": 0.16666666883975267, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 3822.1945190429688, "epoch": 0.5813282001924928, "grad_norm": 0.3287241756916046, "kl": 0.2412109375, "learning_rate": 4.6669164482399734e-07, "loss": 0.0455, "reward": 0.29118590615689754, "reward_std": 0.20695828180760145, "rewards/accuracy_multibox_reward": 0.29118590615689754, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 4176.166748046875, "epoch": 0.5822906641000962, "grad_norm": 0.21249189972877502, "kl": 0.224609375, "learning_rate": 4.6528449137104885e-07, "loss": 0.0196, "reward": 0.180555559694767, "reward_std": 0.20166004076600075, "rewards/accuracy_multibox_reward": 0.180555559694767, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 3775.541748046875, "epoch": 0.5832531280076997, "grad_norm": 0.2730811834335327, "kl": 0.2314453125, "learning_rate": 4.6387819619613134e-07, "loss": 0.0135, "reward": 0.08611110970377922, "reward_std": 0.11196815222501755, "rewards/accuracy_multibox_reward": 0.08611110970377922, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 4185.5694580078125, "epoch": 0.5842155919153031, "grad_norm": 0.30413639545440674, "kl": 0.262451171875, "learning_rate": 4.624727735468393e-07, "loss": 0.0149, "reward": 0.15555555745959282, "reward_std": 0.0633788825944066, "rewards/accuracy_multibox_reward": 0.15555555745959282, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 4114.34716796875, "epoch": 0.5851780558229066, "grad_norm": 0.3653811812400818, "kl": 0.286376953125, "learning_rate": 4.6106823766192813e-07, "loss": -0.0045, "reward": 0.10000000521540642, "reward_std": 0.1472368724644184, "rewards/accuracy_multibox_reward": 0.10000000521540642, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 3854.77783203125, "epoch": 0.5861405197305101, "grad_norm": 0.2597335875034332, "kl": 0.2802734375, "learning_rate": 4.59664602771169e-07, "loss": 0.0353, "reward": 0.23888888396322727, "reward_std": 0.13956326618790627, "rewards/accuracy_multibox_reward": 0.23888888396322727, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 4224.0694580078125, "epoch": 0.5871029836381135, "grad_norm": 0.33247604966163635, "kl": 0.243896484375, "learning_rate": 4.582618830952043e-07, "loss": 0.0187, "reward": 0.12777777016162872, "reward_std": 0.1587858498096466, "rewards/accuracy_multibox_reward": 0.12777777016162872, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 3844.2916259765625, "epoch": 0.588065447545717, "grad_norm": 0.349383145570755, "kl": 0.1884765625, "learning_rate": 4.5686009284540494e-07, "loss": -0.0054, "reward": 0.03333333507180214, "reward_std": 0.08164966106414795, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 3687.4861450195312, "epoch": 0.5890279114533205, "grad_norm": 0.16623732447624207, "kl": 0.233154296875, "learning_rate": 4.554592462237249e-07, "loss": 0.024, "reward": 0.12222222238779068, "reward_std": 0.09889999032020569, "rewards/accuracy_multibox_reward": 0.12222222238779068, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 3967.3472290039062, "epoch": 0.5899903753609239, "grad_norm": 0.2683219909667969, "kl": 0.220458984375, "learning_rate": 4.540593574225585e-07, "loss": 0.0211, "reward": 0.2027777750045061, "reward_std": 0.06837743986397982, "rewards/accuracy_multibox_reward": 0.2027777750045061, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 3986.791748046875, "epoch": 0.5909528392685275, "grad_norm": 0.23495875298976898, "kl": 0.206787109375, "learning_rate": 4.526604406245955e-07, "loss": 0.0046, "reward": 0.03333333507180214, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 3655.486083984375, "epoch": 0.591915303176131, "grad_norm": 0.2708987891674042, "kl": 0.14111328125, "learning_rate": 4.5126251000267846e-07, "loss": 0.0166, "reward": 0.2166666705161333, "reward_std": 0.1418878436088562, "rewards/accuracy_multibox_reward": 0.2166666705161333, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 4287.833312988281, "epoch": 0.5928777670837344, "grad_norm": 0.32113194465637207, "kl": 0.257568359375, "learning_rate": 4.4986557971965856e-07, "loss": 0.0304, "reward": 0.1770502720028162, "reward_std": 0.20743243396282196, "rewards/accuracy_multibox_reward": 0.1770502720028162, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 3946.4862060546875, "epoch": 0.5938402309913379, "grad_norm": 0.17708338797092438, "kl": 0.179443359375, "learning_rate": 4.484696639282521e-07, "loss": 0.0305, "reward": 0.19166667200624943, "reward_std": 0.103608806617558, "rewards/accuracy_multibox_reward": 0.19166667200624943, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 4011.8612060546875, "epoch": 0.5948026948989413, "grad_norm": 0.2947791516780853, "kl": 0.1865234375, "learning_rate": 4.4707477677089787e-07, "loss": 0.042, "reward": 0.10833332687616348, "reward_std": 0.19044022262096405, "rewards/accuracy_multibox_reward": 0.10833332687616348, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 3743.9305419921875, "epoch": 0.5957651588065448, "grad_norm": 0.40439558029174805, "kl": 0.1883544921875, "learning_rate": 4.4568093237961226e-07, "loss": 0.0442, "reward": 0.20086805615574121, "reward_std": 0.08377594826743007, "rewards/accuracy_multibox_reward": 0.20086805615574121, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 3671.1666870117188, "epoch": 0.5967276227141483, "grad_norm": 0.4145714044570923, "kl": 0.221923828125, "learning_rate": 4.4428814487584796e-07, "loss": 0.0281, "reward": 0.12777778133749962, "reward_std": 0.12704842910170555, "rewards/accuracy_multibox_reward": 0.12777778133749962, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 3930.0834350585938, "epoch": 0.5976900866217517, "grad_norm": 0.5296496748924255, "kl": 0.1455078125, "learning_rate": 4.428964283703496e-07, "loss": 0.0616, "reward": 0.1833333382382989, "reward_std": 0.18836922943592072, "rewards/accuracy_multibox_reward": 0.1833333382382989, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 3988.3195190429688, "epoch": 0.5986525505293552, "grad_norm": 0.23592084646224976, "kl": 0.1826171875, "learning_rate": 4.415057969630113e-07, "loss": 0.0001, "reward": 0.10833333525806665, "reward_std": 0.04262732435017824, "rewards/accuracy_multibox_reward": 0.10833333525806665, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 3559.90283203125, "epoch": 0.5996150144369586, "grad_norm": 0.36289486289024353, "kl": 0.152099609375, "learning_rate": 4.4011626474273355e-07, "loss": 0.0479, "reward": 0.11388889327645302, "reward_std": 0.1373002640902996, "rewards/accuracy_multibox_reward": 0.11388889327645302, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 3764.5972290039062, "epoch": 0.6005774783445621, "grad_norm": 0.15039704740047455, "kl": 0.1480712890625, "learning_rate": 4.3872784578728094e-07, "loss": 0.0094, "reward": 0.1722222249954939, "reward_std": 0.10420371312648058, "rewards/accuracy_multibox_reward": 0.1722222249954939, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 3862.1805419921875, "epoch": 0.6015399422521656, "grad_norm": 0.25626716017723083, "kl": 0.183349609375, "learning_rate": 4.373405541631394e-07, "loss": 0.0086, "reward": 0.08888888359069824, "reward_std": 0.11280136555433273, "rewards/accuracy_multibox_reward": 0.08888888359069824, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 3373.9027709960938, "epoch": 0.602502406159769, "grad_norm": 0.27734580636024475, "kl": 0.1282958984375, "learning_rate": 4.35954403925373e-07, "loss": -0.0021, "reward": 0.006172839552164078, "reward_std": 0.0151203079149127, "rewards/accuracy_multibox_reward": 0.006172839552164078, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 4306.77783203125, "epoch": 0.6034648700673725, "grad_norm": 0.2362910658121109, "kl": 0.2509765625, "learning_rate": 4.345694091174822e-07, "loss": 0.012, "reward": 0.07222221791744232, "reward_std": 0.11238162964582443, "rewards/accuracy_multibox_reward": 0.07222221791744232, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 4041.4166259765625, "epoch": 0.6044273339749759, "grad_norm": 0.2150043249130249, "kl": 0.239990234375, "learning_rate": 4.331855837712618e-07, "loss": 0.0316, "reward": 0.12777778320014477, "reward_std": 0.14152995496988297, "rewards/accuracy_multibox_reward": 0.12777778320014477, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 3925.52783203125, "epoch": 0.6053897978825794, "grad_norm": 0.8266373872756958, "kl": 0.147705078125, "learning_rate": 4.318029419066582e-07, "loss": 0.0545, "reward": 0.3397085703909397, "reward_std": 0.3692466877400875, "rewards/accuracy_multibox_reward": 0.3397085703909397, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 3629.8472900390625, "epoch": 0.6063522617901829, "grad_norm": 0.4267980754375458, "kl": 0.1722412109375, "learning_rate": 4.304214975316277e-07, "loss": 0.0098, "reward": 0.13333334308117628, "reward_std": 0.11119078379124403, "rewards/accuracy_multibox_reward": 0.13333334308117628, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 3769.6944580078125, "epoch": 0.6073147256977863, "grad_norm": 0.21871833503246307, "kl": 0.189453125, "learning_rate": 4.290412646419942e-07, "loss": 0.0081, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 4046.0555419921875, "epoch": 0.6082771896053898, "grad_norm": 0.22810685634613037, "kl": 0.193359375, "learning_rate": 4.276622572213081e-07, "loss": 0.0094, "reward": 0.01666666753590107, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.01666666753590107, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 4072.8334350585938, "epoch": 0.6092396535129933, "grad_norm": 0.21080957353115082, "kl": 0.21728515625, "learning_rate": 4.262844892407039e-07, "loss": 0.0174, "reward": 0.1666666753590107, "reward_std": 0.09971506893634796, "rewards/accuracy_multibox_reward": 0.1666666753590107, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 3675.5972900390625, "epoch": 0.6102021174205967, "grad_norm": 0.20213140547275543, "kl": 0.168212890625, "learning_rate": 4.2490797465875893e-07, "loss": 0.0095, "reward": 0.14722222927957773, "reward_std": 0.08845378831028938, "rewards/accuracy_multibox_reward": 0.14722222927957773, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 4236.5140380859375, "epoch": 0.6111645813282002, "grad_norm": 0.20005442202091217, "kl": 0.1748046875, "learning_rate": 4.235327274213524e-07, "loss": 0.0065, "reward": 0.05833332985639572, "reward_std": 0.0830860286951065, "rewards/accuracy_multibox_reward": 0.05833332985639572, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 3916.3056030273438, "epoch": 0.6121270452358036, "grad_norm": 0.2801712155342102, "kl": 0.1834716796875, "learning_rate": 4.221587614615232e-07, "loss": 0.0441, "reward": 0.22777778655290604, "reward_std": 0.13435428589582443, "rewards/accuracy_multibox_reward": 0.22777778655290604, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 3765.5972290039062, "epoch": 0.6130895091434071, "grad_norm": 0.20007814466953278, "kl": 0.14697265625, "learning_rate": 4.2078609069932925e-07, "loss": 0.0055, "reward": 0.09444444812834263, "reward_std": 0.07979300245642662, "rewards/accuracy_multibox_reward": 0.09444444812834263, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 3799.5000610351562, "epoch": 0.6140519730510106, "grad_norm": 0.5685856938362122, "kl": 0.14111328125, "learning_rate": 4.194147290417068e-07, "loss": 0.0435, "reward": 0.22341269254684448, "reward_std": 0.18263425678014755, "rewards/accuracy_multibox_reward": 0.22341269254684448, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 3879.291748046875, "epoch": 0.615014436958614, "grad_norm": 0.22935132682323456, "kl": 0.157470703125, "learning_rate": 4.1804469038232883e-07, "loss": 0.0257, "reward": 0.11388889141380787, "reward_std": 0.08845379948616028, "rewards/accuracy_multibox_reward": 0.11388889141380787, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 3750.15283203125, "epoch": 0.6159769008662175, "grad_norm": 0.33813372254371643, "kl": 0.1953125, "learning_rate": 4.166759886014648e-07, "loss": 0.0355, "reward": 0.08611110970377922, "reward_std": 0.14640231803059578, "rewards/accuracy_multibox_reward": 0.08611110970377922, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 3888.263916015625, "epoch": 0.6169393647738209, "grad_norm": 0.29978710412979126, "kl": 0.205078125, "learning_rate": 4.1530863756583967e-07, "loss": 0.009, "reward": 0.1666666753590107, "reward_std": 0.2058177851140499, "rewards/accuracy_multibox_reward": 0.1666666753590107, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 3986.8472900390625, "epoch": 0.6179018286814244, "grad_norm": 0.40522632002830505, "kl": 0.192626953125, "learning_rate": 4.1394265112849394e-07, "loss": 0.0577, "reward": 0.11666666157543659, "reward_std": 0.16984771937131882, "rewards/accuracy_multibox_reward": 0.11666666157543659, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 3839.4306030273438, "epoch": 0.6188642925890279, "grad_norm": 0.17607729136943817, "kl": 0.1875, "learning_rate": 4.125780431286423e-07, "loss": 0.0102, "reward": 0.1527777798473835, "reward_std": 0.09706042520701885, "rewards/accuracy_multibox_reward": 0.1527777798473835, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 3429.875, "epoch": 0.6198267564966313, "grad_norm": 0.2845510244369507, "kl": 0.140869140625, "learning_rate": 4.1121482739153524e-07, "loss": 0.01, "reward": 0.21782408468425274, "reward_std": 0.23705870658159256, "rewards/accuracy_multibox_reward": 0.21782408468425274, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 3359.3056030273438, "epoch": 0.6207892204042348, "grad_norm": 0.6876607537269592, "kl": 0.14056396484375, "learning_rate": 4.098530177283167e-07, "loss": 0.0538, "reward": 0.32499999925494194, "reward_std": 0.3077152967453003, "rewards/accuracy_multibox_reward": 0.32499999925494194, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 4095.4861450195312, "epoch": 0.6217516843118384, "grad_norm": 0.34777340292930603, "kl": 0.231689453125, "learning_rate": 4.0849262793588546e-07, "loss": 0.0379, "reward": 0.11666666902601719, "reward_std": 0.12247449159622192, "rewards/accuracy_multibox_reward": 0.11666666902601719, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 4244.5, "epoch": 0.6227141482194418, "grad_norm": 0.31340381503105164, "kl": 0.242431640625, "learning_rate": 4.071336717967558e-07, "loss": 0.0109, "reward": 0.0518518527969718, "reward_std": 0.06342634558677673, "rewards/accuracy_multibox_reward": 0.0518518527969718, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 3893.25, "epoch": 0.6236766121270453, "grad_norm": 0.21620874106884003, "kl": 0.197021484375, "learning_rate": 4.057761630789167e-07, "loss": 0.0036, "reward": 0.09444444812834263, "reward_std": 0.14432327821850777, "rewards/accuracy_multibox_reward": 0.09444444812834263, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 3873.041748046875, "epoch": 0.6246390760346487, "grad_norm": 0.5552299618721008, "kl": 0.2080078125, "learning_rate": 4.0442011553569276e-07, "loss": 0.0515, "reward": 0.17393162846565247, "reward_std": 0.16587120294570923, "rewards/accuracy_multibox_reward": 0.17393162846565247, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 3941.7500610351562, "epoch": 0.6256015399422522, "grad_norm": 0.2998940944671631, "kl": 0.215576171875, "learning_rate": 4.030655429056056e-07, "loss": 0.0203, "reward": 0.31275253742933273, "reward_std": 0.26552341133356094, "rewards/accuracy_multibox_reward": 0.31275253742933273, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 3610.2222900390625, "epoch": 0.6265640038498557, "grad_norm": 0.33133426308631897, "kl": 0.1915283203125, "learning_rate": 4.017124589122334e-07, "loss": 0.0543, "reward": 0.23055556509643793, "reward_std": 0.16821619868278503, "rewards/accuracy_multibox_reward": 0.23055556509643793, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 4053.5834350585938, "epoch": 0.6275264677574591, "grad_norm": 0.21892429888248444, "kl": 0.25830078125, "learning_rate": 4.0036087726407287e-07, "loss": 0.0366, "reward": 0.11111111380159855, "reward_std": 0.09327250719070435, "rewards/accuracy_multibox_reward": 0.11111111380159855, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 3827.5416259765625, "epoch": 0.6284889316650626, "grad_norm": 0.24630647897720337, "kl": 0.2275390625, "learning_rate": 3.9901081165439987e-07, "loss": 0.0301, "reward": 0.2305555520579219, "reward_std": 0.18525847047567368, "rewards/accuracy_multibox_reward": 0.2305555520579219, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 4000.4722900390625, "epoch": 0.629451395572666, "grad_norm": 0.6962487101554871, "kl": 0.330078125, "learning_rate": 3.9766227576113096e-07, "loss": 0.0695, "reward": 0.2500000074505806, "reward_std": 0.24321002140641212, "rewards/accuracy_multibox_reward": 0.2500000074505806, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 3892.6527099609375, "epoch": 0.6304138594802695, "grad_norm": 0.30400577187538147, "kl": 0.290283203125, "learning_rate": 3.9631528324668493e-07, "loss": 0.0343, "reward": 0.2000000048428774, "reward_std": 0.11392355803400278, "rewards/accuracy_multibox_reward": 0.2000000048428774, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 3294.65283203125, "epoch": 0.631376323387873, "grad_norm": 0.25785964727401733, "kl": 0.238525390625, "learning_rate": 3.9496984775784315e-07, "loss": -0.0013, "reward": 0.21111111715435982, "reward_std": 0.10239813383668661, "rewards/accuracy_multibox_reward": 0.21111111715435982, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 3826.791748046875, "epoch": 0.6323387872954764, "grad_norm": 0.5379136800765991, "kl": 0.277099609375, "learning_rate": 3.9362598292561367e-07, "loss": -0.0057, "reward": 0.016812865622341633, "reward_std": 0.04118294268846512, "rewards/accuracy_multibox_reward": 0.016812865622341633, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 4143.458312988281, "epoch": 0.6333012512030799, "grad_norm": 0.36112621426582336, "kl": 0.280517578125, "learning_rate": 3.922837023650906e-07, "loss": 0.0113, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 3937.5416870117188, "epoch": 0.6342637151106834, "grad_norm": 0.8162462115287781, "kl": 0.3060302734375, "learning_rate": 3.9094301967531806e-07, "loss": 0.0768, "reward": 0.5333333481103182, "reward_std": 0.38630059361457825, "rewards/accuracy_multibox_reward": 0.5333333481103182, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 3520.2083740234375, "epoch": 0.6352261790182868, "grad_norm": 0.4137125611305237, "kl": 0.265625, "learning_rate": 3.8960394843915114e-07, "loss": 0.0106, "reward": 0.08333333674818277, "reward_std": 0.1751791089773178, "rewards/accuracy_multibox_reward": 0.08333333674818277, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 3898.0416870117188, "epoch": 0.6361886429258903, "grad_norm": 0.6535941362380981, "kl": 0.2939453125, "learning_rate": 3.882665022231193e-07, "loss": 0.0614, "reward": 0.14444444701075554, "reward_std": 0.20398468151688576, "rewards/accuracy_multibox_reward": 0.14444444701075554, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 3650.1666870117188, "epoch": 0.6371511068334937, "grad_norm": 0.6627215147018433, "kl": 0.281005859375, "learning_rate": 3.869306945772881e-07, "loss": 0.0058, "reward": 0.17222222685813904, "reward_std": 0.06131125055253506, "rewards/accuracy_multibox_reward": 0.17222222685813904, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 3578.40283203125, "epoch": 0.6381135707410972, "grad_norm": 0.6300303936004639, "kl": 0.266845703125, "learning_rate": 3.855965390351222e-07, "loss": -0.0084, "reward": 0.26527778059244156, "reward_std": 0.10366082563996315, "rewards/accuracy_multibox_reward": 0.26527778059244156, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 3916.77783203125, "epoch": 0.6390760346487007, "grad_norm": 0.29005637764930725, "kl": 0.35498046875, "learning_rate": 3.84264049113349e-07, "loss": 0.026, "reward": 0.09444444812834263, "reward_std": 0.07979300245642662, "rewards/accuracy_multibox_reward": 0.09444444812834263, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 3991.7777099609375, "epoch": 0.6400384985563041, "grad_norm": 0.40841221809387207, "kl": 0.3330078125, "learning_rate": 3.8293323831181966e-07, "loss": 0.0464, "reward": 0.18333332985639572, "reward_std": 0.24011782556772232, "rewards/accuracy_multibox_reward": 0.18333332985639572, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 4054.7222900390625, "epoch": 0.6410009624639076, "grad_norm": 0.3186582326889038, "kl": 0.34814453125, "learning_rate": 3.816041201133752e-07, "loss": 0.019, "reward": 0.06944444496184587, "reward_std": 0.13784046843647957, "rewards/accuracy_multibox_reward": 0.06944444496184587, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 3861.2222290039062, "epoch": 0.641963426371511, "grad_norm": 0.43369582295417786, "kl": 0.29443359375, "learning_rate": 3.8027670798370736e-07, "loss": 0.0223, "reward": 0.31111111491918564, "reward_std": 0.24514074996113777, "rewards/accuracy_multibox_reward": 0.31111111491918564, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 4027.0000610351562, "epoch": 0.6429258902791145, "grad_norm": 0.5350426435470581, "kl": 0.37255859375, "learning_rate": 3.789510153712233e-07, "loss": 0.0567, "reward": 0.2888888958841562, "reward_std": 0.19529825076460838, "rewards/accuracy_multibox_reward": 0.2888888958841562, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 3803.2777709960938, "epoch": 0.643888354186718, "grad_norm": 0.23920631408691406, "kl": 0.21435546875, "learning_rate": 3.776270557069099e-07, "loss": 0.0303, "reward": 0.08055555820465088, "reward_std": 0.040023140609264374, "rewards/accuracy_multibox_reward": 0.08055555820465088, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 3793.5278930664062, "epoch": 0.6448508180943214, "grad_norm": 0.30436474084854126, "kl": 0.2813720703125, "learning_rate": 3.763048424041961e-07, "loss": 0.0134, "reward": 0.37777779437601566, "reward_std": 0.20312510523945093, "rewards/accuracy_multibox_reward": 0.37777779437601566, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 3695.4306030273438, "epoch": 0.6458132820019249, "grad_norm": 0.5222062468528748, "kl": 0.296142578125, "learning_rate": 3.749843888588187e-07, "loss": 0.0416, "reward": 0.33750000037252903, "reward_std": 0.18162836972624063, "rewards/accuracy_multibox_reward": 0.33750000037252903, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 3883.6943969726562, "epoch": 0.6467757459095284, "grad_norm": 0.30882808566093445, "kl": 0.31982421875, "learning_rate": 3.736657084486852e-07, "loss": 0.0197, "reward": 0.10000000521540642, "reward_std": 0.10327956825494766, "rewards/accuracy_multibox_reward": 0.10000000521540642, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 3905.111083984375, "epoch": 0.6477382098171318, "grad_norm": 0.2614690661430359, "kl": 0.32470703125, "learning_rate": 3.7234881453373977e-07, "loss": 0.0243, "reward": 0.17777777463197708, "reward_std": 0.17270660400390625, "rewards/accuracy_multibox_reward": 0.17777777463197708, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 3721.7501220703125, "epoch": 0.6487006737247353, "grad_norm": 0.33878082036972046, "kl": 0.216064453125, "learning_rate": 3.710337204558264e-07, "loss": 0.0363, "reward": 0.23333334550261497, "reward_std": 0.16296062245965004, "rewards/accuracy_multibox_reward": 0.23333334550261497, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 4185.833312988281, "epoch": 0.6496631376323387, "grad_norm": 0.3346640467643738, "kl": 0.36279296875, "learning_rate": 3.6972043953855414e-07, "loss": 0.022, "reward": 0.22499999590218067, "reward_std": 0.12775749154388905, "rewards/accuracy_multibox_reward": 0.22499999590218067, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 3866.40283203125, "epoch": 0.6506256015399422, "grad_norm": 0.6601501107215881, "kl": 0.37353515625, "learning_rate": 3.684089850871636e-07, "loss": 0.0011, "reward": 0.1138888904824853, "reward_std": 0.03402068838477135, "rewards/accuracy_multibox_reward": 0.1138888904824853, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 3788.5695190429688, "epoch": 0.6515880654475458, "grad_norm": 0.2152867466211319, "kl": 0.232177734375, "learning_rate": 3.670993703883895e-07, "loss": 0.0177, "reward": 0.05000000260770321, "reward_std": 0.0924646146595478, "rewards/accuracy_multibox_reward": 0.05000000260770321, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 3827.2361450195312, "epoch": 0.6525505293551492, "grad_norm": 0.5893759727478027, "kl": 0.27978515625, "learning_rate": 3.6579160871032845e-07, "loss": 0.058, "reward": 0.25555556267499924, "reward_std": 0.17990728095173836, "rewards/accuracy_multibox_reward": 0.25555556267499924, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 3922.916748046875, "epoch": 0.6535129932627527, "grad_norm": 0.31547895073890686, "kl": 0.273681640625, "learning_rate": 3.6448571330230316e-07, "loss": 0.0356, "reward": 0.23456791415810585, "reward_std": 0.1515463888645172, "rewards/accuracy_multibox_reward": 0.23456791415810585, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 3998.7361450195312, "epoch": 0.6544754571703562, "grad_norm": 0.8393051624298096, "kl": 0.3515625, "learning_rate": 3.631816973947286e-07, "loss": 0.0796, "reward": 0.3138888981193304, "reward_std": 0.24396883323788643, "rewards/accuracy_multibox_reward": 0.3138888981193304, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 3513.7777709960938, "epoch": 0.6554379210779596, "grad_norm": 0.19181416928768158, "kl": 0.261962890625, "learning_rate": 3.618795741989782e-07, "loss": 0.0263, "reward": 0.18194445222616196, "reward_std": 0.10865546111017466, "rewards/accuracy_multibox_reward": 0.18194445222616196, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 3761.47216796875, "epoch": 0.6564003849855631, "grad_norm": 0.3826933205127716, "kl": 0.302978515625, "learning_rate": 3.6057935690724927e-07, "loss": 0.0291, "reward": 0.16111111640930176, "reward_std": 0.08162937685847282, "rewards/accuracy_multibox_reward": 0.16111111640930176, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 3738.6805419921875, "epoch": 0.6573628488931665, "grad_norm": 0.447295218706131, "kl": 0.319091796875, "learning_rate": 3.5928105869243043e-07, "loss": 0.0171, "reward": 0.16018519271165133, "reward_std": 0.16153568029403687, "rewards/accuracy_multibox_reward": 0.16018519271165133, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 3286.5138549804688, "epoch": 0.65832531280077, "grad_norm": 0.6815807223320007, "kl": 0.26171875, "learning_rate": 3.5798469270796714e-07, "loss": 0.0612, "reward": 0.261111106723547, "reward_std": 0.21763264387845993, "rewards/accuracy_multibox_reward": 0.261111106723547, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 3734.166748046875, "epoch": 0.6592877767083735, "grad_norm": 0.44469356536865234, "kl": 0.33544921875, "learning_rate": 3.566902720877293e-07, "loss": 0.0687, "reward": 0.236111119389534, "reward_std": 0.13517379760742188, "rewards/accuracy_multibox_reward": 0.236111119389534, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 3987.8611450195312, "epoch": 0.6602502406159769, "grad_norm": 0.28584492206573486, "kl": 0.3447265625, "learning_rate": 3.553978099458771e-07, "loss": 0.0218, "reward": 0.08333333674818277, "reward_std": 0.08819804340600967, "rewards/accuracy_multibox_reward": 0.08333333674818277, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 3737.5138549804688, "epoch": 0.6612127045235804, "grad_norm": 0.301457017660141, "kl": 0.226806640625, "learning_rate": 3.5410731937672923e-07, "loss": 0.01, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 3532.875, "epoch": 0.6621751684311838, "grad_norm": 0.3546752333641052, "kl": 0.302001953125, "learning_rate": 3.5281881345462994e-07, "loss": 0.0325, "reward": 0.15000000223517418, "reward_std": 0.18456309288740158, "rewards/accuracy_multibox_reward": 0.15000000223517418, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 3458.7362060546875, "epoch": 0.6631376323387873, "grad_norm": 0.2200569361448288, "kl": 0.259521484375, "learning_rate": 3.5153230523381574e-07, "loss": 0.021, "reward": 0.06111111305654049, "reward_std": 0.08989017084240913, "rewards/accuracy_multibox_reward": 0.06111111305654049, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 3729.0834350585938, "epoch": 0.6641000962463908, "grad_norm": 0.2077282965183258, "kl": 0.3193359375, "learning_rate": 3.502478077482849e-07, "loss": 0.0262, "reward": 0.13611111603677273, "reward_std": 0.15077951923012733, "rewards/accuracy_multibox_reward": 0.13611111603677273, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 3959.8472900390625, "epoch": 0.6650625601539942, "grad_norm": 0.4314574897289276, "kl": 0.3203125, "learning_rate": 3.489653340116633e-07, "loss": 0.0376, "reward": 0.23981481976807117, "reward_std": 0.17820755764842033, "rewards/accuracy_multibox_reward": 0.23981481976807117, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 3814.4165649414062, "epoch": 0.6660250240615977, "grad_norm": 0.24665242433547974, "kl": 0.2886962890625, "learning_rate": 3.4768489701707425e-07, "loss": 0.0228, "reward": 0.5055555775761604, "reward_std": 0.15583274513483047, "rewards/accuracy_multibox_reward": 0.5055555775761604, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 4031.40283203125, "epoch": 0.6669874879692012, "grad_norm": 0.22178517282009125, "kl": 0.2861328125, "learning_rate": 3.4640650973700615e-07, "loss": 0.0307, "reward": 0.08055555820465088, "reward_std": 0.10372589156031609, "rewards/accuracy_multibox_reward": 0.08055555820465088, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 3598.0277709960938, "epoch": 0.6679499518768046, "grad_norm": 0.4568510055541992, "kl": 0.264892578125, "learning_rate": 3.451301851231806e-07, "loss": 0.047, "reward": 0.3166666738688946, "reward_std": 0.2142295055091381, "rewards/accuracy_multibox_reward": 0.3166666738688946, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 3965.5277099609375, "epoch": 0.6689124157844081, "grad_norm": 0.2905183732509613, "kl": 0.3193359375, "learning_rate": 3.4385593610642226e-07, "loss": 0.0163, "reward": 0.13611111044883728, "reward_std": 0.06705055385828018, "rewards/accuracy_multibox_reward": 0.13611111044883728, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 3898.4862060546875, "epoch": 0.6698748796920115, "grad_norm": 0.19660653173923492, "kl": 0.255859375, "learning_rate": 3.425837755965274e-07, "loss": 0.0162, "reward": 0.01666666753590107, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.01666666753590107, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 3794.0834350585938, "epoch": 0.670837343599615, "grad_norm": 0.25608956813812256, "kl": 0.358154296875, "learning_rate": 3.413137164821325e-07, "loss": 0.0239, "reward": 0.2888888902962208, "reward_std": 0.24344339221715927, "rewards/accuracy_multibox_reward": 0.2888888902962208, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 3739.1945190429688, "epoch": 0.6717998075072185, "grad_norm": 0.2835664451122284, "kl": 0.259765625, "learning_rate": 3.400457716305848e-07, "loss": 0.0124, "reward": 0.16111110523343086, "reward_std": 0.067263288423419, "rewards/accuracy_multibox_reward": 0.16111110523343086, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 3865.5416870117188, "epoch": 0.6727622714148219, "grad_norm": 0.40369707345962524, "kl": 0.33642578125, "learning_rate": 3.3877995388781045e-07, "loss": 0.0403, "reward": 0.20833333302289248, "reward_std": 0.2826799303293228, "rewards/accuracy_multibox_reward": 0.20833333302289248, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 3955.0833740234375, "epoch": 0.6737247353224254, "grad_norm": 0.34322983026504517, "kl": 0.27197265625, "learning_rate": 3.37516276078186e-07, "loss": 0.0298, "reward": 0.06944444589316845, "reward_std": 0.09370484948158264, "rewards/accuracy_multibox_reward": 0.06944444589316845, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 3642.9583740234375, "epoch": 0.6746871992300288, "grad_norm": 0.20746324956417084, "kl": 0.27001953125, "learning_rate": 3.3625475100440736e-07, "loss": 0.031, "reward": 0.13611111417412758, "reward_std": 0.19279820099473, "rewards/accuracy_multibox_reward": 0.13611111417412758, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 4052.2222290039062, "epoch": 0.6756496631376323, "grad_norm": 0.4129761755466461, "kl": 0.244873046875, "learning_rate": 3.349953914473608e-07, "loss": 0.0063, "reward": 0.013888888992369175, "reward_std": 0.03402068838477135, "rewards/accuracy_multibox_reward": 0.013888888992369175, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 4103.319396972656, "epoch": 0.6766121270452358, "grad_norm": 0.3208429217338562, "kl": 0.289306640625, "learning_rate": 3.337382101659923e-07, "loss": 0.0192, "reward": 0.08333333674818277, "reward_std": 0.1418512798845768, "rewards/accuracy_multibox_reward": 0.08333333674818277, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 3925.1527709960938, "epoch": 0.6775745909528392, "grad_norm": 0.5495811104774475, "kl": 0.248779296875, "learning_rate": 3.3248321989717995e-07, "loss": -0.0007, "reward": 0.06388888973742723, "reward_std": 0.12423219904303551, "rewards/accuracy_multibox_reward": 0.06388888973742723, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 3910.0833740234375, "epoch": 0.6785370548604427, "grad_norm": 0.28982505202293396, "kl": 0.263671875, "learning_rate": 3.312304333556036e-07, "loss": 0.0118, "reward": 0.09999999683350325, "reward_std": 0.16596302762627602, "rewards/accuracy_multibox_reward": 0.09999999683350325, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 3768.569580078125, "epoch": 0.6794995187680462, "grad_norm": 0.2630541920661926, "kl": 0.2822265625, "learning_rate": 3.2997986323361627e-07, "loss": 0.0286, "reward": 0.17777778394520283, "reward_std": 0.08874451369047165, "rewards/accuracy_multibox_reward": 0.17777778394520283, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 3955.5693969726562, "epoch": 0.6804619826756496, "grad_norm": 0.6013598442077637, "kl": 0.271728515625, "learning_rate": 3.287315222011165e-07, "loss": 0.039, "reward": 0.3194444328546524, "reward_std": 0.2611636854708195, "rewards/accuracy_multibox_reward": 0.3194444328546524, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 3679.5972290039062, "epoch": 0.6814244465832531, "grad_norm": 0.18735507130622864, "kl": 0.23583984375, "learning_rate": 3.274854229054186e-07, "loss": 0.0185, "reward": 0.0694444477558136, "reward_std": 0.05417735502123833, "rewards/accuracy_multibox_reward": 0.0694444477558136, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 3928.666748046875, "epoch": 0.6823869104908566, "grad_norm": 0.27725911140441895, "kl": 0.2603759765625, "learning_rate": 3.262415779711253e-07, "loss": 0.0222, "reward": 0.18750000419095159, "reward_std": 0.16072114743292332, "rewards/accuracy_multibox_reward": 0.18750000419095159, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 3551.2222290039062, "epoch": 0.6833493743984601, "grad_norm": 0.34686797857284546, "kl": 0.15869140625, "learning_rate": 3.250000000000001e-07, "loss": 0.0387, "reward": 0.3750000223517418, "reward_std": 0.16881208075210452, "rewards/accuracy_multibox_reward": 0.3750000223517418, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 3725.5554809570312, "epoch": 0.6843118383060636, "grad_norm": 0.40388748049736023, "kl": 0.2218017578125, "learning_rate": 3.2376070157083857e-07, "loss": 0.034, "reward": 0.30935388803482056, "reward_std": 0.13160740584135056, "rewards/accuracy_multibox_reward": 0.30935388803482056, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 3553.15283203125, "epoch": 0.685274302213667, "grad_norm": 0.24716556072235107, "kl": 0.17724609375, "learning_rate": 3.225236952393422e-07, "loss": 0.004, "reward": 0.23611110541969538, "reward_std": 0.08845378551632166, "rewards/accuracy_multibox_reward": 0.23611110541969538, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 3946.6806640625, "epoch": 0.6862367661212705, "grad_norm": 0.2206210494041443, "kl": 0.2257080078125, "learning_rate": 3.212889935379902e-07, "loss": 0.0119, "reward": 0.1305555570870638, "reward_std": 0.058169892989099026, "rewards/accuracy_multibox_reward": 0.1305555570870638, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 4229.666687011719, "epoch": 0.687199230028874, "grad_norm": 0.2505021095275879, "kl": 0.23193359375, "learning_rate": 3.2005660897591314e-07, "loss": 0.0135, "reward": 0.05277778208255768, "reward_std": 0.05813458189368248, "rewards/accuracy_multibox_reward": 0.05277778208255768, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 3991.7638549804688, "epoch": 0.6881616939364774, "grad_norm": 0.323823481798172, "kl": 0.24267578125, "learning_rate": 3.1882655403876547e-07, "loss": 0.026, "reward": 0.2083333320915699, "reward_std": 0.17673566192388535, "rewards/accuracy_multibox_reward": 0.2083333320915699, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 4000.0416870117188, "epoch": 0.6891241578440809, "grad_norm": 0.6307852864265442, "kl": 0.181396484375, "learning_rate": 3.1759884118860005e-07, "loss": 0.0546, "reward": 0.23333332128822803, "reward_std": 0.1670653186738491, "rewards/accuracy_multibox_reward": 0.23333332128822803, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 3881.6251220703125, "epoch": 0.6900866217516843, "grad_norm": 0.39660483598709106, "kl": 0.177490234375, "learning_rate": 3.1637348286374125e-07, "loss": 0.0394, "reward": 0.1333333384245634, "reward_std": 0.14830171316862106, "rewards/accuracy_multibox_reward": 0.1333333384245634, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 4055.15283203125, "epoch": 0.6910490856592878, "grad_norm": 0.609953761100769, "kl": 0.258056640625, "learning_rate": 3.151504914786586e-07, "loss": 0.053, "reward": 0.09722222480922937, "reward_std": 0.17918993532657623, "rewards/accuracy_multibox_reward": 0.09722222480922937, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 3862.5556030273438, "epoch": 0.6920115495668913, "grad_norm": 0.18857114017009735, "kl": 0.1884765625, "learning_rate": 3.139298794238422e-07, "loss": 0.0121, "reward": 0.07500000111758709, "reward_std": 0.12927862256765366, "rewards/accuracy_multibox_reward": 0.07500000111758709, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 4187.8194580078125, "epoch": 0.6929740134744947, "grad_norm": 0.3680835962295532, "kl": 0.255615234375, "learning_rate": 3.1271165906567574e-07, "loss": 0.0441, "reward": 0.25833334028720856, "reward_std": 0.195224367082119, "rewards/accuracy_multibox_reward": 0.25833334028720856, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 3949.47216796875, "epoch": 0.6939364773820982, "grad_norm": 0.3153010308742523, "kl": 0.214599609375, "learning_rate": 3.114958427463125e-07, "loss": 0.0235, "reward": 0.27222222089767456, "reward_std": 0.2540128827095032, "rewards/accuracy_multibox_reward": 0.27222222089767456, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 3790.541748046875, "epoch": 0.6948989412897016, "grad_norm": 0.30683475732803345, "kl": 0.186279296875, "learning_rate": 3.102824427835494e-07, "loss": 0.0279, "reward": 0.10555556416511536, "reward_std": 0.11036816239356995, "rewards/accuracy_multibox_reward": 0.10555556416511536, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 3596.888916015625, "epoch": 0.6958614051973051, "grad_norm": 0.1727837175130844, "kl": 0.16064453125, "learning_rate": 3.0907147147070234e-07, "loss": 0.0121, "reward": 0.2361111156642437, "reward_std": 0.058443918358534575, "rewards/accuracy_multibox_reward": 0.2361111156642437, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 3739.8056030273438, "epoch": 0.6968238691049086, "grad_norm": 0.3520987033843994, "kl": 0.24658203125, "learning_rate": 3.0786294107648234e-07, "loss": 0.0208, "reward": 0.19930554926395416, "reward_std": 0.09284630045294762, "rewards/accuracy_multibox_reward": 0.19930554926395416, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 3683.388916015625, "epoch": 0.697786333012512, "grad_norm": 0.48412322998046875, "kl": 0.22412109375, "learning_rate": 3.0665686384487044e-07, "loss": 0.0474, "reward": 0.1750000026077032, "reward_std": 0.13637348264455795, "rewards/accuracy_multibox_reward": 0.1750000026077032, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 4047.3195190429688, "epoch": 0.6987487969201155, "grad_norm": 0.35670384764671326, "kl": 0.26953125, "learning_rate": 3.054532519949941e-07, "loss": 0.0096, "reward": 0.03055555559694767, "reward_std": 0.047628965228796005, "rewards/accuracy_multibox_reward": 0.03055555559694767, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 4119.527893066406, "epoch": 0.699711260827719, "grad_norm": 0.30320310592651367, "kl": 0.274658203125, "learning_rate": 3.04252117721003e-07, "loss": 0.0157, "reward": 0.10000000149011612, "reward_std": 0.010540921241044998, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 3604.4444580078125, "epoch": 0.7006737247353224, "grad_norm": 0.1997496485710144, "kl": 0.186767578125, "learning_rate": 3.030534731919461e-07, "loss": 0.0044, "reward": 0.04907407611608505, "reward_std": 0.10816721618175507, "rewards/accuracy_multibox_reward": 0.04907407611608505, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 4062.3750610351562, "epoch": 0.7016361886429259, "grad_norm": 0.25820231437683105, "kl": 0.24072265625, "learning_rate": 3.0185733055164776e-07, "loss": 0.0405, "reward": 0.09444444626569748, "reward_std": 0.0995594672858715, "rewards/accuracy_multibox_reward": 0.09444444626569748, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 3739.263916015625, "epoch": 0.7025986525505293, "grad_norm": 0.22249667346477509, "kl": 0.274658203125, "learning_rate": 3.006637019185852e-07, "loss": 0.0151, "reward": 0.19722223281860352, "reward_std": 0.08979267627000809, "rewards/accuracy_multibox_reward": 0.19722223281860352, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 3797.0416259765625, "epoch": 0.7035611164581328, "grad_norm": 0.26554879546165466, "kl": 0.2509765625, "learning_rate": 2.9947259938576535e-07, "loss": 0.0381, "reward": 0.07777777966111898, "reward_std": 0.13328944519162178, "rewards/accuracy_multibox_reward": 0.07777777966111898, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 3819.6389770507812, "epoch": 0.7045235803657363, "grad_norm": 0.25098928809165955, "kl": 0.2452392578125, "learning_rate": 2.982840350206023e-07, "loss": 0.0067, "reward": 0.05674603255465627, "reward_std": 0.06785477977246046, "rewards/accuracy_multibox_reward": 0.05674603255465627, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 3990.3611450195312, "epoch": 0.7054860442733397, "grad_norm": 0.5062599778175354, "kl": 0.2464599609375, "learning_rate": 2.9709802086479533e-07, "loss": 0.0396, "reward": 0.21543210931122303, "reward_std": 0.23596147820353508, "rewards/accuracy_multibox_reward": 0.21543210931122303, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 3787.8333129882812, "epoch": 0.7064485081809432, "grad_norm": 0.7298231720924377, "kl": 0.224609375, "learning_rate": 2.959145689342072e-07, "loss": 0.0564, "reward": 0.24444443732500076, "reward_std": 0.19198717176914215, "rewards/accuracy_multibox_reward": 0.24444443732500076, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 4043.15283203125, "epoch": 0.7074109720885466, "grad_norm": 0.41299310326576233, "kl": 0.255615234375, "learning_rate": 2.94733691218741e-07, "loss": 0.0533, "reward": 0.15000000223517418, "reward_std": 0.17922843992710114, "rewards/accuracy_multibox_reward": 0.15000000223517418, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 3769.0, "epoch": 0.7083734359961501, "grad_norm": 0.3718836009502411, "kl": 0.1746826171875, "learning_rate": 2.9355539968222117e-07, "loss": 0.038, "reward": 0.23333333432674408, "reward_std": 0.14886673167347908, "rewards/accuracy_multibox_reward": 0.23333333432674408, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 3786.263916015625, "epoch": 0.7093358999037536, "grad_norm": 0.2398822009563446, "kl": 0.26123046875, "learning_rate": 2.9237970626226916e-07, "loss": 0.0342, "reward": 0.10000000521540642, "reward_std": 0.10327956825494766, "rewards/accuracy_multibox_reward": 0.10000000521540642, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 4246.680603027344, "epoch": 0.710298363811357, "grad_norm": 0.24324241280555725, "kl": 0.32470703125, "learning_rate": 2.9120662287018527e-07, "loss": 0.0341, "reward": 0.06388889066874981, "reward_std": 0.09926875308156013, "rewards/accuracy_multibox_reward": 0.06388889066874981, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 3788.6805419921875, "epoch": 0.7112608277189605, "grad_norm": 0.2493879795074463, "kl": 0.27099609375, "learning_rate": 2.9003616139082663e-07, "loss": 0.0123, "reward": 0.044444444589316845, "reward_std": 0.08164965733885765, "rewards/accuracy_multibox_reward": 0.044444444589316845, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 4143.958312988281, "epoch": 0.7122232916265641, "grad_norm": 0.19001366198062897, "kl": 0.2626953125, "learning_rate": 2.8886833368248653e-07, "loss": 0.0144, "reward": 0.05873842630535364, "reward_std": 0.08252592757344246, "rewards/accuracy_multibox_reward": 0.05873842630535364, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 3754.4723510742188, "epoch": 0.7131857555341675, "grad_norm": 0.2311333566904068, "kl": 0.18328857421875, "learning_rate": 2.87703151576775e-07, "loss": 0.022, "reward": 0.03611110895872116, "reward_std": 0.056190814822912216, "rewards/accuracy_multibox_reward": 0.03611110895872116, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 3650.638916015625, "epoch": 0.714148219441771, "grad_norm": 0.3940192759037018, "kl": 0.26611328125, "learning_rate": 2.865406268784991e-07, "loss": 0.0307, "reward": 0.20555555913597345, "reward_std": 0.17522955313324928, "rewards/accuracy_multibox_reward": 0.20555555913597345, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 3753.5139770507812, "epoch": 0.7151106833493744, "grad_norm": 0.38743409514427185, "kl": 0.328857421875, "learning_rate": 2.853807713655423e-07, "loss": 0.0167, "reward": 0.08333333767950535, "reward_std": 0.09352945536375046, "rewards/accuracy_multibox_reward": 0.08333333767950535, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 4006.0556640625, "epoch": 0.7160731472569779, "grad_norm": 0.2972193956375122, "kl": 0.29638671875, "learning_rate": 2.8422359678874597e-07, "loss": 0.0134, "reward": 0.19583333563059568, "reward_std": 0.2810147739946842, "rewards/accuracy_multibox_reward": 0.19583333563059568, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 3473.1527709960938, "epoch": 0.7170356111645814, "grad_norm": 0.33019182085990906, "kl": 0.2091064453125, "learning_rate": 2.830691148717902e-07, "loss": -0.0118, "reward": 0.325000018812716, "reward_std": 0.159113559871912, "rewards/accuracy_multibox_reward": 0.325000018812716, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 3696.3888549804688, "epoch": 0.7179980750721848, "grad_norm": 0.28523820638656616, "kl": 0.26220703125, "learning_rate": 2.81917337311075e-07, "loss": 0.0305, "reward": 0.1749999914318323, "reward_std": 0.135981697589159, "rewards/accuracy_multibox_reward": 0.1749999914318323, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 3692.6806640625, "epoch": 0.7189605389797883, "grad_norm": 0.595460832118988, "kl": 0.2451171875, "learning_rate": 2.8076827577560116e-07, "loss": 0.0528, "reward": 0.1749999988824129, "reward_std": 0.1445154957473278, "rewards/accuracy_multibox_reward": 0.1749999988824129, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 3800.4306030273438, "epoch": 0.7199230028873917, "grad_norm": 0.41025403141975403, "kl": 0.2822265625, "learning_rate": 2.7962194190685376e-07, "loss": 0.0442, "reward": 0.16944444924592972, "reward_std": 0.152824517339468, "rewards/accuracy_multibox_reward": 0.16944444924592972, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 3864.5694580078125, "epoch": 0.7208854667949952, "grad_norm": 0.6564794182777405, "kl": 0.271728515625, "learning_rate": 2.7847834731868193e-07, "loss": 0.0623, "reward": 0.3416666779667139, "reward_std": 0.3496321365237236, "rewards/accuracy_multibox_reward": 0.3416666779667139, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 3930.486083984375, "epoch": 0.7218479307025987, "grad_norm": 0.4742569327354431, "kl": 0.2294921875, "learning_rate": 2.773375035971829e-07, "loss": 0.0427, "reward": 0.37500003166496754, "reward_std": 0.19291163235902786, "rewards/accuracy_multibox_reward": 0.37500003166496754, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 4087.2362060546875, "epoch": 0.7228103946102021, "grad_norm": 0.3424828350543976, "kl": 0.38525390625, "learning_rate": 2.761994223005841e-07, "loss": 0.0482, "reward": 0.12222221866250038, "reward_std": 0.1112096980214119, "rewards/accuracy_multibox_reward": 0.12222221866250038, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 3786.2083740234375, "epoch": 0.7237728585178056, "grad_norm": 0.2562057673931122, "kl": 0.26513671875, "learning_rate": 2.750641149591256e-07, "loss": 0.0178, "reward": 0.30525362491607666, "reward_std": 0.054821706376969814, "rewards/accuracy_multibox_reward": 0.30525362491607666, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 3960.111083984375, "epoch": 0.7247353224254091, "grad_norm": 0.28142768144607544, "kl": 0.358154296875, "learning_rate": 2.739315930749442e-07, "loss": 0.0393, "reward": 0.23055555671453476, "reward_std": 0.19406402856111526, "rewards/accuracy_multibox_reward": 0.23055555671453476, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 3444.8056030273438, "epoch": 0.7256977863330125, "grad_norm": 0.30529800057411194, "kl": 0.2236328125, "learning_rate": 2.728018681219559e-07, "loss": 0.0312, "reward": 0.15555556677281857, "reward_std": 0.10461794957518578, "rewards/accuracy_multibox_reward": 0.15555556677281857, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 3754.1527709960938, "epoch": 0.726660250240616, "grad_norm": 0.6951242685317993, "kl": 0.33447265625, "learning_rate": 2.7167495154574085e-07, "loss": -0.0011, "reward": 0.15555556118488312, "reward_std": 0.061311256140470505, "rewards/accuracy_multibox_reward": 0.15555556118488312, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 3988.2361450195312, "epoch": 0.7276227141482194, "grad_norm": 0.4391360580921173, "kl": 0.3193359375, "learning_rate": 2.7055085476342574e-07, "loss": 0.0137, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 3853.625, "epoch": 0.7285851780558229, "grad_norm": 0.31256696581840515, "kl": 0.375, "learning_rate": 2.6942958916356994e-07, "loss": 0.0443, "reward": 0.28055556304752827, "reward_std": 0.1139418724924326, "rewards/accuracy_multibox_reward": 0.28055556304752827, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 3914.6666870117188, "epoch": 0.7295476419634264, "grad_norm": 0.46993640065193176, "kl": 0.322021484375, "learning_rate": 2.683111661060489e-07, "loss": 0.0484, "reward": 0.17500001192092896, "reward_std": 0.11981615796685219, "rewards/accuracy_multibox_reward": 0.17500001192092896, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 3973.944580078125, "epoch": 0.7305101058710298, "grad_norm": 0.32694894075393677, "kl": 0.296875, "learning_rate": 2.671955969219395e-07, "loss": 0.0293, "reward": 0.2031986527144909, "reward_std": 0.1838572509586811, "rewards/accuracy_multibox_reward": 0.2031986527144909, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 3704.9862060546875, "epoch": 0.7314725697786333, "grad_norm": 0.46089595556259155, "kl": 0.29345703125, "learning_rate": 2.6608289291340525e-07, "loss": -0.0007, "reward": 0.24444444850087166, "reward_std": 0.07492489088326693, "rewards/accuracy_multibox_reward": 0.24444444850087166, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 3736.5416870117188, "epoch": 0.7324350336862367, "grad_norm": 0.4101853370666504, "kl": 0.247802734375, "learning_rate": 2.6497306535358133e-07, "loss": 0.0083, "reward": 0.06666667014360428, "reward_std": 0.09215527027845383, "rewards/accuracy_multibox_reward": 0.06666667014360428, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 3598.0139770507812, "epoch": 0.7333974975938402, "grad_norm": 0.25594067573547363, "kl": 0.272216796875, "learning_rate": 2.6386612548646126e-07, "loss": 0.0378, "reward": 0.3361111208796501, "reward_std": 0.22332081850618124, "rewards/accuracy_multibox_reward": 0.3361111208796501, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 3375.25, "epoch": 0.7343599615014437, "grad_norm": 0.4724268317222595, "kl": 0.255126953125, "learning_rate": 2.627620845267824e-07, "loss": 0.0392, "reward": 0.31388889625668526, "reward_std": 0.21276536583900452, "rewards/accuracy_multibox_reward": 0.31388889625668526, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 4206.916687011719, "epoch": 0.7353224254090471, "grad_norm": 0.6205666661262512, "kl": 0.3662109375, "learning_rate": 2.616609536599119e-07, "loss": 0.0039, "reward": 0.1111111044883728, "reward_std": 0.008606628514826298, "rewards/accuracy_multibox_reward": 0.1111111044883728, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 3675.7500610351562, "epoch": 0.7362848893166506, "grad_norm": 0.7920295000076294, "kl": 0.33984375, "learning_rate": 2.6056274404173464e-07, "loss": 0.0867, "reward": 0.302777785807848, "reward_std": 0.24296868965029716, "rewards/accuracy_multibox_reward": 0.302777785807848, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 3654.40283203125, "epoch": 0.737247353224254, "grad_norm": 0.18745167553424835, "kl": 0.3056640625, "learning_rate": 2.5946746679853893e-07, "loss": 0.039, "reward": 0.3250000197440386, "reward_std": 0.0562355974689126, "rewards/accuracy_multibox_reward": 0.3250000197440386, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 3786.375, "epoch": 0.7382098171318575, "grad_norm": 0.32861021161079407, "kl": 0.3115234375, "learning_rate": 2.5837513302690456e-07, "loss": 0.0183, "reward": 0.22222222574055195, "reward_std": 0.04943145625293255, "rewards/accuracy_multibox_reward": 0.22222222574055195, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 3776.111083984375, "epoch": 0.739172281039461, "grad_norm": 0.278815895318985, "kl": 0.248779296875, "learning_rate": 2.572857537935902e-07, "loss": 0.0145, "reward": 0.28333333879709244, "reward_std": 0.17784330621361732, "rewards/accuracy_multibox_reward": 0.28333333879709244, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 3469.8333740234375, "epoch": 0.7401347449470644, "grad_norm": 0.3224775493144989, "kl": 0.293212890625, "learning_rate": 2.5619934013542084e-07, "loss": 0.0215, "reward": 0.11666666902601719, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.11666666902601719, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 3989.9999389648438, "epoch": 0.7410972088546679, "grad_norm": 0.5526517629623413, "kl": 0.35205078125, "learning_rate": 2.5511590305917667e-07, "loss": 0.007, "reward": 0.10833333432674408, "reward_std": 0.04262732341885567, "rewards/accuracy_multibox_reward": 0.10833333432674408, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 3708.0416259765625, "epoch": 0.7420596727622714, "grad_norm": 0.2507631182670593, "kl": 0.276123046875, "learning_rate": 2.540354535414812e-07, "loss": 0.0488, "reward": 0.28981480933725834, "reward_std": 0.14766887947916985, "rewards/accuracy_multibox_reward": 0.28981480933725834, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 3959.5972900390625, "epoch": 0.7430221366698749, "grad_norm": 1.1509993076324463, "kl": 0.29541015625, "learning_rate": 2.5295800252869013e-07, "loss": 0.0939, "reward": 0.28611112013459206, "reward_std": 0.272820308804512, "rewards/accuracy_multibox_reward": 0.28611112013459206, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 3821.638916015625, "epoch": 0.7439846005774784, "grad_norm": 0.47117507457733154, "kl": 0.321044921875, "learning_rate": 2.5188356093678e-07, "loss": 0.0045, "reward": 0.1938368109986186, "reward_std": 0.14734359504655004, "rewards/accuracy_multibox_reward": 0.1938368109986186, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 3848.6250610351562, "epoch": 0.7449470644850819, "grad_norm": 0.28501561284065247, "kl": 0.2562255859375, "learning_rate": 2.508121396512389e-07, "loss": 0.0115, "reward": 0.07777778152376413, "reward_std": 0.13328944519162178, "rewards/accuracy_multibox_reward": 0.07777778152376413, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 3821.1944580078125, "epoch": 0.7459095283926853, "grad_norm": 0.36888453364372253, "kl": 0.279541015625, "learning_rate": 2.497437495269544e-07, "loss": 0.0431, "reward": 0.1833333382382989, "reward_std": 0.25108223408460617, "rewards/accuracy_multibox_reward": 0.1833333382382989, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 3298.666748046875, "epoch": 0.7468719923002888, "grad_norm": 0.43211060762405396, "kl": 0.190185546875, "learning_rate": 2.4867840138810456e-07, "loss": 0.0428, "reward": 0.14444445073604584, "reward_std": 0.09060795605182648, "rewards/accuracy_multibox_reward": 0.14444445073604584, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 3832.27783203125, "epoch": 0.7478344562078922, "grad_norm": 0.2708978056907654, "kl": 0.29443359375, "learning_rate": 2.47616106028049e-07, "loss": 0.038, "reward": 0.30000000447034836, "reward_std": 0.16733022686094046, "rewards/accuracy_multibox_reward": 0.30000000447034836, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 4054.5972900390625, "epoch": 0.7487969201154957, "grad_norm": 0.3202114999294281, "kl": 0.27587890625, "learning_rate": 2.465568742092178e-07, "loss": 0.0138, "reward": 0.0694444477558136, "reward_std": 0.05519326403737068, "rewards/accuracy_multibox_reward": 0.0694444477558136, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 3704.0556030273438, "epoch": 0.7497593840230992, "grad_norm": 0.2416752576828003, "kl": 0.2666015625, "learning_rate": 2.455007166630039e-07, "loss": 0.0083, "reward": 0.1944444440305233, "reward_std": 0.11795268580317497, "rewards/accuracy_multibox_reward": 0.1944444440305233, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 3943.638916015625, "epoch": 0.7507218479307026, "grad_norm": 0.24068506062030792, "kl": 0.302490234375, "learning_rate": 2.444476440896538e-07, "loss": 0.0276, "reward": 0.22222222574055195, "reward_std": 0.20350589603185654, "rewards/accuracy_multibox_reward": 0.22222222574055195, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 3826.6250610351562, "epoch": 0.7516843118383061, "grad_norm": 0.25055021047592163, "kl": 0.33984375, "learning_rate": 2.43397667158159e-07, "loss": 0.0226, "reward": 0.2222222276031971, "reward_std": 0.06757263466715813, "rewards/accuracy_multibox_reward": 0.2222222276031971, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 4027.5972900390625, "epoch": 0.7526467757459095, "grad_norm": 0.6667999029159546, "kl": 0.3564453125, "learning_rate": 2.4235079650614827e-07, "loss": 0.0314, "reward": 0.13633879274129868, "reward_std": 0.14619679376482964, "rewards/accuracy_multibox_reward": 0.13633879274129868, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 4123.2222900390625, "epoch": 0.753609239653513, "grad_norm": 0.4054693579673767, "kl": 0.385986328125, "learning_rate": 2.4130704273977983e-07, "loss": 0.0222, "reward": 0.12777778785675764, "reward_std": 0.15569349750876427, "rewards/accuracy_multibox_reward": 0.12777778785675764, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 3961.916748046875, "epoch": 0.7545717035611165, "grad_norm": 0.29372578859329224, "kl": 0.303466796875, "learning_rate": 2.402664164336339e-07, "loss": 0.0357, "reward": 0.11984127387404442, "reward_std": 0.12337626703083515, "rewards/accuracy_multibox_reward": 0.11984127387404442, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 3993.8333740234375, "epoch": 0.7555341674687199, "grad_norm": 0.4740871787071228, "kl": 0.3603515625, "learning_rate": 2.392289281306049e-07, "loss": 0.0244, "reward": 0.17135599628090858, "reward_std": 0.14752696827054024, "rewards/accuracy_multibox_reward": 0.17135599628090858, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 3859.138916015625, "epoch": 0.7564966313763234, "grad_norm": 0.19913062453269958, "kl": 0.253662109375, "learning_rate": 2.3819458834179583e-07, "loss": 0.0182, "reward": 0.15833334252238274, "reward_std": 0.1020620740018785, "rewards/accuracy_multibox_reward": 0.15833334252238274, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 4024.27783203125, "epoch": 0.7574590952839269, "grad_norm": 0.19603808224201202, "kl": 0.30859375, "learning_rate": 2.3716340754641099e-07, "loss": 0.0429, "reward": 0.17777778208255768, "reward_std": 0.16557767987251282, "rewards/accuracy_multibox_reward": 0.17777778208255768, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 3809.041748046875, "epoch": 0.7584215591915303, "grad_norm": 0.3426586091518402, "kl": 0.2578125, "learning_rate": 2.361353961916498e-07, "loss": 0.0136, "reward": 0.04722222313284874, "reward_std": 0.08845380693674088, "rewards/accuracy_multibox_reward": 0.04722222313284874, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 4023.2084350585938, "epoch": 0.7593840230991338, "grad_norm": 0.2815620005130768, "kl": 0.270751953125, "learning_rate": 2.3511056469260142e-07, "loss": 0.0207, "reward": 0.11944444477558136, "reward_std": 0.09701564535498619, "rewards/accuracy_multibox_reward": 0.11944444477558136, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 3651.263916015625, "epoch": 0.7603464870067372, "grad_norm": 0.21571071445941925, "kl": 0.232421875, "learning_rate": 2.3408892343213849e-07, "loss": 0.0218, "reward": 0.3222222179174423, "reward_std": 0.12682723114266992, "rewards/accuracy_multibox_reward": 0.3222222179174423, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 3688.041748046875, "epoch": 0.7613089509143407, "grad_norm": 0.2537418007850647, "kl": 0.265625, "learning_rate": 2.3307048276081284e-07, "loss": 0.0277, "reward": 0.22777778655290604, "reward_std": 0.11269130278378725, "rewards/accuracy_multibox_reward": 0.22777778655290604, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 3420.625, "epoch": 0.7622714148219442, "grad_norm": 0.169183149933815, "kl": 0.17919921875, "learning_rate": 2.3205525299675016e-07, "loss": 0.0107, "reward": 0.18333333544433117, "reward_std": 0.1577126607298851, "rewards/accuracy_multibox_reward": 0.18333333544433117, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 3690.2500610351562, "epoch": 0.7632338787295476, "grad_norm": 0.32716259360313416, "kl": 0.198486328125, "learning_rate": 2.3104324442554506e-07, "loss": 0.0068, "reward": 0.013888888992369175, "reward_std": 0.034020692110061646, "rewards/accuracy_multibox_reward": 0.013888888992369175, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 3883.3334350585938, "epoch": 0.7641963426371511, "grad_norm": 0.2401774525642395, "kl": 0.32958984375, "learning_rate": 2.300344673001577e-07, "loss": 0.0316, "reward": 0.15555555745959282, "reward_std": 0.14498375356197357, "rewards/accuracy_multibox_reward": 0.15555555745959282, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 3791.5416870117188, "epoch": 0.7651588065447545, "grad_norm": 0.78798508644104, "kl": 0.25048828125, "learning_rate": 2.2902893184080931e-07, "loss": 0.0622, "reward": 0.2527777813374996, "reward_std": 0.19142402336001396, "rewards/accuracy_multibox_reward": 0.2527777813374996, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 3748.25, "epoch": 0.766121270452358, "grad_norm": 0.2752157151699066, "kl": 0.2252197265625, "learning_rate": 2.2802664823487896e-07, "loss": 0.0112, "reward": 0.2805555611848831, "reward_std": 0.11833405820652843, "rewards/accuracy_multibox_reward": 0.2805555611848831, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 3721.194580078125, "epoch": 0.7670837343599615, "grad_norm": 0.20432044565677643, "kl": 0.21533203125, "learning_rate": 2.2702762663680033e-07, "loss": 0.0248, "reward": 0.11111111380159855, "reward_std": 0.14944519102573395, "rewards/accuracy_multibox_reward": 0.11111111380159855, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 3738.5693969726562, "epoch": 0.7680461982675649, "grad_norm": 0.2516527473926544, "kl": 0.274169921875, "learning_rate": 2.260318771679582e-07, "loss": 0.0284, "reward": 0.26111109741032124, "reward_std": 0.15369971096515656, "rewards/accuracy_multibox_reward": 0.26111109741032124, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 3434.8333740234375, "epoch": 0.7690086621751684, "grad_norm": 0.34230250120162964, "kl": 0.192138671875, "learning_rate": 2.2503940991658693e-07, "loss": 0.006, "reward": 0.16388890147209167, "reward_std": 0.05844391882419586, "rewards/accuracy_multibox_reward": 0.16388890147209167, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 3716.9583129882812, "epoch": 0.7699711260827719, "grad_norm": 0.21641255915164948, "kl": 0.223388671875, "learning_rate": 2.240502349376677e-07, "loss": 0.0218, "reward": 0.11666666716337204, "reward_std": 0.09621395543217659, "rewards/accuracy_multibox_reward": 0.11666666716337204, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 4140.638916015625, "epoch": 0.7709335899903753, "grad_norm": 0.41825219988822937, "kl": 0.328125, "learning_rate": 2.2306436225282666e-07, "loss": 0.0124, "reward": 0.03333333507180214, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 4214.361083984375, "epoch": 0.7718960538979788, "grad_norm": 0.2792278528213501, "kl": 0.322998046875, "learning_rate": 2.2208180185023302e-07, "loss": 0.0146, "reward": 0.08333333674818277, "reward_std": 0.1418960615992546, "rewards/accuracy_multibox_reward": 0.08333333674818277, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 4094.8472290039062, "epoch": 0.7728585178055823, "grad_norm": 0.1834118366241455, "kl": 0.228759765625, "learning_rate": 2.2110256368449923e-07, "loss": 0.026, "reward": 0.10277778282761574, "reward_std": 0.15059919282794, "rewards/accuracy_multibox_reward": 0.10277778282761574, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 3842.2500610351562, "epoch": 0.7738209817131858, "grad_norm": 0.19599953293800354, "kl": 0.2001953125, "learning_rate": 2.2012665767657823e-07, "loss": 0.0113, "reward": 0.05000000074505806, "reward_std": 0.09246460348367691, "rewards/accuracy_multibox_reward": 0.05000000074505806, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 3990.8055419921875, "epoch": 0.7747834456207893, "grad_norm": 0.17936751246452332, "kl": 0.23828125, "learning_rate": 2.1915409371366411e-07, "loss": 0.0228, "reward": 0.1983796339482069, "reward_std": 0.2026684246957302, "rewards/accuracy_multibox_reward": 0.1983796339482069, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 4093.486083984375, "epoch": 0.7757459095283927, "grad_norm": 0.2244546264410019, "kl": 0.2431640625, "learning_rate": 2.181848816490922e-07, "loss": 0.027, "reward": 0.16944444924592972, "reward_std": 0.16027548909187317, "rewards/accuracy_multibox_reward": 0.16944444924592972, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 4101.388916015625, "epoch": 0.7767083734359962, "grad_norm": 0.2602750360965729, "kl": 0.2489013671875, "learning_rate": 2.1721903130223805e-07, "loss": 0.0232, "reward": 0.15555555932223797, "reward_std": 0.14281125739216805, "rewards/accuracy_multibox_reward": 0.15555555932223797, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 3802.1806640625, "epoch": 0.7776708373435997, "grad_norm": 0.19958814978599548, "kl": 0.17333984375, "learning_rate": 2.162565524584191e-07, "loss": 0.013, "reward": 0.03333333507180214, "reward_std": 0.08164966106414795, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 3629.736083984375, "epoch": 0.7786333012512031, "grad_norm": 0.41378453373908997, "kl": 0.172607421875, "learning_rate": 2.1529745486879513e-07, "loss": 0.0472, "reward": 0.23888887465000153, "reward_std": 0.15675868652760983, "rewards/accuracy_multibox_reward": 0.23888887465000153, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 3692.4306030273438, "epoch": 0.7795957651588066, "grad_norm": 0.2748502194881439, "kl": 0.18212890625, "learning_rate": 2.1434174825026897e-07, "loss": 0.0226, "reward": 0.11111111380159855, "reward_std": 0.08765211328864098, "rewards/accuracy_multibox_reward": 0.11111111380159855, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 3782.02783203125, "epoch": 0.78055822906641, "grad_norm": 0.22261366248130798, "kl": 0.216796875, "learning_rate": 2.1338944228538893e-07, "loss": 0.0122, "reward": 0.03055555559694767, "reward_std": 0.047628965228796005, "rewards/accuracy_multibox_reward": 0.03055555559694767, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 3856.263916015625, "epoch": 0.7815206929740135, "grad_norm": 0.20962795615196228, "kl": 0.208251953125, "learning_rate": 2.124405466222502e-07, "loss": 0.0055, "reward": 0.2583333346992731, "reward_std": 0.15516270697116852, "rewards/accuracy_multibox_reward": 0.2583333346992731, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 3821.0556030273438, "epoch": 0.782483156881617, "grad_norm": 0.2369692474603653, "kl": 0.20849609375, "learning_rate": 2.1149507087439726e-07, "loss": 0.0095, "reward": 0.01944444514811039, "reward_std": 0.047628965228796005, "rewards/accuracy_multibox_reward": 0.01944444514811039, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 3600.0000610351562, "epoch": 0.7834456207892204, "grad_norm": 0.8733448386192322, "kl": 0.18017578125, "learning_rate": 2.105530246207259e-07, "loss": 0.0275, "reward": 0.4555555582046509, "reward_std": 0.1749221794307232, "rewards/accuracy_multibox_reward": 0.4555555582046509, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 3515.2222290039062, "epoch": 0.7844080846968239, "grad_norm": 0.2652549743652344, "kl": 0.167724609375, "learning_rate": 2.0961441740538763e-07, "loss": 0.0273, "reward": 0.2861111108213663, "reward_std": 0.15081237256526947, "rewards/accuracy_multibox_reward": 0.2861111108213663, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 4238.0556640625, "epoch": 0.7853705486044273, "grad_norm": 0.20037966966629028, "kl": 0.2373046875, "learning_rate": 2.086792587376912e-07, "loss": 0.0151, "reward": 0.08472222462296486, "reward_std": 0.17135443165898323, "rewards/accuracy_multibox_reward": 0.08472222462296486, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 3940.9445190429688, "epoch": 0.7863330125120308, "grad_norm": 0.2762700319290161, "kl": 0.2197265625, "learning_rate": 2.077475580920072e-07, "loss": 0.0092, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 3636.3194580078125, "epoch": 0.7872954764196343, "grad_norm": 0.7184471487998962, "kl": 0.1690673828125, "learning_rate": 2.0681932490767262e-07, "loss": 0.0906, "reward": 0.194444440305233, "reward_std": 0.1514606811106205, "rewards/accuracy_multibox_reward": 0.194444440305233, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 3989.0416870117188, "epoch": 0.7882579403272377, "grad_norm": 0.2927410304546356, "kl": 0.243896484375, "learning_rate": 2.058945685888937e-07, "loss": 0.0102, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 3930.0416870117188, "epoch": 0.7892204042348412, "grad_norm": 0.31956663727760315, "kl": 0.213623046875, "learning_rate": 2.0497329850465213e-07, "loss": 0.0343, "reward": 0.17638888768851757, "reward_std": 0.21128249168395996, "rewards/accuracy_multibox_reward": 0.17638888768851757, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 3939.4444580078125, "epoch": 0.7901828681424446, "grad_norm": 0.48809170722961426, "kl": 0.202880859375, "learning_rate": 2.0405552398860937e-07, "loss": 0.0488, "reward": 0.10000000335276127, "reward_std": 0.1472368724644184, "rewards/accuracy_multibox_reward": 0.10000000335276127, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 3823.3194580078125, "epoch": 0.7911453320500481, "grad_norm": 0.18232478201389313, "kl": 0.191162109375, "learning_rate": 2.0314125433901202e-07, "loss": 0.0066, "reward": 0.10000000335276127, "reward_std": 0.13328943401575089, "rewards/accuracy_multibox_reward": 0.10000000335276127, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 4248.1944580078125, "epoch": 0.7921077959576516, "grad_norm": 0.17635326087474823, "kl": 0.294677734375, "learning_rate": 2.0223049881859812e-07, "loss": 0.0294, "reward": 0.02599206380546093, "reward_std": 0.057068031281232834, "rewards/accuracy_multibox_reward": 0.02599206380546093, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 3661.40283203125, "epoch": 0.793070259865255, "grad_norm": 0.43424326181411743, "kl": 0.17138671875, "learning_rate": 2.0132326665450275e-07, "loss": 0.0181, "reward": 0.14444445073604584, "reward_std": 0.12881609052419662, "rewards/accuracy_multibox_reward": 0.14444445073604584, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 3700.0, "epoch": 0.7940327237728585, "grad_norm": 0.2077682614326477, "kl": 0.153076171875, "learning_rate": 2.0041956703816514e-07, "loss": 0.013, "reward": 0.14999999664723873, "reward_std": 0.1341955065727234, "rewards/accuracy_multibox_reward": 0.14999999664723873, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 4019.33349609375, "epoch": 0.794995187680462, "grad_norm": 0.1422128826379776, "kl": 0.272705078125, "learning_rate": 1.9951940912523502e-07, "loss": 0.0212, "reward": 0.2222222164273262, "reward_std": 0.05443310737609863, "rewards/accuracy_multibox_reward": 0.2222222164273262, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 3982.666748046875, "epoch": 0.7959576515880654, "grad_norm": 0.3669414222240448, "kl": 0.158203125, "learning_rate": 1.986228020354802e-07, "loss": 0.0131, "reward": 0.18888889625668526, "reward_std": 0.1125350072979927, "rewards/accuracy_multibox_reward": 0.18888889625668526, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 3879.0556030273438, "epoch": 0.7969201154956689, "grad_norm": 0.2660183906555176, "kl": 0.1993408203125, "learning_rate": 1.9772975485269377e-07, "loss": 0.0264, "reward": 0.2111111218109727, "reward_std": 0.2235187627375126, "rewards/accuracy_multibox_reward": 0.2111111218109727, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 3972.9862060546875, "epoch": 0.7978825794032723, "grad_norm": 0.41529011726379395, "kl": 0.23583984375, "learning_rate": 1.9684027662460257e-07, "loss": -0.0023, "reward": 0.1041666641831398, "reward_std": 0.03061862103641033, "rewards/accuracy_multibox_reward": 0.1041666641831398, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 3809.0694580078125, "epoch": 0.7988450433108758, "grad_norm": 0.34452080726623535, "kl": 0.2109375, "learning_rate": 1.9595437636277534e-07, "loss": 0.0283, "reward": 0.305555559694767, "reward_std": 0.10087906057015061, "rewards/accuracy_multibox_reward": 0.305555559694767, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 3333.3472290039062, "epoch": 0.7998075072184793, "grad_norm": 0.57805335521698, "kl": 0.15447998046875, "learning_rate": 1.950720630425312e-07, "loss": 0.0443, "reward": 0.213888899423182, "reward_std": 0.14009357243776321, "rewards/accuracy_multibox_reward": 0.213888899423182, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 4113.888854980469, "epoch": 0.8007699711260827, "grad_norm": 0.39273205399513245, "kl": 0.216064453125, "learning_rate": 1.9419334560284905e-07, "loss": 0.0216, "reward": 0.34531250927830115, "reward_std": 0.18120475276373327, "rewards/accuracy_multibox_reward": 0.34531250927830115, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 3814.3056640625, "epoch": 0.8017324350336862, "grad_norm": 0.46475183963775635, "kl": 0.16796875, "learning_rate": 1.9331823294627687e-07, "loss": 0.0246, "reward": 0.2805555537343025, "reward_std": 0.06705888919532299, "rewards/accuracy_multibox_reward": 0.2805555537343025, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 3757.3333740234375, "epoch": 0.8026948989412896, "grad_norm": 0.2319643497467041, "kl": 0.15716552734375, "learning_rate": 1.9244673393884104e-07, "loss": 0.0294, "reward": 0.1444444414228201, "reward_std": 0.10607289150357246, "rewards/accuracy_multibox_reward": 0.1444444414228201, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 3756.3195190429688, "epoch": 0.8036573628488932, "grad_norm": 0.8225255012512207, "kl": 0.293212890625, "learning_rate": 1.9157885740995795e-07, "loss": 0.076, "reward": 0.28611111734062433, "reward_std": 0.32891758531332016, "rewards/accuracy_multibox_reward": 0.28611111734062433, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 3503.3333740234375, "epoch": 0.8046198267564967, "grad_norm": 0.2036581039428711, "kl": 0.231689453125, "learning_rate": 1.9071461215234257e-07, "loss": 0.0273, "reward": 0.06388888508081436, "reward_std": 0.09922395646572113, "rewards/accuracy_multibox_reward": 0.06388888508081436, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 3902.5000610351562, "epoch": 0.8055822906641001, "grad_norm": 0.17802253365516663, "kl": 0.2568359375, "learning_rate": 1.89854006921921e-07, "loss": 0.0189, "reward": 0.15000000223517418, "reward_std": 0.13742699846625328, "rewards/accuracy_multibox_reward": 0.15000000223517418, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 4128.625061035156, "epoch": 0.8065447545717036, "grad_norm": 0.28310808539390564, "kl": 0.2646484375, "learning_rate": 1.8899705043774094e-07, "loss": 0.0126, "reward": 0.15833334624767303, "reward_std": 0.06046589836478233, "rewards/accuracy_multibox_reward": 0.15833334624767303, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 3639.6944580078125, "epoch": 0.8075072184793071, "grad_norm": 0.37549152970314026, "kl": 0.1641845703125, "learning_rate": 1.881437513818837e-07, "loss": 0.0307, "reward": 0.17500000447034836, "reward_std": 0.18404622375965118, "rewards/accuracy_multibox_reward": 0.17500000447034836, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 4061.361083984375, "epoch": 0.8084696823869105, "grad_norm": 0.22845321893692017, "kl": 0.226806640625, "learning_rate": 1.8729411839937576e-07, "loss": 0.0101, "reward": 0.13333333656191826, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.13333333656191826, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 4032.0416870117188, "epoch": 0.809432146294514, "grad_norm": 0.4229394495487213, "kl": 0.271728515625, "learning_rate": 1.8644816009810177e-07, "loss": 0.033, "reward": 0.2847222425043583, "reward_std": 0.21231283200904727, "rewards/accuracy_multibox_reward": 0.2847222425043583, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 3929.8333129882812, "epoch": 0.8103946102021174, "grad_norm": 0.33233436942100525, "kl": 0.259521484375, "learning_rate": 1.8560588504871721e-07, "loss": 0.0145, "reward": 0.04444444552063942, "reward_std": 0.04906534031033516, "rewards/accuracy_multibox_reward": 0.04444444552063942, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 3815.1943969726562, "epoch": 0.8113570741097209, "grad_norm": 0.4347555935382843, "kl": 0.282470703125, "learning_rate": 1.8476730178456097e-07, "loss": 0.0507, "reward": 0.21388889476656914, "reward_std": 0.1379275880753994, "rewards/accuracy_multibox_reward": 0.21388889476656914, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 3967.9722900390625, "epoch": 0.8123195380173244, "grad_norm": 0.20236709713935852, "kl": 0.24755859375, "learning_rate": 1.8393241880157006e-07, "loss": 0.0264, "reward": 0.05000000260770321, "reward_std": 0.0924646146595478, "rewards/accuracy_multibox_reward": 0.05000000260770321, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 3599.3611450195312, "epoch": 0.8132820019249278, "grad_norm": 0.24206921458244324, "kl": 0.238037109375, "learning_rate": 1.831012445581923e-07, "loss": 0.0292, "reward": 0.22500000149011612, "reward_std": 0.09855551645159721, "rewards/accuracy_multibox_reward": 0.22500000149011612, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 3978.916748046875, "epoch": 0.8142444658325313, "grad_norm": 0.33771976828575134, "kl": 0.2265625, "learning_rate": 1.8227378747530103e-07, "loss": 0.0134, "reward": 0.13958333618938923, "reward_std": 0.17408283427357674, "rewards/accuracy_multibox_reward": 0.13958333618938923, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 4024.3751220703125, "epoch": 0.8152069297401348, "grad_norm": 0.24038581550121307, "kl": 0.25, "learning_rate": 1.8145005593611076e-07, "loss": 0.0245, "reward": 0.1805555671453476, "reward_std": 0.047628968954086304, "rewards/accuracy_multibox_reward": 0.1805555671453476, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 3669.0138549804688, "epoch": 0.8161693936477382, "grad_norm": 0.6167483925819397, "kl": 0.1982421875, "learning_rate": 1.8063005828609046e-07, "loss": 0.016, "reward": 0.45771604776382446, "reward_std": 0.32632962986826897, "rewards/accuracy_multibox_reward": 0.45771604776382446, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 3770.2361450195312, "epoch": 0.8171318575553417, "grad_norm": 0.15312175452709198, "kl": 0.2061767578125, "learning_rate": 1.798138028328805e-07, "loss": 0.0207, "reward": 0.18611112236976624, "reward_std": 0.05325010418891907, "rewards/accuracy_multibox_reward": 0.18611112236976624, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 3997.4722900390625, "epoch": 0.8180943214629451, "grad_norm": 0.2231951802968979, "kl": 0.32763671875, "learning_rate": 1.7900129784620796e-07, "loss": 0.0245, "reward": 0.03333333507180214, "reward_std": 0.08164966106414795, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 3710.236083984375, "epoch": 0.8190567853705486, "grad_norm": 0.7686970829963684, "kl": 0.266845703125, "learning_rate": 1.7819255155780238e-07, "loss": 0.0674, "reward": 0.22361111268401146, "reward_std": 0.2222573421895504, "rewards/accuracy_multibox_reward": 0.22361111268401146, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 3949.5972900390625, "epoch": 0.8200192492781521, "grad_norm": 0.29913100600242615, "kl": 0.255859375, "learning_rate": 1.7738757216131328e-07, "loss": 0.0318, "reward": 0.09444444719702005, "reward_std": 0.13774657994508743, "rewards/accuracy_multibox_reward": 0.09444444719702005, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 4021.291748046875, "epoch": 0.8209817131857555, "grad_norm": 0.2608039081096649, "kl": 0.2794189453125, "learning_rate": 1.765863678122264e-07, "loss": 0.0122, "reward": 0.047222224064171314, "reward_std": 0.08566047623753548, "rewards/accuracy_multibox_reward": 0.047222224064171314, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 3920.6806030273438, "epoch": 0.821944177093359, "grad_norm": 0.5526895523071289, "kl": 0.246826171875, "learning_rate": 1.757889466277817e-07, "loss": 0.0511, "reward": 0.16944444365799427, "reward_std": 0.24898959696292877, "rewards/accuracy_multibox_reward": 0.16944444365799427, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 3562.1943969726562, "epoch": 0.8229066410009624, "grad_norm": 0.49584734439849854, "kl": 0.20703125, "learning_rate": 1.7499531668689011e-07, "loss": 0.0196, "reward": 0.28611111640930176, "reward_std": 0.20086828246712685, "rewards/accuracy_multibox_reward": 0.28611111640930176, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 3524.986083984375, "epoch": 0.8238691049085659, "grad_norm": 0.24370816349983215, "kl": 0.25146484375, "learning_rate": 1.742054860300532e-07, "loss": 0.0377, "reward": 0.13333333656191826, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.13333333656191826, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 4126.0556640625, "epoch": 0.8248315688161694, "grad_norm": 0.36314627528190613, "kl": 0.34619140625, "learning_rate": 1.7341946265928013e-07, "loss": 0.0495, "reward": 0.11944444105029106, "reward_std": 0.09563052654266357, "rewards/accuracy_multibox_reward": 0.11944444105029106, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 3760.9306030273438, "epoch": 0.8257940327237728, "grad_norm": 0.18969938158988953, "kl": 0.215087890625, "learning_rate": 1.726372545380077e-07, "loss": 0.0177, "reward": 0.1111111044883728, "reward_std": 0.008606628514826298, "rewards/accuracy_multibox_reward": 0.1111111044883728, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 3787.52783203125, "epoch": 0.8267564966313763, "grad_norm": 0.2658759355545044, "kl": 0.2315673828125, "learning_rate": 1.718588695910193e-07, "loss": 0.0045, "reward": 0.18888889253139496, "reward_std": 0.10713772848248482, "rewards/accuracy_multibox_reward": 0.18888889253139496, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 3510.8472290039062, "epoch": 0.8277189605389798, "grad_norm": 0.805979311466217, "kl": 0.232421875, "learning_rate": 1.7108431570436428e-07, "loss": -0.0349, "reward": 0.4000000078231096, "reward_std": 0.16084513813257217, "rewards/accuracy_multibox_reward": 0.4000000078231096, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 3854.90283203125, "epoch": 0.8286814244465832, "grad_norm": 0.31592103838920593, "kl": 0.223388671875, "learning_rate": 1.703136007252786e-07, "loss": 0.035, "reward": 0.16388889588415623, "reward_std": 0.15171364322304726, "rewards/accuracy_multibox_reward": 0.16388889588415623, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 3657.3194580078125, "epoch": 0.8296438883541867, "grad_norm": 0.34158456325531006, "kl": 0.242431640625, "learning_rate": 1.6954673246210532e-07, "loss": 0.0124, "reward": 0.41388889867812395, "reward_std": 0.2244861088693142, "rewards/accuracy_multibox_reward": 0.41388889867812395, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 3751.4166870117188, "epoch": 0.8306063522617901, "grad_norm": 0.20980679988861084, "kl": 0.1966552734375, "learning_rate": 1.687837186842147e-07, "loss": 0.0283, "reward": 0.21111112460494041, "reward_std": 0.09188881516456604, "rewards/accuracy_multibox_reward": 0.21111112460494041, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 3730.5556030273438, "epoch": 0.8315688161693936, "grad_norm": 0.49739521741867065, "kl": 0.197265625, "learning_rate": 1.680245671219268e-07, "loss": 0.0376, "reward": 0.23055557534098625, "reward_std": 0.1314869448542595, "rewards/accuracy_multibox_reward": 0.23055557534098625, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 3961.4306030273438, "epoch": 0.8325312800769971, "grad_norm": 0.2755904793739319, "kl": 0.260986328125, "learning_rate": 1.6726928546643173e-07, "loss": 0.0309, "reward": 0.17777778394520283, "reward_std": 0.12201833352446556, "rewards/accuracy_multibox_reward": 0.17777778394520283, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 3804.3056640625, "epoch": 0.8334937439846006, "grad_norm": 0.265863299369812, "kl": 0.22607421875, "learning_rate": 1.6651788136971284e-07, "loss": 0.0179, "reward": 0.22615740448236465, "reward_std": 0.1081370497122407, "rewards/accuracy_multibox_reward": 0.22615740448236465, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 3885.25, "epoch": 0.8344562078922041, "grad_norm": 0.39819931983947754, "kl": 0.306884765625, "learning_rate": 1.6577036244446863e-07, "loss": 0.0273, "reward": 0.3895833548158407, "reward_std": 0.254192310385406, "rewards/accuracy_multibox_reward": 0.3895833548158407, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 3866.6805419921875, "epoch": 0.8354186717998076, "grad_norm": 0.27799192070961, "kl": 0.258544921875, "learning_rate": 1.6502673626403593e-07, "loss": 0.0127, "reward": 0.09365079924464226, "reward_std": 0.11458496376872063, "rewards/accuracy_multibox_reward": 0.09365079924464226, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 3867.2361450195312, "epoch": 0.836381135707411, "grad_norm": 0.2633301913738251, "kl": 0.281494140625, "learning_rate": 1.6428701036231274e-07, "loss": 0.021, "reward": 0.07222221791744232, "reward_std": 0.05643744021654129, "rewards/accuracy_multibox_reward": 0.07222221791744232, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 3909.3612060546875, "epoch": 0.8373435996150145, "grad_norm": 0.2526848018169403, "kl": 0.32568359375, "learning_rate": 1.6355119223368235e-07, "loss": 0.0284, "reward": 0.18055556900799274, "reward_std": 0.12167280912399292, "rewards/accuracy_multibox_reward": 0.18055556900799274, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 3860.5833740234375, "epoch": 0.8383060635226179, "grad_norm": 0.6999472379684448, "kl": 0.274658203125, "learning_rate": 1.6281928933293738e-07, "loss": 0.0675, "reward": 0.19166667386889458, "reward_std": 0.28475601226091385, "rewards/accuracy_multibox_reward": 0.19166667386889458, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 3698.2639770507812, "epoch": 0.8392685274302214, "grad_norm": 0.38150233030319214, "kl": 0.279541015625, "learning_rate": 1.6209130907520358e-07, "loss": -0.0021, "reward": 0.0694444477558136, "reward_std": 0.034020692110061646, "rewards/accuracy_multibox_reward": 0.0694444477558136, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 3900.2639770507812, "epoch": 0.8402309913378249, "grad_norm": 0.22077417373657227, "kl": 0.29296875, "learning_rate": 1.6136725883586598e-07, "loss": 0.0179, "reward": 0.20277779176831245, "reward_std": 0.1506333127617836, "rewards/accuracy_multibox_reward": 0.20277779176831245, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 3837.1112060546875, "epoch": 0.8411934552454283, "grad_norm": 0.3729349374771118, "kl": 0.2213134765625, "learning_rate": 1.6064714595049298e-07, "loss": 0.0375, "reward": 0.263888880610466, "reward_std": 0.10530199203640223, "rewards/accuracy_multibox_reward": 0.263888880610466, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 4067.8333740234375, "epoch": 0.8421559191530318, "grad_norm": 0.4646424651145935, "kl": 0.2412109375, "learning_rate": 1.599309777147622e-07, "loss": 0.0399, "reward": 0.2611111253499985, "reward_std": 0.2065117284655571, "rewards/accuracy_multibox_reward": 0.2611111253499985, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 3671.236083984375, "epoch": 0.8431183830606352, "grad_norm": 0.5691246390342712, "kl": 0.19775390625, "learning_rate": 1.5921876138438765e-07, "loss": 0.056, "reward": 0.21111112460494041, "reward_std": 0.10115310689434409, "rewards/accuracy_multibox_reward": 0.21111112460494041, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 3979.2637939453125, "epoch": 0.8440808469682387, "grad_norm": 0.23209092020988464, "kl": 0.2308349609375, "learning_rate": 1.5851050417504464e-07, "loss": 0.0159, "reward": 0.24166668206453323, "reward_std": 0.07759147044271231, "rewards/accuracy_multibox_reward": 0.24166668206453323, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 4016.7361450195312, "epoch": 0.8450433108758422, "grad_norm": 0.3832041919231415, "kl": 0.24609375, "learning_rate": 1.5780621326229776e-07, "loss": 0.0069, "reward": 0.12222222425043583, "reward_std": 0.049431455321609974, "rewards/accuracy_multibox_reward": 0.12222222425043583, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 3811.4306640625, "epoch": 0.8460057747834456, "grad_norm": 0.16674433648586273, "kl": 0.190673828125, "learning_rate": 1.57105895781528e-07, "loss": 0.0201, "reward": 0.2166666705161333, "reward_std": 0.05443310318514705, "rewards/accuracy_multibox_reward": 0.2166666705161333, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 4074.5556030273438, "epoch": 0.8469682386910491, "grad_norm": 0.41961783170700073, "kl": 0.2861328125, "learning_rate": 1.5640955882786016e-07, "loss": 0.0155, "reward": 0.05000000260770321, "reward_std": 0.0924646146595478, "rewards/accuracy_multibox_reward": 0.05000000260770321, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 3989.8750610351562, "epoch": 0.8479307025986526, "grad_norm": 0.40608739852905273, "kl": 0.2939453125, "learning_rate": 1.5571720945609107e-07, "loss": 0.0465, "reward": 0.21666666865348816, "reward_std": 0.20585821941494942, "rewards/accuracy_multibox_reward": 0.21666666865348816, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 4092.8888549804688, "epoch": 0.848893166506256, "grad_norm": 0.3591742515563965, "kl": 0.328125, "learning_rate": 1.5502885468061827e-07, "loss": 0.0375, "reward": 0.11134259402751923, "reward_std": 0.09317319840192795, "rewards/accuracy_multibox_reward": 0.11134259402751923, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 3864.7223510742188, "epoch": 0.8498556304138595, "grad_norm": 0.2619227170944214, "kl": 0.2822265625, "learning_rate": 1.5434450147536904e-07, "loss": 0.0249, "reward": 0.22083333134651184, "reward_std": 0.08878178894519806, "rewards/accuracy_multibox_reward": 0.22083333134651184, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 3585.486083984375, "epoch": 0.8508180943214629, "grad_norm": 0.22555166482925415, "kl": 0.229248046875, "learning_rate": 1.53664156773729e-07, "loss": 0.0358, "reward": 0.18750000186264515, "reward_std": 0.1622828058898449, "rewards/accuracy_multibox_reward": 0.18750000186264515, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 3728.7916259765625, "epoch": 0.8517805582290664, "grad_norm": 0.2465713769197464, "kl": 0.2177734375, "learning_rate": 1.5298782746847325e-07, "loss": 0.0221, "reward": 0.1277777785435319, "reward_std": 0.0408248258754611, "rewards/accuracy_multibox_reward": 0.1277777785435319, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 3824.1251831054688, "epoch": 0.8527430221366699, "grad_norm": 0.2955557405948639, "kl": 0.251220703125, "learning_rate": 1.5231552041169494e-07, "loss": 0.0328, "reward": 0.32006175071001053, "reward_std": 0.2455812320113182, "rewards/accuracy_multibox_reward": 0.32006175071001053, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 3785.4166870117188, "epoch": 0.8537054860442733, "grad_norm": 0.31887286901474, "kl": 0.23486328125, "learning_rate": 1.5164724241473707e-07, "loss": 0.0153, "reward": 0.11388889141380787, "reward_std": 0.15404100716114044, "rewards/accuracy_multibox_reward": 0.11388889141380787, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 3751.194580078125, "epoch": 0.8546679499518768, "grad_norm": 0.3069356083869934, "kl": 0.25537109375, "learning_rate": 1.5098300024812305e-07, "loss": 0.0335, "reward": 0.21111112646758556, "reward_std": 0.1502409130334854, "rewards/accuracy_multibox_reward": 0.21111112646758556, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 4082.5833129882812, "epoch": 0.8556304138594802, "grad_norm": 0.25197887420654297, "kl": 0.33837890625, "learning_rate": 1.503228006414877e-07, "loss": 0.0281, "reward": 0.22222222574055195, "reward_std": 0.15545956371352077, "rewards/accuracy_multibox_reward": 0.22222222574055195, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 4060.819580078125, "epoch": 0.8565928777670837, "grad_norm": 0.37335216999053955, "kl": 0.25390625, "learning_rate": 1.4966665028350988e-07, "loss": 0.0096, "reward": 0.01666666753590107, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.01666666753590107, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 3826.416748046875, "epoch": 0.8575553416746872, "grad_norm": 0.37466076016426086, "kl": 0.25927734375, "learning_rate": 1.4901455582184406e-07, "loss": 0.0345, "reward": 0.24191920831799507, "reward_std": 0.1989148184657097, "rewards/accuracy_multibox_reward": 0.24191920831799507, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 3462.6805419921875, "epoch": 0.8585178055822906, "grad_norm": 0.4422335624694824, "kl": 0.1829833984375, "learning_rate": 1.4836652386305347e-07, "loss": 0.0071, "reward": 0.030555556528270245, "reward_std": 0.07484551891684532, "rewards/accuracy_multibox_reward": 0.030555556528270245, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 4189.069396972656, "epoch": 0.8594802694898941, "grad_norm": 0.2539936900138855, "kl": 0.3251953125, "learning_rate": 1.477225609725426e-07, "loss": 0.0355, "reward": 0.125, "reward_std": 0.11533895879983902, "rewards/accuracy_multibox_reward": 0.125, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 3378.6111450195312, "epoch": 0.8604427333974976, "grad_norm": 0.40038588643074036, "kl": 0.16259765625, "learning_rate": 1.4708267367449117e-07, "loss": 0.031, "reward": 0.22249695099890232, "reward_std": 0.2428729236125946, "rewards/accuracy_multibox_reward": 0.22249695099890232, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 3980.75, "epoch": 0.861405197305101, "grad_norm": 0.47304776310920715, "kl": 0.27978515625, "learning_rate": 1.464468684517879e-07, "loss": 0.0474, "reward": 0.20277779176831245, "reward_std": 0.11582577601075172, "rewards/accuracy_multibox_reward": 0.20277779176831245, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 3826.4722290039062, "epoch": 0.8623676612127045, "grad_norm": 0.4658720791339874, "kl": 0.29736328125, "learning_rate": 1.458151517459646e-07, "loss": 0.0161, "reward": 0.03333333507180214, "reward_std": 0.05270462483167648, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 3937.1945190429688, "epoch": 0.8633301251203079, "grad_norm": 0.4861120283603668, "kl": 0.259033203125, "learning_rate": 1.4518752995713148e-07, "loss": 0.0072, "reward": 0.18611111491918564, "reward_std": 0.04762896476313472, "rewards/accuracy_multibox_reward": 0.18611111491918564, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 3731.8056030273438, "epoch": 0.8642925890279115, "grad_norm": 0.3472088873386383, "kl": 0.275634765625, "learning_rate": 1.4456400944391144e-07, "loss": 0.0071, "reward": 0.18611112236976624, "reward_std": 0.15401794016361237, "rewards/accuracy_multibox_reward": 0.18611112236976624, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 3789.6111450195312, "epoch": 0.865255052935515, "grad_norm": 0.5843220353126526, "kl": 0.30419921875, "learning_rate": 1.4394459652337636e-07, "loss": 0.0437, "reward": 0.22777778282761574, "reward_std": 0.23747585713863373, "rewards/accuracy_multibox_reward": 0.22777778282761574, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 3714.5556030273438, "epoch": 0.8662175168431184, "grad_norm": 0.4083632826805115, "kl": 0.274658203125, "learning_rate": 1.4332929747098308e-07, "loss": 0.0174, "reward": 0.24444444850087166, "reward_std": 0.09246460348367691, "rewards/accuracy_multibox_reward": 0.24444444850087166, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 4079.9166870117188, "epoch": 0.8671799807507219, "grad_norm": 0.28399428725242615, "kl": 0.26416015625, "learning_rate": 1.4271811852050913e-07, "loss": 0.0137, "reward": 0.08888889197260141, "reward_std": 0.09247295185923576, "rewards/accuracy_multibox_reward": 0.08888889197260141, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 3624.666748046875, "epoch": 0.8681424446583254, "grad_norm": 0.6600984334945679, "kl": 0.322021484375, "learning_rate": 1.4211106586399078e-07, "loss": 0.0054, "reward": 0.15123003348708153, "reward_std": 0.15787888690829277, "rewards/accuracy_multibox_reward": 0.15123003348708153, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 4010.3195190429688, "epoch": 0.8691049085659288, "grad_norm": 0.8764727711677551, "kl": 0.240478515625, "learning_rate": 1.4150814565165873e-07, "loss": 0.0494, "reward": 0.28263890743255615, "reward_std": 0.24845078215003014, "rewards/accuracy_multibox_reward": 0.28263890743255615, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 3884.52783203125, "epoch": 0.8700673724735323, "grad_norm": 0.6173692345619202, "kl": 0.261474609375, "learning_rate": 1.409093639918773e-07, "loss": 0.0447, "reward": 0.11666667275130749, "reward_std": 0.1451692394912243, "rewards/accuracy_multibox_reward": 0.11666667275130749, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 3730.3056030273438, "epoch": 0.8710298363811357, "grad_norm": 0.2500438988208771, "kl": 0.200439453125, "learning_rate": 1.4031472695108177e-07, "loss": 0.0196, "reward": 0.04722222313284874, "reward_std": 0.052086107432842255, "rewards/accuracy_multibox_reward": 0.04722222313284874, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 3828.52783203125, "epoch": 0.8719923002887392, "grad_norm": 0.366105854511261, "kl": 0.285888671875, "learning_rate": 1.3972424055371678e-07, "loss": 0.028, "reward": 0.24166667088866234, "reward_std": 0.10558969480916858, "rewards/accuracy_multibox_reward": 0.24166667088866234, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 3756.9027709960938, "epoch": 0.8729547641963427, "grad_norm": 0.38488173484802246, "kl": 0.25537109375, "learning_rate": 1.3913791078217582e-07, "loss": 0.0422, "reward": 0.1111111156642437, "reward_std": 0.10199343413114548, "rewards/accuracy_multibox_reward": 0.1111111156642437, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 3859.4028930664062, "epoch": 0.8739172281039461, "grad_norm": 0.16096054017543793, "kl": 0.208251953125, "learning_rate": 1.385557435767404e-07, "loss": 0.0163, "reward": 0.21957071870565414, "reward_std": 0.06154652405530214, "rewards/accuracy_multibox_reward": 0.21957071870565414, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 3710.263916015625, "epoch": 0.8748796920115496, "grad_norm": 0.385374516248703, "kl": 0.2001953125, "learning_rate": 1.3797774483551978e-07, "loss": 0.0176, "reward": 0.2777777798473835, "reward_std": 0.195420291274786, "rewards/accuracy_multibox_reward": 0.2777777798473835, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 4018.84716796875, "epoch": 0.875842155919153, "grad_norm": 0.29954925179481506, "kl": 0.297119140625, "learning_rate": 1.3740392041439097e-07, "loss": 0.0354, "reward": 0.10000000521540642, "reward_std": 0.10327956825494766, "rewards/accuracy_multibox_reward": 0.10000000521540642, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 4122.361145019531, "epoch": 0.8768046198267565, "grad_norm": 0.6562490463256836, "kl": 0.30322265625, "learning_rate": 1.3683427612694016e-07, "loss": 0.0534, "reward": 0.1666666716337204, "reward_std": 0.196169912815094, "rewards/accuracy_multibox_reward": 0.1666666716337204, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 3845.3195190429688, "epoch": 0.87776708373436, "grad_norm": 0.3467956781387329, "kl": 0.2159423828125, "learning_rate": 1.362688177444032e-07, "loss": 0.0256, "reward": 0.28333333507180214, "reward_std": 0.12681302800774574, "rewards/accuracy_multibox_reward": 0.28333333507180214, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 3771.666748046875, "epoch": 0.8787295476419634, "grad_norm": 0.8673956990242004, "kl": 0.210693359375, "learning_rate": 1.35707550995607e-07, "loss": 0.0469, "reward": 0.1527777872979641, "reward_std": 0.17866531014442444, "rewards/accuracy_multibox_reward": 0.1527777872979641, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 3953.7084350585938, "epoch": 0.8796920115495669, "grad_norm": 0.5552398562431335, "kl": 0.2646484375, "learning_rate": 1.3515048156691242e-07, "loss": -0.0013, "reward": 0.19166667014360428, "reward_std": 0.051109469961375, "rewards/accuracy_multibox_reward": 0.19166667014360428, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 3995.9583129882812, "epoch": 0.8806544754571703, "grad_norm": 0.23217399418354034, "kl": 0.25341796875, "learning_rate": 1.3459761510215533e-07, "loss": 0.0141, "reward": 0.14444444701075554, "reward_std": 0.05244768364354968, "rewards/accuracy_multibox_reward": 0.14444444701075554, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 4126.444519042969, "epoch": 0.8816169393647738, "grad_norm": 0.34894633293151855, "kl": 0.36767578125, "learning_rate": 1.3404895720259052e-07, "loss": 0.0412, "reward": 0.1908730138093233, "reward_std": 0.15257573826238513, "rewards/accuracy_multibox_reward": 0.1908730138093233, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 3809.1112060546875, "epoch": 0.8825794032723773, "grad_norm": 0.5202602744102478, "kl": 0.2666015625, "learning_rate": 1.3350451342683452e-07, "loss": -0.0076, "reward": 0.1805555671453476, "reward_std": 0.047628968954086304, "rewards/accuracy_multibox_reward": 0.1805555671453476, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 3823.1111450195312, "epoch": 0.8835418671799807, "grad_norm": 0.329961359500885, "kl": 0.22021484375, "learning_rate": 1.3296428929080914e-07, "loss": 0.051, "reward": 0.1611111108213663, "reward_std": 0.17064599692821503, "rewards/accuracy_multibox_reward": 0.1611111108213663, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 3788.02783203125, "epoch": 0.8845043310875842, "grad_norm": 0.3528249263763428, "kl": 0.23681640625, "learning_rate": 1.3242829026768596e-07, "loss": 0.0362, "reward": 0.0861111069098115, "reward_std": 0.13600712269544601, "rewards/accuracy_multibox_reward": 0.0861111069098115, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 4089.888916015625, "epoch": 0.8854667949951877, "grad_norm": 0.4870060682296753, "kl": 0.34423828125, "learning_rate": 1.3189652178783055e-07, "loss": 0.0156, "reward": 0.10833333805203438, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.10833333805203438, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 4117.888977050781, "epoch": 0.8864292589027911, "grad_norm": 0.28239157795906067, "kl": 0.2958984375, "learning_rate": 1.3136898923874783e-07, "loss": 0.0358, "reward": 0.2531565763056278, "reward_std": 0.1080734170973301, "rewards/accuracy_multibox_reward": 0.2531565763056278, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 3650.4027099609375, "epoch": 0.8873917228103946, "grad_norm": 0.295401930809021, "kl": 0.24658203125, "learning_rate": 1.3084569796502683e-07, "loss": 0.0127, "reward": 0.14444444701075554, "reward_std": 0.04906534031033516, "rewards/accuracy_multibox_reward": 0.14444444701075554, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 4042.9306030273438, "epoch": 0.888354186717998, "grad_norm": 0.43557778000831604, "kl": 0.300048828125, "learning_rate": 1.3032665326828722e-07, "loss": 0.0101, "reward": 0.22708333283662796, "reward_std": 0.17413102462887764, "rewards/accuracy_multibox_reward": 0.22708333283662796, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 3672.7501220703125, "epoch": 0.8893166506256015, "grad_norm": 0.3498525619506836, "kl": 0.27587890625, "learning_rate": 1.2981186040712535e-07, "loss": 0.0597, "reward": 0.20833334419876337, "reward_std": 0.14124062610790133, "rewards/accuracy_multibox_reward": 0.20833334419876337, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 3327.4861450195312, "epoch": 0.890279114533205, "grad_norm": 0.2497827708721161, "kl": 0.192626953125, "learning_rate": 1.2930132459706088e-07, "loss": 0.0108, "reward": 0.2825396843254566, "reward_std": 0.1409766818396747, "rewards/accuracy_multibox_reward": 0.2825396843254566, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 3881.8334350585938, "epoch": 0.8912415784408084, "grad_norm": 0.2946717143058777, "kl": 0.2685546875, "learning_rate": 1.2879505101048419e-07, "loss": 0.0271, "reward": 0.07777777872979641, "reward_std": 0.13829107955098152, "rewards/accuracy_multibox_reward": 0.07777777872979641, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 3811.5555419921875, "epoch": 0.8922040423484119, "grad_norm": 0.1982443630695343, "kl": 0.235107421875, "learning_rate": 1.2829304477660348e-07, "loss": 0.0164, "reward": 0.11388889700174332, "reward_std": 0.044561613351106644, "rewards/accuracy_multibox_reward": 0.11388889700174332, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 3649.861083984375, "epoch": 0.8931665062560153, "grad_norm": 0.262771338224411, "kl": 0.2294921875, "learning_rate": 1.277953109813933e-07, "loss": 0.0149, "reward": 0.22920752316713333, "reward_std": 0.20107705611735582, "rewards/accuracy_multibox_reward": 0.22920752316713333, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 3636.4166259765625, "epoch": 0.8941289701636189, "grad_norm": 0.3109305202960968, "kl": 0.262451171875, "learning_rate": 1.273018546675431e-07, "loss": 0.015, "reward": 0.10000000149011612, "reward_std": 0.10609812662005424, "rewards/accuracy_multibox_reward": 0.10000000149011612, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 4013.6806030273438, "epoch": 0.8950914340712224, "grad_norm": 0.3161824345588684, "kl": 0.241943359375, "learning_rate": 1.268126808344053e-07, "loss": 0.021, "reward": 0.19722222536802292, "reward_std": 0.2578936479985714, "rewards/accuracy_multibox_reward": 0.19722222536802292, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 3938.4444580078125, "epoch": 0.8960538979788258, "grad_norm": 0.28733542561531067, "kl": 0.238037109375, "learning_rate": 1.2632779443794588e-07, "loss": 0.0129, "reward": 0.07777778059244156, "reward_std": 0.038968171924352646, "rewards/accuracy_multibox_reward": 0.07777778059244156, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 3987.9444580078125, "epoch": 0.8970163618864293, "grad_norm": 0.4489271640777588, "kl": 0.248046875, "learning_rate": 1.258472003906931e-07, "loss": 0.0143, "reward": 0.18333333916962147, "reward_std": 0.1329801008105278, "rewards/accuracy_multibox_reward": 0.18333333916962147, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 3836.4583740234375, "epoch": 0.8979788257940328, "grad_norm": 0.22419902682304382, "kl": 0.22412109375, "learning_rate": 1.2537090356168836e-07, "loss": 0.0244, "reward": 0.10277778282761574, "reward_std": 0.13582701236009598, "rewards/accuracy_multibox_reward": 0.10277778282761574, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 3632.1250610351562, "epoch": 0.8989412897016362, "grad_norm": 0.22683072090148926, "kl": 0.20458984375, "learning_rate": 1.2489890877643659e-07, "loss": -0.0183, "reward": 0.2736111283302307, "reward_std": 0.0870560659095645, "rewards/accuracy_multibox_reward": 0.2736111283302307, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 3973.52783203125, "epoch": 0.8999037536092397, "grad_norm": 0.2952636778354645, "kl": 0.2685546875, "learning_rate": 1.2443122081685733e-07, "loss": 0.014, "reward": 0.13055555894970894, "reward_std": 0.09479539841413498, "rewards/accuracy_multibox_reward": 0.13055555894970894, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 3603.0972900390625, "epoch": 0.9008662175168431, "grad_norm": 0.5726350545883179, "kl": 0.216796875, "learning_rate": 1.2396784442123653e-07, "loss": 0.0395, "reward": 0.3055555550381541, "reward_std": 0.2393084019422531, "rewards/accuracy_multibox_reward": 0.3055555550381541, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 3828.0972290039062, "epoch": 0.9018286814244466, "grad_norm": 0.4043099284172058, "kl": 0.28759765625, "learning_rate": 1.2350878428417837e-07, "loss": 0.0338, "reward": 0.1611111182719469, "reward_std": 0.17639262229204178, "rewards/accuracy_multibox_reward": 0.1611111182719469, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 3765.3889770507812, "epoch": 0.9027911453320501, "grad_norm": 0.2922109067440033, "kl": 0.2442626953125, "learning_rate": 1.2305404505655783e-07, "loss": 0.01, "reward": 0.2888888940215111, "reward_std": 0.05443309899419546, "rewards/accuracy_multibox_reward": 0.2888888940215111, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 3747.513916015625, "epoch": 0.9037536092396535, "grad_norm": 0.19409580528736115, "kl": 0.153076171875, "learning_rate": 1.2260363134547326e-07, "loss": 0.0135, "reward": 0.10000000428408384, "reward_std": 0.1300387792289257, "rewards/accuracy_multibox_reward": 0.10000000428408384, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 4039.2084350585938, "epoch": 0.904716073147257, "grad_norm": 0.2663358449935913, "kl": 0.22998046875, "learning_rate": 1.2215754771419997e-07, "loss": 0.0103, "reward": 0.013888888992369175, "reward_std": 0.034020692110061646, "rewards/accuracy_multibox_reward": 0.013888888992369175, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 3848.9722290039062, "epoch": 0.9056785370548605, "grad_norm": 0.2796497642993927, "kl": 0.251708984375, "learning_rate": 1.217157986821441e-07, "loss": 0.0183, "reward": 0.2277777874842286, "reward_std": 0.08739173412322998, "rewards/accuracy_multibox_reward": 0.2277777874842286, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 3773.7083740234375, "epoch": 0.9066410009624639, "grad_norm": 0.6984164118766785, "kl": 0.19091796875, "learning_rate": 1.2127838872479642e-07, "loss": 0.0322, "reward": 0.35277777910232544, "reward_std": 0.26130034402012825, "rewards/accuracy_multibox_reward": 0.35277777910232544, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 3792.791748046875, "epoch": 0.9076034648700674, "grad_norm": 0.27103838324546814, "kl": 0.241455078125, "learning_rate": 1.208453222736876e-07, "loss": 0.0187, "reward": 0.25555555522441864, "reward_std": 0.1702961064875126, "rewards/accuracy_multibox_reward": 0.25555555522441864, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 3997.8195190429688, "epoch": 0.9085659287776708, "grad_norm": 0.2652348279953003, "kl": 0.300048828125, "learning_rate": 1.2041660371634257e-07, "loss": 0.024, "reward": 0.10833333525806665, "reward_std": 0.11381369456648827, "rewards/accuracy_multibox_reward": 0.10833333525806665, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 3851.6527709960938, "epoch": 0.9095283926852743, "grad_norm": 0.2905953824520111, "kl": 0.2601318359375, "learning_rate": 1.1999223739623666e-07, "loss": 0.0128, "reward": 0.08611111715435982, "reward_std": 0.14865543320775032, "rewards/accuracy_multibox_reward": 0.08611111715435982, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 3833.3056640625, "epoch": 0.9104908565928778, "grad_norm": 0.43012842535972595, "kl": 0.275634765625, "learning_rate": 1.1957222761275148e-07, "loss": 0.0342, "reward": 0.1858796365559101, "reward_std": 0.15453240182250738, "rewards/accuracy_multibox_reward": 0.1858796365559101, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 3868.1804809570312, "epoch": 0.9114533205004812, "grad_norm": 0.2933727204799652, "kl": 0.252685546875, "learning_rate": 1.1915657862113096e-07, "loss": 0.0159, "reward": 0.1776289800181985, "reward_std": 0.16362204775214195, "rewards/accuracy_multibox_reward": 0.1776289800181985, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 4075.916748046875, "epoch": 0.9124157844080847, "grad_norm": 0.3571476638317108, "kl": 0.359375, "learning_rate": 1.1874529463243885e-07, "loss": 0.0196, "reward": 0.08333333302289248, "reward_std": 0.17411427199840546, "rewards/accuracy_multibox_reward": 0.08333333302289248, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 3785.6250610351562, "epoch": 0.9133782483156881, "grad_norm": 0.6690623164176941, "kl": 0.19140625, "learning_rate": 1.1833837981351568e-07, "loss": 0.0328, "reward": 0.26944445818662643, "reward_std": 0.2894003689289093, "rewards/accuracy_multibox_reward": 0.26944445818662643, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 3642.611083984375, "epoch": 0.9143407122232916, "grad_norm": 0.2699919044971466, "kl": 0.22021484375, "learning_rate": 1.1793583828693651e-07, "loss": 0.0128, "reward": 0.14166667312383652, "reward_std": 0.20472294837236404, "rewards/accuracy_multibox_reward": 0.14166667312383652, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 3760.1666870117188, "epoch": 0.9153031761308951, "grad_norm": 0.6406789422035217, "kl": 0.156982421875, "learning_rate": 1.1753767413096932e-07, "loss": 0.0405, "reward": 0.27114199195057154, "reward_std": 0.2615858316421509, "rewards/accuracy_multibox_reward": 0.27114199195057154, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 4187.833435058594, "epoch": 0.9162656400384985, "grad_norm": 0.29907041788101196, "kl": 0.333984375, "learning_rate": 1.1714389137953379e-07, "loss": 0.0187, "reward": 0.1527777798473835, "reward_std": 0.05813458189368248, "rewards/accuracy_multibox_reward": 0.1527777798473835, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 3738.4583740234375, "epoch": 0.917228103946102, "grad_norm": 0.34832027554512024, "kl": 0.21826171875, "learning_rate": 1.1675449402216003e-07, "loss": 0.0036, "reward": 0.12222222425043583, "reward_std": 0.04943145625293255, "rewards/accuracy_multibox_reward": 0.12222222425043583, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 3523.0972290039062, "epoch": 0.9181905678537055, "grad_norm": 0.5037846565246582, "kl": 0.22607421875, "learning_rate": 1.163694860039487e-07, "loss": 0.0432, "reward": 0.315277773886919, "reward_std": 0.1720314584672451, "rewards/accuracy_multibox_reward": 0.315277773886919, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 3915.888916015625, "epoch": 0.9191530317613089, "grad_norm": 0.27529340982437134, "kl": 0.212158203125, "learning_rate": 1.159888712255306e-07, "loss": 0.0068, "reward": 0.14166667312383652, "reward_std": 0.15061244368553162, "rewards/accuracy_multibox_reward": 0.14166667312383652, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 3641.4444580078125, "epoch": 0.9201154956689124, "grad_norm": 0.6079003214836121, "kl": 0.16015625, "learning_rate": 1.1561265354302716e-07, "loss": 0.0534, "reward": 0.308333333581686, "reward_std": 0.20722875371575356, "rewards/accuracy_multibox_reward": 0.308333333581686, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 4024.0972290039062, "epoch": 0.9210779595765158, "grad_norm": 0.24720457196235657, "kl": 0.240966796875, "learning_rate": 1.1524083676801186e-07, "loss": 0.0294, "reward": 0.15833333134651184, "reward_std": 0.0992770865559578, "rewards/accuracy_multibox_reward": 0.15833333134651184, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 4038.3750610351562, "epoch": 0.9220404234841193, "grad_norm": 0.2213372141122818, "kl": 0.194091796875, "learning_rate": 1.148734246674711e-07, "loss": 0.0214, "reward": 0.04722222313284874, "reward_std": 0.052086107432842255, "rewards/accuracy_multibox_reward": 0.04722222313284874, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 3927.8750610351562, "epoch": 0.9230028873917228, "grad_norm": 0.4381406307220459, "kl": 0.250244140625, "learning_rate": 1.1451042096376613e-07, "loss": 0.0185, "reward": 0.3388889078050852, "reward_std": 0.19140937738120556, "rewards/accuracy_multibox_reward": 0.3388889078050852, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 4110.77783203125, "epoch": 0.9239653512993262, "grad_norm": 0.2562219202518463, "kl": 0.240966796875, "learning_rate": 1.1415182933459558e-07, "loss": 0.024, "reward": 0.11111111380159855, "reward_std": 0.13328943774104118, "rewards/accuracy_multibox_reward": 0.11111111380159855, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 4104.6251220703125, "epoch": 0.9249278152069298, "grad_norm": 0.3922714293003082, "kl": 0.314697265625, "learning_rate": 1.137976534129579e-07, "loss": 0.0139, "reward": 0.13333333004266024, "reward_std": 0.2307513952255249, "rewards/accuracy_multibox_reward": 0.13333333004266024, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 3840.3750610351562, "epoch": 0.9258902791145333, "grad_norm": 0.4769600033760071, "kl": 0.1944580078125, "learning_rate": 1.1344789678711483e-07, "loss": 0.0006, "reward": 0.28333333507180214, "reward_std": 0.19440969079732895, "rewards/accuracy_multibox_reward": 0.28333333507180214, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 3856.763916015625, "epoch": 0.9268527430221367, "grad_norm": 0.3143102824687958, "kl": 0.22412109375, "learning_rate": 1.1310256300055488e-07, "loss": 0.0173, "reward": 0.10277777351438999, "reward_std": 0.08353400975465775, "rewards/accuracy_multibox_reward": 0.10277777351438999, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 3862.02783203125, "epoch": 0.9278152069297402, "grad_norm": 0.6050894260406494, "kl": 0.236572265625, "learning_rate": 1.1276165555195729e-07, "loss": 0.0477, "reward": 0.18333333171904087, "reward_std": 0.16919448971748352, "rewards/accuracy_multibox_reward": 0.18333333171904087, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 4041.52783203125, "epoch": 0.9287776708373436, "grad_norm": 0.36923089623451233, "kl": 0.2939453125, "learning_rate": 1.1242517789515692e-07, "loss": 0.036, "reward": 0.10277778282761574, "reward_std": 0.10581713542342186, "rewards/accuracy_multibox_reward": 0.10277778282761574, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 3839.5416870117188, "epoch": 0.9297401347449471, "grad_norm": 0.32944971323013306, "kl": 0.182373046875, "learning_rate": 1.1209313343910911e-07, "loss": 0.0243, "reward": 0.18055556062608957, "reward_std": 0.21894989535212517, "rewards/accuracy_multibox_reward": 0.18055556062608957, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 3897.8056030273438, "epoch": 0.9307025986525506, "grad_norm": 0.5192247629165649, "kl": 0.218994140625, "learning_rate": 1.1176552554785503e-07, "loss": 0.0221, "reward": 0.3083333447575569, "reward_std": 0.26800958812236786, "rewards/accuracy_multibox_reward": 0.3083333447575569, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 3599.8612670898438, "epoch": 0.931665062560154, "grad_norm": 0.420758992433548, "kl": 0.279296875, "learning_rate": 1.1144235754048768e-07, "loss": 0.0397, "reward": 0.2430555485188961, "reward_std": 0.19031472504138947, "rewards/accuracy_multibox_reward": 0.2430555485188961, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 3564.3333740234375, "epoch": 0.9326275264677575, "grad_norm": 0.18444304168224335, "kl": 0.240478515625, "learning_rate": 1.1112363269111848e-07, "loss": 0.0235, "reward": 0.03333333507180214, "reward_std": 0.05270462483167648, "rewards/accuracy_multibox_reward": 0.03333333507180214, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 3804.4306640625, "epoch": 0.933589990375361, "grad_norm": 0.39489081501960754, "kl": 0.2431640625, "learning_rate": 1.1080935422884358e-07, "loss": 0.0059, "reward": 0.2500000149011612, "reward_std": 0.09853708557784557, "rewards/accuracy_multibox_reward": 0.2500000149011612, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 3902.4722900390625, "epoch": 0.9345524542829644, "grad_norm": 0.3543093800544739, "kl": 0.28759765625, "learning_rate": 1.1049952533771161e-07, "loss": 0.0488, "reward": 0.25833334121853113, "reward_std": 0.20052914321422577, "rewards/accuracy_multibox_reward": 0.25833334121853113, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 3642.1111450195312, "epoch": 0.9355149181905679, "grad_norm": 0.23651286959648132, "kl": 0.23779296875, "learning_rate": 1.1019414915669143e-07, "loss": 0.0273, "reward": 0.16388890147209167, "reward_std": 0.05844391882419586, "rewards/accuracy_multibox_reward": 0.16388890147209167, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 4104.902770996094, "epoch": 0.9364773820981713, "grad_norm": 0.310626745223999, "kl": 0.28759765625, "learning_rate": 1.0989322877963984e-07, "loss": 0.0192, "reward": 0.06388889066874981, "reward_std": 0.09291093796491623, "rewards/accuracy_multibox_reward": 0.06388889066874981, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 4015.8612060546875, "epoch": 0.9374398460057748, "grad_norm": 0.21327954530715942, "kl": 0.2464599609375, "learning_rate": 1.0959676725527084e-07, "loss": 0.0171, "reward": 0.20069445110857487, "reward_std": 0.07249332591891289, "rewards/accuracy_multibox_reward": 0.20069445110857487, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 4063.4722290039062, "epoch": 0.9384023099133783, "grad_norm": 0.5447343587875366, "kl": 0.26611328125, "learning_rate": 1.0930476758712441e-07, "loss": 0.007, "reward": 0.1138888904824853, "reward_std": 0.03402068838477135, "rewards/accuracy_multibox_reward": 0.1138888904824853, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 3925.361083984375, "epoch": 0.9393647738209817, "grad_norm": 0.2984704077243805, "kl": 0.2579345703125, "learning_rate": 1.0901723273353597e-07, "loss": 0.0193, "reward": 0.19166666828095913, "reward_std": 0.09554864838719368, "rewards/accuracy_multibox_reward": 0.19166666828095913, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 3787.2361450195312, "epoch": 0.9403272377285852, "grad_norm": 0.3002322018146515, "kl": 0.20703125, "learning_rate": 1.0873416560760674e-07, "loss": 0.0152, "reward": 0.2750000171363354, "reward_std": 0.18974793329834938, "rewards/accuracy_multibox_reward": 0.2750000171363354, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 3995.77783203125, "epoch": 0.9412897016361886, "grad_norm": 0.2578088939189911, "kl": 0.283447265625, "learning_rate": 1.0845556907717402e-07, "loss": 0.0216, "reward": 0.241666654124856, "reward_std": 0.15922324731945992, "rewards/accuracy_multibox_reward": 0.241666654124856, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 3780.9583740234375, "epoch": 0.9422521655437921, "grad_norm": 0.2301158457994461, "kl": 0.17578125, "learning_rate": 1.0818144596478223e-07, "loss": 0.0052, "reward": 0.14444444607943296, "reward_std": 0.19629273191094398, "rewards/accuracy_multibox_reward": 0.14444444607943296, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 3967.5693969726562, "epoch": 0.9432146294513956, "grad_norm": 0.6535288095474243, "kl": 0.21142578125, "learning_rate": 1.0791179904765416e-07, "loss": 0.049, "reward": 0.19722222350537777, "reward_std": 0.19265750795602798, "rewards/accuracy_multibox_reward": 0.19722222350537777, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 3933.2777099609375, "epoch": 0.944177093358999, "grad_norm": 0.4792691767215729, "kl": 0.2568359375, "learning_rate": 1.0764663105766295e-07, "loss": 0.0263, "reward": 0.19722223468124866, "reward_std": 0.08979266881942749, "rewards/accuracy_multibox_reward": 0.19722223468124866, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 4013.0556030273438, "epoch": 0.9451395572666025, "grad_norm": 0.2699962556362152, "kl": 0.21728515625, "learning_rate": 1.0738594468130451e-07, "loss": 0.0124, "reward": 0.0694444477558136, "reward_std": 0.05417735129594803, "rewards/accuracy_multibox_reward": 0.0694444477558136, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 3612.5833129882812, "epoch": 0.9461020211742059, "grad_norm": 0.27803468704223633, "kl": 0.197265625, "learning_rate": 1.0712974255966995e-07, "loss": 0.0109, "reward": 0.17777778208255768, "reward_std": 0.14591310545802116, "rewards/accuracy_multibox_reward": 0.17777778208255768, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 4103.819519042969, "epoch": 0.9470644850818094, "grad_norm": 0.4213709831237793, "kl": 0.273681640625, "learning_rate": 1.0687802728841937e-07, "loss": 0.0113, "reward": 0.0416666679084301, "reward_std": 0.04564354941248894, "rewards/accuracy_multibox_reward": 0.0416666679084301, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 3824.4722290039062, "epoch": 0.9480269489894129, "grad_norm": 0.28937506675720215, "kl": 0.22119140625, "learning_rate": 1.0663080141775502e-07, "loss": 0.0193, "reward": 0.2750000227242708, "reward_std": 0.18413343653082848, "rewards/accuracy_multibox_reward": 0.2750000227242708, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 3566.9722290039062, "epoch": 0.9489894128970163, "grad_norm": 0.8688193559646606, "kl": 0.1654052734375, "learning_rate": 1.063880674523958e-07, "loss": 0.0594, "reward": 0.28888891264796257, "reward_std": 0.18345598131418228, "rewards/accuracy_multibox_reward": 0.28888891264796257, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 3940.0972290039062, "epoch": 0.9499518768046198, "grad_norm": 0.19001103937625885, "kl": 0.287841796875, "learning_rate": 1.0614982785155183e-07, "loss": 0.0291, "reward": 0.0833333358168602, "reward_std": 0.10641204193234444, "rewards/accuracy_multibox_reward": 0.0833333358168602, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 4014.5556030273438, "epoch": 0.9509143407122232, "grad_norm": 0.36962026357650757, "kl": 0.257080078125, "learning_rate": 1.0591608502889927e-07, "loss": 0.0288, "reward": 0.1888888869434595, "reward_std": 0.11216938961297274, "rewards/accuracy_multibox_reward": 0.1888888869434595, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 3384.0973510742188, "epoch": 0.9518768046198267, "grad_norm": 0.2873578369617462, "kl": 0.1458740234375, "learning_rate": 1.0568684135255627e-07, "loss": 0.0216, "reward": 0.15000000223517418, "reward_std": 0.1594335064291954, "rewards/accuracy_multibox_reward": 0.15000000223517418, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 3854.5278930664062, "epoch": 0.9528392685274302, "grad_norm": 0.3282194137573242, "kl": 0.22802734375, "learning_rate": 1.0546209914505873e-07, "loss": 0.0327, "reward": 0.14722222462296486, "reward_std": 0.16230680420994759, "rewards/accuracy_multibox_reward": 0.14722222462296486, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 4014.3195190429688, "epoch": 0.9538017324350336, "grad_norm": 0.5646254420280457, "kl": 0.29736328125, "learning_rate": 1.0524186068333691e-07, "loss": 0.0413, "reward": 0.34444444812834263, "reward_std": 0.20820896280929446, "rewards/accuracy_multibox_reward": 0.34444444812834263, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 3923.263916015625, "epoch": 0.9547641963426372, "grad_norm": 0.2466593235731125, "kl": 0.250732421875, "learning_rate": 1.0502612819869216e-07, "loss": 0.015, "reward": 0.236111119389534, "reward_std": 0.16909019742161036, "rewards/accuracy_multibox_reward": 0.236111119389534, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 4032.3472900390625, "epoch": 0.9557266602502407, "grad_norm": 0.43053990602493286, "kl": 0.2724609375, "learning_rate": 1.0481490387677449e-07, "loss": 0.0115, "reward": 0.11388889327645302, "reward_std": 0.10479073226451874, "rewards/accuracy_multibox_reward": 0.11388889327645302, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 3757.861083984375, "epoch": 0.9566891241578441, "grad_norm": 0.2604324221611023, "kl": 0.1678466796875, "learning_rate": 1.0460818985756038e-07, "loss": 0.0068, "reward": 0.09166666585952044, "reward_std": 0.16726534441113472, "rewards/accuracy_multibox_reward": 0.09166666585952044, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 4111.861145019531, "epoch": 0.9576515880654476, "grad_norm": 0.24586614966392517, "kl": 0.274658203125, "learning_rate": 1.044059882353311e-07, "loss": 0.0242, "reward": 0.11666667461395264, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.11666667461395264, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 3807.5139770507812, "epoch": 0.958614051973051, "grad_norm": 0.3831951916217804, "kl": 0.201416015625, "learning_rate": 1.042083010586514e-07, "loss": 0.0394, "reward": 0.1527777798473835, "reward_std": 0.06867550686001778, "rewards/accuracy_multibox_reward": 0.1527777798473835, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 3768.3333740234375, "epoch": 0.9595765158806545, "grad_norm": 0.4571351110935211, "kl": 0.2069091796875, "learning_rate": 1.04015130330349e-07, "loss": 0.0521, "reward": 0.12777777388691902, "reward_std": 0.1834840252995491, "rewards/accuracy_multibox_reward": 0.12777777388691902, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 3814.6251220703125, "epoch": 0.960538979788258, "grad_norm": 0.3037548363208771, "kl": 0.199951171875, "learning_rate": 1.0382647800749392e-07, "loss": 0.0094, "reward": 0.25555556267499924, "reward_std": 0.058189758099615574, "rewards/accuracy_multibox_reward": 0.25555556267499924, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 3900.3333740234375, "epoch": 0.9615014436958614, "grad_norm": 0.3584466278553009, "kl": 0.227783203125, "learning_rate": 1.0364234600137908e-07, "loss": 0.0238, "reward": 0.24583334475755692, "reward_std": 0.0996430478990078, "rewards/accuracy_multibox_reward": 0.24583334475755692, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 3713.7916259765625, "epoch": 0.9624639076034649, "grad_norm": 0.29268255829811096, "kl": 0.2132568359375, "learning_rate": 1.0346273617750056e-07, "loss": 0.0213, "reward": 0.2944444362074137, "reward_std": 0.14491229131817818, "rewards/accuracy_multibox_reward": 0.2944444362074137, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 4042.5, "epoch": 0.9634263715110684, "grad_norm": 0.3486381471157074, "kl": 0.251220703125, "learning_rate": 1.0328765035553901e-07, "loss": 0.0144, "reward": 0.0805555572733283, "reward_std": 0.13730024546384811, "rewards/accuracy_multibox_reward": 0.0805555572733283, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 3988.263916015625, "epoch": 0.9643888354186718, "grad_norm": 0.36531713604927063, "kl": 0.30810546875, "learning_rate": 1.0311709030934099e-07, "loss": 0.0389, "reward": 0.2500000046566129, "reward_std": 0.17545313388109207, "rewards/accuracy_multibox_reward": 0.2500000046566129, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 3793.638916015625, "epoch": 0.9653512993262753, "grad_norm": 0.36344191431999207, "kl": 0.261474609375, "learning_rate": 1.0295105776690106e-07, "loss": 0.0028, "reward": 0.13333334401249886, "reward_std": 0.05477226059883833, "rewards/accuracy_multibox_reward": 0.13333334401249886, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 3776.5138549804688, "epoch": 0.9663137632338787, "grad_norm": 0.19859755039215088, "kl": 0.18359375, "learning_rate": 1.0278955441034438e-07, "loss": 0.0117, "reward": 0.2166666779667139, "reward_std": 0.05443310318514705, "rewards/accuracy_multibox_reward": 0.2166666779667139, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 3800.0694580078125, "epoch": 0.9672762271414822, "grad_norm": 0.2820530831813812, "kl": 0.1724853515625, "learning_rate": 1.026325818759094e-07, "loss": 0.0082, "reward": 0.24166666716337204, "reward_std": 0.06680067535489798, "rewards/accuracy_multibox_reward": 0.24166666716337204, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 4084.6112060546875, "epoch": 0.9682386910490857, "grad_norm": 0.2388535887002945, "kl": 0.2197265625, "learning_rate": 1.0248014175393175e-07, "loss": 0.0181, "reward": 0.14464285969734192, "reward_std": 0.1297646313905716, "rewards/accuracy_multibox_reward": 0.14464285969734192, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 3837.166748046875, "epoch": 0.9692011549566891, "grad_norm": 0.2813622057437897, "kl": 0.177001953125, "learning_rate": 1.0233223558882752e-07, "loss": -0.0083, "reward": 0.10555555671453476, "reward_std": 0.15515022352337837, "rewards/accuracy_multibox_reward": 0.10555555671453476, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 3738.8193969726562, "epoch": 0.9701636188642926, "grad_norm": 0.4079531729221344, "kl": 0.2159423828125, "learning_rate": 1.021888648790782e-07, "loss": 0.0152, "reward": 0.26944445073604584, "reward_std": 0.1930055394768715, "rewards/accuracy_multibox_reward": 0.26944445073604584, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 3560.791748046875, "epoch": 0.971126082771896, "grad_norm": 0.3345167934894562, "kl": 0.17132568359375, "learning_rate": 1.0205003107721504e-07, "loss": 0.0247, "reward": 0.13611111510545015, "reward_std": 0.14682206511497498, "rewards/accuracy_multibox_reward": 0.13611111510545015, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 4052.4306030273438, "epoch": 0.9720885466794995, "grad_norm": 0.2500181794166565, "kl": 0.24853515625, "learning_rate": 1.0191573558980467e-07, "loss": 0.0419, "reward": 0.18055556062608957, "reward_std": 0.08566047251224518, "rewards/accuracy_multibox_reward": 0.18055556062608957, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 3906.59716796875, "epoch": 0.973051010587103, "grad_norm": 0.24951599538326263, "kl": 0.1942138671875, "learning_rate": 1.0178597977743464e-07, "loss": 0.0117, "reward": 0.12222222425043583, "reward_std": 0.04943145625293255, "rewards/accuracy_multibox_reward": 0.12222222425043583, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 3641.736083984375, "epoch": 0.9740134744947064, "grad_norm": 0.22136180102825165, "kl": 0.1845703125, "learning_rate": 1.0166076495469963e-07, "loss": 0.0239, "reward": 0.0833333358168602, "reward_std": 0.10641203820705414, "rewards/accuracy_multibox_reward": 0.0833333358168602, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 4048.7500610351562, "epoch": 0.9749759384023099, "grad_norm": 0.5981465578079224, "kl": 0.288818359375, "learning_rate": 1.0154009239018829e-07, "loss": 0.0338, "reward": 0.16111112385988235, "reward_std": 0.05671145906671882, "rewards/accuracy_multibox_reward": 0.16111112385988235, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 3839.1944580078125, "epoch": 0.9759384023099134, "grad_norm": 0.38396987318992615, "kl": 0.298583984375, "learning_rate": 1.0142396330647038e-07, "loss": 0.0489, "reward": 0.1444444488734007, "reward_std": 0.19279421493411064, "rewards/accuracy_multibox_reward": 0.1444444488734007, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 3725.6666870117188, "epoch": 0.9769008662175168, "grad_norm": 0.3346000015735626, "kl": 0.2386474609375, "learning_rate": 1.013123788800841e-07, "loss": 0.0066, "reward": 0.06666667014360428, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.06666667014360428, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 3793.6944580078125, "epoch": 0.9778633301251203, "grad_norm": 0.33745238184928894, "kl": 0.2017822265625, "learning_rate": 1.0120534024152455e-07, "loss": 0.033, "reward": 0.20555556006729603, "reward_std": 0.1933566927909851, "rewards/accuracy_multibox_reward": 0.20555556006729603, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 3631.4445190429688, "epoch": 0.9788257940327237, "grad_norm": 0.31040406227111816, "kl": 0.23388671875, "learning_rate": 1.0110284847523205e-07, "loss": 0.0153, "reward": 0.30231481324881315, "reward_std": 0.2141357958316803, "rewards/accuracy_multibox_reward": 0.30231481324881315, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 3535.3611450195312, "epoch": 0.9797882579403272, "grad_norm": 0.47274160385131836, "kl": 0.204559326171875, "learning_rate": 1.0100490461958109e-07, "loss": 0.0441, "reward": 0.302777785807848, "reward_std": 0.22018730733543634, "rewards/accuracy_multibox_reward": 0.302777785807848, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 3655.9305419921875, "epoch": 0.9807507218479307, "grad_norm": 0.22589954733848572, "kl": 0.233642578125, "learning_rate": 1.0091150966687007e-07, "loss": 0.0076, "reward": 0.09104938432574272, "reward_std": 0.14717301866039634, "rewards/accuracy_multibox_reward": 0.09104938432574272, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 3849.5001220703125, "epoch": 0.9817131857555341, "grad_norm": 0.3344518840312958, "kl": 0.251220703125, "learning_rate": 1.0082266456331112e-07, "loss": 0.0072, "reward": 0.17500000540167093, "reward_std": 0.0942671038210392, "rewards/accuracy_multibox_reward": 0.17500000540167093, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 3956.3611450195312, "epoch": 0.9826756496631376, "grad_norm": 0.8990132808685303, "kl": 0.23681640625, "learning_rate": 1.0073837020902033e-07, "loss": 0.047, "reward": 0.21666668076068163, "reward_std": 0.26237690076231956, "rewards/accuracy_multibox_reward": 0.21666668076068163, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 3735.5000610351562, "epoch": 0.983638113570741, "grad_norm": 0.34757357835769653, "kl": 0.1861572265625, "learning_rate": 1.0065862745800896e-07, "loss": 0.0237, "reward": 0.33333334513008595, "reward_std": 0.2384423464536667, "rewards/accuracy_multibox_reward": 0.33333334513008595, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 3839.0000610351562, "epoch": 0.9846005774783445, "grad_norm": 0.5126814842224121, "kl": 0.266357421875, "learning_rate": 1.0058343711817453e-07, "loss": 0.051, "reward": 0.1407407484948635, "reward_std": 0.1503763496875763, "rewards/accuracy_multibox_reward": 0.1407407484948635, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 3660.4722290039062, "epoch": 0.9855630413859481, "grad_norm": 0.3811315596103668, "kl": 0.160400390625, "learning_rate": 1.0051279995129272e-07, "loss": 0.0108, "reward": 0.1680555623024702, "reward_std": 0.1365414746105671, "rewards/accuracy_multibox_reward": 0.1680555623024702, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 4138.652770996094, "epoch": 0.9865255052935515, "grad_norm": 0.3163141906261444, "kl": 0.24755859375, "learning_rate": 1.0044671667300971e-07, "loss": 0.0179, "reward": 0.20000000670552254, "reward_std": 0.10434440895915031, "rewards/accuracy_multibox_reward": 0.20000000670552254, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 3928.5972900390625, "epoch": 0.987487969201155, "grad_norm": 0.38235148787498474, "kl": 0.26416015625, "learning_rate": 1.0038518795283488e-07, "loss": 0.0109, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 3696.4722900390625, "epoch": 0.9884504331087585, "grad_norm": 0.2143331617116928, "kl": 0.17724609375, "learning_rate": 1.0032821441413395e-07, "loss": 0.0239, "reward": 0.15833333879709244, "reward_std": 0.05851395055651665, "rewards/accuracy_multibox_reward": 0.15833333879709244, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 4109.1527099609375, "epoch": 0.9894128970163619, "grad_norm": 0.30638381838798523, "kl": 0.344970703125, "learning_rate": 1.0027579663412287e-07, "loss": 0.0286, "reward": 0.07222222350537777, "reward_std": 0.144644595682621, "rewards/accuracy_multibox_reward": 0.07222222350537777, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 4024.8749389648438, "epoch": 0.9903753609239654, "grad_norm": 0.39146688580513, "kl": 0.216064453125, "learning_rate": 1.0022793514386174e-07, "loss": 0.0035, "reward": 0.11666666902601719, "reward_std": 0.040824830532073975, "rewards/accuracy_multibox_reward": 0.11666666902601719, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 4003.8751220703125, "epoch": 0.9913378248315688, "grad_norm": 0.21366339921951294, "kl": 0.2197265625, "learning_rate": 1.0018463042824956e-07, "loss": 0.0139, "reward": 0.06944444589316845, "reward_std": 0.14009356871247292, "rewards/accuracy_multibox_reward": 0.06944444589316845, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 3912.7222290039062, "epoch": 0.9923002887391723, "grad_norm": 0.34053146839141846, "kl": 0.193359375, "learning_rate": 1.0014588292601931e-07, "loss": 0.0091, "reward": 0.11388889141380787, "reward_std": 0.08979266881942749, "rewards/accuracy_multibox_reward": 0.11388889141380787, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 3870.166748046875, "epoch": 0.9932627526467758, "grad_norm": 0.23659369349479675, "kl": 0.2509765625, "learning_rate": 1.0011169302973344e-07, "loss": 0.0282, "reward": 0.13055556267499924, "reward_std": 0.15197336673736572, "rewards/accuracy_multibox_reward": 0.13055556267499924, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 3651.3889770507812, "epoch": 0.9942252165543792, "grad_norm": 0.17036062479019165, "kl": 0.189453125, "learning_rate": 1.0008206108577992e-07, "loss": 0.0129, "reward": 0.06666667014360428, "reward_std": 0.05163978412747383, "rewards/accuracy_multibox_reward": 0.06666667014360428, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 3651.5693969726562, "epoch": 0.9951876804619827, "grad_norm": 0.43690645694732666, "kl": 0.29345703125, "learning_rate": 1.0005698739436888e-07, "loss": 0.0452, "reward": 0.28055555559694767, "reward_std": 0.1453881487250328, "rewards/accuracy_multibox_reward": 0.28055555559694767, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 3887.8611450195312, "epoch": 0.9961501443695862, "grad_norm": 0.344142347574234, "kl": 0.275634765625, "learning_rate": 1.0003647220952936e-07, "loss": 0.013, "reward": 0.01944444514811039, "reward_std": 0.047628965228796005, "rewards/accuracy_multibox_reward": 0.01944444514811039, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 4036.9584350585938, "epoch": 0.9971126082771896, "grad_norm": 0.21288059651851654, "kl": 0.245361328125, "learning_rate": 1.000205157391067e-07, "loss": 0.0212, "reward": 0.15555554628372192, "reward_std": 0.06493871659040451, "rewards/accuracy_multibox_reward": 0.15555554628372192, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 3541.763916015625, "epoch": 0.9980750721847931, "grad_norm": 0.31325817108154297, "kl": 0.139404296875, "learning_rate": 1.0000911814476073e-07, "loss": -0.0176, "reward": 0.22268518898636103, "reward_std": 0.14211585745215416, "rewards/accuracy_multibox_reward": 0.22268518898636103, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 3941.1136474609375, "epoch": 0.9990375360923965, "grad_norm": 0.3806009888648987, "kl": 0.249755859375, "learning_rate": 1.0000227954196384e-07, "loss": 0.0103, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_multibox_reward": 0.0, "step": 1038 }, { "epoch": 0.9990375360923965, "step": 1038, "total_flos": 0.0, "train_loss": 0.021128982487537827, "train_runtime": 125728.8986, "train_samples_per_second": 0.595, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 1039, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }