{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997759689343589, "eval_steps": 500, "global_step": 3347, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1683.430908203125, "epoch": 0.00029870808752146963, "grad_norm": 0.11874296516180038, "kl": 0.0, "learning_rate": 8.955223880597015e-10, "loss": 0.0856, "reward": 0.5251116305589676, "reward_std": 0.2270953133702278, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4023437649011612, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1679.0982971191406, "epoch": 0.0005974161750429393, "grad_norm": 0.11954871565103531, "kl": 0.0, "learning_rate": 1.791044776119403e-09, "loss": 0.1002, "reward": 0.5167411044239998, "reward_std": 0.1966444756835699, "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3805803805589676, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1751.3438110351562, "epoch": 0.0008961242625644089, "grad_norm": 0.09308329224586487, "kl": 6.429851055145264e-06, "learning_rate": 2.6865671641791046e-09, "loss": 0.063, "reward": 0.4414062649011612, "reward_std": 0.16928499191999435, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3833705559372902, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 1711.8549499511719, "epoch": 0.0011948323500858785, "grad_norm": 0.0860733687877655, "kl": 2.206861972808838e-05, "learning_rate": 3.582089552238806e-09, "loss": 0.0692, "reward": 0.5078125223517418, "reward_std": 0.1853974275290966, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3582589477300644, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1624.1830749511719, "epoch": 0.0014935404376073482, "grad_norm": 0.12636132538318634, "kl": 3.0338764190673828e-05, "learning_rate": 4.477611940298507e-09, "loss": 0.1052, "reward": 0.5044643059372902, "reward_std": 0.1568475440144539, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857313156128, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1542.5000610351562, "epoch": 0.0017922485251288178, "grad_norm": 0.11773305386304855, "kl": 4.184246063232422e-05, "learning_rate": 5.373134328358209e-09, "loss": 0.102, "reward": 0.573660746216774, "reward_std": 0.26920509710907936, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000149011612, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1743.5916137695312, "epoch": 0.0020909566126502874, "grad_norm": 0.11479566246271133, "kl": 3.972649574279785e-05, "learning_rate": 6.26865671641791e-09, "loss": 0.0695, "reward": 0.3945312649011612, "reward_std": 0.15671708807349205, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.361049123108387, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 1743.6607971191406, "epoch": 0.002389664700171757, "grad_norm": 0.14383816719055176, "kl": 4.0203332901000977e-05, "learning_rate": 7.164179104477612e-09, "loss": 0.1012, "reward": 0.4832589477300644, "reward_std": 0.158252177760005, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3671875149011612, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1729.5201416015625, "epoch": 0.0026883727876932267, "grad_norm": 0.10122495889663696, "kl": 3.5136938095092773e-05, "learning_rate": 8.059701492537314e-09, "loss": 0.0797, "reward": 0.5357143133878708, "reward_std": 0.24270031973719597, "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500223517418, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 1696.5848999023438, "epoch": 0.0029870808752146963, "grad_norm": 0.12080354988574982, "kl": 3.916025161743164e-05, "learning_rate": 8.955223880597015e-09, "loss": 0.1088, "reward": 0.540736623108387, "reward_std": 0.2305934838950634, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3822544887661934, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1698.4554443359375, "epoch": 0.003285788962736166, "grad_norm": 0.11098581552505493, "kl": 3.4421682357788086e-05, "learning_rate": 9.850746268656716e-09, "loss": 0.0771, "reward": 0.4771205484867096, "reward_std": 0.18985513597726822, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3699776977300644, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1676.4755249023438, "epoch": 0.0035844970502576356, "grad_norm": 0.10070298612117767, "kl": 3.325939178466797e-05, "learning_rate": 1.0746268656716418e-08, "loss": 0.0889, "reward": 0.5078125298023224, "reward_std": 0.1448839157819748, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3895089477300644, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1737.149658203125, "epoch": 0.003883205137779105, "grad_norm": 0.0972321555018425, "kl": 3.510713577270508e-05, "learning_rate": 1.1641791044776118e-08, "loss": 0.0663, "reward": 0.474888414144516, "reward_std": 0.1746129635721445, "rewards/accuracy_reward": 0.10267857671715319, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372209832072258, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1687.1250915527344, "epoch": 0.004181913225300575, "grad_norm": 0.12681637704372406, "kl": 7.262825965881348e-05, "learning_rate": 1.253731343283582e-08, "loss": 0.1209, "reward": 0.5474330633878708, "reward_std": 0.21763423457741737, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3867187723517418, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 1603.5937805175781, "epoch": 0.004480621312822045, "grad_norm": 0.11380653083324432, "kl": 4.380941390991211e-05, "learning_rate": 1.3432835820895521e-08, "loss": 0.0962, "reward": 0.5357143133878708, "reward_std": 0.20932238921523094, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794642984867096, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1741.5425109863281, "epoch": 0.004779329400343514, "grad_norm": 0.11433465033769608, "kl": 3.641843795776367e-05, "learning_rate": 1.4328358208955224e-08, "loss": 0.0967, "reward": 0.4614955633878708, "reward_std": 0.19961045682430267, "rewards/accuracy_reward": 0.07366071967408061, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3878348395228386, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1572.6429443359375, "epoch": 0.005078037487864984, "grad_norm": 0.13021007180213928, "kl": 2.8431415557861328e-05, "learning_rate": 1.5223880597014923e-08, "loss": 0.0907, "reward": 0.541294664144516, "reward_std": 0.19459424167871475, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4185268133878708, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1641.7232971191406, "epoch": 0.005376745575386453, "grad_norm": 0.10701103508472443, "kl": 3.933906555175781e-05, "learning_rate": 1.6119402985074627e-08, "loss": 0.0827, "reward": 0.5385044887661934, "reward_std": 0.2357155755162239, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3911830559372902, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 1596.5960693359375, "epoch": 0.0056754536629079234, "grad_norm": 0.1234470009803772, "kl": 3.522634506225586e-05, "learning_rate": 1.7014925373134328e-08, "loss": 0.0796, "reward": 0.5518973469734192, "reward_std": 0.2196791172027588, "rewards/accuracy_reward": 0.16964287124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3822544738650322, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1752.3638916015625, "epoch": 0.005974161750429393, "grad_norm": 0.08828148245811462, "kl": 3.904104232788086e-05, "learning_rate": 1.791044776119403e-08, "loss": 0.0626, "reward": 0.5284598469734192, "reward_std": 0.20157122611999512, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3677455559372902, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1700.6139221191406, "epoch": 0.006272869837950863, "grad_norm": 0.12157674133777618, "kl": 3.802776336669922e-05, "learning_rate": 1.880597014925373e-08, "loss": 0.1115, "reward": 0.5039062649011612, "reward_std": 0.1885027512907982, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.381138414144516, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 1702.3661499023438, "epoch": 0.006571577925472332, "grad_norm": 0.09368129819631577, "kl": 3.5762786865234375e-05, "learning_rate": 1.970149253731343e-08, "loss": 0.0741, "reward": 0.4475446566939354, "reward_std": 0.18034331500530243, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3850446566939354, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1792.2567749023438, "epoch": 0.006870286012993802, "grad_norm": 0.10874330997467041, "kl": 3.737211227416992e-05, "learning_rate": 2.0597014925373132e-08, "loss": 0.0937, "reward": 0.459821455180645, "reward_std": 0.21897622756659985, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357142873108387, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 1681.0201416015625, "epoch": 0.007168994100515271, "grad_norm": 0.11087338626384735, "kl": 3.483891487121582e-05, "learning_rate": 2.1492537313432836e-08, "loss": 0.0742, "reward": 0.6143973544239998, "reward_std": 0.25623535737395287, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3911830559372902, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1731.19873046875, "epoch": 0.007467702188036741, "grad_norm": 0.10914663225412369, "kl": 3.1381845474243164e-05, "learning_rate": 2.2388059701492534e-08, "loss": 0.0882, "reward": 0.5066964477300644, "reward_std": 0.2024353388696909, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794643059372902, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1697.46435546875, "epoch": 0.00776641027555821, "grad_norm": 0.10319439321756363, "kl": 3.68654727935791e-05, "learning_rate": 2.3283582089552235e-08, "loss": 0.0845, "reward": 0.4832589477300644, "reward_std": 0.23020702600479126, "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.385044664144516, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 1671.9822082519531, "epoch": 0.00806511836307968, "grad_norm": 0.10780474543571472, "kl": 4.792213439941406e-05, "learning_rate": 2.417910447761194e-08, "loss": 0.0768, "reward": 0.5926339402794838, "reward_std": 0.18774145282804966, "rewards/accuracy_reward": 0.2075892968568951, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3850446566939354, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1706.9397888183594, "epoch": 0.00836382645060115, "grad_norm": 0.1165231466293335, "kl": 3.311038017272949e-05, "learning_rate": 2.507462686567164e-08, "loss": 0.0765, "reward": 0.5089285969734192, "reward_std": 0.18680529296398163, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.399553582072258, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1606.0134887695312, "epoch": 0.008662534538122619, "grad_norm": 0.13598506152629852, "kl": 4.00543212890625e-05, "learning_rate": 2.5970149253731345e-08, "loss": 0.1165, "reward": 0.447544664144516, "reward_std": 0.1740584746003151, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3761160895228386, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 1650.3572082519531, "epoch": 0.00896124262564409, "grad_norm": 0.10819530487060547, "kl": 3.585219383239746e-05, "learning_rate": 2.6865671641791042e-08, "loss": 0.0702, "reward": 0.670200914144516, "reward_std": 0.32369451224803925, "rewards/accuracy_reward": 0.2678571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4023437649011612, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1590.3951416015625, "epoch": 0.009259950713165559, "grad_norm": 0.11822541058063507, "kl": 3.826618194580078e-05, "learning_rate": 2.7761194029850743e-08, "loss": 0.0995, "reward": 0.4475446715950966, "reward_std": 0.1643381118774414, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375149011612, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1675.6272888183594, "epoch": 0.009558658800687028, "grad_norm": 0.10473611205816269, "kl": 4.100799560546875e-05, "learning_rate": 2.8656716417910448e-08, "loss": 0.0779, "reward": 0.4927455633878708, "reward_std": 0.168385099619627, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3699776902794838, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1752.7656860351562, "epoch": 0.009857366888208497, "grad_norm": 0.09839142858982086, "kl": 4.32133674621582e-05, "learning_rate": 2.955223880597015e-08, "loss": 0.0767, "reward": 0.4447544887661934, "reward_std": 0.16519702784717083, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.368861623108387, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 1693.5090026855469, "epoch": 0.010156074975729968, "grad_norm": 0.09762740880250931, "kl": 3.129243850708008e-05, "learning_rate": 3.0447761194029846e-08, "loss": 0.0745, "reward": 0.5066964477300644, "reward_std": 0.18808690086007118, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372767873108387, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1674.43310546875, "epoch": 0.010454783063251438, "grad_norm": 0.11876723915338516, "kl": 3.835558891296387e-05, "learning_rate": 3.134328358208955e-08, "loss": 0.0931, "reward": 0.5228794813156128, "reward_std": 0.22836889699101448, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4001116305589676, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1699.9509582519531, "epoch": 0.010753491150772907, "grad_norm": 0.12609705328941345, "kl": 4.4405460357666016e-05, "learning_rate": 3.2238805970149255e-08, "loss": 0.0987, "reward": 0.4436384215950966, "reward_std": 0.19404086098074913, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3833705484867096, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 1681.7032165527344, "epoch": 0.011052199238294378, "grad_norm": 0.11757643520832062, "kl": 3.3795833587646484e-05, "learning_rate": 3.313432835820895e-08, "loss": 0.0997, "reward": 0.5016741305589676, "reward_std": 0.21826454252004623, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3789062649011612, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1760.0871276855469, "epoch": 0.011350907325815847, "grad_norm": 0.10776340961456299, "kl": 4.45246696472168e-05, "learning_rate": 3.4029850746268657e-08, "loss": 0.0755, "reward": 0.4056919813156128, "reward_std": 0.11691828817129135, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3632812649011612, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1706.7679443359375, "epoch": 0.011649615413337316, "grad_norm": 0.1445479393005371, "kl": 4.8220157623291016e-05, "learning_rate": 3.4925373134328354e-08, "loss": 0.1105, "reward": 0.4933035969734192, "reward_std": 0.24934273958206177, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3883928805589676, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 1526.2634582519531, "epoch": 0.011948323500858785, "grad_norm": 0.1240275427699089, "kl": 3.832578659057617e-05, "learning_rate": 3.582089552238806e-08, "loss": 0.0874, "reward": 0.6540178954601288, "reward_std": 0.26514703407883644, "rewards/accuracy_reward": 0.22991072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4241071715950966, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1692.5737609863281, "epoch": 0.012247031588380256, "grad_norm": 0.11402413249015808, "kl": 3.9458274841308594e-05, "learning_rate": 3.671641791044776e-08, "loss": 0.0944, "reward": 0.561383955180645, "reward_std": 0.21303674206137657, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380580373108387, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1805.6786499023438, "epoch": 0.012545739675901725, "grad_norm": 0.10592972487211227, "kl": 4.51505184173584e-05, "learning_rate": 3.761194029850746e-08, "loss": 0.087, "reward": 0.4224330559372902, "reward_std": 0.14282661490142345, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.348772332072258, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 1762.7456359863281, "epoch": 0.012844447763423195, "grad_norm": 0.11416666954755783, "kl": 3.5822391510009766e-05, "learning_rate": 3.850746268656716e-08, "loss": 0.0916, "reward": 0.4575893133878708, "reward_std": 0.20021788962185383, "rewards/accuracy_reward": 0.08258929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1630.8706359863281, "epoch": 0.013143155850944664, "grad_norm": 0.11323259025812149, "kl": 4.214048385620117e-05, "learning_rate": 3.940298507462686e-08, "loss": 0.0857, "reward": 0.5212053656578064, "reward_std": 0.26041167601943016, "rewards/accuracy_reward": 0.13392857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3872768059372902, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 1718.5648193359375, "epoch": 0.013441863938466135, "grad_norm": 0.10110107809305191, "kl": 3.409385681152344e-05, "learning_rate": 4.029850746268657e-08, "loss": 0.0868, "reward": 0.4654018059372902, "reward_std": 0.1983457375317812, "rewards/accuracy_reward": 0.08482143306173384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380580373108387, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1741.0603332519531, "epoch": 0.013740572025987604, "grad_norm": 0.10929618775844574, "kl": 4.011392593383789e-05, "learning_rate": 4.1194029850746264e-08, "loss": 0.0868, "reward": 0.5050223469734192, "reward_std": 0.19772116094827652, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3666294738650322, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 1685.3639221191406, "epoch": 0.014039280113509073, "grad_norm": 0.11352328211069107, "kl": 3.612041473388672e-05, "learning_rate": 4.208955223880597e-08, "loss": 0.0813, "reward": 0.5122768133878708, "reward_std": 0.18525969237089157, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3939732313156128, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1790.8773193359375, "epoch": 0.014337988201030542, "grad_norm": 0.12127692252397537, "kl": 4.762411117553711e-05, "learning_rate": 4.298507462686567e-08, "loss": 0.0982, "reward": 0.3850446566939354, "reward_std": 0.16141337528824806, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.349330373108387, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1792.80810546875, "epoch": 0.014636696288552013, "grad_norm": 0.09357032179832458, "kl": 3.4481287002563477e-05, "learning_rate": 4.388059701492538e-08, "loss": 0.0627, "reward": 0.4419643133878708, "reward_std": 0.16455934569239616, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357142873108387, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 1723.2522888183594, "epoch": 0.014935404376073482, "grad_norm": 0.11818049103021622, "kl": 3.170967102050781e-05, "learning_rate": 4.477611940298507e-08, "loss": 0.0743, "reward": 0.4860491305589676, "reward_std": 0.21108873188495636, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3833705484867096, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1664.9308776855469, "epoch": 0.015234112463594952, "grad_norm": 0.11535719782114029, "kl": 3.954768180847168e-05, "learning_rate": 4.567164179104477e-08, "loss": 0.0967, "reward": 0.5686384290456772, "reward_std": 0.1981690414249897, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1761.6161499023438, "epoch": 0.01553282055111642, "grad_norm": 0.09796777367591858, "kl": 3.299117088317871e-05, "learning_rate": 4.656716417910447e-08, "loss": 0.0777, "reward": 0.4436384066939354, "reward_std": 0.18245521187782288, "rewards/accuracy_reward": 0.07366071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3699776902794838, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1767.0625915527344, "epoch": 0.01583152863863789, "grad_norm": 0.12637929618358612, "kl": 4.172325134277344e-05, "learning_rate": 4.7462686567164174e-08, "loss": 0.0916, "reward": 0.5284598469734192, "reward_std": 0.216598492115736, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3722098395228386, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1724.7813415527344, "epoch": 0.01613023672615936, "grad_norm": 0.09934934973716736, "kl": 3.311038017272949e-05, "learning_rate": 4.835820895522388e-08, "loss": 0.066, "reward": 0.525111623108387, "reward_std": 0.17700908705592155, "rewards/accuracy_reward": 0.13616072060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3889509066939354, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1800.274658203125, "epoch": 0.016428944813680832, "grad_norm": 0.11573201417922974, "kl": 3.6597251892089844e-05, "learning_rate": 4.925373134328358e-08, "loss": 0.0841, "reward": 0.412388414144516, "reward_std": 0.1749095730483532, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.365513414144516, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1688.1005249023438, "epoch": 0.0167276529012023, "grad_norm": 0.10135228931903839, "kl": 3.573298454284668e-05, "learning_rate": 5.014925373134328e-08, "loss": 0.0654, "reward": 0.4140625149011612, "reward_std": 0.16656235232949257, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380580373108387, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1638.1139221191406, "epoch": 0.01702636098872377, "grad_norm": 0.1271073967218399, "kl": 3.388524055480957e-05, "learning_rate": 5.1044776119402985e-08, "loss": 0.0925, "reward": 0.4687500149011612, "reward_std": 0.20705218613147736, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3973214402794838, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1708.2076721191406, "epoch": 0.017325069076245238, "grad_norm": 0.11433330178260803, "kl": 3.2901763916015625e-05, "learning_rate": 5.194029850746269e-08, "loss": 0.0844, "reward": 0.5385044813156128, "reward_std": 0.2572397105395794, "rewards/accuracy_reward": 0.14508929522708058, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3934151977300644, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1645.7433776855469, "epoch": 0.01762377716376671, "grad_norm": 0.15978080034255981, "kl": 3.921985626220703e-05, "learning_rate": 5.283582089552238e-08, "loss": 0.1125, "reward": 0.4994419887661934, "reward_std": 0.1982460767030716, "rewards/accuracy_reward": 0.10714286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392299123108387, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1766.1875610351562, "epoch": 0.01792248525128818, "grad_norm": 0.09044642746448517, "kl": 3.5136938095092773e-05, "learning_rate": 5.3731343283582085e-08, "loss": 0.0856, "reward": 0.505580373108387, "reward_std": 0.22715426981449127, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3805803805589676, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1674.8750915527344, "epoch": 0.018221193338809647, "grad_norm": 0.11666932702064514, "kl": 4.4405460357666016e-05, "learning_rate": 5.462686567164179e-08, "loss": 0.0897, "reward": 0.491071455180645, "reward_std": 0.19610097631812096, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3839285895228386, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1547.4687805175781, "epoch": 0.018519901426331118, "grad_norm": 0.12756124138832092, "kl": 4.1961669921875e-05, "learning_rate": 5.5522388059701486e-08, "loss": 0.0879, "reward": 0.4748884066939354, "reward_std": 0.18908410146832466, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4168526977300644, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 1640.4264221191406, "epoch": 0.01881860951385259, "grad_norm": 0.120692677795887, "kl": 3.331899642944336e-05, "learning_rate": 5.641791044776119e-08, "loss": 0.0894, "reward": 0.4760044887661934, "reward_std": 0.20166881009936333, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404575914144516, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1555.0156860351562, "epoch": 0.019117317601374056, "grad_norm": 0.12683378159999847, "kl": 4.595518112182617e-05, "learning_rate": 5.7313432835820895e-08, "loss": 0.0939, "reward": 0.6294643133878708, "reward_std": 0.2216467298567295, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428805589676, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1765.3750915527344, "epoch": 0.019416025688895527, "grad_norm": 0.11793732643127441, "kl": 3.883242607116699e-05, "learning_rate": 5.820895522388059e-08, "loss": 0.0916, "reward": 0.4458705484867096, "reward_std": 0.20984939858317375, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3677455484867096, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1823.9732666015625, "epoch": 0.019714733776416995, "grad_norm": 0.1036859080195427, "kl": 3.758072853088379e-05, "learning_rate": 5.91044776119403e-08, "loss": 0.0842, "reward": 0.447544664144516, "reward_std": 0.17949946410953999, "rewards/accuracy_reward": 0.09598214388825, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3515625223517418, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1809.6786499023438, "epoch": 0.020013441863938466, "grad_norm": 0.11656396090984344, "kl": 3.4809112548828125e-05, "learning_rate": 6e-08, "loss": 0.0929, "reward": 0.435825914144516, "reward_std": 0.1914697103202343, "rewards/accuracy_reward": 0.08705357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3487723395228386, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1627.9308776855469, "epoch": 0.020312149951459937, "grad_norm": 0.0887487605214119, "kl": 3.528594970703125e-05, "learning_rate": 6.089552238805969e-08, "loss": 0.0678, "reward": 0.546316996216774, "reward_std": 0.18256977945566177, "rewards/accuracy_reward": 0.14732143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3989955633878708, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1699.6920166015625, "epoch": 0.020610858038981404, "grad_norm": 0.10336502641439438, "kl": 3.415346145629883e-05, "learning_rate": 6.17910447761194e-08, "loss": 0.0691, "reward": 0.4419643133878708, "reward_std": 0.15564358048141003, "rewards/accuracy_reward": 0.06696428940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1676.0536193847656, "epoch": 0.020909566126502875, "grad_norm": 0.1253899782896042, "kl": 3.0547380447387695e-05, "learning_rate": 6.26865671641791e-08, "loss": 0.0981, "reward": 0.4949777126312256, "reward_std": 0.25214240327477455, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669738650322, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1559.4755554199219, "epoch": 0.021208274214024346, "grad_norm": 0.15824468433856964, "kl": 3.230571746826172e-05, "learning_rate": 6.35820895522388e-08, "loss": 0.1118, "reward": 0.623883955180645, "reward_std": 0.25335272774100304, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4341518059372902, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1562.1272888183594, "epoch": 0.021506982301545814, "grad_norm": 0.13197307288646698, "kl": 3.814697265625e-05, "learning_rate": 6.447761194029851e-08, "loss": 0.1122, "reward": 0.5418526902794838, "reward_std": 0.20856044068932533, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4190848395228386, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1546.2590026855469, "epoch": 0.021805690389067284, "grad_norm": 0.12614215910434723, "kl": 3.6776065826416016e-05, "learning_rate": 6.537313432835821e-08, "loss": 0.1026, "reward": 0.6629464477300644, "reward_std": 0.225127462297678, "rewards/accuracy_reward": 0.2366071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4263393133878708, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1688.2166137695312, "epoch": 0.022104398476588755, "grad_norm": 0.1191791370511055, "kl": 3.3527612686157227e-05, "learning_rate": 6.62686567164179e-08, "loss": 0.0807, "reward": 0.4347098395228386, "reward_std": 0.1589935217052698, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392299123108387, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1659.7857971191406, "epoch": 0.022403106564110223, "grad_norm": 0.13243959844112396, "kl": 2.86102294921875e-05, "learning_rate": 6.716417910447762e-08, "loss": 0.0807, "reward": 0.478794664144516, "reward_std": 0.1915416121482849, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3872768059372902, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1666.85498046875, "epoch": 0.022701814651631694, "grad_norm": 0.10998034477233887, "kl": 3.460049629211426e-05, "learning_rate": 6.805970149253731e-08, "loss": 0.0774, "reward": 0.4838169887661934, "reward_std": 0.19064557552337646, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3856026977300644, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 1699.0335693359375, "epoch": 0.02300052273915316, "grad_norm": 0.11343551427125931, "kl": 3.415346145629883e-05, "learning_rate": 6.895522388059701e-08, "loss": 0.0881, "reward": 0.4860491156578064, "reward_std": 0.16760939545929432, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3744419813156128, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1837.3326721191406, "epoch": 0.023299230826674632, "grad_norm": 0.08946239948272705, "kl": 3.713369369506836e-05, "learning_rate": 6.985074626865671e-08, "loss": 0.0621, "reward": 0.5033482387661934, "reward_std": 0.18160117883235216, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3493303656578064, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1807.4375610351562, "epoch": 0.023597938914196103, "grad_norm": 0.11405161023139954, "kl": 4.106760025024414e-05, "learning_rate": 7.07462686567164e-08, "loss": 0.0978, "reward": 0.4263392984867096, "reward_std": 0.14606516808271408, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3571428805589676, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 1678.3348693847656, "epoch": 0.02389664700171757, "grad_norm": 0.12855415046215057, "kl": 3.758072853088379e-05, "learning_rate": 7.164179104477612e-08, "loss": 0.1057, "reward": 0.4698660895228386, "reward_std": 0.19230282306671143, "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.385044664144516, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1741.5916137695312, "epoch": 0.02419535508923904, "grad_norm": 0.10387001186609268, "kl": 3.471970558166504e-05, "learning_rate": 7.253731343283581e-08, "loss": 0.073, "reward": 0.482700914144516, "reward_std": 0.1434523370116949, "rewards/accuracy_reward": 0.11383929592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.368861623108387, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1703.5603637695312, "epoch": 0.024494063176760512, "grad_norm": 0.14176154136657715, "kl": 4.3451786041259766e-05, "learning_rate": 7.343283582089553e-08, "loss": 0.0892, "reward": 0.4575893133878708, "reward_std": 0.2170492671430111, "rewards/accuracy_reward": 0.06250000139698386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950892984867096, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1627.35498046875, "epoch": 0.02479277126428198, "grad_norm": 0.10045669227838516, "kl": 3.364682197570801e-05, "learning_rate": 7.432835820895522e-08, "loss": 0.06, "reward": 0.4369419813156128, "reward_std": 0.13767889700829983, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3811384066939354, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1647.1340026855469, "epoch": 0.02509147935180345, "grad_norm": 0.12319032102823257, "kl": 4.00543212890625e-05, "learning_rate": 7.522388059701492e-08, "loss": 0.1022, "reward": 0.5128348395228386, "reward_std": 0.24691282957792282, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4056919813156128, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1686.1786193847656, "epoch": 0.02539018743932492, "grad_norm": 0.1079145297408104, "kl": 3.4689903259277344e-05, "learning_rate": 7.611940298507463e-08, "loss": 0.083, "reward": 0.5664062798023224, "reward_std": 0.1928981924429536, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3744419813156128, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1736.4643859863281, "epoch": 0.02568889552684639, "grad_norm": 0.10596531629562378, "kl": 3.6090612411499023e-05, "learning_rate": 7.701492537313432e-08, "loss": 0.0737, "reward": 0.4888393059372902, "reward_std": 0.19498352892696857, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372767873108387, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1642.6808776855469, "epoch": 0.02598760361436786, "grad_norm": 0.1117667555809021, "kl": 3.522634506225586e-05, "learning_rate": 7.791044776119403e-08, "loss": 0.0665, "reward": 0.6021205708384514, "reward_std": 0.24270454049110413, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4146205559372902, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1767.1942749023438, "epoch": 0.026286311701889328, "grad_norm": 0.1112813949584961, "kl": 3.790855407714844e-05, "learning_rate": 7.880597014925372e-08, "loss": 0.0834, "reward": 0.4090401902794838, "reward_std": 0.18420863151550293, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3621651977300644, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1656.3505249023438, "epoch": 0.0265850197894108, "grad_norm": 0.1196751669049263, "kl": 3.88026237487793e-05, "learning_rate": 7.970149253731344e-08, "loss": 0.0931, "reward": 0.601004496216774, "reward_std": 0.21271182037889957, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3956473395228386, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1785.5156860351562, "epoch": 0.02688372787693227, "grad_norm": 0.10713288933038712, "kl": 3.635883331298828e-05, "learning_rate": 8.059701492537313e-08, "loss": 0.0934, "reward": 0.4358259066939354, "reward_std": 0.18155107647180557, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3710937574505806, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1624.872802734375, "epoch": 0.027182435964453737, "grad_norm": 0.10958074778318405, "kl": 3.5762786865234375e-05, "learning_rate": 8.149253731343282e-08, "loss": 0.0773, "reward": 0.4324776977300644, "reward_std": 0.21120088919997215, "rewards/accuracy_reward": 0.040178574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392299123108387, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1561.3884582519531, "epoch": 0.027481144051975208, "grad_norm": 0.11443352699279785, "kl": 3.039836883544922e-05, "learning_rate": 8.238805970149253e-08, "loss": 0.087, "reward": 0.5267857313156128, "reward_std": 0.2160644195973873, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419642873108387, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1607.9822387695312, "epoch": 0.027779852139496675, "grad_norm": 0.11141274869441986, "kl": 2.5093555450439453e-05, "learning_rate": 8.328358208955223e-08, "loss": 0.1119, "reward": 0.615513414144516, "reward_std": 0.26796576753258705, "rewards/accuracy_reward": 0.2008928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4146205559372902, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 1776.0781860351562, "epoch": 0.028078560227018146, "grad_norm": 0.11773400753736496, "kl": 3.376603126525879e-05, "learning_rate": 8.417910447761194e-08, "loss": 0.1035, "reward": 0.4296875149011612, "reward_std": 0.2233152762055397, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3627232238650322, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1779.7322387695312, "epoch": 0.028377268314539617, "grad_norm": 0.11511088907718658, "kl": 3.612041473388672e-05, "learning_rate": 8.507462686567163e-08, "loss": 0.0943, "reward": 0.4492187798023224, "reward_std": 0.16908852756023407, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3554687574505806, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1721.118408203125, "epoch": 0.028675976402061085, "grad_norm": 0.10883837938308716, "kl": 3.266334533691406e-05, "learning_rate": 8.597014925373135e-08, "loss": 0.0828, "reward": 0.478236623108387, "reward_std": 0.16206570714712143, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3800223395228386, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1756.4554138183594, "epoch": 0.028974684489582556, "grad_norm": 0.12967585027217865, "kl": 2.9474496841430664e-05, "learning_rate": 8.686567164179104e-08, "loss": 0.0953, "reward": 0.3978794887661934, "reward_std": 0.17098122648894787, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3666294738650322, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1746.8884887695312, "epoch": 0.029273392577104027, "grad_norm": 0.11532849073410034, "kl": 3.2901763916015625e-05, "learning_rate": 8.776119402985075e-08, "loss": 0.0875, "reward": 0.572544664144516, "reward_std": 0.2047073133289814, "rewards/accuracy_reward": 0.21205358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3604910895228386, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1547.5804443359375, "epoch": 0.029572100664625494, "grad_norm": 0.11721282452344894, "kl": 2.8133392333984375e-05, "learning_rate": 8.865671641791044e-08, "loss": 0.0816, "reward": 0.444196455180645, "reward_std": 0.1796233020722866, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607387661934, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1697.2277221679688, "epoch": 0.029870808752146965, "grad_norm": 0.1228611022233963, "kl": 2.9027462005615234e-05, "learning_rate": 8.955223880597014e-08, "loss": 0.1036, "reward": 0.5195312798023224, "reward_std": 0.2567187771201134, "rewards/accuracy_reward": 0.12276786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3967634066939354, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1669.2054443359375, "epoch": 0.030169516839668432, "grad_norm": 0.10781023651361465, "kl": 2.682209014892578e-05, "learning_rate": 9.044776119402985e-08, "loss": 0.0896, "reward": 0.5591518208384514, "reward_std": 0.25458318181335926, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3939732313156128, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1763.7366943359375, "epoch": 0.030468224927189903, "grad_norm": 0.1091737374663353, "kl": 1.9058585166931152e-05, "learning_rate": 9.134328358208955e-08, "loss": 0.1006, "reward": 0.482142873108387, "reward_std": 0.16501028090715408, "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035969734192, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1611.8416137695312, "epoch": 0.030766933014711374, "grad_norm": 0.11308807134628296, "kl": 2.2456049919128418e-05, "learning_rate": 9.223880597014926e-08, "loss": 0.0775, "reward": 0.5535714477300644, "reward_std": 0.23757027462124825, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392857164144516, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1621.6138916015625, "epoch": 0.03106564110223284, "grad_norm": 0.1560889333486557, "kl": 1.8693506717681885e-05, "learning_rate": 9.313432835820894e-08, "loss": 0.1011, "reward": 0.5820312798023224, "reward_std": 0.228149626404047, "rewards/accuracy_reward": 0.1808035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4012276902794838, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1857.8125915527344, "epoch": 0.03136434918975431, "grad_norm": 0.11301186680793762, "kl": 1.9043684005737305e-05, "learning_rate": 9.402985074626865e-08, "loss": 0.1001, "reward": 0.4547991305589676, "reward_std": 0.16691002063453197, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3476562649011612, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1583.1697387695312, "epoch": 0.03166305727727578, "grad_norm": 0.13703392446041107, "kl": 1.712888479232788e-05, "learning_rate": 9.492537313432835e-08, "loss": 0.1224, "reward": 0.502790205180645, "reward_std": 0.22746310010552406, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4023437649011612, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1698.83935546875, "epoch": 0.03196176536479725, "grad_norm": 0.1084250956773758, "kl": 2.142786979675293e-05, "learning_rate": 9.582089552238806e-08, "loss": 0.0886, "reward": 0.4441964477300644, "reward_std": 0.15286186896264553, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000149011612, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1783.4666137695312, "epoch": 0.03226047345231872, "grad_norm": 0.11556082218885422, "kl": 5.620718002319336e-05, "learning_rate": 9.671641791044776e-08, "loss": 0.0883, "reward": 0.4151785895228386, "reward_std": 0.19462410919368267, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3660714477300644, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 1681.0759582519531, "epoch": 0.03255918153984019, "grad_norm": 0.13142815232276917, "kl": 2.3543834686279297e-05, "learning_rate": 9.761194029850746e-08, "loss": 0.0922, "reward": 0.5563616380095482, "reward_std": 0.22507062554359436, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3867187649011612, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 1693.6831359863281, "epoch": 0.032857889627361664, "grad_norm": 0.14225296676158905, "kl": 2.086162567138672e-05, "learning_rate": 9.850746268656717e-08, "loss": 0.1088, "reward": 0.431361623108387, "reward_std": 0.16588864661753178, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3889509066939354, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 1780.6161499023438, "epoch": 0.03315659771488313, "grad_norm": 0.11606969684362411, "kl": 2.0325183868408203e-05, "learning_rate": 9.940298507462685e-08, "loss": 0.0878, "reward": 0.4419643059372902, "reward_std": 0.18684570491313934, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3727678656578064, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1762.180908203125, "epoch": 0.0334553058024046, "grad_norm": 0.09594735503196716, "kl": 1.800432801246643e-05, "learning_rate": 1.0029850746268656e-07, "loss": 0.0688, "reward": 0.4140625149011612, "reward_std": 0.132491834461689, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3493303656578064, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1712.7098999023438, "epoch": 0.03375401388992607, "grad_norm": 0.13458725810050964, "kl": 1.913309097290039e-05, "learning_rate": 1.0119402985074626e-07, "loss": 0.1018, "reward": 0.5022321566939354, "reward_std": 0.14312662556767464, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.377232164144516, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 1634.3683776855469, "epoch": 0.03405272197744754, "grad_norm": 0.09432275593280792, "kl": 1.741945743560791e-05, "learning_rate": 1.0208955223880597e-07, "loss": 0.0782, "reward": 0.530133955180645, "reward_std": 0.1790382992476225, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4006696566939354, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 1847.5469665527344, "epoch": 0.03435143006496901, "grad_norm": 0.1180725172162056, "kl": 1.1175870895385742e-05, "learning_rate": 1.0298507462686567e-07, "loss": 0.0901, "reward": 0.4029017984867096, "reward_std": 0.1542592030018568, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.360491082072258, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 1713.74560546875, "epoch": 0.034650138152490476, "grad_norm": 0.1284659206867218, "kl": 1.3888929970562458e-05, "learning_rate": 1.0388059701492538e-07, "loss": 0.0739, "reward": 0.4090401977300644, "reward_std": 0.16860003024339676, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3777901977300644, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1781.9107971191406, "epoch": 0.034948846240011947, "grad_norm": 0.12605510652065277, "kl": 1.7158687114715576e-05, "learning_rate": 1.0477611940298506e-07, "loss": 0.0826, "reward": 0.4547991305589676, "reward_std": 0.13513225689530373, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.365513414144516, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1659.0491943359375, "epoch": 0.03524755432753342, "grad_norm": 0.12102244794368744, "kl": 9.28342342376709e-06, "learning_rate": 1.0567164179104476e-07, "loss": 0.1212, "reward": 0.5887277126312256, "reward_std": 0.22162682935595512, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.403459832072258, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1585.1138916015625, "epoch": 0.03554626241505489, "grad_norm": 0.10610140860080719, "kl": 1.874566078186035e-05, "learning_rate": 1.0656716417910447e-07, "loss": 0.0786, "reward": 0.5440848469734192, "reward_std": 0.24336539581418037, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4101562649011612, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 1782.0603637695312, "epoch": 0.03584497050257636, "grad_norm": 0.11207772046327591, "kl": 1.280754804611206e-05, "learning_rate": 1.0746268656716417e-07, "loss": 0.0764, "reward": 0.4029017984867096, "reward_std": 0.11240966338664293, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3448660969734192, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 1606.3996276855469, "epoch": 0.03614367859009783, "grad_norm": 0.14121295511722565, "kl": 2.22623348236084e-05, "learning_rate": 1.0835820895522388e-07, "loss": 0.1147, "reward": 0.502232164144516, "reward_std": 0.2250801958143711, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4040178805589676, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1635.2656860351562, "epoch": 0.036442386677619294, "grad_norm": 0.13871541619300842, "kl": 3.593042492866516e-06, "learning_rate": 1.0925373134328358e-07, "loss": 0.1132, "reward": 0.4966518059372902, "reward_std": 0.21080688759684563, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.411830373108387, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 1675.3996276855469, "epoch": 0.036741094765140765, "grad_norm": 0.10419156402349472, "kl": 6.241723895072937e-06, "learning_rate": 1.1014925373134329e-07, "loss": 0.0651, "reward": 0.486607164144516, "reward_std": 0.16978228837251663, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950893059372902, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1632.5335388183594, "epoch": 0.037039802852662236, "grad_norm": 0.13395677506923676, "kl": 1.4135031960904598e-05, "learning_rate": 1.1104477611940297e-07, "loss": 0.0893, "reward": 0.5329241380095482, "reward_std": 0.22636568918824196, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3989955559372902, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1594.9464721679688, "epoch": 0.03733851094018371, "grad_norm": 0.09640159457921982, "kl": 7.88271427154541e-06, "learning_rate": 1.1194029850746268e-07, "loss": 0.0678, "reward": 0.5524553805589676, "reward_std": 0.17246666364371777, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.416294664144516, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1723.2076416015625, "epoch": 0.03763721902770518, "grad_norm": 0.12957879900932312, "kl": 8.415430784225464e-06, "learning_rate": 1.1283582089552238e-07, "loss": 0.0951, "reward": 0.5753348544239998, "reward_std": 0.22099421173334122, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.396763414144516, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 1732.1161499023438, "epoch": 0.03793592711522664, "grad_norm": 0.09386232495307922, "kl": 1.68532133102417e-05, "learning_rate": 1.1373134328358208e-07, "loss": 0.0598, "reward": 0.5669643059372902, "reward_std": 0.21041278168559074, "rewards/accuracy_reward": 0.17633929220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250149011612, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 1636.3326721191406, "epoch": 0.03823463520274811, "grad_norm": 0.11837897449731827, "kl": 2.707540988922119e-05, "learning_rate": 1.1462686567164179e-07, "loss": 0.0962, "reward": 0.4832589402794838, "reward_std": 0.1810510978102684, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4029017984867096, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1590.9375305175781, "epoch": 0.038533343290269584, "grad_norm": 0.10348327457904816, "kl": 5.804002285003662e-06, "learning_rate": 1.1552238805970147e-07, "loss": 0.0817, "reward": 0.612723246216774, "reward_std": 0.22352634742856026, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4185268059372902, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1435.8527221679688, "epoch": 0.038832051377791055, "grad_norm": 0.14126534759998322, "kl": 1.7508864402770996e-05, "learning_rate": 1.1641791044776119e-07, "loss": 0.1018, "reward": 0.6188616454601288, "reward_std": 0.2470673769712448, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4559151902794838, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1785.6406860351562, "epoch": 0.039130759465312526, "grad_norm": 0.10213267058134079, "kl": 9.46037471294403e-06, "learning_rate": 1.1731343283582088e-07, "loss": 0.0963, "reward": 0.4843750223517418, "reward_std": 0.24871062487363815, "rewards/accuracy_reward": 0.12053572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3638392984867096, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1706.2902526855469, "epoch": 0.03942946755283399, "grad_norm": 0.09416219592094421, "kl": 1.1309981346130371e-05, "learning_rate": 1.182089552238806e-07, "loss": 0.0536, "reward": 0.4218750149011612, "reward_std": 0.16857190616428852, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.383928582072258, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 1569.2857666015625, "epoch": 0.03972817564035546, "grad_norm": 0.13773444294929504, "kl": 1.837313175201416e-05, "learning_rate": 1.1910447761194029e-07, "loss": 0.1251, "reward": 0.5742187798023224, "reward_std": 0.20168447494506836, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4045759066939354, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 1570.2500610351562, "epoch": 0.04002688372787693, "grad_norm": 0.13440696895122528, "kl": 1.9595026969909668e-05, "learning_rate": 1.2e-07, "loss": 0.0869, "reward": 0.5111607387661934, "reward_std": 0.17558224871754646, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4151785969734192, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 1649.6540832519531, "epoch": 0.0403255918153984, "grad_norm": 0.11159578710794449, "kl": 2.1811574697494507e-05, "learning_rate": 1.208955223880597e-07, "loss": 0.0727, "reward": 0.5128348469734192, "reward_std": 0.18119722418487072, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3989955559372902, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1839.7723999023438, "epoch": 0.04062429990291987, "grad_norm": 0.11850500106811523, "kl": 1.599639654159546e-05, "learning_rate": 1.2179104477611938e-07, "loss": 0.0896, "reward": 0.4107143059372902, "reward_std": 0.16554657369852066, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357142873108387, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 1728.3996276855469, "epoch": 0.040923007990441344, "grad_norm": 0.12086506932973862, "kl": 1.617521047592163e-05, "learning_rate": 1.226865671641791e-07, "loss": 0.0889, "reward": 0.4252232313156128, "reward_std": 0.1739024817943573, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3694196566939354, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 1713.0603332519531, "epoch": 0.04122171607796281, "grad_norm": 0.0992528647184372, "kl": 2.9981136322021484e-05, "learning_rate": 1.235820895522388e-07, "loss": 0.0988, "reward": 0.4810267984867096, "reward_std": 0.17086872085928917, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380580373108387, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 1641.3103332519531, "epoch": 0.04152042416548428, "grad_norm": 0.16813141107559204, "kl": 3.769993782043457e-05, "learning_rate": 1.244776119402985e-07, "loss": 0.1278, "reward": 0.6266741454601288, "reward_std": 0.24073298647999763, "rewards/accuracy_reward": 0.2343750186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392299123108387, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 1616.4241943359375, "epoch": 0.04181913225300575, "grad_norm": 0.11267310380935669, "kl": 2.8371810913085938e-05, "learning_rate": 1.253731343283582e-07, "loss": 0.0894, "reward": 0.5206473469734192, "reward_std": 0.22803904488682747, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4090401902794838, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1691.7947082519531, "epoch": 0.04211784034052722, "grad_norm": 0.10981886088848114, "kl": 4.252791404724121e-05, "learning_rate": 1.262686567164179e-07, "loss": 0.0698, "reward": 0.4977678805589676, "reward_std": 0.23910186812281609, "rewards/accuracy_reward": 0.12053571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.377232164144516, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1520.7076721191406, "epoch": 0.04241654842804869, "grad_norm": 0.11787110567092896, "kl": 5.364418029785156e-05, "learning_rate": 1.271641791044776e-07, "loss": 0.0925, "reward": 0.5150669738650322, "reward_std": 0.2168693859130144, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4414062574505806, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 1659.7165832519531, "epoch": 0.042715256515570156, "grad_norm": 0.14157161116600037, "kl": 6.74128532409668e-05, "learning_rate": 1.2805970149253732e-07, "loss": 0.1137, "reward": 0.4760044887661934, "reward_std": 0.1525522843003273, "rewards/accuracy_reward": 0.09375000675208867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3822544813156128, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1710.2433776855469, "epoch": 0.04301396460309163, "grad_norm": 0.09578654170036316, "kl": 6.985664367675781e-05, "learning_rate": 1.2895522388059702e-07, "loss": 0.0738, "reward": 0.4123884066939354, "reward_std": 0.1878648940473795, "rewards/accuracy_reward": 0.040178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372209832072258, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1473.0782165527344, "epoch": 0.0433126726906131, "grad_norm": 0.10296614468097687, "kl": 5.14984130859375e-05, "learning_rate": 1.298507462686567e-07, "loss": 0.0909, "reward": 0.5491071715950966, "reward_std": 0.21449309214949608, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464286044239998, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1697.8148498535156, "epoch": 0.04361138077813457, "grad_norm": 0.13878241181373596, "kl": 8.547306060791016e-05, "learning_rate": 1.3074626865671641e-07, "loss": 0.1006, "reward": 0.4419643059372902, "reward_std": 0.16722244210541248, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 1637.6920166015625, "epoch": 0.04391008886565604, "grad_norm": 0.16199275851249695, "kl": 7.420778274536133e-05, "learning_rate": 1.316417910447761e-07, "loss": 0.1289, "reward": 0.5686384290456772, "reward_std": 0.247323889285326, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1652.80810546875, "epoch": 0.04420879695317751, "grad_norm": 0.1081409603357315, "kl": 9.322166442871094e-05, "learning_rate": 1.325373134328358e-07, "loss": 0.0724, "reward": 0.5078125074505806, "reward_std": 0.18800101801753044, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4073660895228386, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1608.4844665527344, "epoch": 0.044507505040698975, "grad_norm": 0.11269976198673248, "kl": 9.965896606445312e-05, "learning_rate": 1.334328358208955e-07, "loss": 0.0946, "reward": 0.5351562798023224, "reward_std": 0.24488555639982224, "rewards/accuracy_reward": 0.11607143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4190848395228386, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 1822.1050109863281, "epoch": 0.044806213128220446, "grad_norm": 0.08403084427118301, "kl": 7.194280624389648e-05, "learning_rate": 1.3432835820895523e-07, "loss": 0.063, "reward": 0.4402901977300644, "reward_std": 0.221501886844635, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3643973395228386, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1716.7500610351562, "epoch": 0.04510492121574192, "grad_norm": 0.15295258164405823, "kl": 9.566545486450195e-05, "learning_rate": 1.3522388059701493e-07, "loss": 0.1111, "reward": 0.4135044813156128, "reward_std": 0.18383593298494816, "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.388950914144516, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 1763.7545471191406, "epoch": 0.04540362930326339, "grad_norm": 0.11432671546936035, "kl": 9.512901306152344e-05, "learning_rate": 1.3611940298507463e-07, "loss": 0.0841, "reward": 0.4810268133878708, "reward_std": 0.1877319999039173, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3738839402794838, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 1659.0491638183594, "epoch": 0.04570233739078486, "grad_norm": 0.13194246590137482, "kl": 9.47117805480957e-05, "learning_rate": 1.3701492537313432e-07, "loss": 0.1059, "reward": 0.592633955180645, "reward_std": 0.22044545784592628, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4095982313156128, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 1590.5223999023438, "epoch": 0.04600104547830632, "grad_norm": 0.13896574079990387, "kl": 0.0001392364501953125, "learning_rate": 1.3791044776119402e-07, "loss": 0.0983, "reward": 0.4827009215950966, "reward_std": 0.2207488715648651, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4135044738650322, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 1596.7657165527344, "epoch": 0.04629975356582779, "grad_norm": 0.13482922315597534, "kl": 0.00013327598571777344, "learning_rate": 1.3880597014925372e-07, "loss": 0.0855, "reward": 0.6367187798023224, "reward_std": 0.22360074147582054, "rewards/accuracy_reward": 0.2522321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3844866305589676, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1900.1094360351562, "epoch": 0.046598461653349264, "grad_norm": 0.12370114028453827, "kl": 0.00013577938079833984, "learning_rate": 1.3970149253731342e-07, "loss": 0.0957, "reward": 0.4363839477300644, "reward_std": 0.12708647269755602, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3225446566939354, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 1623.3125915527344, "epoch": 0.046897169740870735, "grad_norm": 0.10108797252178192, "kl": 0.0001418590545654297, "learning_rate": 1.4059701492537314e-07, "loss": 0.0863, "reward": 0.4765625223517418, "reward_std": 0.16676772572100163, "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3783482313156128, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1643.6630249023438, "epoch": 0.047195877828392206, "grad_norm": 0.16066448390483856, "kl": 0.00013363361358642578, "learning_rate": 1.414925373134328e-07, "loss": 0.129, "reward": 0.4341518133878708, "reward_std": 0.19983043149113655, "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3872767984867096, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 1751.6518859863281, "epoch": 0.04749458591591367, "grad_norm": 0.0947645753622055, "kl": 0.00015664100646972656, "learning_rate": 1.4238805970149254e-07, "loss": 0.0547, "reward": 0.4776785895228386, "reward_std": 0.15380142629146576, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3660714402794838, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 1662.4375610351562, "epoch": 0.04779329400343514, "grad_norm": 0.12727534770965576, "kl": 0.00011730194091796875, "learning_rate": 1.4328358208955223e-07, "loss": 0.1019, "reward": 0.6171875298023224, "reward_std": 0.1878465749323368, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4029017984867096, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1667.8371276855469, "epoch": 0.04809200209095661, "grad_norm": 0.10900050401687622, "kl": 0.00013267993927001953, "learning_rate": 1.4417910447761193e-07, "loss": 0.097, "reward": 0.4268973395228386, "reward_std": 0.17749951034784317, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3710937723517418, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 1655.1027526855469, "epoch": 0.04839071017847808, "grad_norm": 0.12825830280780792, "kl": 0.00019788742065429688, "learning_rate": 1.4507462686567163e-07, "loss": 0.084, "reward": 0.5094866454601288, "reward_std": 0.19899317249655724, "rewards/accuracy_reward": 0.12946428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380022332072258, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 1713.3884582519531, "epoch": 0.048689418265999554, "grad_norm": 0.10287753492593765, "kl": 0.00013327598571777344, "learning_rate": 1.4597014925373133e-07, "loss": 0.0843, "reward": 0.482142873108387, "reward_std": 0.2155321016907692, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500223517418, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 1691.0737609863281, "epoch": 0.048988126353521025, "grad_norm": 0.11911661177873611, "kl": 0.0001499652862548828, "learning_rate": 1.4686567164179105e-07, "loss": 0.0833, "reward": 0.5418527126312256, "reward_std": 0.20044337958097458, "rewards/accuracy_reward": 0.15401786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3878348395228386, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 1694.3572082519531, "epoch": 0.04928683444104249, "grad_norm": 0.16631388664245605, "kl": 0.000217437744140625, "learning_rate": 1.4776119402985072e-07, "loss": 0.1078, "reward": 0.5273437723517418, "reward_std": 0.28108350560069084, "rewards/accuracy_reward": 0.1294642877765, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3978794813156128, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1432.7076416015625, "epoch": 0.04958554252856396, "grad_norm": 0.14321938157081604, "kl": 0.0002231597900390625, "learning_rate": 1.4865671641791045e-07, "loss": 0.1078, "reward": 0.6037946790456772, "reward_std": 0.2301327995955944, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4386160895228386, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1588.47998046875, "epoch": 0.04988425061608543, "grad_norm": 0.14361348748207092, "kl": 0.0002484321594238281, "learning_rate": 1.4955223880597014e-07, "loss": 0.0834, "reward": 0.5770089626312256, "reward_std": 0.19838440604507923, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625149011612, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 1672.2188415527344, "epoch": 0.0501829587036069, "grad_norm": 0.11078359186649323, "kl": 0.0001914501190185547, "learning_rate": 1.5044776119402984e-07, "loss": 0.0902, "reward": 0.4458705633878708, "reward_std": 0.15723512135446072, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3856026977300644, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1616.993408203125, "epoch": 0.05048166679112837, "grad_norm": 0.14197629690170288, "kl": 0.00021719932556152344, "learning_rate": 1.5134328358208954e-07, "loss": 0.0863, "reward": 0.5161830708384514, "reward_std": 0.23513133078813553, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4157366305589676, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 1784.0692749023438, "epoch": 0.05078037487864984, "grad_norm": 0.12388847023248672, "kl": 0.00017940998077392578, "learning_rate": 1.5223880597014926e-07, "loss": 0.0897, "reward": 0.514508955180645, "reward_std": 0.2019130140542984, "rewards/accuracy_reward": 0.14955357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.364955373108387, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 1671.1451416015625, "epoch": 0.05107908296617131, "grad_norm": 0.10475897043943405, "kl": 0.00022530555725097656, "learning_rate": 1.5313432835820896e-07, "loss": 0.0812, "reward": 0.4492187649011612, "reward_std": 0.15383503772318363, "rewards/accuracy_reward": 0.07812500093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3710937649011612, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1707.4978332519531, "epoch": 0.05137779105369278, "grad_norm": 0.15374568104743958, "kl": 0.0002942085266113281, "learning_rate": 1.5402985074626863e-07, "loss": 0.1037, "reward": 0.4313616305589676, "reward_std": 0.16682772897183895, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3733259066939354, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1720.3572387695312, "epoch": 0.05167649914121425, "grad_norm": 0.10549383610486984, "kl": 0.00021076202392578125, "learning_rate": 1.5492537313432836e-07, "loss": 0.0697, "reward": 0.501116082072258, "reward_std": 0.19339246675372124, "rewards/accuracy_reward": 0.11830358067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1788.6741943359375, "epoch": 0.05197520722873572, "grad_norm": 0.1107393130660057, "kl": 0.00024962425231933594, "learning_rate": 1.5582089552238805e-07, "loss": 0.0821, "reward": 0.4179687798023224, "reward_std": 0.20687466114759445, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3621651977300644, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1778.1384582519531, "epoch": 0.052273915316257184, "grad_norm": 0.10706379264593124, "kl": 0.00020647048950195312, "learning_rate": 1.5671641791044775e-07, "loss": 0.0765, "reward": 0.4369419813156128, "reward_std": 0.1674421764910221, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.361049123108387, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 1664.8996276855469, "epoch": 0.052572623403778655, "grad_norm": 0.1326288878917694, "kl": 0.0002849102020263672, "learning_rate": 1.5761194029850745e-07, "loss": 0.1052, "reward": 0.5708705484867096, "reward_std": 0.2381768636405468, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4123884215950966, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 1696.3281860351562, "epoch": 0.052871331491300126, "grad_norm": 0.12604905664920807, "kl": 0.00029087066650390625, "learning_rate": 1.5850746268656717e-07, "loss": 0.0965, "reward": 0.572544664144516, "reward_std": 0.22342480346560478, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375149011612, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1601.0223693847656, "epoch": 0.0531700395788216, "grad_norm": 0.13352543115615845, "kl": 0.00025534629821777344, "learning_rate": 1.5940298507462687e-07, "loss": 0.1039, "reward": 0.5591518133878708, "reward_std": 0.23859144747257233, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3939732313156128, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1661.4866943359375, "epoch": 0.05346874766634307, "grad_norm": 0.12767590582370758, "kl": 0.0002925395965576172, "learning_rate": 1.6029850746268654e-07, "loss": 0.0814, "reward": 0.5318080484867096, "reward_std": 0.20642055198550224, "rewards/accuracy_reward": 0.14062500977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3911830559372902, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1725.3125915527344, "epoch": 0.05376745575386454, "grad_norm": 0.10698803514242172, "kl": 0.0002601146697998047, "learning_rate": 1.6119402985074627e-07, "loss": 0.0826, "reward": 0.5055803954601288, "reward_std": 0.20414995774626732, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1684.2590026855469, "epoch": 0.054066163841386, "grad_norm": 0.10296889394521713, "kl": 0.0003018379211425781, "learning_rate": 1.6208955223880596e-07, "loss": 0.0776, "reward": 0.4564732313156128, "reward_std": 0.18584585562348366, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3783482313156128, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 1675.5223999023438, "epoch": 0.054364871928907474, "grad_norm": 0.10351081937551498, "kl": 0.00028824806213378906, "learning_rate": 1.6298507462686564e-07, "loss": 0.0746, "reward": 0.5892857387661934, "reward_std": 0.1971493996679783, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3973214477300644, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1659.57373046875, "epoch": 0.054663580016428945, "grad_norm": 0.15872013568878174, "kl": 0.00029778480529785156, "learning_rate": 1.6388059701492536e-07, "loss": 0.1273, "reward": 0.5083705559372902, "reward_std": 0.23131391778588295, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3967634066939354, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1621.5157165527344, "epoch": 0.054962288103950416, "grad_norm": 0.13607467710971832, "kl": 0.00037217140197753906, "learning_rate": 1.6477611940298506e-07, "loss": 0.099, "reward": 0.5301339477300644, "reward_std": 0.16642693430185318, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375149011612, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1671.0402526855469, "epoch": 0.05526099619147189, "grad_norm": 0.14255857467651367, "kl": 0.0004019737243652344, "learning_rate": 1.6567164179104478e-07, "loss": 0.0968, "reward": 0.4960937723517418, "reward_std": 0.19889061897993088, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.384486623108387, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1629.32373046875, "epoch": 0.05555970427899335, "grad_norm": 0.13438384234905243, "kl": 0.00033855438232421875, "learning_rate": 1.6656716417910445e-07, "loss": 0.0952, "reward": 0.547991082072258, "reward_std": 0.2060382254421711, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4051339477300644, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1737.7634582519531, "epoch": 0.05585841236651482, "grad_norm": 0.1286824494600296, "kl": 0.00038552284240722656, "learning_rate": 1.6746268656716418e-07, "loss": 0.0949, "reward": 0.5429687947034836, "reward_std": 0.16317202895879745, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3777901977300644, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1681.4598999023438, "epoch": 0.05615712045403629, "grad_norm": 0.13256406784057617, "kl": 0.0004925727844238281, "learning_rate": 1.6835820895522387e-07, "loss": 0.094, "reward": 0.5379464477300644, "reward_std": 0.20884386263787746, "rewards/accuracy_reward": 0.14732143585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250223517418, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1792.0223999023438, "epoch": 0.056455828541557763, "grad_norm": 0.10631029307842255, "kl": 0.00048351287841796875, "learning_rate": 1.692537313432836e-07, "loss": 0.081, "reward": 0.4642857313156128, "reward_std": 0.16078880801796913, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3750000223517418, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 1632.2344665527344, "epoch": 0.056754536629079234, "grad_norm": 0.10183247923851013, "kl": 0.000576019287109375, "learning_rate": 1.7014925373134327e-07, "loss": 0.0794, "reward": 0.5496651977300644, "reward_std": 0.20809292793273926, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4112723469734192, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1707.6786193847656, "epoch": 0.057053244716600705, "grad_norm": 0.1497078388929367, "kl": 0.000797271728515625, "learning_rate": 1.7104477611940297e-07, "loss": 0.1038, "reward": 0.5111607387661934, "reward_std": 0.22226527333259583, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3816964402794838, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1754.3616943359375, "epoch": 0.05735195280412217, "grad_norm": 0.13438710570335388, "kl": 0.0007734298706054688, "learning_rate": 1.719402985074627e-07, "loss": 0.0928, "reward": 0.5016741305589676, "reward_std": 0.16846062615513802, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3677455484867096, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1607.7880249023438, "epoch": 0.05765066089164364, "grad_norm": 0.14446121454238892, "kl": 0.00090789794921875, "learning_rate": 1.7283582089552236e-07, "loss": 0.1262, "reward": 0.5747768133878708, "reward_std": 0.21229331567883492, "rewards/accuracy_reward": 0.17187500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4029018059372902, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1750.9107971191406, "epoch": 0.05794936897916511, "grad_norm": 0.11568669974803925, "kl": 0.0009832382202148438, "learning_rate": 1.737313432835821e-07, "loss": 0.076, "reward": 0.4302455633878708, "reward_std": 0.17481928505003452, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372209832072258, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1728.0759582519531, "epoch": 0.05824807706668658, "grad_norm": 0.1065167561173439, "kl": 0.0010995864868164062, "learning_rate": 1.7462686567164178e-07, "loss": 0.0792, "reward": 0.4581473469734192, "reward_std": 0.21770770847797394, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3755580484867096, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1634.8929138183594, "epoch": 0.05854678515420805, "grad_norm": 0.15402430295944214, "kl": 0.001422882080078125, "learning_rate": 1.755223880597015e-07, "loss": 0.1027, "reward": 0.5234375149011612, "reward_std": 0.20134853944182396, "rewards/accuracy_reward": 0.12276786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4006696566939354, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1698.3080749511719, "epoch": 0.05884549324172952, "grad_norm": 0.12812164425849915, "kl": 0.0012569427490234375, "learning_rate": 1.7641791044776118e-07, "loss": 0.0688, "reward": 0.4464285895228386, "reward_std": 0.21193958073854446, "rewards/accuracy_reward": 0.05580357485450804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250223517418, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1728.5134582519531, "epoch": 0.05914420132925099, "grad_norm": 0.11334919184446335, "kl": 0.0011587142944335938, "learning_rate": 1.7731343283582088e-07, "loss": 0.0811, "reward": 0.5039062798023224, "reward_std": 0.2106359638273716, "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.387834832072258, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1650.8884582519531, "epoch": 0.05944290941677246, "grad_norm": 0.12955929338932037, "kl": 0.0013427734375, "learning_rate": 1.782089552238806e-07, "loss": 0.109, "reward": 0.5055803805589676, "reward_std": 0.22031951323151588, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4162946566939354, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 1672.8795166015625, "epoch": 0.05974161750429393, "grad_norm": 0.1597614288330078, "kl": 0.00148773193359375, "learning_rate": 1.7910447761194027e-07, "loss": 0.1116, "reward": 0.4732143133878708, "reward_std": 0.20719240233302116, "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4107143133878708, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1643.6027221679688, "epoch": 0.0600403255918154, "grad_norm": 0.14362257719039917, "kl": 0.001926422119140625, "learning_rate": 1.8e-07, "loss": 0.1085, "reward": 0.4966518133878708, "reward_std": 0.2112794928252697, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4051339477300644, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1737.0715026855469, "epoch": 0.060339033679336865, "grad_norm": 0.1493038535118103, "kl": 0.0017795562744140625, "learning_rate": 1.808955223880597e-07, "loss": 0.1088, "reward": 0.5100446715950966, "reward_std": 0.1706965770572424, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3694196566939354, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1764.3639221191406, "epoch": 0.060637741766858336, "grad_norm": 0.12096064537763596, "kl": 0.0017223358154296875, "learning_rate": 1.8179104477611942e-07, "loss": 0.0833, "reward": 0.4408482387661934, "reward_std": 0.20917117223143578, "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3895089477300644, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1689.5357971191406, "epoch": 0.06093644985437981, "grad_norm": 0.13287681341171265, "kl": 0.00182342529296875, "learning_rate": 1.826865671641791e-07, "loss": 0.0731, "reward": 0.525111623108387, "reward_std": 0.18190494179725647, "rewards/accuracy_reward": 0.14732143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3777901977300644, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1502.6630249023438, "epoch": 0.06123515794190128, "grad_norm": 0.13710848987102509, "kl": 0.00179290771484375, "learning_rate": 1.835820895522388e-07, "loss": 0.125, "reward": 0.5625000223517418, "reward_std": 0.24395519495010376, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357313156128, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1679.8058776855469, "epoch": 0.06153386602942275, "grad_norm": 0.10821450501680374, "kl": 0.001983642578125, "learning_rate": 1.844776119402985e-07, "loss": 0.0679, "reward": 0.4386160969734192, "reward_std": 0.19054606929421425, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3761160895228386, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 1600.5000915527344, "epoch": 0.06183257411694422, "grad_norm": 0.13245004415512085, "kl": 0.0020542144775390625, "learning_rate": 1.853731343283582e-07, "loss": 0.0759, "reward": 0.6021205633878708, "reward_std": 0.22119785845279694, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4168526977300644, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1804.0290832519531, "epoch": 0.06213128220446568, "grad_norm": 0.1324647068977356, "kl": 0.001983642578125, "learning_rate": 1.8626865671641788e-07, "loss": 0.0869, "reward": 0.3710937649011612, "reward_std": 0.1465673577040434, "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3465401902794838, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 1661.7746276855469, "epoch": 0.062429990291987154, "grad_norm": 0.14967681467533112, "kl": 0.0030364990234375, "learning_rate": 1.871641791044776e-07, "loss": 0.1063, "reward": 0.483816996216774, "reward_std": 0.21914983913302422, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3789062723517418, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 1696.5201721191406, "epoch": 0.06272869837950862, "grad_norm": 0.09708944708108902, "kl": 0.0021762847900390625, "learning_rate": 1.880597014925373e-07, "loss": 0.0455, "reward": 0.4570312723517418, "reward_std": 0.18494780734181404, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3856026902794838, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1678.0291137695312, "epoch": 0.06302740646703009, "grad_norm": 0.11944709718227386, "kl": 0.002277374267578125, "learning_rate": 1.88955223880597e-07, "loss": 0.0779, "reward": 0.5429687723517418, "reward_std": 0.17690804786980152, "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3889509066939354, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1708.3371276855469, "epoch": 0.06332611455455156, "grad_norm": 0.1300218403339386, "kl": 0.00228118896484375, "learning_rate": 1.898507462686567e-07, "loss": 0.0674, "reward": 0.5217634290456772, "reward_std": 0.17174266278743744, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3677455484867096, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1697.4107666015625, "epoch": 0.06362482264207303, "grad_norm": 0.12140227109193802, "kl": 0.0022335052490234375, "learning_rate": 1.9074626865671642e-07, "loss": 0.0906, "reward": 0.4994419813156128, "reward_std": 0.23086220771074295, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392299123108387, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 1576.40185546875, "epoch": 0.0639235307295945, "grad_norm": 0.15239672362804413, "kl": 0.002559661865234375, "learning_rate": 1.9164179104477612e-07, "loss": 0.1077, "reward": 0.5926339626312256, "reward_std": 0.2343706674873829, "rewards/accuracy_reward": 0.16741071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4252232387661934, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1715.2902221679688, "epoch": 0.06422223881711597, "grad_norm": 0.14547939598560333, "kl": 0.00289154052734375, "learning_rate": 1.925373134328358e-07, "loss": 0.0951, "reward": 0.4720982313156128, "reward_std": 0.15328525938093662, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3850446566939354, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1617.7009887695312, "epoch": 0.06452094690463744, "grad_norm": 0.1390671283006668, "kl": 0.00257110595703125, "learning_rate": 1.9343283582089551e-07, "loss": 0.0961, "reward": 0.4676339477300644, "reward_std": 0.21553698182106018, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3962053656578064, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1656.0313110351562, "epoch": 0.06481965499215891, "grad_norm": 0.14371053874492645, "kl": 0.002681732177734375, "learning_rate": 1.943283582089552e-07, "loss": 0.1185, "reward": 0.5613839626312256, "reward_std": 0.2216217890381813, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375149011612, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 1693.7590026855469, "epoch": 0.06511836307968039, "grad_norm": 0.16356320679187775, "kl": 0.002750396728515625, "learning_rate": 1.952238805970149e-07, "loss": 0.1115, "reward": 0.4375000223517418, "reward_std": 0.18608659878373146, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794643059372902, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1836.27685546875, "epoch": 0.06541707116720186, "grad_norm": 0.12509854137897491, "kl": 0.002498626708984375, "learning_rate": 1.961194029850746e-07, "loss": 0.0758, "reward": 0.4637276977300644, "reward_std": 0.20686499774456024, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3632812649011612, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 1857.4041137695312, "epoch": 0.06571577925472333, "grad_norm": 0.12642747163772583, "kl": 0.002925872802734375, "learning_rate": 1.9701492537313433e-07, "loss": 0.082, "reward": 0.4514509066939354, "reward_std": 0.1434930134564638, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3487723395228386, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1799.2367248535156, "epoch": 0.06601448734224478, "grad_norm": 0.16725115478038788, "kl": 0.003215789794921875, "learning_rate": 1.9791044776119403e-07, "loss": 0.0987, "reward": 0.454799123108387, "reward_std": 0.1908818818628788, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3632812723517418, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1618.8995971679688, "epoch": 0.06631319542976626, "grad_norm": 0.18415889143943787, "kl": 0.0034332275390625, "learning_rate": 1.988059701492537e-07, "loss": 0.1176, "reward": 0.550781287252903, "reward_std": 0.21289395168423653, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4101562649011612, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 1685.5313415527344, "epoch": 0.06661190351728773, "grad_norm": 0.1648326814174652, "kl": 0.003414154052734375, "learning_rate": 1.9970149253731343e-07, "loss": 0.0829, "reward": 0.518973246216774, "reward_std": 0.18380441889166832, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3850446566939354, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1728.3438415527344, "epoch": 0.0669106116048092, "grad_norm": 0.09605135023593903, "kl": 0.00257110595703125, "learning_rate": 2.0059701492537312e-07, "loss": 0.0727, "reward": 0.613839328289032, "reward_std": 0.19651852920651436, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1771.8036499023438, "epoch": 0.06720931969233067, "grad_norm": 0.14743822813034058, "kl": 0.00336456298828125, "learning_rate": 2.0149253731343285e-07, "loss": 0.1068, "reward": 0.4603794813156128, "reward_std": 0.1802474744617939, "rewards/accuracy_reward": 0.08705357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3733259066939354, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1674.2522888183594, "epoch": 0.06750802777985214, "grad_norm": 0.12085289508104324, "kl": 0.00276947021484375, "learning_rate": 2.0238805970149252e-07, "loss": 0.0811, "reward": 0.490513414144516, "reward_std": 0.20984778925776482, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 1641.7165832519531, "epoch": 0.06780673586737361, "grad_norm": 0.16235822439193726, "kl": 0.00344085693359375, "learning_rate": 2.0328358208955224e-07, "loss": 0.1119, "reward": 0.5385044813156128, "reward_std": 0.23450100794434547, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4135044813156128, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1667.2433776855469, "epoch": 0.06810544395489508, "grad_norm": 0.1341167688369751, "kl": 0.003353118896484375, "learning_rate": 2.0417910447761194e-07, "loss": 0.0702, "reward": 0.545758955180645, "reward_std": 0.22194916754961014, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625074505806, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 1771.1942749023438, "epoch": 0.06840415204241655, "grad_norm": 0.13590477406978607, "kl": 0.002742767333984375, "learning_rate": 2.050746268656716e-07, "loss": 0.0816, "reward": 0.400669664144516, "reward_std": 0.15168441459536552, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3582589477300644, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 1636.3884887695312, "epoch": 0.06870286012993802, "grad_norm": 0.1085042729973793, "kl": 0.003475189208984375, "learning_rate": 2.0597014925373134e-07, "loss": 0.0553, "reward": 0.486607164144516, "reward_std": 0.19710953533649445, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.399553582072258, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1660.9777221679688, "epoch": 0.0690015682174595, "grad_norm": 0.1421467512845993, "kl": 0.00357818603515625, "learning_rate": 2.0686567164179103e-07, "loss": 0.0741, "reward": 0.5731027126312256, "reward_std": 0.24914058670401573, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4123883992433548, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 1674.1652526855469, "epoch": 0.06930027630498095, "grad_norm": 0.13840103149414062, "kl": 0.003772735595703125, "learning_rate": 2.0776119402985076e-07, "loss": 0.0951, "reward": 0.5329241305589676, "reward_std": 0.18147192522883415, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3989955484867096, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1576.2745971679688, "epoch": 0.06959898439250242, "grad_norm": 0.1589258909225464, "kl": 0.00388336181640625, "learning_rate": 2.0865671641791043e-07, "loss": 0.0952, "reward": 0.5072544887661934, "reward_std": 0.1656729392707348, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4090401977300644, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1647.6004943847656, "epoch": 0.06989769248002389, "grad_norm": 0.14736725389957428, "kl": 0.004535675048828125, "learning_rate": 2.0955223880597013e-07, "loss": 0.0897, "reward": 0.541294664144516, "reward_std": 0.21775419265031815, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4095982313156128, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1758.6875610351562, "epoch": 0.07019640056754536, "grad_norm": 0.13103868067264557, "kl": 0.003345489501953125, "learning_rate": 2.1044776119402985e-07, "loss": 0.084, "reward": 0.4771205633878708, "reward_std": 0.212843406945467, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1699.3193054199219, "epoch": 0.07049510865506683, "grad_norm": 0.13174493610858917, "kl": 0.0044403076171875, "learning_rate": 2.1134328358208952e-07, "loss": 0.0738, "reward": 0.5474330633878708, "reward_std": 0.1859729401767254, "rewards/accuracy_reward": 0.16071429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3867187723517418, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1634.7590026855469, "epoch": 0.0707938167425883, "grad_norm": 0.13874451816082, "kl": 0.003925323486328125, "learning_rate": 2.1223880597014925e-07, "loss": 0.0958, "reward": 0.5212053880095482, "reward_std": 0.23695604130625725, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4051339477300644, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1637.2411804199219, "epoch": 0.07109252483010978, "grad_norm": 0.13351604342460632, "kl": 0.004116058349609375, "learning_rate": 2.1313432835820894e-07, "loss": 0.0861, "reward": 0.522321455180645, "reward_std": 0.18783728405833244, "rewards/accuracy_reward": 0.11607143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500149011612, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1658.4732666015625, "epoch": 0.07139123291763125, "grad_norm": 0.18188543617725372, "kl": 0.00396728515625, "learning_rate": 2.1402985074626867e-07, "loss": 0.1102, "reward": 0.5563616305589676, "reward_std": 0.22525354847311974, "rewards/accuracy_reward": 0.1495535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4068080559372902, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 1624.1942443847656, "epoch": 0.07168994100515272, "grad_norm": 0.15537305176258087, "kl": 0.00439453125, "learning_rate": 2.1492537313432834e-07, "loss": 0.1026, "reward": 0.5044643059372902, "reward_std": 0.1992409024387598, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500223517418, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 1688.9732666015625, "epoch": 0.07198864909267419, "grad_norm": 0.15165981650352478, "kl": 0.005035400390625, "learning_rate": 2.1582089552238804e-07, "loss": 0.1011, "reward": 0.6289062798023224, "reward_std": 0.2904147170484066, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4012276902794838, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 1670.0781555175781, "epoch": 0.07228735718019566, "grad_norm": 0.17596736550331116, "kl": 0.00519561767578125, "learning_rate": 2.1671641791044776e-07, "loss": 0.1172, "reward": 0.5061383992433548, "reward_std": 0.19540873914957047, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3922991305589676, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 1818.4286499023438, "epoch": 0.07258606526771712, "grad_norm": 0.12452724575996399, "kl": 0.00406646728515625, "learning_rate": 2.1761194029850746e-07, "loss": 0.0725, "reward": 0.478794664144516, "reward_std": 0.19040923938155174, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.364955373108387, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 1743.1161193847656, "epoch": 0.07288477335523859, "grad_norm": 0.16066378355026245, "kl": 0.0052032470703125, "learning_rate": 2.1850746268656716e-07, "loss": 0.1066, "reward": 0.556919664144516, "reward_std": 0.18567639961838722, "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3850446566939354, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1737.618408203125, "epoch": 0.07318348144276006, "grad_norm": 0.14144359529018402, "kl": 0.005462646484375, "learning_rate": 2.1940298507462685e-07, "loss": 0.0737, "reward": 0.4252232313156128, "reward_std": 0.15796689130365849, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3783482387661934, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 1667.7969360351562, "epoch": 0.07348218953028153, "grad_norm": 0.1984703242778778, "kl": 0.005329132080078125, "learning_rate": 2.2029850746268658e-07, "loss": 0.0951, "reward": 0.6534598618745804, "reward_std": 0.18945389240980148, "rewards/accuracy_reward": 0.2522321604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4012276977300644, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 1624.8170166015625, "epoch": 0.073780897617803, "grad_norm": 0.17586453258991241, "kl": 0.00536346435546875, "learning_rate": 2.2119402985074625e-07, "loss": 0.1235, "reward": 0.5636160969734192, "reward_std": 0.21757451072335243, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4185268059372902, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1621.5379943847656, "epoch": 0.07407960570532447, "grad_norm": 0.16187602281570435, "kl": 0.0064849853515625, "learning_rate": 2.2208955223880595e-07, "loss": 0.0958, "reward": 0.4854910895228386, "reward_std": 0.21597174927592278, "rewards/accuracy_reward": 0.06919643050059676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.416294664144516, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 1703.7679138183594, "epoch": 0.07437831379284594, "grad_norm": 0.14348551630973816, "kl": 0.007354736328125, "learning_rate": 2.2298507462686567e-07, "loss": 0.0911, "reward": 0.5016741454601288, "reward_std": 0.18764624185860157, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.376674123108387, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 1670.2500915527344, "epoch": 0.07467702188036741, "grad_norm": 0.17889803647994995, "kl": 0.007232666015625, "learning_rate": 2.2388059701492537e-07, "loss": 0.1136, "reward": 0.4659598395228386, "reward_std": 0.20607049763202667, "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.407924123108387, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1762.6295776367188, "epoch": 0.07497572996788888, "grad_norm": 0.1282837688922882, "kl": 0.0063323974609375, "learning_rate": 2.2477611940298507e-07, "loss": 0.0856, "reward": 0.4726562723517418, "reward_std": 0.20630904659628868, "rewards/accuracy_reward": 0.1004464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3722098395228386, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 1779.4420166015625, "epoch": 0.07527443805541036, "grad_norm": 0.1903720647096634, "kl": 0.0095977783203125, "learning_rate": 2.2567164179104476e-07, "loss": 0.1074, "reward": 0.4687500223517418, "reward_std": 0.2134157381951809, "rewards/accuracy_reward": 0.08705357438884676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3816964402794838, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 1573.4732666015625, "epoch": 0.07557314614293181, "grad_norm": 0.11845073848962784, "kl": 0.00799560546875, "learning_rate": 2.265671641791045e-07, "loss": 0.0589, "reward": 0.654575914144516, "reward_std": 0.24023927375674248, "rewards/accuracy_reward": 0.2321428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4224330484867096, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 1707.5692443847656, "epoch": 0.07587185423045328, "grad_norm": 0.160665825009346, "kl": 0.0085296630859375, "learning_rate": 2.2746268656716416e-07, "loss": 0.1057, "reward": 0.5167410895228386, "reward_std": 0.23537119291722775, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 1650.7232971191406, "epoch": 0.07617056231797475, "grad_norm": 0.1435379832983017, "kl": 0.0068817138671875, "learning_rate": 2.2835820895522386e-07, "loss": 0.0838, "reward": 0.5831473469734192, "reward_std": 0.269546203315258, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4179687649011612, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 1646.3013916015625, "epoch": 0.07646927040549623, "grad_norm": 0.1408359706401825, "kl": 0.00745391845703125, "learning_rate": 2.2925373134328358e-07, "loss": 0.0987, "reward": 0.5156250223517418, "reward_std": 0.20921886712312698, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500223517418, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1747.3594665527344, "epoch": 0.0767679784930177, "grad_norm": 0.15240831673145294, "kl": 0.00847625732421875, "learning_rate": 2.3014925373134328e-07, "loss": 0.0807, "reward": 0.4330357238650322, "reward_std": 0.16917020455002785, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3705357238650322, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1595.4888916015625, "epoch": 0.07706668658053917, "grad_norm": 0.1447017639875412, "kl": 0.0087432861328125, "learning_rate": 2.3104477611940295e-07, "loss": 0.1029, "reward": 0.6272321864962578, "reward_std": 0.19604502990841866, "rewards/accuracy_reward": 0.20758929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419642873108387, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 1672.4911499023438, "epoch": 0.07736539466806064, "grad_norm": 0.1691456139087677, "kl": 0.0113983154296875, "learning_rate": 2.3194029850746267e-07, "loss": 0.1109, "reward": 0.4815848469734192, "reward_std": 0.17533284798264503, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3878348469734192, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 1865.3103332519531, "epoch": 0.07766410275558211, "grad_norm": 0.14553463459014893, "kl": 0.00946807861328125, "learning_rate": 2.3283582089552237e-07, "loss": 0.1036, "reward": 0.3816964402794838, "reward_std": 0.1985895223915577, "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3504464477300644, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 1661.47998046875, "epoch": 0.07796281084310358, "grad_norm": 0.14181146025657654, "kl": 0.0083465576171875, "learning_rate": 2.337313432835821e-07, "loss": 0.0777, "reward": 0.5251116305589676, "reward_std": 0.2368466705083847, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404575914144516, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1731.571533203125, "epoch": 0.07826151893062505, "grad_norm": 0.13169826567173004, "kl": 0.0081024169921875, "learning_rate": 2.3462686567164177e-07, "loss": 0.0848, "reward": 0.4486607387661934, "reward_std": 0.14242799952626228, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3593750149011612, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 1729.3572082519531, "epoch": 0.07856022701814652, "grad_norm": 0.19655297696590424, "kl": 0.0110321044921875, "learning_rate": 2.355223880597015e-07, "loss": 0.0999, "reward": 0.4748884066939354, "reward_std": 0.1812262386083603, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372209832072258, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1709.3996276855469, "epoch": 0.07885893510566798, "grad_norm": 0.11960544437170029, "kl": 0.0114593505859375, "learning_rate": 2.364179104477612e-07, "loss": 0.0852, "reward": 0.5228794887661934, "reward_std": 0.20422037690877914, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380022332072258, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 1623.3572387695312, "epoch": 0.07915764319318945, "grad_norm": 0.12931658327579498, "kl": 0.0108489990234375, "learning_rate": 2.3731343283582086e-07, "loss": 0.0745, "reward": 0.4687500149011612, "reward_std": 0.18547334149479866, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3995535969734192, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1753.7701721191406, "epoch": 0.07945635128071092, "grad_norm": 0.16333754360675812, "kl": 0.0117340087890625, "learning_rate": 2.3820895522388058e-07, "loss": 0.0913, "reward": 0.4799107313156128, "reward_std": 0.2388301081955433, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372767873108387, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1730.1764221191406, "epoch": 0.07975505936823239, "grad_norm": 0.1521817296743393, "kl": 0.01116943359375, "learning_rate": 2.391044776119403e-07, "loss": 0.1099, "reward": 0.471540205180645, "reward_std": 0.214101642370224, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3867187723517418, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1756.4219360351562, "epoch": 0.08005376745575386, "grad_norm": 0.13242104649543762, "kl": 0.012603759765625, "learning_rate": 2.4e-07, "loss": 0.1022, "reward": 0.4966518133878708, "reward_std": 0.19917987287044525, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 1620.83935546875, "epoch": 0.08035247554327533, "grad_norm": 0.15813924372196198, "kl": 0.0114898681640625, "learning_rate": 2.408955223880597e-07, "loss": 0.0973, "reward": 0.5641741305589676, "reward_std": 0.21391383931040764, "rewards/accuracy_reward": 0.16964286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3945312649011612, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1683.9375915527344, "epoch": 0.0806511836307968, "grad_norm": 0.1461154669523239, "kl": 0.0120697021484375, "learning_rate": 2.417910447761194e-07, "loss": 0.0782, "reward": 0.6305803805589676, "reward_std": 0.2561534717679024, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.396205373108387, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1605.7411499023438, "epoch": 0.08094989171831828, "grad_norm": 0.14078788459300995, "kl": 0.0100250244140625, "learning_rate": 2.426865671641791e-07, "loss": 0.0706, "reward": 0.5647321566939354, "reward_std": 0.24575885571539402, "rewards/accuracy_reward": 0.17410715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250149011612, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1641.4286499023438, "epoch": 0.08124859980583975, "grad_norm": 0.12922753393650055, "kl": 0.0119171142578125, "learning_rate": 2.4358208955223877e-07, "loss": 0.082, "reward": 0.514508955180645, "reward_std": 0.20477286726236343, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4207589402794838, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1568.4955749511719, "epoch": 0.08154730789336122, "grad_norm": 0.2945692241191864, "kl": 0.0115509033203125, "learning_rate": 2.4447761194029847e-07, "loss": 0.1379, "reward": 0.5373884066939354, "reward_std": 0.24970903247594833, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4190848395228386, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1735.3192749023438, "epoch": 0.08184601598088269, "grad_norm": 0.1841738373041153, "kl": 0.01611328125, "learning_rate": 2.453731343283582e-07, "loss": 0.0873, "reward": 0.607700914144516, "reward_std": 0.18619681894779205, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3666294813156128, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1627.2255249023438, "epoch": 0.08214472406840415, "grad_norm": 0.17926596105098724, "kl": 0.0133514404296875, "learning_rate": 2.462686567164179e-07, "loss": 0.118, "reward": 0.554687537252903, "reward_std": 0.27307654917240143, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4006696566939354, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1659.1741638183594, "epoch": 0.08244343215592562, "grad_norm": 0.15800859034061432, "kl": 0.0154571533203125, "learning_rate": 2.471641791044776e-07, "loss": 0.1111, "reward": 0.4944196566939354, "reward_std": 0.21703854762017727, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4185268059372902, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1744.8929443359375, "epoch": 0.08274214024344709, "grad_norm": 0.14526693522930145, "kl": 0.0145263671875, "learning_rate": 2.480597014925373e-07, "loss": 0.0979, "reward": 0.5016741380095482, "reward_std": 0.19702809303998947, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3878348395228386, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1728.6808776855469, "epoch": 0.08304084833096856, "grad_norm": 0.12965428829193115, "kl": 0.0125579833984375, "learning_rate": 2.48955223880597e-07, "loss": 0.0888, "reward": 0.5005580484867096, "reward_std": 0.2360159568488598, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.395647332072258, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1625.9576721191406, "epoch": 0.08333955641849003, "grad_norm": 0.13115423917770386, "kl": 0.0152587890625, "learning_rate": 2.498507462686567e-07, "loss": 0.0951, "reward": 0.4944196715950966, "reward_std": 0.20203901454806328, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4095982387661934, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1771.7433776855469, "epoch": 0.0836382645060115, "grad_norm": 0.16687826812267303, "kl": 0.013824462890625, "learning_rate": 2.507462686567164e-07, "loss": 0.0878, "reward": 0.4581473469734192, "reward_std": 0.19889593683183193, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.368861623108387, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1789.9710693359375, "epoch": 0.08393697259353297, "grad_norm": 0.1582089364528656, "kl": 0.0163421630859375, "learning_rate": 2.516417910447761e-07, "loss": 0.0879, "reward": 0.4369419738650322, "reward_std": 0.1472496148198843, "rewards/accuracy_reward": 0.08258929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3543526902794838, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1746.6943054199219, "epoch": 0.08423568068105444, "grad_norm": 0.2485809624195099, "kl": 0.016571044921875, "learning_rate": 2.525373134328358e-07, "loss": 0.0946, "reward": 0.5390625223517418, "reward_std": 0.1920334193855524, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3738839477300644, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 1734.2500610351562, "epoch": 0.08453438876857591, "grad_norm": 0.10918474942445755, "kl": 0.017333984375, "learning_rate": 2.534328358208955e-07, "loss": 0.0608, "reward": 0.502790205180645, "reward_std": 0.18779809027910233, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3889509066939354, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1701.0380554199219, "epoch": 0.08483309685609738, "grad_norm": 0.16535907983779907, "kl": 0.018402099609375, "learning_rate": 2.543283582089552e-07, "loss": 0.0816, "reward": 0.5379464477300644, "reward_std": 0.21999012306332588, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794643059372902, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1803.5202026367188, "epoch": 0.08513180494361886, "grad_norm": 0.14454659819602966, "kl": 0.017303466796875, "learning_rate": 2.552238805970149e-07, "loss": 0.0798, "reward": 0.4559151977300644, "reward_std": 0.1861301101744175, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.368861623108387, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1666.3170166015625, "epoch": 0.08543051303114031, "grad_norm": 0.1478193998336792, "kl": 0.018096923828125, "learning_rate": 2.5611940298507464e-07, "loss": 0.0777, "reward": 0.5658482387661934, "reward_std": 0.19072331674396992, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3962053805589676, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1726.6563415527344, "epoch": 0.08572922111866178, "grad_norm": 0.11328954994678497, "kl": 0.015045166015625, "learning_rate": 2.570149253731343e-07, "loss": 0.0668, "reward": 0.5652901902794838, "reward_std": 0.21457896381616592, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3666294813156128, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1720.3438415527344, "epoch": 0.08602792920618325, "grad_norm": 0.11135070770978928, "kl": 0.01849365234375, "learning_rate": 2.5791044776119404e-07, "loss": 0.081, "reward": 0.4492187649011612, "reward_std": 0.18345548957586288, "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3956473395228386, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1806.8460693359375, "epoch": 0.08632663729370472, "grad_norm": 0.1601344347000122, "kl": 0.0174560546875, "learning_rate": 2.5880597014925374e-07, "loss": 0.1618, "reward": 0.5318080633878708, "reward_std": 0.1626906618475914, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3733259066939354, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1687.5067443847656, "epoch": 0.0866253453812262, "grad_norm": 0.14696349203586578, "kl": 0.0171051025390625, "learning_rate": 2.597014925373134e-07, "loss": 0.0887, "reward": 0.6289062723517418, "reward_std": 0.23547784611582756, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3922991305589676, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 1720.0781860351562, "epoch": 0.08692405346874767, "grad_norm": 0.13937023282051086, "kl": 0.0178070068359375, "learning_rate": 2.6059701492537313e-07, "loss": 0.0896, "reward": 0.5256696715950966, "reward_std": 0.1844266913831234, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3761160895228386, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1605.7411499023438, "epoch": 0.08722276155626914, "grad_norm": 0.14868046343326569, "kl": 0.0153350830078125, "learning_rate": 2.6149253731343283e-07, "loss": 0.0872, "reward": 0.5195312947034836, "reward_std": 0.23007197305560112, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412388414144516, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1712.6697387695312, "epoch": 0.08752146964379061, "grad_norm": 0.20771163702011108, "kl": 0.019195556640625, "learning_rate": 2.623880597014925e-07, "loss": 0.1044, "reward": 0.5608259215950966, "reward_std": 0.162094596773386, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3822544887661934, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1771.02685546875, "epoch": 0.08782017773131208, "grad_norm": 0.16314779222011566, "kl": 0.0198974609375, "learning_rate": 2.632835820895522e-07, "loss": 0.0959, "reward": 0.4285714402794838, "reward_std": 0.20358892902731895, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.377232164144516, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 1786.7634887695312, "epoch": 0.08811888581883355, "grad_norm": 0.15111438930034637, "kl": 0.0196990966796875, "learning_rate": 2.641791044776119e-07, "loss": 0.0911, "reward": 0.5094866305589676, "reward_std": 0.19190515018999577, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3733259066939354, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 1761.9576416015625, "epoch": 0.08841759390635502, "grad_norm": 0.18785519897937775, "kl": 0.020355224609375, "learning_rate": 2.650746268656716e-07, "loss": 0.0923, "reward": 0.4330357387661934, "reward_std": 0.1853826940059662, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3772321566939354, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1794.80810546875, "epoch": 0.08871630199387648, "grad_norm": 0.1847190409898758, "kl": 0.02020263671875, "learning_rate": 2.6597014925373137e-07, "loss": 0.0872, "reward": 0.4966518059372902, "reward_std": 0.18843606673181057, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3716518059372902, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1744.02685546875, "epoch": 0.08901501008139795, "grad_norm": 0.13363026082515717, "kl": 0.021331787109375, "learning_rate": 2.66865671641791e-07, "loss": 0.0717, "reward": 0.4698660895228386, "reward_std": 0.19190073013305664, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3805803656578064, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1629.9531860351562, "epoch": 0.08931371816891942, "grad_norm": 0.16138088703155518, "kl": 0.02239990234375, "learning_rate": 2.677611940298507e-07, "loss": 0.0988, "reward": 0.5167410895228386, "reward_std": 0.21149498969316483, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4252232387661934, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1643.8840026855469, "epoch": 0.08961242625644089, "grad_norm": 0.13144133985042572, "kl": 0.020721435546875, "learning_rate": 2.6865671641791046e-07, "loss": 0.0877, "reward": 0.5848214477300644, "reward_std": 0.1698419526219368, "rewards/accuracy_reward": 0.18973214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950892984867096, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1730.7724304199219, "epoch": 0.08991113434396236, "grad_norm": 0.11547870188951492, "kl": 0.02105712890625, "learning_rate": 2.695522388059701e-07, "loss": 0.053, "reward": 0.4654018059372902, "reward_std": 0.18020198121666908, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380580373108387, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 1718.6451721191406, "epoch": 0.09020984243148383, "grad_norm": 0.1907738596200943, "kl": 0.021453857421875, "learning_rate": 2.7044776119402986e-07, "loss": 0.103, "reward": 0.4654018133878708, "reward_std": 0.17699389159679413, "rewards/accuracy_reward": 0.06473214458674192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400669664144516, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1785.2255249023438, "epoch": 0.0905085505190053, "grad_norm": 0.16051071882247925, "kl": 0.02215576171875, "learning_rate": 2.7134328358208956e-07, "loss": 0.0882, "reward": 0.4603794887661934, "reward_std": 0.23151732981204987, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3777901977300644, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 1674.9800109863281, "epoch": 0.09080725860652678, "grad_norm": 0.1584753543138504, "kl": 0.01995849609375, "learning_rate": 2.7223880597014925e-07, "loss": 0.0753, "reward": 0.4994419887661934, "reward_std": 0.20632381364703178, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412388414144516, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 1564.1027526855469, "epoch": 0.09110596669404825, "grad_norm": 0.22058820724487305, "kl": 0.019866943359375, "learning_rate": 2.7313432835820895e-07, "loss": 0.0955, "reward": 0.549107164144516, "reward_std": 0.2444890234619379, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607387661934, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1841.7746276855469, "epoch": 0.09140467478156972, "grad_norm": 0.14712168276309967, "kl": 0.0247802734375, "learning_rate": 2.7402985074626865e-07, "loss": 0.0551, "reward": 0.4609375149011612, "reward_std": 0.1960465181618929, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.356026791036129, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 1735.5023498535156, "epoch": 0.09170338286909117, "grad_norm": 0.17436496913433075, "kl": 0.02410888671875, "learning_rate": 2.7492537313432835e-07, "loss": 0.0971, "reward": 0.5563616305589676, "reward_std": 0.20075469836592674, "rewards/accuracy_reward": 0.1607142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3956473395228386, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 1762.7076416015625, "epoch": 0.09200209095661264, "grad_norm": 0.19251605868339539, "kl": 0.02215576171875, "learning_rate": 2.7582089552238804e-07, "loss": 0.102, "reward": 0.4804687798023224, "reward_std": 0.25986140966415405, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.395647332072258, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 1601.4955749511719, "epoch": 0.09230079904413412, "grad_norm": 0.20661401748657227, "kl": 0.022491455078125, "learning_rate": 2.7671641791044774e-07, "loss": 0.1098, "reward": 0.498325914144516, "reward_std": 0.1971307024359703, "rewards/accuracy_reward": 0.08705357415601611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4112723469734192, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 1480.5938110351562, "epoch": 0.09259950713165559, "grad_norm": 0.25084495544433594, "kl": 0.0222625732421875, "learning_rate": 2.7761194029850744e-07, "loss": 0.1314, "reward": 0.6328125149011612, "reward_std": 0.20056232810020447, "rewards/accuracy_reward": 0.17633928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732313156128, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 1646.8661499023438, "epoch": 0.09289821521917706, "grad_norm": 0.1831519454717636, "kl": 0.022918701171875, "learning_rate": 2.7850746268656714e-07, "loss": 0.0791, "reward": 0.4687500223517418, "reward_std": 0.198662169277668, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404017873108387, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 1701.3058471679688, "epoch": 0.09319692330669853, "grad_norm": 0.1731054037809372, "kl": 0.024749755859375, "learning_rate": 2.7940298507462683e-07, "loss": 0.088, "reward": 0.5412946790456772, "reward_std": 0.20474319346249104, "rewards/accuracy_reward": 0.1651785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3761160895228386, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 1632.2433471679688, "epoch": 0.09349563139422, "grad_norm": 0.24675822257995605, "kl": 0.024658203125, "learning_rate": 2.8029850746268653e-07, "loss": 0.109, "reward": 0.541294664144516, "reward_std": 0.21041417866945267, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4118303805589676, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 1546.65185546875, "epoch": 0.09379433948174147, "grad_norm": 0.4164622724056244, "kl": 0.030487060546875, "learning_rate": 2.811940298507463e-07, "loss": 0.1162, "reward": 0.5792411044239998, "reward_std": 0.2640913836658001, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 1570.6741943359375, "epoch": 0.09409304756926294, "grad_norm": 0.21610848605632782, "kl": 0.025390625, "learning_rate": 2.82089552238806e-07, "loss": 0.0957, "reward": 0.5563616380095482, "reward_std": 0.2341964840888977, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187723517418, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1637.3817443847656, "epoch": 0.09439175565678441, "grad_norm": 0.26492780447006226, "kl": 0.031005859375, "learning_rate": 2.829850746268656e-07, "loss": 0.1047, "reward": 0.6333705633878708, "reward_std": 0.26227736845612526, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4213169813156128, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 1657.1362609863281, "epoch": 0.09469046374430588, "grad_norm": 0.25361546874046326, "kl": 0.027984619140625, "learning_rate": 2.838805970149254e-07, "loss": 0.1051, "reward": 0.5089285895228386, "reward_std": 0.18195757269859314, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950893059372902, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 1761.9866943359375, "epoch": 0.09498917183182734, "grad_norm": 0.2746720314025879, "kl": 0.03057861328125, "learning_rate": 2.8477611940298507e-07, "loss": 0.0872, "reward": 0.4933035969734192, "reward_std": 0.20656593516469002, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.383928582072258, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 1692.60498046875, "epoch": 0.09528787991934881, "grad_norm": 0.270435631275177, "kl": 0.029876708984375, "learning_rate": 2.8567164179104477e-07, "loss": 0.0854, "reward": 0.5061384215950966, "reward_std": 0.234851386398077, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4034598395228386, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 1687.1004943847656, "epoch": 0.09558658800687028, "grad_norm": 0.27087515592575073, "kl": 0.030914306640625, "learning_rate": 2.8656716417910447e-07, "loss": 0.1055, "reward": 0.4693080559372902, "reward_std": 0.18623634800314903, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4202009215950966, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 1636.7523193359375, "epoch": 0.09588529609439175, "grad_norm": 0.27277451753616333, "kl": 0.027923583984375, "learning_rate": 2.8746268656716417e-07, "loss": 0.1156, "reward": 0.534040205180645, "reward_std": 0.26148390769958496, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4313616305589676, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1619.0871276855469, "epoch": 0.09618400418191322, "grad_norm": 0.3187912106513977, "kl": 0.032012939453125, "learning_rate": 2.8835820895522386e-07, "loss": 0.0957, "reward": 0.5708705559372902, "reward_std": 0.21944256126880646, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705484867096, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 1736.5068054199219, "epoch": 0.0964827122694347, "grad_norm": 0.33965209126472473, "kl": 0.033233642578125, "learning_rate": 2.8925373134328356e-07, "loss": 0.1234, "reward": 0.4882812574505806, "reward_std": 0.1364483255892992, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3856026902794838, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 1648.62060546875, "epoch": 0.09678142035695617, "grad_norm": 0.3891081213951111, "kl": 0.03326416015625, "learning_rate": 2.9014925373134326e-07, "loss": 0.121, "reward": 0.4838169887661934, "reward_std": 0.23809976130723953, "rewards/accuracy_reward": 0.06696428707800806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.416852705180645, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1558.3214721679688, "epoch": 0.09708012844447764, "grad_norm": 0.38267526030540466, "kl": 0.0308837890625, "learning_rate": 2.9104477611940296e-07, "loss": 0.1138, "reward": 0.5597098544239998, "reward_std": 0.2379733808338642, "rewards/accuracy_reward": 0.1138392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705559372902, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1630.4375915527344, "epoch": 0.09737883653199911, "grad_norm": 0.3727235198020935, "kl": 0.03082275390625, "learning_rate": 2.9194029850746265e-07, "loss": 0.0981, "reward": 0.5998884290456772, "reward_std": 0.2406637966632843, "rewards/accuracy_reward": 0.17187500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.428013414144516, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 1622.9822082519531, "epoch": 0.09767754461952058, "grad_norm": 0.35727638006210327, "kl": 0.03424072265625, "learning_rate": 2.9283582089552235e-07, "loss": 0.1119, "reward": 0.5825893059372902, "reward_std": 0.22473907470703125, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4308036044239998, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 1474.0313110351562, "epoch": 0.09797625270704205, "grad_norm": 0.33378317952156067, "kl": 0.030670166015625, "learning_rate": 2.937313432835821e-07, "loss": 0.1415, "reward": 0.6356027126312256, "reward_std": 0.20895413681864738, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4793526902794838, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 1603.8572082519531, "epoch": 0.0982749607945635, "grad_norm": 0.31300511956214905, "kl": 0.042755126953125, "learning_rate": 2.946268656716418e-07, "loss": 0.0628, "reward": 0.5619419887661934, "reward_std": 0.21602148562669754, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4146205559372902, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 1539.3549499511719, "epoch": 0.09857366888208498, "grad_norm": 0.4070514440536499, "kl": 0.033966064453125, "learning_rate": 2.9552238805970145e-07, "loss": 0.0875, "reward": 0.5664062798023224, "reward_std": 0.26943301036953926, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1495.2746276855469, "epoch": 0.09887237696960645, "grad_norm": 0.30025383830070496, "kl": 0.0423583984375, "learning_rate": 2.964179104477612e-07, "loss": 0.0698, "reward": 0.5948661118745804, "reward_std": 0.21006715670228004, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4475446715950966, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1617.9710693359375, "epoch": 0.09917108505712792, "grad_norm": 0.48415082693099976, "kl": 0.044921875, "learning_rate": 2.973134328358209e-07, "loss": 0.1019, "reward": 0.570312537252903, "reward_std": 0.21727300807833672, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4497768133878708, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 1614.6138916015625, "epoch": 0.09946979314464939, "grad_norm": 0.6349298357963562, "kl": 0.04608154296875, "learning_rate": 2.982089552238806e-07, "loss": 0.1226, "reward": 0.5797991305589676, "reward_std": 0.2682628929615021, "rewards/accuracy_reward": 0.11607143562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 1410.7478637695312, "epoch": 0.09976850123217086, "grad_norm": 0.4692821800708771, "kl": 0.041656494140625, "learning_rate": 2.991044776119403e-07, "loss": 0.1094, "reward": 0.6244419813156128, "reward_std": 0.24968943372368813, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241305589676, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1524.0045471191406, "epoch": 0.10006720931969233, "grad_norm": 0.7896956205368042, "kl": 0.06951904296875, "learning_rate": 3e-07, "loss": 0.1536, "reward": 0.6322544813156128, "reward_std": 0.28275538980960846, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580559372902, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1488.93310546875, "epoch": 0.1003659174072138, "grad_norm": 0.7389527559280396, "kl": 0.0513916015625, "learning_rate": 2.9999992656661574e-07, "loss": 0.1417, "reward": 0.5786830484867096, "reward_std": 0.2918492667376995, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 1555.1183471679688, "epoch": 0.10066462549473527, "grad_norm": 0.9840577840805054, "kl": 0.06317138671875, "learning_rate": 2.999997062665427e-07, "loss": 0.1566, "reward": 0.6333705633878708, "reward_std": 0.2717251442372799, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455708384514, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 1481.0938415527344, "epoch": 0.10096333358225675, "grad_norm": 0.6152579188346863, "kl": 0.0567626953125, "learning_rate": 2.999993391000207e-07, "loss": 0.107, "reward": 0.7438616454601288, "reward_std": 0.2821759395301342, "rewards/accuracy_reward": 0.22767858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5161830559372902, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 1623.0469665527344, "epoch": 0.10126204166977822, "grad_norm": 1.0983754396438599, "kl": 0.07513427734375, "learning_rate": 2.999988250674491e-07, "loss": 0.1603, "reward": 0.561383955180645, "reward_std": 0.27354512363672256, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1618.4688110351562, "epoch": 0.10156074975729967, "grad_norm": 0.7358133792877197, "kl": 0.0736083984375, "learning_rate": 2.999981641693872e-07, "loss": 0.1241, "reward": 0.5066964477300644, "reward_std": 0.25115587562322617, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500223517418, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1593.8281555175781, "epoch": 0.10185945784482114, "grad_norm": 0.9558663368225098, "kl": 0.0845947265625, "learning_rate": 2.999973564065539e-07, "loss": 0.1248, "reward": 0.6210937798023224, "reward_std": 0.2744237296283245, "rewards/accuracy_reward": 0.12276786146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4983259290456772, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 1531.2634582519531, "epoch": 0.10215816593234262, "grad_norm": 0.9518200159072876, "kl": 0.111572265625, "learning_rate": 2.9999640177982793e-07, "loss": 0.1298, "reward": 0.667410746216774, "reward_std": 0.288241907954216, "rewards/accuracy_reward": 0.16741072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000298023224, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 1630.1473999023438, "epoch": 0.10245687401986409, "grad_norm": 1.0429834127426147, "kl": 0.1077880859375, "learning_rate": 2.9999530029024795e-07, "loss": 0.1538, "reward": 0.5619419887661934, "reward_std": 0.23585882782936096, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598469734192, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 1573.1451721191406, "epoch": 0.10275558210738556, "grad_norm": 1.0917168855667114, "kl": 0.08740234375, "learning_rate": 2.9999405193901215e-07, "loss": 0.1258, "reward": 0.6757812798023224, "reward_std": 0.3352752812206745, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705559372902, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1492.5960388183594, "epoch": 0.10305429019490703, "grad_norm": 1.3219847679138184, "kl": 0.1199951171875, "learning_rate": 2.999926567274787e-07, "loss": 0.129, "reward": 0.5496651977300644, "reward_std": 0.2461422197520733, "rewards/accuracy_reward": 0.06473214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330559372902, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 1600.4777221679688, "epoch": 0.1033529982824285, "grad_norm": 1.3846224546432495, "kl": 0.11376953125, "learning_rate": 2.999911146571655e-07, "loss": 0.1488, "reward": 0.5909598544239998, "reward_std": 0.28439684957265854, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949777200818062, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 1772.9107971191406, "epoch": 0.10365170636994997, "grad_norm": 1.4299756288528442, "kl": 0.153564453125, "learning_rate": 2.9998942572975e-07, "loss": 0.1028, "reward": 0.5446428805589676, "reward_std": 0.22860493510961533, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459821455180645, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1589.9643249511719, "epoch": 0.10395041445747144, "grad_norm": 1.0333150625228882, "kl": 0.1341552734375, "learning_rate": 2.999875899470698e-07, "loss": 0.1345, "reward": 0.6316964626312256, "reward_std": 0.26746442541480064, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459821455180645, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 1565.138427734375, "epoch": 0.10424912254499291, "grad_norm": 1.3092542886734009, "kl": 0.17236328125, "learning_rate": 2.999856073111219e-07, "loss": 0.1474, "reward": 0.668526828289032, "reward_std": 0.31843802332878113, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854911044239998, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1510.5692749023438, "epoch": 0.10454783063251437, "grad_norm": 1.6648449897766113, "kl": 0.201416015625, "learning_rate": 2.9998347782406324e-07, "loss": 0.1619, "reward": 0.6411830633878708, "reward_std": 0.297934677451849, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044813156128, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 1643.3639221191406, "epoch": 0.10484653872003584, "grad_norm": 1.7489110231399536, "kl": 0.258544921875, "learning_rate": 2.9998120148821055e-07, "loss": 0.1333, "reward": 0.5887277126312256, "reward_std": 0.2324591837823391, "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4525669813156128, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 1678.7210693359375, "epoch": 0.10514524680755731, "grad_norm": 1.587652325630188, "kl": 0.27783203125, "learning_rate": 2.999787783060402e-07, "loss": 0.1412, "reward": 0.5987723469734192, "reward_std": 0.2847166433930397, "rewards/accuracy_reward": 0.15625000791624188, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4425223395228386, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 1537.1808776855469, "epoch": 0.10544395489507878, "grad_norm": 1.4693421125411987, "kl": 0.29150390625, "learning_rate": 2.9997620828018836e-07, "loss": 0.1413, "reward": 0.6305803656578064, "reward_std": 0.3329645209014416, "rewards/accuracy_reward": 0.1517857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1570.7031860351562, "epoch": 0.10574266298260025, "grad_norm": 1.6804273128509521, "kl": 0.33251953125, "learning_rate": 2.99973491413451e-07, "loss": 0.1592, "reward": 0.6930803805589676, "reward_std": 0.3070681393146515, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125149011612, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 1573.2433776855469, "epoch": 0.10604137107012172, "grad_norm": 1.7745722532272339, "kl": 0.32421875, "learning_rate": 2.999706277087839e-07, "loss": 0.1626, "reward": 0.582031287252903, "reward_std": 0.2830625846982002, "rewards/accuracy_reward": 0.06919642956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5128348395228386, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1640.5514221191406, "epoch": 0.1063400791576432, "grad_norm": 1.5374826192855835, "kl": 0.35986328125, "learning_rate": 2.999676171693023e-07, "loss": 0.135, "reward": 0.566406287252903, "reward_std": 0.26816776022315025, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919887661934, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 1664.3348999023438, "epoch": 0.10663878724516467, "grad_norm": 1.9020404815673828, "kl": 0.32666015625, "learning_rate": 2.999644597982815e-07, "loss": 0.1556, "reward": 0.5585937798023224, "reward_std": 0.28986701369285583, "rewards/accuracy_reward": 0.10714286146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451450914144516, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1697.8728332519531, "epoch": 0.10693749533268614, "grad_norm": 1.8729100227355957, "kl": 0.36328125, "learning_rate": 2.999611555991564e-07, "loss": 0.1443, "reward": 0.5502232536673546, "reward_std": 0.2946861907839775, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474330373108387, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1630.2835693359375, "epoch": 0.10723620342020761, "grad_norm": 1.7405922412872314, "kl": 0.34130859375, "learning_rate": 2.9995770457552154e-07, "loss": 0.1074, "reward": 0.5452009215950966, "reward_std": 0.2852553389966488, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482700914144516, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 1537.3460388183594, "epoch": 0.10753491150772908, "grad_norm": 2.2886714935302734, "kl": 0.359375, "learning_rate": 2.9995410673113147e-07, "loss": 0.1921, "reward": 0.6439732536673546, "reward_std": 0.31345657631754875, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160895228386, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1638.8348999023438, "epoch": 0.10783361959525053, "grad_norm": 1.9607813358306885, "kl": 0.56689453125, "learning_rate": 2.999503620699002e-07, "loss": 0.1391, "reward": 0.5145089626312256, "reward_std": 0.259963259100914, "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.485491082072258, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1526.0157165527344, "epoch": 0.108132327682772, "grad_norm": 2.1960959434509277, "kl": 0.4365234375, "learning_rate": 2.999464705959015e-07, "loss": 0.1325, "reward": 0.612723246216774, "reward_std": 0.3166027218103409, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5301339477300644, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1429.279052734375, "epoch": 0.10843103577029348, "grad_norm": 2.89924693107605, "kl": 0.5, "learning_rate": 2.9994243231336895e-07, "loss": 0.2206, "reward": 0.7500000149011612, "reward_std": 0.29292990639805794, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669642984867096, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 1542.8683776855469, "epoch": 0.10872974385781495, "grad_norm": 2.6320199966430664, "kl": 0.65576171875, "learning_rate": 2.9993824722669584e-07, "loss": 0.18, "reward": 0.7488839775323868, "reward_std": 0.28335820510983467, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167410969734192, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1532.9487609863281, "epoch": 0.10902845194533642, "grad_norm": 3.0463287830352783, "kl": 0.85302734375, "learning_rate": 2.9993391534043517e-07, "loss": 0.1781, "reward": 0.640066996216774, "reward_std": 0.28407638147473335, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5172991305589676, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 1485.4554443359375, "epoch": 0.10932716003285789, "grad_norm": 2.42756986618042, "kl": 0.7861328125, "learning_rate": 2.9992943665929956e-07, "loss": 0.1554, "reward": 0.6216518133878708, "reward_std": 0.3161746487021446, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5256696492433548, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 1598.0759887695312, "epoch": 0.10962586812037936, "grad_norm": 3.594923734664917, "kl": 0.7568359375, "learning_rate": 2.999248111881613e-07, "loss": 0.1552, "reward": 0.6227678805589676, "reward_std": 0.32291051745414734, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5290178954601288, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 1448.7768249511719, "epoch": 0.10992457620790083, "grad_norm": 3.040289878845215, "kl": 0.7080078125, "learning_rate": 2.999200389320526e-07, "loss": 0.187, "reward": 0.6679687798023224, "reward_std": 0.3253435268998146, "rewards/accuracy_reward": 0.11607143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973469734192, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 1613.6451721191406, "epoch": 0.1102232842954223, "grad_norm": 2.753264904022217, "kl": 0.830078125, "learning_rate": 2.9991511989616507e-07, "loss": 0.1877, "reward": 0.5641741305589676, "reward_std": 0.30769117176532745, "rewards/accuracy_reward": 0.04017857229337096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5239955559372902, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 1551.6540832519531, "epoch": 0.11052199238294377, "grad_norm": 2.3896234035491943, "kl": 0.78125, "learning_rate": 2.999100540858502e-07, "loss": 0.1552, "reward": 0.667410746216774, "reward_std": 0.2887098975479603, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5200893133878708, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1498.5670471191406, "epoch": 0.11082070047046524, "grad_norm": 3.1911563873291016, "kl": 1.0166015625, "learning_rate": 2.999048415066191e-07, "loss": 0.2295, "reward": 0.6690848469734192, "reward_std": 0.3580819442868233, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098469734192, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 1546.4866638183594, "epoch": 0.1111194085579867, "grad_norm": 3.380524158477783, "kl": 1.138671875, "learning_rate": 2.9989948216414255e-07, "loss": 0.229, "reward": 0.6032366305589676, "reward_std": 0.3060474470257759, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5273437798023224, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 1502.1742248535156, "epoch": 0.11141811664550817, "grad_norm": 4.70617151260376, "kl": 1.38671875, "learning_rate": 2.99893976064251e-07, "loss": 0.1929, "reward": 0.7555803954601288, "reward_std": 0.33274828642606735, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5658482313156128, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 1579.0536193847656, "epoch": 0.11171682473302964, "grad_norm": 3.4672365188598633, "kl": 1.431640625, "learning_rate": 2.998883232129344e-07, "loss": 0.1981, "reward": 0.6646205633878708, "reward_std": 0.3603917881846428, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5239955633878708, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1562.2813415527344, "epoch": 0.11201553282055111, "grad_norm": 5.419432163238525, "kl": 1.888671875, "learning_rate": 2.9988252361634273e-07, "loss": 0.1804, "reward": 0.6662946790456772, "reward_std": 0.34463248401880264, "rewards/accuracy_reward": 0.15625000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5100446715950966, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 1482.7210388183594, "epoch": 0.11231424090807259, "grad_norm": 2.9666051864624023, "kl": 1.615234375, "learning_rate": 2.9987657728078526e-07, "loss": 0.1943, "reward": 0.6735491305589676, "reward_std": 0.3480307459831238, "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5351562798023224, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 1579.0603637695312, "epoch": 0.11261294899559406, "grad_norm": 3.166823625564575, "kl": 1.580078125, "learning_rate": 2.9987048421273106e-07, "loss": 0.1738, "reward": 0.5915178805589676, "reward_std": 0.28363706916570663, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5245535895228386, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 1517.4152526855469, "epoch": 0.11291165708311553, "grad_norm": 3.627471685409546, "kl": 1.642578125, "learning_rate": 2.9986424441880875e-07, "loss": 0.2255, "reward": 0.6428571790456772, "reward_std": 0.3225829415023327, "rewards/accuracy_reward": 0.09375000721774995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.549107164144516, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1561.3572082519531, "epoch": 0.113210365170637, "grad_norm": 2.843432903289795, "kl": 1.662109375, "learning_rate": 2.9985785790580655e-07, "loss": 0.197, "reward": 0.5948661044239998, "reward_std": 0.27914076298475266, "rewards/accuracy_reward": 0.07142857811413705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375223517418, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 1517.368408203125, "epoch": 0.11350907325815847, "grad_norm": 3.3939995765686035, "kl": 1.6064453125, "learning_rate": 2.998513246806725e-07, "loss": 0.2214, "reward": 0.667410746216774, "reward_std": 0.32855261862277985, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5580357313156128, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1440.1786193847656, "epoch": 0.11380778134567994, "grad_norm": 2.6917545795440674, "kl": 1.716796875, "learning_rate": 2.9984464475051405e-07, "loss": 0.2494, "reward": 0.6869419813156128, "reward_std": 0.3111980929970741, "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5373884290456772, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 1387.0000610351562, "epoch": 0.11410648943320141, "grad_norm": 2.950944423675537, "kl": 2.22265625, "learning_rate": 2.9983781812259827e-07, "loss": 0.2471, "reward": 0.6729911118745804, "reward_std": 0.3647177666425705, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5368303805589676, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1435.1161193847656, "epoch": 0.11440519752072287, "grad_norm": 2.7077085971832275, "kl": 1.986328125, "learning_rate": 2.9983084480435196e-07, "loss": 0.2197, "reward": 0.7137277126312256, "reward_std": 0.36425111442804337, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491305589676, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 1508.1473999023438, "epoch": 0.11470390560824434, "grad_norm": 3.151148796081543, "kl": 1.87890625, "learning_rate": 2.998237248033613e-07, "loss": 0.1489, "reward": 0.7031250298023224, "reward_std": 0.34465036541223526, "rewards/accuracy_reward": 0.16071429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5424107313156128, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1437.8326416015625, "epoch": 0.11500261369576581, "grad_norm": 2.4404995441436768, "kl": 1.9375, "learning_rate": 2.9981645812737227e-07, "loss": 0.2186, "reward": 0.6623884290456772, "reward_std": 0.34111957997083664, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5396205633878708, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 1519.3661193847656, "epoch": 0.11530132178328728, "grad_norm": 2.5855777263641357, "kl": 2.21484375, "learning_rate": 2.998090447842901e-07, "loss": 0.2217, "reward": 0.6891741454601288, "reward_std": 0.30939196050167084, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.503906287252903, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1434.1183776855469, "epoch": 0.11560002987080875, "grad_norm": 3.036932945251465, "kl": 1.85546875, "learning_rate": 2.9980148478218004e-07, "loss": 0.2324, "reward": 0.6936384290456772, "reward_std": 0.3513480946421623, "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574776902794838, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1407.8773193359375, "epoch": 0.11589873795833022, "grad_norm": 3.0851759910583496, "kl": 1.486328125, "learning_rate": 2.997937781292664e-07, "loss": 0.2083, "reward": 0.733816996216774, "reward_std": 0.3567340821027756, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5574777126312256, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1358.6719360351562, "epoch": 0.1161974460458517, "grad_norm": 3.8248660564422607, "kl": 1.427734375, "learning_rate": 2.997859248339334e-07, "loss": 0.2196, "reward": 0.7845982611179352, "reward_std": 0.3836229220032692, "rewards/accuracy_reward": 0.2008928693830967, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.581473246216774, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 1388.4219665527344, "epoch": 0.11649615413337316, "grad_norm": 4.422694683074951, "kl": 1.806640625, "learning_rate": 2.9977792490472467e-07, "loss": 0.2372, "reward": 0.7996652126312256, "reward_std": 0.3631875813007355, "rewards/accuracy_reward": 0.2299107201397419, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5675223469734192, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1349.26123046875, "epoch": 0.11679486222089464, "grad_norm": 3.1299147605895996, "kl": 1.763671875, "learning_rate": 2.997697783503433e-07, "loss": 0.2475, "reward": 0.707589328289032, "reward_std": 0.32892274111509323, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6205357313156128, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1322.5580749511719, "epoch": 0.1170935703084161, "grad_norm": 3.8240954875946045, "kl": 1.3056640625, "learning_rate": 2.997614851796519e-07, "loss": 0.23, "reward": 0.7968750298023224, "reward_std": 0.2829844653606415, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250298023224, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1491.0804443359375, "epoch": 0.11739227839593756, "grad_norm": 2.774972915649414, "kl": 2.11328125, "learning_rate": 2.997530454016726e-07, "loss": 0.2104, "reward": 0.640066996216774, "reward_std": 0.28809960186481476, "rewards/accuracy_reward": 0.04464285867288709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1358.1920471191406, "epoch": 0.11769098648345903, "grad_norm": 3.9586193561553955, "kl": 1.841796875, "learning_rate": 2.9974445902558726e-07, "loss": 0.2098, "reward": 0.761160746216774, "reward_std": 0.35367975383996964, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6183035969734192, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 1377.1741943359375, "epoch": 0.1179896945709805, "grad_norm": 3.2013566493988037, "kl": 1.498046875, "learning_rate": 2.9973572606073683e-07, "loss": 0.1628, "reward": 0.7706473469734192, "reward_std": 0.2902618646621704, "rewards/accuracy_reward": 0.11383929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6568080484867096, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1335.0513916015625, "epoch": 0.11828840265850198, "grad_norm": 6.379864692687988, "kl": 2.13671875, "learning_rate": 2.997268465166219e-07, "loss": 0.2435, "reward": 0.8035714775323868, "reward_std": 0.3097067251801491, "rewards/accuracy_reward": 0.17187501024454832, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6316964477300644, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 1311.5000610351562, "epoch": 0.11858711074602345, "grad_norm": 2.713371753692627, "kl": 2.009765625, "learning_rate": 2.9971782040290273e-07, "loss": 0.223, "reward": 0.8191964626312256, "reward_std": 0.36446413397789, "rewards/accuracy_reward": 0.2142857313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 1387.9308776855469, "epoch": 0.11888581883354492, "grad_norm": 3.830652952194214, "kl": 2.380859375, "learning_rate": 2.9970864772939864e-07, "loss": 0.2792, "reward": 0.8270089626312256, "reward_std": 0.3692360520362854, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875298023224, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1377.3772888183594, "epoch": 0.11918452692106639, "grad_norm": 3.348126173019409, "kl": 2.1953125, "learning_rate": 2.996993285060887e-07, "loss": 0.2505, "reward": 0.7438616454601288, "reward_std": 0.3239649534225464, "rewards/accuracy_reward": 0.13392857508733869, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6077009290456772, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1428.9532165527344, "epoch": 0.11948323500858786, "grad_norm": 5.2568769454956055, "kl": 2.51953125, "learning_rate": 2.996898627431113e-07, "loss": 0.2504, "reward": 0.7193080633878708, "reward_std": 0.31437747925519943, "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6032366305589676, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 1505.96435546875, "epoch": 0.11978194309610933, "grad_norm": 2.043370485305786, "kl": 2.5390625, "learning_rate": 2.9968025045076425e-07, "loss": 0.2734, "reward": 0.7198661118745804, "reward_std": 0.3374958708882332, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.565848246216774, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 1337.5781860351562, "epoch": 0.1200806511836308, "grad_norm": 3.2711410522460938, "kl": 1.771484375, "learning_rate": 2.996704916395048e-07, "loss": 0.2188, "reward": 0.717075914144516, "reward_std": 0.3519558012485504, "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1374.5447082519531, "epoch": 0.12037935927115227, "grad_norm": 2.6673266887664795, "kl": 1.5615234375, "learning_rate": 2.996605863199496e-07, "loss": 0.2038, "reward": 0.7232143133878708, "reward_std": 0.3205532133579254, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6227678805589676, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 1518.8103332519531, "epoch": 0.12067806735867373, "grad_norm": 2.293356418609619, "kl": 2.26171875, "learning_rate": 2.996505345028746e-07, "loss": 0.2313, "reward": 0.6495536118745804, "reward_std": 0.3396643251180649, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669642984867096, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 1557.3750610351562, "epoch": 0.1209767754461952, "grad_norm": 3.0904812812805176, "kl": 1.576171875, "learning_rate": 2.9964033619921527e-07, "loss": 0.1927, "reward": 0.6316964477300644, "reward_std": 0.29421214014291763, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321790456772, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 1404.4241943359375, "epoch": 0.12127548353371667, "grad_norm": 2.8422586917877197, "kl": 1.771484375, "learning_rate": 2.9962999142006625e-07, "loss": 0.2376, "reward": 0.727120578289032, "reward_std": 0.3317301347851753, "rewards/accuracy_reward": 0.1339285778813064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.593191996216774, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 1489.4442749023438, "epoch": 0.12157419162123814, "grad_norm": 2.544217109680176, "kl": 2.041015625, "learning_rate": 2.9961950017668183e-07, "loss": 0.2427, "reward": 0.7103795111179352, "reward_std": 0.31089963018894196, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5697544813156128, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 1486.24560546875, "epoch": 0.12187289970875961, "grad_norm": 2.5312297344207764, "kl": 2.18359375, "learning_rate": 2.996088624804753e-07, "loss": 0.2658, "reward": 0.737723246216774, "reward_std": 0.3728252872824669, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636160969734192, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 1299.4777069091797, "epoch": 0.12217160779628108, "grad_norm": 4.738706588745117, "kl": 1.6337890625, "learning_rate": 2.9959807834301946e-07, "loss": 0.2531, "reward": 0.6914062798023224, "reward_std": 0.30162686482071877, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6065848469734192, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 1332.2076416015625, "epoch": 0.12247031588380256, "grad_norm": 1.7893487215042114, "kl": 1.59375, "learning_rate": 2.995871477760464e-07, "loss": 0.1929, "reward": 0.7405134290456772, "reward_std": 0.3456357754766941, "rewards/accuracy_reward": 0.10714286495931447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.633370578289032, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 1412.3438110351562, "epoch": 0.12276902397132403, "grad_norm": 3.3927247524261475, "kl": 1.49609375, "learning_rate": 2.995760707914476e-07, "loss": 0.2272, "reward": 0.6902902275323868, "reward_std": 0.3199015110731125, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6300223469734192, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 1266.3594360351562, "epoch": 0.1230677320588455, "grad_norm": 3.0701894760131836, "kl": 1.607421875, "learning_rate": 2.9956484740127357e-07, "loss": 0.2289, "reward": 0.8013393133878708, "reward_std": 0.3370378911495209, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.658482164144516, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 1345.1451416015625, "epoch": 0.12336644014636697, "grad_norm": 2.62039852142334, "kl": 1.5322265625, "learning_rate": 2.9955347761773437e-07, "loss": 0.2635, "reward": 0.7165178805589676, "reward_std": 0.3147599399089813, "rewards/accuracy_reward": 0.06473214784637094, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6495535969734192, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 1045.5848541259766, "epoch": 0.12366514823388844, "grad_norm": 6.996425628662109, "kl": 1.765625, "learning_rate": 2.995419614531992e-07, "loss": 0.2644, "reward": 0.9207589626312256, "reward_std": 0.3174304738640785, "rewards/accuracy_reward": 0.2321428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6886160969734192, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 1091.7031860351562, "epoch": 0.1239638563214099, "grad_norm": 4.35817813873291, "kl": 1.185546875, "learning_rate": 2.995302989201965e-07, "loss": 0.1709, "reward": 0.9023438096046448, "reward_std": 0.33790676295757294, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.694754496216774, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 1338.6942138671875, "epoch": 0.12426256440893137, "grad_norm": 1.7343546152114868, "kl": 1.2275390625, "learning_rate": 2.9951849003141386e-07, "loss": 0.2253, "reward": 0.7790178805589676, "reward_std": 0.32955098897218704, "rewards/accuracy_reward": 0.11830357764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6607143133878708, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 1199.5223693847656, "epoch": 0.12456127249645284, "grad_norm": 3.7931525707244873, "kl": 1.568359375, "learning_rate": 2.995065347996984e-07, "loss": 0.1983, "reward": 0.8498884290456772, "reward_std": 0.30018650740385056, "rewards/accuracy_reward": 0.15401786426082253, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6936384439468384, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 1253.3214721679688, "epoch": 0.12485998058397431, "grad_norm": 5.7125325202941895, "kl": 2.126953125, "learning_rate": 2.994944332380561e-07, "loss": 0.263, "reward": 0.7829241454601288, "reward_std": 0.3524812161922455, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.640066996216774, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 1235.5804138183594, "epoch": 0.1251586886714958, "grad_norm": 6.179095268249512, "kl": 2.931640625, "learning_rate": 2.994821853596523e-07, "loss": 0.2533, "reward": 0.7873884290456772, "reward_std": 0.3245573788881302, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6757812798023224, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1313.2880249023438, "epoch": 0.12545739675901724, "grad_norm": 6.823179244995117, "kl": 2.291015625, "learning_rate": 2.9946979117781154e-07, "loss": 0.2912, "reward": 0.784598246216774, "reward_std": 0.28172187879681587, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6417411118745804, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 1274.58935546875, "epoch": 0.1257561048465387, "grad_norm": 7.143161773681641, "kl": 2.19921875, "learning_rate": 2.994572507060174e-07, "loss": 0.2526, "reward": 0.8777902126312256, "reward_std": 0.36292172223329544, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.654575914144516, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1363.7210083007812, "epoch": 0.12605481293406018, "grad_norm": 7.014333248138428, "kl": 2.84765625, "learning_rate": 2.9944456395791276e-07, "loss": 0.3051, "reward": 0.7488839626312256, "reward_std": 0.3381282389163971, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.606026828289032, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1318.3527526855469, "epoch": 0.12635352102158165, "grad_norm": 2.9608542919158936, "kl": 1.986328125, "learning_rate": 2.994317309472994e-07, "loss": 0.2421, "reward": 0.7047991454601288, "reward_std": 0.31410888582468033, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 1362.9576721191406, "epoch": 0.12665222910910312, "grad_norm": 3.1645426750183105, "kl": 1.8564453125, "learning_rate": 2.9941875168813866e-07, "loss": 0.2437, "reward": 0.766183078289032, "reward_std": 0.3352186903357506, "rewards/accuracy_reward": 0.15625000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609933078289032, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 1476.6317749023438, "epoch": 0.1269509371966246, "grad_norm": 2.1904468536376953, "kl": 2.0859375, "learning_rate": 2.994056261945504e-07, "loss": 0.2515, "reward": 0.7098214626312256, "reward_std": 0.3141253888607025, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6071428805589676, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1487.1451416015625, "epoch": 0.12724964528414606, "grad_norm": 1.9513980150222778, "kl": 1.927734375, "learning_rate": 2.993923544808141e-07, "loss": 0.2671, "reward": 0.7667410969734192, "reward_std": 0.3774949535727501, "rewards/accuracy_reward": 0.17857143934816122, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5859375298023224, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1382.7545166015625, "epoch": 0.12754835337166753, "grad_norm": 2.372096300125122, "kl": 1.638671875, "learning_rate": 2.9937893656136794e-07, "loss": 0.2052, "reward": 0.7555803805589676, "reward_std": 0.34193113446235657, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6194196790456772, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1456.4889221191406, "epoch": 0.127847061459189, "grad_norm": 4.8316545486450195, "kl": 1.591796875, "learning_rate": 2.9936537245080936e-07, "loss": 0.2574, "reward": 0.694754496216774, "reward_std": 0.2907973788678646, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116156578064, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1487.5000915527344, "epoch": 0.12814576954671048, "grad_norm": 4.3105878829956055, "kl": 1.95703125, "learning_rate": 2.9935166216389477e-07, "loss": 0.2551, "reward": 0.7170759290456772, "reward_std": 0.3577748313546181, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965402126312256, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 1517.12060546875, "epoch": 0.12844447763423195, "grad_norm": 2.8182740211486816, "kl": 2.1953125, "learning_rate": 2.9933780571553967e-07, "loss": 0.2764, "reward": 0.6662946790456772, "reward_std": 0.32121002674102783, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 1433.6072387695312, "epoch": 0.12874318572175342, "grad_norm": 2.858734369277954, "kl": 2.130859375, "learning_rate": 2.9932380312081844e-07, "loss": 0.2799, "reward": 0.7912946939468384, "reward_std": 0.3506431356072426, "rewards/accuracy_reward": 0.17857143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.612723246216774, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1403.9018249511719, "epoch": 0.1290418938092749, "grad_norm": 1.4947850704193115, "kl": 2.2421875, "learning_rate": 2.9930965439496454e-07, "loss": 0.2622, "reward": 0.7338170111179352, "reward_std": 0.31123989820480347, "rewards/accuracy_reward": 0.13839286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1473.76123046875, "epoch": 0.12934060189679636, "grad_norm": 2.8177647590637207, "kl": 2.275390625, "learning_rate": 2.992953595533704e-07, "loss": 0.2115, "reward": 0.6545759290456772, "reward_std": 0.32292766124010086, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5943080633878708, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 1491.3638916015625, "epoch": 0.12963930998431783, "grad_norm": 2.598900079727173, "kl": 2.3203125, "learning_rate": 2.9928091861158755e-07, "loss": 0.2579, "reward": 0.7539062798023224, "reward_std": 0.3615180626511574, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6132812798023224, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 1544.49560546875, "epoch": 0.1299380180718393, "grad_norm": 2.0301551818847656, "kl": 2.44921875, "learning_rate": 2.992663315853261e-07, "loss": 0.2667, "reward": 0.7723214626312256, "reward_std": 0.3392820432782173, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5647321790456772, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1476.0848999023438, "epoch": 0.13023672615936077, "grad_norm": 2.2324626445770264, "kl": 2.27734375, "learning_rate": 2.992515984904554e-07, "loss": 0.2179, "reward": 0.652901828289032, "reward_std": 0.2994166538119316, "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6060268133878708, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1519.4978332519531, "epoch": 0.13053543424688224, "grad_norm": 3.1724884510040283, "kl": 2.93359375, "learning_rate": 2.992367193430036e-07, "loss": 0.2943, "reward": 0.620535746216774, "reward_std": 0.29687266051769257, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678805589676, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 1600.1607971191406, "epoch": 0.1308341423344037, "grad_norm": 4.284750461578369, "kl": 2.66796875, "learning_rate": 2.9922169415915776e-07, "loss": 0.2396, "reward": 0.7594866454601288, "reward_std": 0.3693241477012634, "rewards/accuracy_reward": 0.19419643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1537.2947082519531, "epoch": 0.13113285042192518, "grad_norm": 1.6038237810134888, "kl": 2.125, "learning_rate": 2.9920652295526393e-07, "loss": 0.2367, "reward": 0.6875000298023224, "reward_std": 0.3186362683773041, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964477300644, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1492.9732971191406, "epoch": 0.13143155850944666, "grad_norm": 2.247159719467163, "kl": 2.076171875, "learning_rate": 2.9919120574782665e-07, "loss": 0.2215, "reward": 0.723214328289032, "reward_std": 0.30596308410167694, "rewards/accuracy_reward": 0.1629464323632419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678656578064, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 1558.2009582519531, "epoch": 0.13173026659696813, "grad_norm": 2.136239767074585, "kl": 2.083984375, "learning_rate": 2.991757425535097e-07, "loss": 0.2089, "reward": 0.6495536118745804, "reward_std": 0.303979255259037, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000149011612, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 1490.5759887695312, "epoch": 0.13202897468448957, "grad_norm": 5.237728595733643, "kl": 1.80078125, "learning_rate": 2.991601333891355e-07, "loss": 0.2633, "reward": 0.6696428805589676, "reward_std": 0.28168610483407974, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1442.1406860351562, "epoch": 0.13232768277201104, "grad_norm": 3.183319091796875, "kl": 1.904296875, "learning_rate": 2.991443782716853e-07, "loss": 0.2371, "reward": 0.7293527126312256, "reward_std": 0.3332124873995781, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241454601288, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 1538.7009582519531, "epoch": 0.1326263908595325, "grad_norm": 4.083923816680908, "kl": 1.703125, "learning_rate": 2.9912847721829906e-07, "loss": 0.2016, "reward": 0.6417410969734192, "reward_std": 0.29859398305416107, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5591518133878708, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1472.3215026855469, "epoch": 0.13292509894705398, "grad_norm": 4.432039260864258, "kl": 1.810546875, "learning_rate": 2.9911243024627563e-07, "loss": 0.2407, "reward": 0.8097098469734192, "reward_std": 0.31061072275042534, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1472.6540832519531, "epoch": 0.13322380703457545, "grad_norm": 2.0080933570861816, "kl": 2.111328125, "learning_rate": 2.990962373730725e-07, "loss": 0.2583, "reward": 0.734933078289032, "reward_std": 0.3173089399933815, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 1365.0425109863281, "epoch": 0.13352251512209692, "grad_norm": 2.7629024982452393, "kl": 1.748046875, "learning_rate": 2.99079898616306e-07, "loss": 0.2234, "reward": 0.8270089626312256, "reward_std": 0.32735301554203033, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.628348246216774, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 1323.1139221191406, "epoch": 0.1338212232096184, "grad_norm": 2.2673027515411377, "kl": 2.021484375, "learning_rate": 2.99063413993751e-07, "loss": 0.2461, "reward": 0.7332589626312256, "reward_std": 0.3513525575399399, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6372768133878708, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 1417.0871276855469, "epoch": 0.13411993129713987, "grad_norm": 1.7159249782562256, "kl": 2.36328125, "learning_rate": 2.9904678352334126e-07, "loss": 0.2742, "reward": 0.7572545111179352, "reward_std": 0.3164460062980652, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.632254496216774, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1237.2053833007812, "epoch": 0.13441863938466134, "grad_norm": 3.790372133255005, "kl": 2.06640625, "learning_rate": 2.99030007223169e-07, "loss": 0.3191, "reward": 0.7773437649011612, "reward_std": 0.3004034608602524, "rewards/accuracy_reward": 0.09598214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6813616454601288, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1358.3661193847656, "epoch": 0.1347173474721828, "grad_norm": 3.7145116329193115, "kl": 2.537109375, "learning_rate": 2.990130851114852e-07, "loss": 0.2787, "reward": 0.7723214775323868, "reward_std": 0.3282766714692116, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138393133878708, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1317.5469360351562, "epoch": 0.13501605555970428, "grad_norm": 5.955320358276367, "kl": 2.37890625, "learning_rate": 2.9899601720669957e-07, "loss": 0.2543, "reward": 0.814732164144516, "reward_std": 0.33534230291843414, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.651785746216774, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1369.7902526855469, "epoch": 0.13531476364722575, "grad_norm": 2.130039930343628, "kl": 2.123046875, "learning_rate": 2.9897880352738023e-07, "loss": 0.2467, "reward": 0.777901828289032, "reward_std": 0.29524267464876175, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6261160969734192, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 1370.79248046875, "epoch": 0.13561347173474722, "grad_norm": 3.1182384490966797, "kl": 1.90234375, "learning_rate": 2.9896144409225393e-07, "loss": 0.225, "reward": 0.7879464626312256, "reward_std": 0.3365172669291496, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928954601288, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 1417.43310546875, "epoch": 0.1359121798222687, "grad_norm": 2.6194005012512207, "kl": 2.30078125, "learning_rate": 2.989439389202061e-07, "loss": 0.2548, "reward": 0.9051339626312256, "reward_std": 0.3255029618740082, "rewards/accuracy_reward": 0.2790178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6261160969734192, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1495.43310546875, "epoch": 0.13621088790979016, "grad_norm": 1.9273591041564941, "kl": 2.3671875, "learning_rate": 2.989262880302807e-07, "loss": 0.2027, "reward": 0.7656250447034836, "reward_std": 0.40861862152814865, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611607164144516, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 1381.2389221191406, "epoch": 0.13650959599731163, "grad_norm": 2.0748836994171143, "kl": 2.3125, "learning_rate": 2.989084914416801e-07, "loss": 0.2673, "reward": 0.7154018133878708, "reward_std": 0.3127022385597229, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6216518133878708, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1454.3058471679688, "epoch": 0.1368083040848331, "grad_norm": 2.8076279163360596, "kl": 2.046875, "learning_rate": 2.988905491737652e-07, "loss": 0.227, "reward": 0.8482143431901932, "reward_std": 0.3125644214451313, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464328289032, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 1574.5000610351562, "epoch": 0.13710701217235458, "grad_norm": 1.7345948219299316, "kl": 2.4453125, "learning_rate": 2.9887246124605546e-07, "loss": 0.2543, "reward": 0.6729910969734192, "reward_std": 0.35663987696170807, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.588169664144516, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 1377.4643859863281, "epoch": 0.13740572025987605, "grad_norm": 6.331684112548828, "kl": 1.701171875, "learning_rate": 2.988542276782289e-07, "loss": 0.2617, "reward": 0.7600446790456772, "reward_std": 0.3163454309105873, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875298023224, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1446.9241943359375, "epoch": 0.13770442834739752, "grad_norm": 1.4992908239364624, "kl": 1.994140625, "learning_rate": 2.9883584849012166e-07, "loss": 0.2066, "reward": 0.8783482611179352, "reward_std": 0.33525440841913223, "rewards/accuracy_reward": 0.28125000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5970982313156128, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1474.3326416015625, "epoch": 0.138003136434919, "grad_norm": 2.6561524868011475, "kl": 2.310546875, "learning_rate": 2.9881732370172863e-07, "loss": 0.2223, "reward": 0.713169664144516, "reward_std": 0.3104434460401535, "rewards/accuracy_reward": 0.13169643562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1490.7009582519531, "epoch": 0.13830184452244043, "grad_norm": 2.9729437828063965, "kl": 2.083984375, "learning_rate": 2.987986533332029e-07, "loss": 0.2483, "reward": 0.7031250298023224, "reward_std": 0.2886030375957489, "rewards/accuracy_reward": 0.1272321476135403, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 1423.5201416015625, "epoch": 0.1386005526099619, "grad_norm": 1.909359335899353, "kl": 1.75, "learning_rate": 2.987798374048561e-07, "loss": 0.2105, "reward": 0.8515625298023224, "reward_std": 0.3628368452191353, "rewards/accuracy_reward": 0.22991072619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6216517984867096, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 1495.4710693359375, "epoch": 0.13889926069748337, "grad_norm": 2.0928354263305664, "kl": 1.947265625, "learning_rate": 2.9876087593715795e-07, "loss": 0.2294, "reward": 0.7790178954601288, "reward_std": 0.32921967282891273, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678954601288, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 1371.2790832519531, "epoch": 0.13919796878500484, "grad_norm": 3.243940591812134, "kl": 2.35546875, "learning_rate": 2.987417689507368e-07, "loss": 0.2269, "reward": 0.6902901977300644, "reward_std": 0.2845025658607483, "rewards/accuracy_reward": 0.10044642956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 1488.7969360351562, "epoch": 0.13949667687252632, "grad_norm": 3.2148289680480957, "kl": 2.26953125, "learning_rate": 2.987225164663791e-07, "loss": 0.2358, "reward": 0.7020089626312256, "reward_std": 0.33655861020088196, "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125298023224, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1465.3438110351562, "epoch": 0.13979538496004779, "grad_norm": 3.64693546295166, "kl": 2.265625, "learning_rate": 2.9870311850502966e-07, "loss": 0.2216, "reward": 0.6696428954601288, "reward_std": 0.29757318645715714, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 1432.7500610351562, "epoch": 0.14009409304756926, "grad_norm": 3.8039207458496094, "kl": 2.169921875, "learning_rate": 2.9868357508779165e-07, "loss": 0.2391, "reward": 0.6891741305589676, "reward_std": 0.3356279358267784, "rewards/accuracy_reward": 0.11607143585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731027126312256, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 1415.54248046875, "epoch": 0.14039280113509073, "grad_norm": 4.670017719268799, "kl": 2.326171875, "learning_rate": 2.9866388623592633e-07, "loss": 0.228, "reward": 0.805245578289032, "reward_std": 0.3614969104528427, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6199776977300644, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 1426.4063110351562, "epoch": 0.1406915092226122, "grad_norm": 2.1187686920166016, "kl": 1.921875, "learning_rate": 2.9864405197085324e-07, "loss": 0.2045, "reward": 0.705357164144516, "reward_std": 0.3319389671087265, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 1447.1295471191406, "epoch": 0.14099021731013367, "grad_norm": 2.9662201404571533, "kl": 1.755859375, "learning_rate": 2.986240723141502e-07, "loss": 0.2085, "reward": 0.8370535969734192, "reward_std": 0.33054159581661224, "rewards/accuracy_reward": 0.25892858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 1456.2679138183594, "epoch": 0.14128892539765514, "grad_norm": 3.0304224491119385, "kl": 1.70703125, "learning_rate": 2.98603947287553e-07, "loss": 0.213, "reward": 0.6824777126312256, "reward_std": 0.32545629143714905, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043527275323868, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 1503.4331359863281, "epoch": 0.1415876334851766, "grad_norm": 1.9852255582809448, "kl": 2.109375, "learning_rate": 2.9858367691295573e-07, "loss": 0.2318, "reward": 0.7098214477300644, "reward_std": 0.3094250075519085, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5825893133878708, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 1431.9710693359375, "epoch": 0.14188634157269808, "grad_norm": 3.0506951808929443, "kl": 1.845703125, "learning_rate": 2.9856326121241063e-07, "loss": 0.2147, "reward": 0.6936384290456772, "reward_std": 0.31963609904050827, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.608816996216774, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1460.8839721679688, "epoch": 0.14218504966021955, "grad_norm": 1.5436222553253174, "kl": 2.0546875, "learning_rate": 2.9854270020812793e-07, "loss": 0.2148, "reward": 0.7059152275323868, "reward_std": 0.3052345737814903, "rewards/accuracy_reward": 0.10267857881262898, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366305589676, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 1532.8773193359375, "epoch": 0.14248375774774102, "grad_norm": 1.6293562650680542, "kl": 1.947265625, "learning_rate": 2.9852199392247593e-07, "loss": 0.197, "reward": 0.7516741454601288, "reward_std": 0.2945735417306423, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5842634290456772, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 1337.4732666015625, "epoch": 0.1427824658352625, "grad_norm": 2.5717222690582275, "kl": 1.58984375, "learning_rate": 2.9850114237798114e-07, "loss": 0.184, "reward": 0.8175223618745804, "reward_std": 0.3447072058916092, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6344866156578064, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 1343.8572082519531, "epoch": 0.14308117392278397, "grad_norm": 2.003377914428711, "kl": 1.87890625, "learning_rate": 2.98480145597328e-07, "loss": 0.2535, "reward": 0.7092634290456772, "reward_std": 0.299997054040432, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6445312798023224, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 1362.8839721679688, "epoch": 0.14337988201030544, "grad_norm": 1.606672763824463, "kl": 1.923828125, "learning_rate": 2.984590036033589e-07, "loss": 0.2107, "reward": 0.7622768133878708, "reward_std": 0.2912249192595482, "rewards/accuracy_reward": 0.13616072060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6261160969734192, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 1503.2545471191406, "epoch": 0.1436785900978269, "grad_norm": 4.464453220367432, "kl": 2.50390625, "learning_rate": 2.9843771641907425e-07, "loss": 0.2224, "reward": 0.7539062798023224, "reward_std": 0.3230953738093376, "rewards/accuracy_reward": 0.16071429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1352.1830749511719, "epoch": 0.14397729818534838, "grad_norm": 1.6058191061019897, "kl": 1.689453125, "learning_rate": 2.984162840676324e-07, "loss": 0.1753, "reward": 0.86495541036129, "reward_std": 0.32333026081323624, "rewards/accuracy_reward": 0.2410714440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6238839626312256, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 1357.3884582519531, "epoch": 0.14427600627286985, "grad_norm": 1.9830535650253296, "kl": 1.681640625, "learning_rate": 2.9839470657234973e-07, "loss": 0.1707, "reward": 0.7293527126312256, "reward_std": 0.31283652037382126, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6266741305589676, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 1372.247802734375, "epoch": 0.14457471436039132, "grad_norm": 1.470196008682251, "kl": 1.98046875, "learning_rate": 2.983729839567004e-07, "loss": 0.2035, "reward": 0.8783482611179352, "reward_std": 0.3825588598847389, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.637276828289032, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 1450.2165832519531, "epoch": 0.14487342244791276, "grad_norm": 2.305537700653076, "kl": 1.47265625, "learning_rate": 2.983511162443165e-07, "loss": 0.176, "reward": 0.762276828289032, "reward_std": 0.344062153249979, "rewards/accuracy_reward": 0.14955357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6127232313156128, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 1529.2076416015625, "epoch": 0.14517213053543424, "grad_norm": 2.2607555389404297, "kl": 1.904296875, "learning_rate": 2.9832910345898786e-07, "loss": 0.1843, "reward": 0.6523437649011612, "reward_std": 0.2779325693845749, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6010044813156128, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1223.8527221679688, "epoch": 0.1454708386229557, "grad_norm": 2.503504514694214, "kl": 1.626953125, "learning_rate": 2.983069456246624e-07, "loss": 0.1947, "reward": 0.7399553954601288, "reward_std": 0.2961529679596424, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6439732611179352, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 1319.1205749511719, "epoch": 0.14576954671047718, "grad_norm": 2.1653828620910645, "kl": 1.740234375, "learning_rate": 2.9828464276544557e-07, "loss": 0.2357, "reward": 0.6863839477300644, "reward_std": 0.30332938581705093, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5970982313156128, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 1324.4063110351562, "epoch": 0.14606825479799865, "grad_norm": 1.760551929473877, "kl": 1.65625, "learning_rate": 2.9826219490560066e-07, "loss": 0.1855, "reward": 0.7784598469734192, "reward_std": 0.33888837695121765, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6356027126312256, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1327.1920166015625, "epoch": 0.14636696288552012, "grad_norm": 3.2158713340759277, "kl": 1.810546875, "learning_rate": 2.9823960206954886e-07, "loss": 0.1979, "reward": 0.714285746216774, "reward_std": 0.3110659345984459, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6383928805589676, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1379.0090026855469, "epoch": 0.1466656709730416, "grad_norm": 2.3065526485443115, "kl": 1.681640625, "learning_rate": 2.982168642818689e-07, "loss": 0.171, "reward": 0.7299107313156128, "reward_std": 0.3081848993897438, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000447034836, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 1212.9063110351562, "epoch": 0.14696437906056306, "grad_norm": 8.987589836120605, "kl": 1.544921875, "learning_rate": 2.981939815672972e-07, "loss": 0.2898, "reward": 0.8794643133878708, "reward_std": 0.33471082150936127, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6696428805589676, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1389.4063110351562, "epoch": 0.14726308714808453, "grad_norm": 6.335673809051514, "kl": 2.384765625, "learning_rate": 2.98170953950728e-07, "loss": 0.1985, "reward": 0.7393973469734192, "reward_std": 0.28198250383138657, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6077009290456772, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 1201.5023040771484, "epoch": 0.147561795235606, "grad_norm": 2.852086305618286, "kl": 1.8046875, "learning_rate": 2.9814778145721304e-07, "loss": 0.2072, "reward": 0.7527902275323868, "reward_std": 0.2894596941769123, "rewards/accuracy_reward": 0.12946428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6233258992433548, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 1402.9107971191406, "epoch": 0.14786050332312747, "grad_norm": 4.080699920654297, "kl": 2.23828125, "learning_rate": 2.981244641119617e-07, "loss": 0.2156, "reward": 0.7070312947034836, "reward_std": 0.32959430664777756, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.617745578289032, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1381.37060546875, "epoch": 0.14815921141064894, "grad_norm": 3.3376662731170654, "kl": 2.119140625, "learning_rate": 2.98101001940341e-07, "loss": 0.2216, "reward": 0.7165178805589676, "reward_std": 0.2980485185980797, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183035969734192, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 1433.62060546875, "epoch": 0.14845791949817042, "grad_norm": 2.714540481567383, "kl": 2.04296875, "learning_rate": 2.9807739496787554e-07, "loss": 0.1976, "reward": 0.7901786118745804, "reward_std": 0.27919070050120354, "rewards/accuracy_reward": 0.19866071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 1389.6942138671875, "epoch": 0.1487566275856919, "grad_norm": 1.7343618869781494, "kl": 1.54296875, "learning_rate": 2.9805364322024734e-07, "loss": 0.1636, "reward": 0.828683078289032, "reward_std": 0.39550189673900604, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.607700914144516, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 1327.3125610351562, "epoch": 0.14905533567321336, "grad_norm": 2.714236259460449, "kl": 1.75390625, "learning_rate": 2.98029746723296e-07, "loss": 0.2149, "reward": 0.7622768133878708, "reward_std": 0.37350621074438095, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303805589676, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1366.40185546875, "epoch": 0.14935404376073483, "grad_norm": 3.591031551361084, "kl": 1.4453125, "learning_rate": 2.980057055030185e-07, "loss": 0.1899, "reward": 0.6841518133878708, "reward_std": 0.3181386925280094, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5970982313156128, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 1367.7835083007812, "epoch": 0.1496527518482563, "grad_norm": 4.561177730560303, "kl": 1.357421875, "learning_rate": 2.9798151958556936e-07, "loss": 0.1986, "reward": 0.679129496216774, "reward_std": 0.3062707781791687, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366454601288, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 1437.9531860351562, "epoch": 0.14995145993577777, "grad_norm": 4.7488298416137695, "kl": 1.599609375, "learning_rate": 2.9795718899726056e-07, "loss": 0.1906, "reward": 0.6434152126312256, "reward_std": 0.29583151638507843, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 1430.7634582519531, "epoch": 0.15025016802329924, "grad_norm": 1.3406668901443481, "kl": 1.447265625, "learning_rate": 2.979327137645613e-07, "loss": 0.178, "reward": 0.7410714626312256, "reward_std": 0.35077981650829315, "rewards/accuracy_reward": 0.14062500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 1328.85498046875, "epoch": 0.1505488761108207, "grad_norm": 2.184772253036499, "kl": 1.6328125, "learning_rate": 2.9790809391409823e-07, "loss": 0.19, "reward": 0.7393973618745804, "reward_std": 0.2921490743756294, "rewards/accuracy_reward": 0.13616071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366305589676, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 1266.4799499511719, "epoch": 0.15084758419834218, "grad_norm": 1.9945482015609741, "kl": 1.486328125, "learning_rate": 2.9788332947265547e-07, "loss": 0.203, "reward": 0.8694196939468384, "reward_std": 0.41108542680740356, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375298023224, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 1422.8236999511719, "epoch": 0.15114629228586363, "grad_norm": 1.9457911252975464, "kl": 1.810546875, "learning_rate": 2.978584204671741e-07, "loss": 0.1947, "reward": 0.7081473469734192, "reward_std": 0.31472300738096237, "rewards/accuracy_reward": 0.10714286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6010044813156128, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 1362.9219360351562, "epoch": 0.1514450003733851, "grad_norm": 2.030211925506592, "kl": 1.71875, "learning_rate": 2.978333669247529e-07, "loss": 0.1989, "reward": 0.7354910969734192, "reward_std": 0.32736585289239883, "rewards/accuracy_reward": 0.16517857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125298023224, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 1304.8170166015625, "epoch": 0.15174370846090657, "grad_norm": 4.463720321655273, "kl": 2.109375, "learning_rate": 2.978081688726475e-07, "loss": 0.206, "reward": 0.8537946939468384, "reward_std": 0.3218907192349434, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603794664144516, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 1397.4040832519531, "epoch": 0.15204241654842804, "grad_norm": 1.5174646377563477, "kl": 2.0078125, "learning_rate": 2.9778282633827096e-07, "loss": 0.1881, "reward": 0.670200914144516, "reward_std": 0.2925293743610382, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 1428.5536499023438, "epoch": 0.1523411246359495, "grad_norm": 1.2803972959518433, "kl": 1.892578125, "learning_rate": 2.977573393491935e-07, "loss": 0.1894, "reward": 0.6707589626312256, "reward_std": 0.3134211078286171, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6060267984867096, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 1344.5603637695312, "epoch": 0.15263983272347098, "grad_norm": 3.6982853412628174, "kl": 1.482421875, "learning_rate": 2.977317079331424e-07, "loss": 0.1964, "reward": 0.8627232611179352, "reward_std": 0.28600042685866356, "rewards/accuracy_reward": 0.2544642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6082589477300644, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 1327.87060546875, "epoch": 0.15293854081099245, "grad_norm": 1.7933217287063599, "kl": 1.826171875, "learning_rate": 2.9770593211800217e-07, "loss": 0.2125, "reward": 0.808035746216774, "reward_std": 0.3185873404145241, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750298023224, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 1423.1161499023438, "epoch": 0.15323724889851392, "grad_norm": 1.6344956159591675, "kl": 1.943359375, "learning_rate": 2.976800119318144e-07, "loss": 0.2071, "reward": 0.6573661118745804, "reward_std": 0.28831013292074203, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594866082072258, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 1213.0245971679688, "epoch": 0.1535359569860354, "grad_norm": 2.162921905517578, "kl": 1.76953125, "learning_rate": 2.9765394740277776e-07, "loss": 0.215, "reward": 0.7633928954601288, "reward_std": 0.3459802493453026, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138392984867096, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 1362.6808776855469, "epoch": 0.15383466507355686, "grad_norm": 1.8100570440292358, "kl": 1.998046875, "learning_rate": 2.9762773855924773e-07, "loss": 0.2038, "reward": 0.7215401977300644, "reward_std": 0.3487880006432533, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6121652126312256, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 1367.6183471679688, "epoch": 0.15413337316107834, "grad_norm": 1.9195843935012817, "kl": 1.962890625, "learning_rate": 2.976013854297371e-07, "loss": 0.2218, "reward": 0.616629496216774, "reward_std": 0.28701673448085785, "rewards/accuracy_reward": 0.031250001629814506, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5853794813156128, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 1219.5246276855469, "epoch": 0.1544320812485998, "grad_norm": 1.9982043504714966, "kl": 1.78515625, "learning_rate": 2.9757488804291543e-07, "loss": 0.2341, "reward": 0.8097098618745804, "reward_std": 0.3340122327208519, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6132812798023224, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 1239.1250305175781, "epoch": 0.15473078933612128, "grad_norm": 3.315272331237793, "kl": 1.6640625, "learning_rate": 2.9754824642760934e-07, "loss": 0.2096, "reward": 0.7812500298023224, "reward_std": 0.26124628260731697, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.636160746216774, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 1250.2522888183594, "epoch": 0.15502949742364275, "grad_norm": 2.2030951976776123, "kl": 1.671875, "learning_rate": 2.9752146061280227e-07, "loss": 0.1763, "reward": 0.784598246216774, "reward_std": 0.3502102643251419, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603794664144516, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 1420.4085388183594, "epoch": 0.15532820551116422, "grad_norm": 3.437328577041626, "kl": 1.921875, "learning_rate": 2.9749453062763455e-07, "loss": 0.1254, "reward": 0.6484375149011612, "reward_std": 0.2865727096796036, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5725446790456772, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 1293.1920166015625, "epoch": 0.1556269135986857, "grad_norm": 1.4827286005020142, "kl": 1.806640625, "learning_rate": 2.9746745650140334e-07, "loss": 0.1693, "reward": 0.6735491305589676, "reward_std": 0.28130095452070236, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.602120578289032, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 1178.5848999023438, "epoch": 0.15592562168620716, "grad_norm": 1.7537493705749512, "kl": 1.505859375, "learning_rate": 2.974402382635626e-07, "loss": 0.1795, "reward": 0.7198660969734192, "reward_std": 0.2923256456851959, "rewards/accuracy_reward": 0.1227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5970982313156128, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 1236.4866638183594, "epoch": 0.15622432977372863, "grad_norm": 3.3373425006866455, "kl": 1.716796875, "learning_rate": 2.974128759437233e-07, "loss": 0.2361, "reward": 0.765066996216774, "reward_std": 0.3157125487923622, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.602120578289032, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 1259.9777221679688, "epoch": 0.1565230378612501, "grad_norm": 2.638411283493042, "kl": 1.46484375, "learning_rate": 2.9738536957165277e-07, "loss": 0.1795, "reward": 0.7968750447034836, "reward_std": 0.34639694541692734, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611607164144516, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 1267.997802734375, "epoch": 0.15682174594877157, "grad_norm": 3.3475325107574463, "kl": 1.3984375, "learning_rate": 2.973577191772753e-07, "loss": 0.2072, "reward": 0.7421875298023224, "reward_std": 0.30015380680561066, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603794664144516, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 1372.7523193359375, "epoch": 0.15712045403629304, "grad_norm": 2.1782102584838867, "kl": 1.88671875, "learning_rate": 2.9732992479067187e-07, "loss": 0.1857, "reward": 0.734933078289032, "reward_std": 0.3190823569893837, "rewards/accuracy_reward": 0.13169643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366305589676, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 1337.4152526855469, "epoch": 0.15741916212381452, "grad_norm": 1.9678608179092407, "kl": 1.798828125, "learning_rate": 2.973019864420799e-07, "loss": 0.2078, "reward": 0.6774553954601288, "reward_std": 0.28822486102581024, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5904018133878708, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 1311.7634582519531, "epoch": 0.15771787021133596, "grad_norm": 1.7255126237869263, "kl": 2.1328125, "learning_rate": 2.972739041618937e-07, "loss": 0.2551, "reward": 0.7472098618745804, "reward_std": 0.3405918553471565, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.553013414144516, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 1402.6495666503906, "epoch": 0.15801657829885743, "grad_norm": 1.9471229314804077, "kl": 2.048828125, "learning_rate": 2.972456779806641e-07, "loss": 0.2228, "reward": 0.6953125298023224, "reward_std": 0.31477034091949463, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.559151828289032, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 1369.2813110351562, "epoch": 0.1583152863863789, "grad_norm": 2.490185260772705, "kl": 2.04296875, "learning_rate": 2.972173079290982e-07, "loss": 0.2282, "reward": 0.6640625149011612, "reward_std": 0.3126770183444023, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.565848246216774, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 1250.1607666015625, "epoch": 0.15861399447390037, "grad_norm": 2.333564281463623, "kl": 2.1484375, "learning_rate": 2.9718879403806e-07, "loss": 0.2427, "reward": 0.7360491305589676, "reward_std": 0.3302226811647415, "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 1330.0692749023438, "epoch": 0.15891270256142184, "grad_norm": 4.742910861968994, "kl": 2.541015625, "learning_rate": 2.971601363385698e-07, "loss": 0.2345, "reward": 0.6540178805589676, "reward_std": 0.30243702977895737, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571790456772, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 1357.0291137695312, "epoch": 0.1592114106489433, "grad_norm": 2.6041102409362793, "kl": 2.076171875, "learning_rate": 2.9713133486180434e-07, "loss": 0.1983, "reward": 0.722098246216774, "reward_std": 0.27202022820711136, "rewards/accuracy_reward": 0.1361607185099274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 1406.2723693847656, "epoch": 0.15951011873646478, "grad_norm": 5.8214850425720215, "kl": 2.58203125, "learning_rate": 2.971023896390968e-07, "loss": 0.1979, "reward": 0.6138393208384514, "reward_std": 0.30424460768699646, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5312500298023224, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 1381.4219360351562, "epoch": 0.15980882682398626, "grad_norm": 4.708216667175293, "kl": 2.59765625, "learning_rate": 2.970733007019368e-07, "loss": 0.2195, "reward": 0.616629496216774, "reward_std": 0.31651800125837326, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.536272332072258, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 1364.1206359863281, "epoch": 0.16010753491150773, "grad_norm": 3.141550302505493, "kl": 2.046875, "learning_rate": 2.970440680819702e-07, "loss": 0.1927, "reward": 0.7633928954601288, "reward_std": 0.30361153185367584, "rewards/accuracy_reward": 0.2008928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000298023224, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 1351.6340026855469, "epoch": 0.1604062429990292, "grad_norm": 3.0957999229431152, "kl": 1.677734375, "learning_rate": 2.9701469181099917e-07, "loss": 0.2053, "reward": 0.6757812947034836, "reward_std": 0.2997969463467598, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 1388.2545166015625, "epoch": 0.16070495108655067, "grad_norm": 1.6409519910812378, "kl": 1.6640625, "learning_rate": 2.9698517192098233e-07, "loss": 0.1666, "reward": 0.6529018133878708, "reward_std": 0.28466832637786865, "rewards/accuracy_reward": 0.09151786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 1457.6295166015625, "epoch": 0.16100365917407214, "grad_norm": 4.688132286071777, "kl": 1.3515625, "learning_rate": 2.969555084440344e-07, "loss": 0.1857, "reward": 0.7142857313156128, "reward_std": 0.29148516803979874, "rewards/accuracy_reward": 0.1495535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 1341.1942749023438, "epoch": 0.1613023672615936, "grad_norm": 2.9548566341400146, "kl": 1.380859375, "learning_rate": 2.9692570141242624e-07, "loss": 0.1731, "reward": 0.702566996216774, "reward_std": 0.33142734318971634, "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348469734192, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 1341.5714721679688, "epoch": 0.16160107534911508, "grad_norm": 3.3150219917297363, "kl": 1.5703125, "learning_rate": 2.968957508585851e-07, "loss": 0.228, "reward": 0.624441996216774, "reward_std": 0.3048996105790138, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5664062798023224, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 1359.7813110351562, "epoch": 0.16189978343663655, "grad_norm": 3.0899341106414795, "kl": 1.806640625, "learning_rate": 2.968656568150942e-07, "loss": 0.1647, "reward": 0.6796875447034836, "reward_std": 0.33013804256916046, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 1412.6183776855469, "epoch": 0.16219849152415802, "grad_norm": 1.5847591161727905, "kl": 1.740234375, "learning_rate": 2.968354193146929e-07, "loss": 0.1846, "reward": 0.6127232313156128, "reward_std": 0.27792390435934067, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.559151828289032, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 1302.9040832519531, "epoch": 0.1624971996116795, "grad_norm": 2.6086819171905518, "kl": 1.85546875, "learning_rate": 2.968050383902767e-07, "loss": 0.2478, "reward": 0.828683078289032, "reward_std": 0.36533912271261215, "rewards/accuracy_reward": 0.2410714440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116305589676, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 1343.3103332519531, "epoch": 0.16279590769920096, "grad_norm": 2.7577810287475586, "kl": 2.025390625, "learning_rate": 2.96774514074897e-07, "loss": 0.1955, "reward": 0.6462053954601288, "reward_std": 0.28622812777757645, "rewards/accuracy_reward": 0.08258929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636160969734192, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 1309.3884582519531, "epoch": 0.16309461578672244, "grad_norm": 2.3493781089782715, "kl": 1.75390625, "learning_rate": 2.967438464017612e-07, "loss": 0.2046, "reward": 0.6914062947034836, "reward_std": 0.313680537045002, "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6132812649011612, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 1371.1607971191406, "epoch": 0.1633933238742439, "grad_norm": 2.610234498977661, "kl": 1.84375, "learning_rate": 2.9671303540423283e-07, "loss": 0.16, "reward": 0.637276828289032, "reward_std": 0.28983183205127716, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 1422.71435546875, "epoch": 0.16369203196176538, "grad_norm": 3.140031337738037, "kl": 2.041015625, "learning_rate": 2.9668208111583124e-07, "loss": 0.1908, "reward": 0.6662946790456772, "reward_std": 0.30889812856912613, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089328289032, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 1214.9777526855469, "epoch": 0.16399074004928682, "grad_norm": 1.7830718755722046, "kl": 1.921875, "learning_rate": 2.966509835702316e-07, "loss": 0.2201, "reward": 0.7271205633878708, "reward_std": 0.35317590832710266, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 1254.6250915527344, "epoch": 0.1642894481368083, "grad_norm": 2.4523355960845947, "kl": 1.7109375, "learning_rate": 2.9661974280126504e-07, "loss": 0.2229, "reward": 0.692522332072258, "reward_std": 0.2813154309988022, "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5920759290456772, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 1329.6741638183594, "epoch": 0.16458815622432976, "grad_norm": 1.9598231315612793, "kl": 1.94140625, "learning_rate": 2.9658835884291844e-07, "loss": 0.207, "reward": 0.6785714626312256, "reward_std": 0.2546756826341152, "rewards/accuracy_reward": 0.11607143143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000298023224, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 1329.9822082519531, "epoch": 0.16488686431185123, "grad_norm": 2.3984365463256836, "kl": 1.837890625, "learning_rate": 2.9655683172933443e-07, "loss": 0.1823, "reward": 0.6947544813156128, "reward_std": 0.3128894418478012, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 1245.8973693847656, "epoch": 0.1651855723993727, "grad_norm": 1.7619162797927856, "kl": 2.19140625, "learning_rate": 2.9652516149481144e-07, "loss": 0.2289, "reward": 0.6785714626312256, "reward_std": 0.291094034910202, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 1316.85498046875, "epoch": 0.16548428048689418, "grad_norm": 2.386834144592285, "kl": 1.52734375, "learning_rate": 2.964933481738036e-07, "loss": 0.2118, "reward": 0.7148437798023224, "reward_std": 0.36708658188581467, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 1358.4598693847656, "epoch": 0.16578298857441565, "grad_norm": 2.6268839836120605, "kl": 1.935546875, "learning_rate": 2.964613918009207e-07, "loss": 0.2283, "reward": 0.7354911118745804, "reward_std": 0.29133735597133636, "rewards/accuracy_reward": 0.17633929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.559151828289032, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 1339.1496276855469, "epoch": 0.16608169666193712, "grad_norm": 1.4101991653442383, "kl": 1.978515625, "learning_rate": 2.964292924109281e-07, "loss": 0.1988, "reward": 0.667410746216774, "reward_std": 0.32595738768577576, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678805589676, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 1337.5513916015625, "epoch": 0.1663804047494586, "grad_norm": 2.8455867767333984, "kl": 1.80859375, "learning_rate": 2.9639705003874686e-07, "loss": 0.2159, "reward": 0.7126116305589676, "reward_std": 0.3327535539865494, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5987723469734192, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 1320.3147888183594, "epoch": 0.16667911283698006, "grad_norm": 3.9756999015808105, "kl": 2.140625, "learning_rate": 2.9636466471945345e-07, "loss": 0.2076, "reward": 0.6724330633878708, "reward_std": 0.3368644490838051, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 1362.9933776855469, "epoch": 0.16697782092450153, "grad_norm": 5.011478424072266, "kl": 2.48828125, "learning_rate": 2.9633213648827995e-07, "loss": 0.2055, "reward": 0.6657366305589676, "reward_std": 0.34529294073581696, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 1246.7411193847656, "epoch": 0.167276529012023, "grad_norm": 2.003479480743408, "kl": 2.173828125, "learning_rate": 2.962994653806139e-07, "loss": 0.1963, "reward": 0.7226562947034836, "reward_std": 0.3600875586271286, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731027126312256, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 1230.5379943847656, "epoch": 0.16757523709954447, "grad_norm": 2.812901496887207, "kl": 2.234375, "learning_rate": 2.962666514319982e-07, "loss": 0.2141, "reward": 0.6981026977300644, "reward_std": 0.35064928233623505, "rewards/accuracy_reward": 0.12723215203732252, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705633878708, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 1325.0000610351562, "epoch": 0.16787394518706594, "grad_norm": 2.5071194171905518, "kl": 2.1328125, "learning_rate": 2.9623369467813125e-07, "loss": 0.1974, "reward": 0.6925223469734192, "reward_std": 0.38189415633678436, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937798023224, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 1299.7076416015625, "epoch": 0.1681726532745874, "grad_norm": 2.6967339515686035, "kl": 1.689453125, "learning_rate": 2.9620059515486685e-07, "loss": 0.1891, "reward": 0.765066996216774, "reward_std": 0.29438207298517227, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 1293.1562805175781, "epoch": 0.16847136136210888, "grad_norm": 3.0062568187713623, "kl": 2.24609375, "learning_rate": 2.961673528982139e-07, "loss": 0.2167, "reward": 0.6367187798023224, "reward_std": 0.30704937875270844, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5385044813156128, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 1225.2522888183594, "epoch": 0.16877006944963036, "grad_norm": 3.199305534362793, "kl": 1.501953125, "learning_rate": 2.961339679443368e-07, "loss": 0.2018, "reward": 0.722098246216774, "reward_std": 0.29116223752498627, "rewards/accuracy_reward": 0.13616072433069348, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 1213.7411346435547, "epoch": 0.16906877753715183, "grad_norm": 2.0640358924865723, "kl": 1.703125, "learning_rate": 2.961004403295551e-07, "loss": 0.1961, "reward": 0.7092634290456772, "reward_std": 0.3300848826766014, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562649011612, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 1419.2366638183594, "epoch": 0.1693674856246733, "grad_norm": 1.3651434183120728, "kl": 1.75, "learning_rate": 2.9606677009034354e-07, "loss": 0.1949, "reward": 0.6813616305589676, "reward_std": 0.3662364035844803, "rewards/accuracy_reward": 0.11607143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 1294.9286499023438, "epoch": 0.16966619371219477, "grad_norm": 2.7443771362304688, "kl": 1.59375, "learning_rate": 2.96032957263332e-07, "loss": 0.1804, "reward": 0.688616082072258, "reward_std": 0.30449428409338, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581473246216774, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 1309.185302734375, "epoch": 0.16996490179971624, "grad_norm": 1.6206597089767456, "kl": 1.794921875, "learning_rate": 2.959990018853056e-07, "loss": 0.2016, "reward": 0.676339328289032, "reward_std": 0.3500155955553055, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 1237.8504943847656, "epoch": 0.1702636098872377, "grad_norm": 2.931919574737549, "kl": 1.748046875, "learning_rate": 2.959649039932044e-07, "loss": 0.2337, "reward": 0.7879464626312256, "reward_std": 0.3435285836458206, "rewards/accuracy_reward": 0.216517873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 1265.2857666015625, "epoch": 0.17056231797475915, "grad_norm": 1.6735901832580566, "kl": 1.955078125, "learning_rate": 2.959306636241237e-07, "loss": 0.2217, "reward": 0.655691996216774, "reward_std": 0.3158768117427826, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 1299.4420166015625, "epoch": 0.17086102606228062, "grad_norm": 1.2962442636489868, "kl": 1.8671875, "learning_rate": 2.9589628081531347e-07, "loss": 0.2094, "reward": 0.7438616305589676, "reward_std": 0.3368866816163063, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 1338.9576110839844, "epoch": 0.1711597341498021, "grad_norm": 2.8994078636169434, "kl": 2.072265625, "learning_rate": 2.9586175560417893e-07, "loss": 0.2075, "reward": 0.6729910969734192, "reward_std": 0.27686845138669014, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 1213.0916137695312, "epoch": 0.17145844223732357, "grad_norm": 2.712886095046997, "kl": 1.98828125, "learning_rate": 2.9582708802828015e-07, "loss": 0.2116, "reward": 0.6356026977300644, "reward_std": 0.28797560930252075, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 1376.8103332519531, "epoch": 0.17175715032484504, "grad_norm": 1.6688140630722046, "kl": 2.09375, "learning_rate": 2.95792278125332e-07, "loss": 0.2116, "reward": 0.6540178954601288, "reward_std": 0.3373526930809021, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 1282.0893249511719, "epoch": 0.1720558584123665, "grad_norm": 3.410916566848755, "kl": 1.85546875, "learning_rate": 2.9575732593320434e-07, "loss": 0.2428, "reward": 0.6824777126312256, "reward_std": 0.31957997381687164, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5842634290456772, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 1311.9933776855469, "epoch": 0.17235456649988798, "grad_norm": 1.7891688346862793, "kl": 2.025390625, "learning_rate": 2.957222314899216e-07, "loss": 0.2121, "reward": 0.6651785969734192, "reward_std": 0.3332887962460518, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.587053582072258, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 1269.2545166015625, "epoch": 0.17265327458740945, "grad_norm": 1.7266921997070312, "kl": 1.875, "learning_rate": 2.9568699483366314e-07, "loss": 0.1933, "reward": 0.7087053954601288, "reward_std": 0.3430727645754814, "rewards/accuracy_reward": 0.12276786495931447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 1223.2344360351562, "epoch": 0.17295198267493092, "grad_norm": 1.775802731513977, "kl": 1.865234375, "learning_rate": 2.95651616002763e-07, "loss": 0.2124, "reward": 0.66573666036129, "reward_std": 0.31930436939001083, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965402126312256, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 1286.3058776855469, "epoch": 0.1732506907624524, "grad_norm": 1.6123160123825073, "kl": 1.845703125, "learning_rate": 2.956160950357099e-07, "loss": 0.1906, "reward": 0.7455357611179352, "reward_std": 0.3420178070664406, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5892857164144516, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 1311.5357360839844, "epoch": 0.17354939884997386, "grad_norm": 1.3821473121643066, "kl": 1.833984375, "learning_rate": 2.955804319711472e-07, "loss": 0.1854, "reward": 0.6088170111179352, "reward_std": 0.3012676164507866, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.559709832072258, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 1420.0536499023438, "epoch": 0.17384810693749533, "grad_norm": 1.375596046447754, "kl": 1.8671875, "learning_rate": 2.9554462684787274e-07, "loss": 0.1938, "reward": 0.6199776977300644, "reward_std": 0.24796469509601593, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5284598469734192, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 1381.4241638183594, "epoch": 0.1741468150250168, "grad_norm": 1.7386895418167114, "kl": 1.59765625, "learning_rate": 2.95508679704839e-07, "loss": 0.1678, "reward": 0.7137276977300644, "reward_std": 0.29906517267227173, "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.577566996216774, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 1210.7165832519531, "epoch": 0.17444552311253828, "grad_norm": 3.404744863510132, "kl": 1.724609375, "learning_rate": 2.95472590581153e-07, "loss": 0.2208, "reward": 0.7734375298023224, "reward_std": 0.30194444954395294, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6261161118745804, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 1247.69873046875, "epoch": 0.17474423120005975, "grad_norm": 2.3896706104278564, "kl": 1.830078125, "learning_rate": 2.954363595160761e-07, "loss": 0.1952, "reward": 0.666294664144516, "reward_std": 0.2894363962113857, "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 1265.3192443847656, "epoch": 0.17504293928758122, "grad_norm": 3.96429443359375, "kl": 1.876953125, "learning_rate": 2.953999865490242e-07, "loss": 0.25, "reward": 0.7081473618745804, "reward_std": 0.34210988134145737, "rewards/accuracy_reward": 0.11383929057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5943080633878708, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 1283.1384582519531, "epoch": 0.1753416473751027, "grad_norm": 1.8049471378326416, "kl": 2.23828125, "learning_rate": 2.953634717195675e-07, "loss": 0.2719, "reward": 0.6406250298023224, "reward_std": 0.3346988782286644, "rewards/accuracy_reward": 0.08258929057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.558035746216774, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 1234.2790985107422, "epoch": 0.17564035546262416, "grad_norm": 1.9160741567611694, "kl": 2.1328125, "learning_rate": 2.9532681506743057e-07, "loss": 0.2141, "reward": 0.765066996216774, "reward_std": 0.37275809794664383, "rewards/accuracy_reward": 0.17633929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 1320.9152526855469, "epoch": 0.17593906355014563, "grad_norm": 2.3735363483428955, "kl": 2.40234375, "learning_rate": 2.9529001663249226e-07, "loss": 0.2142, "reward": 0.6902901977300644, "reward_std": 0.3071579709649086, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 1329.1719055175781, "epoch": 0.1762377716376671, "grad_norm": 3.35433292388916, "kl": 2.45703125, "learning_rate": 2.9525307645478564e-07, "loss": 0.2143, "reward": 0.646763414144516, "reward_std": 0.2876173406839371, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 1312.5089721679688, "epoch": 0.17653647972518857, "grad_norm": 1.7155447006225586, "kl": 2.34765625, "learning_rate": 2.95215994574498e-07, "loss": 0.2382, "reward": 0.6679687649011612, "reward_std": 0.32674986124038696, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187798023224, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 1328.90185546875, "epoch": 0.17683518781271004, "grad_norm": 1.9275996685028076, "kl": 2.1484375, "learning_rate": 2.951787710319708e-07, "loss": 0.2262, "reward": 0.5859375, "reward_std": 0.27949077636003494, "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 1366.6473693847656, "epoch": 0.17713389590023149, "grad_norm": 2.03189754486084, "kl": 2.546875, "learning_rate": 2.9514140586769965e-07, "loss": 0.2425, "reward": 0.6902902126312256, "reward_std": 0.33959171921014786, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187649011612, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 1253.3616638183594, "epoch": 0.17743260398775296, "grad_norm": 3.7110204696655273, "kl": 1.916015625, "learning_rate": 2.951038991223341e-07, "loss": 0.2275, "reward": 0.8058036118745804, "reward_std": 0.3508753776550293, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178954601288, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 1244.7545166015625, "epoch": 0.17773131207527443, "grad_norm": 1.5904840230941772, "kl": 2.05078125, "learning_rate": 2.950662508366779e-07, "loss": 0.219, "reward": 0.647879496216774, "reward_std": 0.2816258817911148, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.583147332072258, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 1213.2098999023438, "epoch": 0.1780300201627959, "grad_norm": 2.243239641189575, "kl": 2.099609375, "learning_rate": 2.950284610516886e-07, "loss": 0.2249, "reward": 0.6562500298023224, "reward_std": 0.3245512992143631, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678805589676, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 1326.5156860351562, "epoch": 0.17832872825031737, "grad_norm": 1.5034074783325195, "kl": 2.015625, "learning_rate": 2.9499052980847784e-07, "loss": 0.1792, "reward": 0.693638414144516, "reward_std": 0.2796296402812004, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312649011612, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 1170.3929138183594, "epoch": 0.17862743633783884, "grad_norm": 1.6476982831954956, "kl": 2.064453125, "learning_rate": 2.9495245714831107e-07, "loss": 0.2086, "reward": 0.7678571790456772, "reward_std": 0.3478069305419922, "rewards/accuracy_reward": 0.1785714440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 1131.5781555175781, "epoch": 0.1789261444253603, "grad_norm": 2.5105738639831543, "kl": 1.93359375, "learning_rate": 2.949142431126077e-07, "loss": 0.2073, "reward": 0.7421875298023224, "reward_std": 0.31779181957244873, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6104910969734192, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 1257.6585083007812, "epoch": 0.17922485251288178, "grad_norm": 1.820809245109558, "kl": 2.306640625, "learning_rate": 2.9487588774294076e-07, "loss": 0.2383, "reward": 0.6841518133878708, "reward_std": 0.2819531336426735, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948660969734192, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 1239.9688110351562, "epoch": 0.17952356060040325, "grad_norm": 2.252384662628174, "kl": 2.318359375, "learning_rate": 2.9483739108103715e-07, "loss": 0.2605, "reward": 0.6601562798023224, "reward_std": 0.34783125668764114, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 1341.44873046875, "epoch": 0.17982226868792472, "grad_norm": 2.106611728668213, "kl": 2.25, "learning_rate": 2.9479875316877745e-07, "loss": 0.2599, "reward": 0.671316996216774, "reward_std": 0.32845066487789154, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5440848469734192, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 1211.7812805175781, "epoch": 0.1801209767754462, "grad_norm": 2.5769903659820557, "kl": 1.64453125, "learning_rate": 2.94759974048196e-07, "loss": 0.2169, "reward": 0.7248884290456772, "reward_std": 0.3184996470808983, "rewards/accuracy_reward": 0.11607143585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.608816996216774, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 1227.83935546875, "epoch": 0.18041968486296767, "grad_norm": 2.0246081352233887, "kl": 2.28515625, "learning_rate": 2.947210537614806e-07, "loss": 0.1889, "reward": 0.6618303954601288, "reward_std": 0.3062736988067627, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089626312256, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 1173.8616638183594, "epoch": 0.18071839295048914, "grad_norm": 2.229285955429077, "kl": 2.66796875, "learning_rate": 2.946819923509727e-07, "loss": 0.2624, "reward": 0.7472098469734192, "reward_std": 0.35145553946495056, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6177455633878708, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 1320.3929138183594, "epoch": 0.1810171010380106, "grad_norm": 3.4890353679656982, "kl": 2.68359375, "learning_rate": 2.946427898591674e-07, "loss": 0.2119, "reward": 0.625558078289032, "reward_std": 0.31609708815813065, "rewards/accuracy_reward": 0.05803571781143546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 1243.8504943847656, "epoch": 0.18131580912553208, "grad_norm": 4.37901496887207, "kl": 2.61328125, "learning_rate": 2.9460344632871306e-07, "loss": 0.2393, "reward": 0.7511161118745804, "reward_std": 0.34572774171829224, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089477300644, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 1284.669692993164, "epoch": 0.18161451721305355, "grad_norm": 4.008758544921875, "kl": 2.65234375, "learning_rate": 2.945639618024116e-07, "loss": 0.2631, "reward": 0.6032366454601288, "reward_std": 0.3130444772541523, "rewards/accuracy_reward": 0.05580357485450804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5474330484867096, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 1346.7054443359375, "epoch": 0.18191322530057502, "grad_norm": 1.8525586128234863, "kl": 2.6015625, "learning_rate": 2.945243363232184e-07, "loss": 0.2266, "reward": 0.5820312649011612, "reward_std": 0.2890723645687103, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5463169813156128, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 1257.7054138183594, "epoch": 0.1822119333880965, "grad_norm": 2.2907562255859375, "kl": 2.369140625, "learning_rate": 2.94484569934242e-07, "loss": 0.2252, "reward": 0.6361607313156128, "reward_std": 0.30889467895030975, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580357164144516, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 1249.9174499511719, "epoch": 0.18251064147561796, "grad_norm": 3.2086093425750732, "kl": 2.453125, "learning_rate": 2.944446626787443e-07, "loss": 0.2621, "reward": 0.793526828289032, "reward_std": 0.32409538328647614, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803954601288, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 1319.6875610351562, "epoch": 0.18280934956313943, "grad_norm": 3.5112693309783936, "kl": 2.068359375, "learning_rate": 2.9440461460014063e-07, "loss": 0.2541, "reward": 0.5965401977300644, "reward_std": 0.28905726224184036, "rewards/accuracy_reward": 0.03125000069849193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 1222.7991638183594, "epoch": 0.1831080576506609, "grad_norm": 3.296011209487915, "kl": 1.94140625, "learning_rate": 2.943644257419993e-07, "loss": 0.2535, "reward": 0.7723214626312256, "reward_std": 0.32168637961149216, "rewards/accuracy_reward": 0.15401786495931447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183035969734192, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 1250.0938415527344, "epoch": 0.18340676573818235, "grad_norm": 1.938809871673584, "kl": 2.1875, "learning_rate": 2.9432409614804176e-07, "loss": 0.2634, "reward": 0.6495535969734192, "reward_std": 0.335442490875721, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714626312256, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 1262.27685546875, "epoch": 0.18370547382570382, "grad_norm": 2.5088188648223877, "kl": 1.92578125, "learning_rate": 2.942836258621427e-07, "loss": 0.1895, "reward": 0.6372768133878708, "reward_std": 0.32583312690258026, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125298023224, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 1371.0111999511719, "epoch": 0.1840041819132253, "grad_norm": 2.3900699615478516, "kl": 2.44140625, "learning_rate": 2.9424301492832995e-07, "loss": 0.2198, "reward": 0.5987723469734192, "reward_std": 0.28602835536003113, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5429687798023224, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 1195.1875610351562, "epoch": 0.18430289000074676, "grad_norm": 2.358747720718384, "kl": 2.41015625, "learning_rate": 2.942022633907841e-07, "loss": 0.2408, "reward": 0.7193080633878708, "reward_std": 0.32308609783649445, "rewards/accuracy_reward": 0.13839286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 1312.7053833007812, "epoch": 0.18460159808826823, "grad_norm": 2.6020190715789795, "kl": 2.205078125, "learning_rate": 2.941613712938389e-07, "loss": 0.244, "reward": 0.6953125447034836, "reward_std": 0.30200784653425217, "rewards/accuracy_reward": 0.11383929173462093, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 1212.65185546875, "epoch": 0.1849003061757897, "grad_norm": 1.878385066986084, "kl": 2.259765625, "learning_rate": 2.9412033868198085e-07, "loss": 0.2322, "reward": 0.6484375298023224, "reward_std": 0.30947543680667877, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 1264.3438110351562, "epoch": 0.18519901426331117, "grad_norm": 2.87445068359375, "kl": 2.73046875, "learning_rate": 2.940791655998495e-07, "loss": 0.2518, "reward": 0.6785714328289032, "reward_std": 0.31506554037332535, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 1277.4889221191406, "epoch": 0.18549772235083264, "grad_norm": 3.7051515579223633, "kl": 2.78515625, "learning_rate": 2.940378520922371e-07, "loss": 0.2311, "reward": 0.702566996216774, "reward_std": 0.32463347166776657, "rewards/accuracy_reward": 0.12053572130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312947034836, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 1172.9420471191406, "epoch": 0.18579643043835412, "grad_norm": 3.1422135829925537, "kl": 2.59765625, "learning_rate": 2.939963982040887e-07, "loss": 0.3118, "reward": 0.8141741454601288, "reward_std": 0.4180385023355484, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 1161.2076416015625, "epoch": 0.1860951385258756, "grad_norm": 1.6736844778060913, "kl": 2.265625, "learning_rate": 2.93954803980502e-07, "loss": 0.2313, "reward": 0.784598246216774, "reward_std": 0.3786848410964012, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 1234.6540832519531, "epoch": 0.18639384661339706, "grad_norm": 7.128546237945557, "kl": 2.859375, "learning_rate": 2.9391306946672757e-07, "loss": 0.2961, "reward": 0.7232143133878708, "reward_std": 0.3516513928771019, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000298023224, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 1297.0335388183594, "epoch": 0.18669255470091853, "grad_norm": 2.1855826377868652, "kl": 2.625, "learning_rate": 2.9387119470816837e-07, "loss": 0.2224, "reward": 0.6646205633878708, "reward_std": 0.34923846274614334, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705633878708, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 1262.6629943847656, "epoch": 0.18699126278844, "grad_norm": 3.33404541015625, "kl": 2.96484375, "learning_rate": 2.938291797503801e-07, "loss": 0.2381, "reward": 0.6981027126312256, "reward_std": 0.3452201634645462, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098618745804, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 1302.7121276855469, "epoch": 0.18728997087596147, "grad_norm": 3.0033652782440186, "kl": 3.03515625, "learning_rate": 2.937870246390708e-07, "loss": 0.2716, "reward": 0.6517857611179352, "reward_std": 0.32128140330314636, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5446428805589676, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 1296.21435546875, "epoch": 0.18758867896348294, "grad_norm": 2.2228901386260986, "kl": 2.390625, "learning_rate": 2.937447294201012e-07, "loss": 0.2452, "reward": 0.6160714626312256, "reward_std": 0.3093036487698555, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678805589676, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 1321.8014221191406, "epoch": 0.1878873870510044, "grad_norm": 2.386044979095459, "kl": 2.50390625, "learning_rate": 2.937022941394844e-07, "loss": 0.1995, "reward": 0.7594866454601288, "reward_std": 0.3447205573320389, "rewards/accuracy_reward": 0.20982143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5496651977300644, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 1281.1496276855469, "epoch": 0.18818609513852588, "grad_norm": 2.1286404132843018, "kl": 2.53125, "learning_rate": 2.9365971884338566e-07, "loss": 0.2651, "reward": 0.6026785969734192, "reward_std": 0.3113881051540375, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5357143133878708, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 1253.8526916503906, "epoch": 0.18848480322604735, "grad_norm": 4.87906551361084, "kl": 2.115234375, "learning_rate": 2.9361700357812284e-07, "loss": 0.2627, "reward": 0.662388414144516, "reward_std": 0.34899797290563583, "rewards/accuracy_reward": 0.08035714854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312649011612, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 1168.8527069091797, "epoch": 0.18878351131356882, "grad_norm": 3.4788169860839844, "kl": 2.056640625, "learning_rate": 2.9357414839016605e-07, "loss": 0.2698, "reward": 0.8035714626312256, "reward_std": 0.39230917394161224, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 1257.9754943847656, "epoch": 0.1890822194010903, "grad_norm": 4.169307708740234, "kl": 2.185546875, "learning_rate": 2.9353115332613734e-07, "loss": 0.2504, "reward": 0.6556919813156128, "reward_std": 0.35854168236255646, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 1298.7835693359375, "epoch": 0.18938092748861177, "grad_norm": 2.575434684753418, "kl": 2.1875, "learning_rate": 2.9348801843281126e-07, "loss": 0.2629, "reward": 0.678013414144516, "reward_std": 0.3479675278067589, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 1303.3482971191406, "epoch": 0.18967963557613324, "grad_norm": 2.3143484592437744, "kl": 2.76171875, "learning_rate": 2.9344474375711427e-07, "loss": 0.2993, "reward": 0.5563616305589676, "reward_std": 0.27797725796699524, "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.540736623108387, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 1265.2411499023438, "epoch": 0.18997834366365468, "grad_norm": 1.4657530784606934, "kl": 2.220703125, "learning_rate": 2.93401329346125e-07, "loss": 0.2078, "reward": 0.6741071790456772, "reward_std": 0.33585933595895767, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 1326.2924499511719, "epoch": 0.19027705175117615, "grad_norm": 2.4700639247894287, "kl": 2.619140625, "learning_rate": 2.933577752470741e-07, "loss": 0.2617, "reward": 0.6450893133878708, "reward_std": 0.34827935695648193, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5558035969734192, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 1326.0067138671875, "epoch": 0.19057575983869762, "grad_norm": 2.264681577682495, "kl": 3.0625, "learning_rate": 2.9331408150734415e-07, "loss": 0.2916, "reward": 0.666294664144516, "reward_std": 0.29776570200920105, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089477300644, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 1235.4732666015625, "epoch": 0.1908744679262191, "grad_norm": 2.723902940750122, "kl": 2.9609375, "learning_rate": 2.9327024817446965e-07, "loss": 0.2933, "reward": 0.6847098618745804, "reward_std": 0.33163949847221375, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 1263.3616638183594, "epoch": 0.19117317601374056, "grad_norm": 4.070285797119141, "kl": 2.99609375, "learning_rate": 2.9322627529613704e-07, "loss": 0.3017, "reward": 0.6791294813156128, "reward_std": 0.30734650045633316, "rewards/accuracy_reward": 0.12723214854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973618745804, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 1244.3080749511719, "epoch": 0.19147188410126204, "grad_norm": 3.0691823959350586, "kl": 3.07421875, "learning_rate": 2.9318216292018434e-07, "loss": 0.3094, "reward": 0.6579241305589676, "reward_std": 0.3376459553837776, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098469734192, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 1381.2768249511719, "epoch": 0.1917705921887835, "grad_norm": 2.5195839405059814, "kl": 2.578125, "learning_rate": 2.931379110946016e-07, "loss": 0.2987, "reward": 0.642857164144516, "reward_std": 0.31823672354221344, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.549107164144516, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 1243.1763916015625, "epoch": 0.19206930027630498, "grad_norm": 3.7158408164978027, "kl": 2.5234375, "learning_rate": 2.930935198675305e-07, "loss": 0.2429, "reward": 0.6886161267757416, "reward_std": 0.2941788285970688, "rewards/accuracy_reward": 0.12723215157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839626312256, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 1182.7031860351562, "epoch": 0.19236800836382645, "grad_norm": 3.230837821960449, "kl": 2.240234375, "learning_rate": 2.9304898928726427e-07, "loss": 0.2575, "reward": 0.6858259290456772, "reward_std": 0.28604865819215775, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 1357.6429138183594, "epoch": 0.19266671645134792, "grad_norm": 2.3502395153045654, "kl": 2.40234375, "learning_rate": 2.9300431940224783e-07, "loss": 0.2152, "reward": 0.7070312947034836, "reward_std": 0.3772338554263115, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 1370.3013916015625, "epoch": 0.1929654245388694, "grad_norm": 3.957366466522217, "kl": 2.349609375, "learning_rate": 2.929595102610777e-07, "loss": 0.238, "reward": 0.7031250447034836, "reward_std": 0.3582776039838791, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964477300644, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 1297.63623046875, "epoch": 0.19326413262639086, "grad_norm": 4.572798728942871, "kl": 3.3671875, "learning_rate": 2.9291456191250173e-07, "loss": 0.2882, "reward": 0.5758928805589676, "reward_std": 0.2758125215768814, "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5178571790456772, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 1249.3460388183594, "epoch": 0.19356284071391233, "grad_norm": 4.215023517608643, "kl": 2.9609375, "learning_rate": 2.928694744054194e-07, "loss": 0.313, "reward": 0.698660746216774, "reward_std": 0.2965206131339073, "rewards/accuracy_reward": 0.12946429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964477300644, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 1216.8348693847656, "epoch": 0.1938615488014338, "grad_norm": 1.7957091331481934, "kl": 2.564453125, "learning_rate": 2.9282424778888143e-07, "loss": 0.2388, "reward": 0.7628348469734192, "reward_std": 0.36502063274383545, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312649011612, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 1323.5982666015625, "epoch": 0.19416025688895527, "grad_norm": 6.3907976150512695, "kl": 3.109375, "learning_rate": 2.9277888211209007e-07, "loss": 0.2533, "reward": 0.6830357611179352, "reward_std": 0.33605898171663284, "rewards/accuracy_reward": 0.15178571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5312500447034836, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 1386.52685546875, "epoch": 0.19445896497647674, "grad_norm": 4.404731273651123, "kl": 3.25, "learning_rate": 2.9273337742439863e-07, "loss": 0.2591, "reward": 0.643973246216774, "reward_std": 0.33243151009082794, "rewards/accuracy_reward": 0.12723215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167410969734192, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 1341.8817443847656, "epoch": 0.19475767306399822, "grad_norm": 2.919826030731201, "kl": 2.41796875, "learning_rate": 2.926877337753117e-07, "loss": 0.2356, "reward": 0.679129496216774, "reward_std": 0.31360507011413574, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 1323.1361999511719, "epoch": 0.1950563811515197, "grad_norm": 1.3723971843719482, "kl": 2.298828125, "learning_rate": 2.926419512144852e-07, "loss": 0.1973, "reward": 0.6233259290456772, "reward_std": 0.2938479259610176, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5273437798023224, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 1246.6429443359375, "epoch": 0.19535508923904116, "grad_norm": 2.3135921955108643, "kl": 1.8359375, "learning_rate": 2.9259602979172604e-07, "loss": 0.1843, "reward": 0.646763414144516, "reward_std": 0.29186562448740005, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 1451.7679443359375, "epoch": 0.19565379732656263, "grad_norm": 2.12271785736084, "kl": 2.21875, "learning_rate": 2.9254996955699225e-07, "loss": 0.2251, "reward": 0.6088169813156128, "reward_std": 0.33835896849632263, "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5463169738650322, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 1338.1317749023438, "epoch": 0.1959525054140841, "grad_norm": 2.5565922260284424, "kl": 2.302734375, "learning_rate": 2.925037705603928e-07, "loss": 0.236, "reward": 0.715401828289032, "reward_std": 0.3267947658896446, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 1211.9486846923828, "epoch": 0.19625121350160554, "grad_norm": 1.7352392673492432, "kl": 2.765625, "learning_rate": 2.924574328521877e-07, "loss": 0.2426, "reward": 0.676339328289032, "reward_std": 0.3475376293063164, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5334821715950966, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 1351.05810546875, "epoch": 0.196549921589127, "grad_norm": 2.192871332168579, "kl": 2.4609375, "learning_rate": 2.924109564827878e-07, "loss": 0.2262, "reward": 0.6344866454601288, "reward_std": 0.2854093313217163, "rewards/accuracy_reward": 0.0870535729918629, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5474330633878708, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 1267.8013763427734, "epoch": 0.19684862967664848, "grad_norm": 1.706934928894043, "kl": 2.353515625, "learning_rate": 2.923643415027549e-07, "loss": 0.2338, "reward": 0.6824777126312256, "reward_std": 0.2959539145231247, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 1316.3326721191406, "epoch": 0.19714733776416996, "grad_norm": 4.738879680633545, "kl": 3.02734375, "learning_rate": 2.9231758796280143e-07, "loss": 0.2687, "reward": 0.6562500447034836, "reward_std": 0.3486894965171814, "rewards/accuracy_reward": 0.11383929383009672, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5424107238650322, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 1299.7254943847656, "epoch": 0.19744604585169143, "grad_norm": 2.7236106395721436, "kl": 2.626953125, "learning_rate": 2.922706959137908e-07, "loss": 0.2719, "reward": 0.8454241454601288, "reward_std": 0.3657253161072731, "rewards/accuracy_reward": 0.2834821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.561941996216774, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 1372.1607971191406, "epoch": 0.1977447539392129, "grad_norm": 1.8906363248825073, "kl": 2.22265625, "learning_rate": 2.922236654067368e-07, "loss": 0.1907, "reward": 0.6300223469734192, "reward_std": 0.30794940143823624, "rewards/accuracy_reward": 0.06026786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.569754496216774, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 1326.5804443359375, "epoch": 0.19804346202673437, "grad_norm": 4.556854724884033, "kl": 2.6328125, "learning_rate": 2.921764964928042e-07, "loss": 0.2111, "reward": 0.654575914144516, "reward_std": 0.3282273858785629, "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 1279.9063110351562, "epoch": 0.19834217011425584, "grad_norm": 3.6396210193634033, "kl": 2.91015625, "learning_rate": 2.92129189223308e-07, "loss": 0.3041, "reward": 0.6054687798023224, "reward_std": 0.2801951467990875, "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5474330633878708, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 1313.7902221679688, "epoch": 0.1986408782017773, "grad_norm": 2.2271804809570312, "kl": 2.6875, "learning_rate": 2.920817436497139e-07, "loss": 0.255, "reward": 0.6819196790456772, "reward_std": 0.3337383419275284, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581473246216774, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 1248.3170166015625, "epoch": 0.19893958628929878, "grad_norm": 2.1930408477783203, "kl": 2.392578125, "learning_rate": 2.9203415982363806e-07, "loss": 0.2478, "reward": 0.7511160969734192, "reward_std": 0.32682137191295624, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.574776828289032, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 1391.5022888183594, "epoch": 0.19923829437682025, "grad_norm": 1.7815409898757935, "kl": 2.302734375, "learning_rate": 2.919864377968471e-07, "loss": 0.214, "reward": 0.6021205484867096, "reward_std": 0.2901410609483719, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348618745804, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 1113.4665985107422, "epoch": 0.19953700246434172, "grad_norm": 3.720926284790039, "kl": 1.833984375, "learning_rate": 2.919385776212577e-07, "loss": 0.1722, "reward": 0.8297991454601288, "reward_std": 0.3258424922823906, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.608816996216774, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 1305.5915832519531, "epoch": 0.1998357105518632, "grad_norm": 3.0395023822784424, "kl": 2.453125, "learning_rate": 2.918905793489372e-07, "loss": 0.2117, "reward": 0.74776791036129, "reward_std": 0.3380211442708969, "rewards/accuracy_reward": 0.21205357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5357143133878708, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 1299.0402526855469, "epoch": 0.20013441863938466, "grad_norm": 2.6106832027435303, "kl": 1.9375, "learning_rate": 2.9184244303210304e-07, "loss": 0.248, "reward": 0.6618303805589676, "reward_std": 0.33026666939258575, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732238650322, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 1270.04248046875, "epoch": 0.20043312672690614, "grad_norm": 3.9223427772521973, "kl": 2.142578125, "learning_rate": 2.9179416872312275e-07, "loss": 0.2592, "reward": 0.6685268357396126, "reward_std": 0.2897845506668091, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.545758955180645, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 1186.8013763427734, "epoch": 0.2007318348144276, "grad_norm": 3.2253308296203613, "kl": 1.90234375, "learning_rate": 2.91745756474514e-07, "loss": 0.1833, "reward": 0.7873884439468384, "reward_std": 0.32489408552646637, "rewards/accuracy_reward": 0.18080357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.606584832072258, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 1231.8504943847656, "epoch": 0.20103054290194908, "grad_norm": 3.2266411781311035, "kl": 4.25390625, "learning_rate": 2.9169720633894466e-07, "loss": 0.2487, "reward": 0.7165178954601288, "reward_std": 0.3490903601050377, "rewards/accuracy_reward": 0.14285714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 1105.2589721679688, "epoch": 0.20132925098947055, "grad_norm": 3.7205660343170166, "kl": 1.859375, "learning_rate": 2.9164851836923253e-07, "loss": 0.2076, "reward": 0.746651828289032, "reward_std": 0.3646777346730232, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303656578064, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 1102.9107666015625, "epoch": 0.20162795907699202, "grad_norm": 4.3817853927612305, "kl": 1.970703125, "learning_rate": 2.9159969261834526e-07, "loss": 0.2617, "reward": 0.6529018133878708, "reward_std": 0.28514258563518524, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594866082072258, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 1256.8482666015625, "epoch": 0.2019266671645135, "grad_norm": 4.202768325805664, "kl": 2.8671875, "learning_rate": 2.915507291394005e-07, "loss": 0.2872, "reward": 0.7265625298023224, "reward_std": 0.3453759178519249, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875298023224, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 1258.8035888671875, "epoch": 0.20222537525203496, "grad_norm": 3.8414745330810547, "kl": 2.41015625, "learning_rate": 2.915016279856658e-07, "loss": 0.2541, "reward": 0.6992187947034836, "reward_std": 0.30503789335489273, "rewards/accuracy_reward": 0.12053571688011289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 1155.6652221679688, "epoch": 0.20252408333955643, "grad_norm": 3.2632977962493896, "kl": 2.3125, "learning_rate": 2.9145238921055827e-07, "loss": 0.2535, "reward": 0.738839328289032, "reward_std": 0.27690907195210457, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 1171.3906860351562, "epoch": 0.20282279142707788, "grad_norm": 6.482316493988037, "kl": 3.53515625, "learning_rate": 2.9140301286764496e-07, "loss": 0.3162, "reward": 0.7265625298023224, "reward_std": 0.3656807541847229, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125149011612, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 1403.8482360839844, "epoch": 0.20312149951459935, "grad_norm": 2.928969383239746, "kl": 2.2265625, "learning_rate": 2.9135349901064245e-07, "loss": 0.2119, "reward": 0.7851562947034836, "reward_std": 0.3520202934741974, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 1257.2255249023438, "epoch": 0.20342020760212082, "grad_norm": 3.279463768005371, "kl": 2.21484375, "learning_rate": 2.9130384769341706e-07, "loss": 0.2067, "reward": 0.7053571790456772, "reward_std": 0.3120860680937767, "rewards/accuracy_reward": 0.13839286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 1295.05810546875, "epoch": 0.2037189156896423, "grad_norm": 2.6953601837158203, "kl": 2.09375, "learning_rate": 2.912540589699845e-07, "loss": 0.2158, "reward": 0.6551339626312256, "reward_std": 0.30731717497110367, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5658482313156128, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 1302.7523193359375, "epoch": 0.20401762377716376, "grad_norm": 2.5531742572784424, "kl": 2.046875, "learning_rate": 2.9120413289450996e-07, "loss": 0.2178, "reward": 0.6662946790456772, "reward_std": 0.30405425652861595, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5435268133878708, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 1262.0223999023438, "epoch": 0.20431633186468523, "grad_norm": 16.76797866821289, "kl": 2.16015625, "learning_rate": 2.911540695213082e-07, "loss": 0.2517, "reward": 0.7059152126312256, "reward_std": 0.3519522249698639, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 1216.2545166015625, "epoch": 0.2046150399522067, "grad_norm": 5.535301208496094, "kl": 1.8203125, "learning_rate": 2.9110386890484317e-07, "loss": 0.2153, "reward": 0.6796875298023224, "reward_std": 0.3473030775785446, "rewards/accuracy_reward": 0.10044643585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792410969734192, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 1232.026840209961, "epoch": 0.20491374803972817, "grad_norm": 7.485930442810059, "kl": 1.759765625, "learning_rate": 2.910535310997283e-07, "loss": 0.1892, "reward": 0.7633928805589676, "reward_std": 0.3356345072388649, "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982142984867096, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 1306.357177734375, "epoch": 0.20521245612724964, "grad_norm": 5.890832424163818, "kl": 1.947265625, "learning_rate": 2.910030561607262e-07, "loss": 0.2021, "reward": 0.6367187798023224, "reward_std": 0.36643216013908386, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.545200914144516, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 1262.8995971679688, "epoch": 0.2055111642147711, "grad_norm": 5.081296920776367, "kl": 2.22265625, "learning_rate": 2.9095244414274863e-07, "loss": 0.2646, "reward": 0.6724330708384514, "reward_std": 0.3636903166770935, "rewards/accuracy_reward": 0.15625000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5161830708384514, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 1297.966552734375, "epoch": 0.20580987230229258, "grad_norm": 6.698150634765625, "kl": 1.9296875, "learning_rate": 2.909016951008565e-07, "loss": 0.2679, "reward": 0.6183035969734192, "reward_std": 0.348819263279438, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250223517418, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 1307.0982971191406, "epoch": 0.20610858038981406, "grad_norm": 4.224587917327881, "kl": 2.00390625, "learning_rate": 2.9085080909025974e-07, "loss": 0.1824, "reward": 0.6668526977300644, "reward_std": 0.35200344771146774, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5396205559372902, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 1407.6250915527344, "epoch": 0.20640728847733553, "grad_norm": 6.399862766265869, "kl": 3.078125, "learning_rate": 2.907997861663174e-07, "loss": 0.2737, "reward": 0.7248884290456772, "reward_std": 0.29934418201446533, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5507812798023224, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 1262.8504943847656, "epoch": 0.206705996564857, "grad_norm": 8.020212173461914, "kl": 2.9296875, "learning_rate": 2.907486263845374e-07, "loss": 0.2955, "reward": 0.5976562798023224, "reward_std": 0.3283785805106163, "rewards/accuracy_reward": 0.06473214761354029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5329241156578064, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 1271.3192749023438, "epoch": 0.20700470465237847, "grad_norm": 4.261770725250244, "kl": 2.470703125, "learning_rate": 2.9069732980057666e-07, "loss": 0.2355, "reward": 0.6785714477300644, "reward_std": 0.327284500002861, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 1275.5893249511719, "epoch": 0.20730341273989994, "grad_norm": 2.9424190521240234, "kl": 2.61328125, "learning_rate": 2.906458964702407e-07, "loss": 0.2348, "reward": 0.6450893133878708, "reward_std": 0.32026419043540955, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.533482164144516, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 1206.5692749023438, "epoch": 0.2076021208274214, "grad_norm": 2.9270384311676025, "kl": 2.337890625, "learning_rate": 2.9059432644948405e-07, "loss": 0.2527, "reward": 0.723214328289032, "reward_std": 0.2971097081899643, "rewards/accuracy_reward": 0.16964286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714626312256, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 1219.60498046875, "epoch": 0.20790082891494288, "grad_norm": 5.915009498596191, "kl": 2.421875, "learning_rate": 2.9054261979440973e-07, "loss": 0.2456, "reward": 0.6171875298023224, "reward_std": 0.36296629905700684, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.556919664144516, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 1254.466552734375, "epoch": 0.20819953700246435, "grad_norm": 3.8979058265686035, "kl": 2.54296875, "learning_rate": 2.904907765612696e-07, "loss": 0.2066, "reward": 0.7338169813156128, "reward_std": 0.3455209732055664, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491305589676, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 1348.0938110351562, "epoch": 0.20849824508998582, "grad_norm": 2.8345494270324707, "kl": 2.30859375, "learning_rate": 2.90438796806464e-07, "loss": 0.1879, "reward": 0.6188616305589676, "reward_std": 0.33593156933784485, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5362723469734192, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 1270.6629943847656, "epoch": 0.2087969531775073, "grad_norm": 3.520801544189453, "kl": 2.314453125, "learning_rate": 2.9038668058654175e-07, "loss": 0.2253, "reward": 0.6266741380095482, "reward_std": 0.35669560730457306, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741380095482, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 1321.9888916015625, "epoch": 0.20909566126502874, "grad_norm": 3.6579761505126953, "kl": 2.18359375, "learning_rate": 2.903344279582003e-07, "loss": 0.1629, "reward": 0.6473214477300644, "reward_std": 0.3317019045352936, "rewards/accuracy_reward": 0.0714285762514919, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 1090.7523040771484, "epoch": 0.2093943693525502, "grad_norm": 7.778899669647217, "kl": 1.8427734375, "learning_rate": 2.902820389782853e-07, "loss": 0.282, "reward": 0.7527902275323868, "reward_std": 0.32179446518421173, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.601004496216774, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 1297.3370971679688, "epoch": 0.20969307744007168, "grad_norm": 4.107633113861084, "kl": 1.94140625, "learning_rate": 2.902295137037909e-07, "loss": 0.2064, "reward": 0.6021205633878708, "reward_std": 0.2984320670366287, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5507812649011612, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 1233.5558776855469, "epoch": 0.20999178552759315, "grad_norm": 4.558014392852783, "kl": 2.005859375, "learning_rate": 2.901768521918593e-07, "loss": 0.2209, "reward": 0.6830357611179352, "reward_std": 0.3465666025876999, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321939468384, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 1214.0335693359375, "epoch": 0.21029049361511462, "grad_norm": 124.13453674316406, "kl": 5.62109375, "learning_rate": 2.9012405449978125e-07, "loss": 0.505, "reward": 0.5948660969734192, "reward_std": 0.3199402615427971, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5055803805589676, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 1289.9129943847656, "epoch": 0.2105892017026361, "grad_norm": 3.290674924850464, "kl": 1.8359375, "learning_rate": 2.900711206849954e-07, "loss": 0.1219, "reward": 0.6205357313156128, "reward_std": 0.32636844366788864, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.571428582072258, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 1280.3326721191406, "epoch": 0.21088790979015756, "grad_norm": 4.33824348449707, "kl": 2.248046875, "learning_rate": 2.900180508050884e-07, "loss": 0.178, "reward": 0.723214328289032, "reward_std": 0.3735961243510246, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5401785969734192, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 1323.5424499511719, "epoch": 0.21118661787767903, "grad_norm": 3.9503936767578125, "kl": 2.76171875, "learning_rate": 2.8996484491779533e-07, "loss": 0.2437, "reward": 0.5239955484867096, "reward_std": 0.2702138125896454, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5217634066939354, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 1135.6719360351562, "epoch": 0.2114853259652005, "grad_norm": 5.656639099121094, "kl": 2.005859375, "learning_rate": 2.8991150308099883e-07, "loss": 0.2076, "reward": 0.7829241454601288, "reward_std": 0.3528374582529068, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6289062947034836, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 1224.700927734375, "epoch": 0.21178403405272198, "grad_norm": 7.78969669342041, "kl": 2.501953125, "learning_rate": 2.898580253527296e-07, "loss": 0.2474, "reward": 0.754464328289032, "reward_std": 0.33581312745809555, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500298023224, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 1285.6161193847656, "epoch": 0.21208274214024345, "grad_norm": 3.2791521549224854, "kl": 2.34375, "learning_rate": 2.8980441179116623e-07, "loss": 0.1358, "reward": 0.714285746216774, "reward_std": 0.3498755320906639, "rewards/accuracy_reward": 0.1629464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513393059372902, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 1206.7031860351562, "epoch": 0.21238145022776492, "grad_norm": 4.083518028259277, "kl": 2.314453125, "learning_rate": 2.8975066245463496e-07, "loss": 0.2295, "reward": 0.652901828289032, "reward_std": 0.2841971665620804, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5479911044239998, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 1153.4933471679688, "epoch": 0.2126801583152864, "grad_norm": 7.734899520874023, "kl": 2.912109375, "learning_rate": 2.896967774016098e-07, "loss": 0.2995, "reward": 0.706473246216774, "reward_std": 0.36090564727783203, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5457589477300644, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 1272.6139221191406, "epoch": 0.21297886640280786, "grad_norm": 3.863568067550659, "kl": 2.05078125, "learning_rate": 2.896427566907124e-07, "loss": 0.1739, "reward": 0.7215401977300644, "reward_std": 0.2964620962738991, "rewards/accuracy_reward": 0.14508929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576450914144516, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 1250.0179138183594, "epoch": 0.21327757449032933, "grad_norm": 5.044826984405518, "kl": 2.484375, "learning_rate": 2.895886003807121e-07, "loss": 0.212, "reward": 0.6434151977300644, "reward_std": 0.34356237947940826, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5563616305589676, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 1287.5514221191406, "epoch": 0.2135762825778508, "grad_norm": 2.9968793392181396, "kl": 2.375, "learning_rate": 2.895343085305255e-07, "loss": 0.1802, "reward": 0.5524553954601288, "reward_std": 0.31824756413698196, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875149011612, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 1195.2812805175781, "epoch": 0.21387499066537227, "grad_norm": 3.958808183670044, "kl": 2.5, "learning_rate": 2.894798811992169e-07, "loss": 0.2164, "reward": 0.6981027275323868, "reward_std": 0.33249687403440475, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 1186.8482971191406, "epoch": 0.21417369875289374, "grad_norm": 3.119706153869629, "kl": 2.33203125, "learning_rate": 2.8942531844599797e-07, "loss": 0.2378, "reward": 0.659598246216774, "reward_std": 0.3342067822813988, "rewards/accuracy_reward": 0.11607143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5435268208384514, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 1136.8839416503906, "epoch": 0.21447240684041521, "grad_norm": 4.308752536773682, "kl": 1.904296875, "learning_rate": 2.893706203302275e-07, "loss": 0.1781, "reward": 0.7393973469734192, "reward_std": 0.33151140064001083, "rewards/accuracy_reward": 0.13169643748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6077009215950966, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 1142.2947082519531, "epoch": 0.21477111492793668, "grad_norm": 5.900983810424805, "kl": 2.017578125, "learning_rate": 2.8931578691141175e-07, "loss": 0.1919, "reward": 0.6707589626312256, "reward_std": 0.32609113305807114, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5524553805589676, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 1234.5848693847656, "epoch": 0.21506982301545816, "grad_norm": 3.5934488773345947, "kl": 1.857421875, "learning_rate": 2.892608182492041e-07, "loss": 0.2203, "reward": 0.604910746216774, "reward_std": 0.34335558861494064, "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5580357387661934, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 1131.8973693847656, "epoch": 0.21536853110297963, "grad_norm": 6.291898250579834, "kl": 2.224609375, "learning_rate": 2.892057144034051e-07, "loss": 0.2313, "reward": 0.6562500298023224, "reward_std": 0.3136945590376854, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5580357313156128, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 1268.1518249511719, "epoch": 0.21566723919050107, "grad_norm": 2.915289878845215, "kl": 2.107421875, "learning_rate": 2.891504754339623e-07, "loss": 0.1842, "reward": 0.6540178805589676, "reward_std": 0.3479708135128021, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000298023224, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 1143.9754943847656, "epoch": 0.21596594727802254, "grad_norm": 25.868432998657227, "kl": 2.66796875, "learning_rate": 2.8909510140097023e-07, "loss": 0.2614, "reward": 0.7645089626312256, "reward_std": 0.32145876437425613, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768059372902, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 1263.8995971679688, "epoch": 0.216264655365544, "grad_norm": 4.4506916999816895, "kl": 2.60546875, "learning_rate": 2.8903959236467045e-07, "loss": 0.2363, "reward": 0.6774553954601288, "reward_std": 0.3394400104880333, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.556919664144516, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 1180.9486999511719, "epoch": 0.21656336345306548, "grad_norm": 8.838371276855469, "kl": 3.375, "learning_rate": 2.889839483854514e-07, "loss": 0.1788, "reward": 0.6880580633878708, "reward_std": 0.2878439500927925, "rewards/accuracy_reward": 0.10491071594879031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5831473469734192, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 1220.5714416503906, "epoch": 0.21686207154058695, "grad_norm": 4.988823890686035, "kl": 2.5625, "learning_rate": 2.889281695238483e-07, "loss": 0.182, "reward": 0.726004496216774, "reward_std": 0.2701692469418049, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6099330633878708, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 1245.29248046875, "epoch": 0.21716077962810842, "grad_norm": 4.1517333984375, "kl": 2.52734375, "learning_rate": 2.888722558405429e-07, "loss": 0.2289, "reward": 0.6517857313156128, "reward_std": 0.3199516087770462, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.575892873108387, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 1255.7254943847656, "epoch": 0.2174594877156299, "grad_norm": 5.1854963302612305, "kl": 2.57421875, "learning_rate": 2.88816207396364e-07, "loss": 0.1882, "reward": 0.633928582072258, "reward_std": 0.3022385910153389, "rewards/accuracy_reward": 0.06919642956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 1218.46435546875, "epoch": 0.21775819580315137, "grad_norm": 30.981033325195312, "kl": 3.677734375, "learning_rate": 2.8876002425228673e-07, "loss": 0.2686, "reward": 0.640066996216774, "reward_std": 0.341957688331604, "rewards/accuracy_reward": 0.1227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517299123108387, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 1153.700942993164, "epoch": 0.21805690389067284, "grad_norm": 7.308539390563965, "kl": 1.634765625, "learning_rate": 2.887037064694329e-07, "loss": 0.1509, "reward": 0.7003348469734192, "reward_std": 0.32961658388376236, "rewards/accuracy_reward": 0.10937500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 1201.2924499511719, "epoch": 0.2183556119781943, "grad_norm": 28.546010971069336, "kl": 2.31640625, "learning_rate": 2.8864725410907066e-07, "loss": 0.2502, "reward": 0.6238839477300644, "reward_std": 0.312300905585289, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5323660969734192, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 1184.8750305175781, "epoch": 0.21865432006571578, "grad_norm": 7.65896463394165, "kl": 1.798828125, "learning_rate": 2.885906672326147e-07, "loss": 0.1978, "reward": 0.6757812798023224, "reward_std": 0.34087974578142166, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.561941996216774, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 1195.3928833007812, "epoch": 0.21895302815323725, "grad_norm": 5.195554256439209, "kl": 2.13671875, "learning_rate": 2.88533945901626e-07, "loss": 0.2369, "reward": 0.678013414144516, "reward_std": 0.37092141807079315, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5284598469734192, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 1178.107177734375, "epoch": 0.21925173624075872, "grad_norm": 3.5176312923431396, "kl": 1.9140625, "learning_rate": 2.884770901778117e-07, "loss": 0.1774, "reward": 0.6841518133878708, "reward_std": 0.34349704533815384, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5435268133878708, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 1300.6853332519531, "epoch": 0.2195504443282802, "grad_norm": 6.323423862457275, "kl": 1.99609375, "learning_rate": 2.884201001230254e-07, "loss": 0.1737, "reward": 0.6785714626312256, "reward_std": 0.3674309030175209, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5379464626312256, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 1130.9263763427734, "epoch": 0.21984915241580166, "grad_norm": 4.741235256195068, "kl": 2.166015625, "learning_rate": 2.8836297579926666e-07, "loss": 0.2373, "reward": 0.645647332072258, "reward_std": 0.35850806534290314, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973469734192, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 1141.3504943847656, "epoch": 0.22014786050332313, "grad_norm": 18.30469512939453, "kl": 2.76953125, "learning_rate": 2.8830571726868107e-07, "loss": 0.2879, "reward": 0.6690848618745804, "reward_std": 0.3449884131550789, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.535156287252903, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 1128.2455749511719, "epoch": 0.2204465685908446, "grad_norm": 6.885038375854492, "kl": 1.849609375, "learning_rate": 2.882483245935603e-07, "loss": 0.2057, "reward": 0.608816996216774, "reward_std": 0.2835880219936371, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991305589676, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 1241.0669860839844, "epoch": 0.22074527667836608, "grad_norm": 5.499398231506348, "kl": 2.66796875, "learning_rate": 2.881907978363422e-07, "loss": 0.2558, "reward": 0.6305803805589676, "reward_std": 0.338662251830101, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.534598246216774, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 1183.747802734375, "epoch": 0.22104398476588755, "grad_norm": 2.767194986343384, "kl": 2.220703125, "learning_rate": 2.8813313705960986e-07, "loss": 0.1779, "reward": 0.7405134439468384, "reward_std": 0.33192896842956543, "rewards/accuracy_reward": 0.17633929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 1074.9710235595703, "epoch": 0.22134269285340902, "grad_norm": 6.2823486328125, "kl": 2.859375, "learning_rate": 2.880753423260928e-07, "loss": 0.2055, "reward": 0.7645089477300644, "reward_std": 0.3841777443885803, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948661118745804, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 1264.122802734375, "epoch": 0.2216414009409305, "grad_norm": 13.283294677734375, "kl": 3.8125, "learning_rate": 2.88017413698666e-07, "loss": 0.3012, "reward": 0.6261160969734192, "reward_std": 0.33425526320934296, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375298023224, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 1257.6250610351562, "epoch": 0.22194010902845193, "grad_norm": 6.930812835693359, "kl": 2.8984375, "learning_rate": 2.879593512403501e-07, "loss": 0.1705, "reward": 0.6685268133878708, "reward_std": 0.3322274833917618, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6037946492433548, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 1113.7210388183594, "epoch": 0.2222388171159734, "grad_norm": 4.331634044647217, "kl": 2.78515625, "learning_rate": 2.8790115501431135e-07, "loss": 0.2147, "reward": 0.768973246216774, "reward_std": 0.36724256724119186, "rewards/accuracy_reward": 0.19642858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 1179.589340209961, "epoch": 0.22253752520349487, "grad_norm": 45.59884262084961, "kl": 3.533203125, "learning_rate": 2.878428250838615e-07, "loss": 0.3516, "reward": 0.7031250298023224, "reward_std": 0.3394280672073364, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 1203.7678833007812, "epoch": 0.22283623329101634, "grad_norm": 5.485734462738037, "kl": 2.33984375, "learning_rate": 2.877843615124579e-07, "loss": 0.2449, "reward": 0.6768973618745804, "reward_std": 0.3234754800796509, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5987723469734192, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 1231.0781555175781, "epoch": 0.22313494137853782, "grad_norm": 3.281419277191162, "kl": 2.77734375, "learning_rate": 2.877257643637031e-07, "loss": 0.2172, "reward": 0.6657366156578064, "reward_std": 0.3352634608745575, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5362723469734192, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 1219.7165832519531, "epoch": 0.2234336494660593, "grad_norm": 3.9984264373779297, "kl": 1.99609375, "learning_rate": 2.8766703370134515e-07, "loss": 0.1566, "reward": 0.7159598618745804, "reward_std": 0.2655518390238285, "rewards/accuracy_reward": 0.11160714598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043527126312256, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 1133.66748046875, "epoch": 0.22373235755358076, "grad_norm": 4.081468105316162, "kl": 2.259765625, "learning_rate": 2.876081695892772e-07, "loss": 0.2087, "reward": 0.6718750298023224, "reward_std": 0.3487987741827965, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 1147.08935546875, "epoch": 0.22403106564110223, "grad_norm": 8.932157516479492, "kl": 2.76171875, "learning_rate": 2.875491720915376e-07, "loss": 0.2902, "reward": 0.6936384290456772, "reward_std": 0.3434918001294136, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 1314.4732666015625, "epoch": 0.2243297737286237, "grad_norm": 3.5573809146881104, "kl": 2.640625, "learning_rate": 2.8749004127230985e-07, "loss": 0.2239, "reward": 0.6004464477300644, "reward_std": 0.3675982654094696, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089285969734192, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 1194.0826416015625, "epoch": 0.22462848181614517, "grad_norm": 3.936433792114258, "kl": 1.9921875, "learning_rate": 2.874307771959225e-07, "loss": 0.1656, "reward": 0.7087053954601288, "reward_std": 0.3666161820292473, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803805589676, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 1179.5781860351562, "epoch": 0.22492718990366664, "grad_norm": 4.795867919921875, "kl": 2.806640625, "learning_rate": 2.873713799268491e-07, "loss": 0.2022, "reward": 0.6629464477300644, "reward_std": 0.34836381673812866, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5290178805589676, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 1240.7812805175781, "epoch": 0.2252258979911881, "grad_norm": 5.333897113800049, "kl": 2.67578125, "learning_rate": 2.87311849529708e-07, "loss": 0.2938, "reward": 0.670200914144516, "reward_std": 0.3296411335468292, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5474330484867096, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 1236.716552734375, "epoch": 0.22552460607870958, "grad_norm": 7.973015785217285, "kl": 2.91796875, "learning_rate": 2.872521860692625e-07, "loss": 0.2733, "reward": 0.6077009290456772, "reward_std": 0.3136155754327774, "rewards/accuracy_reward": 0.06919643050059676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.538504496216774, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 1116.1250762939453, "epoch": 0.22582331416623105, "grad_norm": 5.3335418701171875, "kl": 2.173828125, "learning_rate": 2.871923896104206e-07, "loss": 0.1872, "reward": 0.7834821790456772, "reward_std": 0.3664005473256111, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 1345.3170166015625, "epoch": 0.22612202225375252, "grad_norm": 7.242931365966797, "kl": 2.1357421875, "learning_rate": 2.871324602182348e-07, "loss": 0.1864, "reward": 0.7527901977300644, "reward_std": 0.3436829224228859, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687798023224, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 1332.544677734375, "epoch": 0.226420730341274, "grad_norm": 4.434770584106445, "kl": 2.41015625, "learning_rate": 2.870723979579027e-07, "loss": 0.1581, "reward": 0.6981026977300644, "reward_std": 0.32088953256607056, "rewards/accuracy_reward": 0.14285715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5552455484867096, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 1287.1295318603516, "epoch": 0.22671943842879547, "grad_norm": 3.5314688682556152, "kl": 2.1640625, "learning_rate": 2.8701220289476587e-07, "loss": 0.1916, "reward": 0.6590402126312256, "reward_std": 0.3501978702843189, "rewards/accuracy_reward": 0.08035714854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 1212.9018249511719, "epoch": 0.22701814651631694, "grad_norm": 11.311164855957031, "kl": 3.0703125, "learning_rate": 2.8695187509431086e-07, "loss": 0.2065, "reward": 0.7109375298023224, "reward_std": 0.2974485978484154, "rewards/accuracy_reward": 0.18303571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5279018133878708, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 1238.8995971679688, "epoch": 0.2273168546038384, "grad_norm": 17.24918556213379, "kl": 3.73828125, "learning_rate": 2.868914146221683e-07, "loss": 0.314, "reward": 0.6227678805589676, "reward_std": 0.34802592545747757, "rewards/accuracy_reward": 0.07366071850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5491071790456772, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 1274.3326416015625, "epoch": 0.22761556269135988, "grad_norm": 5.497211456298828, "kl": 1.7998046875, "learning_rate": 2.8683082154411326e-07, "loss": 0.2369, "reward": 0.728794664144516, "reward_std": 0.31047173589468, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636160969734192, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 1161.1964721679688, "epoch": 0.22791427077888135, "grad_norm": 4.155698776245117, "kl": 2.162109375, "learning_rate": 2.867700959260651e-07, "loss": 0.2021, "reward": 0.7087053954601288, "reward_std": 0.3384958282113075, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5658482387661934, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 1082.3594207763672, "epoch": 0.22821297886640282, "grad_norm": 6.885112285614014, "kl": 1.931640625, "learning_rate": 2.867092378340873e-07, "loss": 0.2197, "reward": 0.7444196790456772, "reward_std": 0.3298797160387039, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839626312256, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 1198.52685546875, "epoch": 0.22851168695392426, "grad_norm": 3.1637697219848633, "kl": 2.49609375, "learning_rate": 2.866482473343876e-07, "loss": 0.1891, "reward": 0.6573660969734192, "reward_std": 0.30999891832470894, "rewards/accuracy_reward": 0.09151785913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5658482313156128, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 1152.3839721679688, "epoch": 0.22881039504144574, "grad_norm": 8.081725120544434, "kl": 3.46484375, "learning_rate": 2.865871244933175e-07, "loss": 0.2612, "reward": 0.7477678954601288, "reward_std": 0.36652661114931107, "rewards/accuracy_reward": 0.20312501210719347, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5446428805589676, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 1072.1942443847656, "epoch": 0.2291091031289672, "grad_norm": 4.223757266998291, "kl": 2.40234375, "learning_rate": 2.865258693773728e-07, "loss": 0.1982, "reward": 0.7907366305589676, "reward_std": 0.3608318492770195, "rewards/accuracy_reward": 0.20758929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5831473469734192, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 1246.9554138183594, "epoch": 0.22940781121648868, "grad_norm": 12.715611457824707, "kl": 4.08203125, "learning_rate": 2.86464482053193e-07, "loss": 0.3404, "reward": 0.5993303954601288, "reward_std": 0.33288150280714035, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4966517984867096, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 1204.26123046875, "epoch": 0.22970651930401015, "grad_norm": 5.4133453369140625, "kl": 2.5859375, "learning_rate": 2.864029625875617e-07, "loss": 0.1952, "reward": 0.7226562947034836, "reward_std": 0.30622825771570206, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574777126312256, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 1206.5246276855469, "epoch": 0.23000522739153162, "grad_norm": 10.956332206726074, "kl": 3.515625, "learning_rate": 2.863413110474057e-07, "loss": 0.3352, "reward": 0.6891741454601288, "reward_std": 0.3309524282813072, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5329241454601288, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 1280.8103332519531, "epoch": 0.2303039354790531, "grad_norm": 5.036247253417969, "kl": 2.640625, "learning_rate": 2.8627952749979605e-07, "loss": 0.2923, "reward": 0.671316996216774, "reward_std": 0.35835500806570053, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5507812798023224, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 1318.5714721679688, "epoch": 0.23060264356657456, "grad_norm": 9.328558921813965, "kl": 3.26171875, "learning_rate": 2.8621761201194713e-07, "loss": 0.2132, "reward": 0.6590402126312256, "reward_std": 0.28770100325345993, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5429687798023224, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 1263.0558471679688, "epoch": 0.23090135165409603, "grad_norm": 4.427732944488525, "kl": 2.88671875, "learning_rate": 2.8615556465121695e-07, "loss": 0.2271, "reward": 0.5993303656578064, "reward_std": 0.3341727703809738, "rewards/accuracy_reward": 0.06026785867288709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625298023224, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 1240.6005096435547, "epoch": 0.2312000597416175, "grad_norm": 4.071465492248535, "kl": 2.55859375, "learning_rate": 2.8609338548510703e-07, "loss": 0.2404, "reward": 0.6428571790456772, "reward_std": 0.2654860019683838, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5558035969734192, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 1279.8192749023438, "epoch": 0.23149876782913897, "grad_norm": 5.421728134155273, "kl": 3.09375, "learning_rate": 2.8603107458126207e-07, "loss": 0.2661, "reward": 0.6199776977300644, "reward_std": 0.31942547112703323, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 1238.72998046875, "epoch": 0.23179747591666044, "grad_norm": 6.638443946838379, "kl": 2.099609375, "learning_rate": 2.859686320074703e-07, "loss": 0.2362, "reward": 0.6183035969734192, "reward_std": 0.3629935756325722, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506696455180645, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 1211.966552734375, "epoch": 0.23209618400418192, "grad_norm": 6.216376781463623, "kl": 1.662109375, "learning_rate": 2.859060578316631e-07, "loss": 0.2187, "reward": 0.6835937798023224, "reward_std": 0.32299667596817017, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5630580633878708, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 1201.1183471679688, "epoch": 0.2323948920917034, "grad_norm": 5.217857837677002, "kl": 2.36328125, "learning_rate": 2.8584335212191505e-07, "loss": 0.276, "reward": 0.6484375298023224, "reward_std": 0.3490828797221184, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375223517418, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 1318.2433471679688, "epoch": 0.23269360017922486, "grad_norm": 9.307173728942871, "kl": 2.98046875, "learning_rate": 2.8578051494644377e-07, "loss": 0.2765, "reward": 0.5094866380095482, "reward_std": 0.3281366676092148, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 1314.200927734375, "epoch": 0.23299230826674633, "grad_norm": 3.6287057399749756, "kl": 2.390625, "learning_rate": 2.8571754637361e-07, "loss": 0.2108, "reward": 0.5602678954601288, "reward_std": 0.326089471578598, "rewards/accuracy_reward": 0.08258929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785895228386, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 1223.9911193847656, "epoch": 0.2332910163542678, "grad_norm": 10.816656112670898, "kl": 3.048828125, "learning_rate": 2.856544464719174e-07, "loss": 0.2819, "reward": 0.6400669813156128, "reward_std": 0.3116523139178753, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517299123108387, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 1332.6406555175781, "epoch": 0.23358972444178927, "grad_norm": 7.468239784240723, "kl": 3.046875, "learning_rate": 2.8559121531001244e-07, "loss": 0.2263, "reward": 0.6556920111179352, "reward_std": 0.34089213609695435, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5172991305589676, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 1305.5692138671875, "epoch": 0.23388843252931074, "grad_norm": 4.426988124847412, "kl": 2.123046875, "learning_rate": 2.8552785295668447e-07, "loss": 0.159, "reward": 0.5909598469734192, "reward_std": 0.3010644093155861, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.492745541036129, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 1191.2902526855469, "epoch": 0.2341871406168322, "grad_norm": 6.330776214599609, "kl": 3.03125, "learning_rate": 2.8546435948086537e-07, "loss": 0.2442, "reward": 0.5703125298023224, "reward_std": 0.3319938853383064, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.514508955180645, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 1236.0536193847656, "epoch": 0.23448584870435368, "grad_norm": 4.292649745941162, "kl": 2.2109375, "learning_rate": 2.8540073495163005e-07, "loss": 0.2076, "reward": 0.667410746216774, "reward_std": 0.3314632549881935, "rewards/accuracy_reward": 0.14285715203732252, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5245535969734192, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 1320.27685546875, "epoch": 0.23478455679187513, "grad_norm": 4.9177632331848145, "kl": 2.244140625, "learning_rate": 2.8533697943819554e-07, "loss": 0.2221, "reward": 0.5859375298023224, "reward_std": 0.2909090965986252, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5189732313156128, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 1283.9754943847656, "epoch": 0.2350832648793966, "grad_norm": 4.365779876708984, "kl": 3.0703125, "learning_rate": 2.8527309300992184e-07, "loss": 0.2502, "reward": 0.558035746216774, "reward_std": 0.3333124667406082, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678805589676, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 1318.3661499023438, "epoch": 0.23538197296691807, "grad_norm": 15.635862350463867, "kl": 3.36328125, "learning_rate": 2.8520907573631084e-07, "loss": 0.3501, "reward": 0.7053571790456772, "reward_std": 0.3337148055434227, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964626312256, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 1345.6004943847656, "epoch": 0.23568068105443954, "grad_norm": 5.143390655517578, "kl": 3.05859375, "learning_rate": 2.8514492768700725e-07, "loss": 0.2521, "reward": 0.5652901977300644, "reward_std": 0.31984613835811615, "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 1229.7701416015625, "epoch": 0.235979389141961, "grad_norm": 9.174274444580078, "kl": 3.19140625, "learning_rate": 2.850806489317978e-07, "loss": 0.283, "reward": 0.6294643133878708, "reward_std": 0.285482257604599, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515625037252903, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 1315.6919860839844, "epoch": 0.23627809722948248, "grad_norm": 4.6327290534973145, "kl": 2.64453125, "learning_rate": 2.850162395406115e-07, "loss": 0.2614, "reward": 0.6584821790456772, "reward_std": 0.2808911129832268, "rewards/accuracy_reward": 0.16964286798611283, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393133878708, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 1209.1183776855469, "epoch": 0.23657680531700395, "grad_norm": 7.542088031768799, "kl": 2.94140625, "learning_rate": 2.8495169958351935e-07, "loss": 0.2995, "reward": 0.6395089626312256, "reward_std": 0.3628928065299988, "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125149011612, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 1371.7813110351562, "epoch": 0.23687551340452542, "grad_norm": 13.338149070739746, "kl": 2.423828125, "learning_rate": 2.8488702913073464e-07, "loss": 0.1618, "reward": 0.6054687798023224, "reward_std": 0.33822008967399597, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5340401977300644, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 1169.41748046875, "epoch": 0.2371742214920469, "grad_norm": 7.263813018798828, "kl": 2.10546875, "learning_rate": 2.848222282526124e-07, "loss": 0.2547, "reward": 0.7031250447034836, "reward_std": 0.35978198796510696, "rewards/accuracy_reward": 0.18526786798611283, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5178571715950966, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 1227.87060546875, "epoch": 0.23747292957956836, "grad_norm": 8.263843536376953, "kl": 1.94140625, "learning_rate": 2.8475729701964974e-07, "loss": 0.2392, "reward": 0.6367187947034836, "reward_std": 0.32348018139600754, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027901977300644, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 1177.29248046875, "epoch": 0.23777163766708984, "grad_norm": 8.356523513793945, "kl": 2.087890625, "learning_rate": 2.8469223550248535e-07, "loss": 0.2757, "reward": 0.6250000298023224, "reward_std": 0.38262295722961426, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5223214477300644, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 1291.5290832519531, "epoch": 0.2380703457546113, "grad_norm": 4.8000168800354, "kl": 2.51953125, "learning_rate": 2.8462704377189985e-07, "loss": 0.2647, "reward": 0.5424107387661934, "reward_std": 0.30116528272628784, "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 1382.6250305175781, "epoch": 0.23836905384213278, "grad_norm": 5.47354793548584, "kl": 2.3515625, "learning_rate": 2.845617218988154e-07, "loss": 0.2438, "reward": 0.586495578289032, "reward_std": 0.3233739957213402, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5306919813156128, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 1221.2701416015625, "epoch": 0.23866776192965425, "grad_norm": 7.5547356605529785, "kl": 2.474609375, "learning_rate": 2.8449626995429586e-07, "loss": 0.2509, "reward": 0.6344866305589676, "reward_std": 0.31189096719026566, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5228794738650322, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 1193.3147888183594, "epoch": 0.23896647001717572, "grad_norm": 4.990750312805176, "kl": 2.65625, "learning_rate": 2.8443068800954645e-07, "loss": 0.2769, "reward": 0.7176339626312256, "reward_std": 0.40026436001062393, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5435268059372902, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 1276.0469360351562, "epoch": 0.2392651781046972, "grad_norm": 10.731584548950195, "kl": 3.46875, "learning_rate": 2.8436497613591404e-07, "loss": 0.2768, "reward": 0.645089328289032, "reward_std": 0.34839576482772827, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5178571566939354, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 1113.05810546875, "epoch": 0.23956388619221866, "grad_norm": 4.940190315246582, "kl": 2.96484375, "learning_rate": 2.8429913440488657e-07, "loss": 0.2947, "reward": 0.6378348618745804, "reward_std": 0.32658807188272476, "rewards/accuracy_reward": 0.07366071501746774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741454601288, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 1241.4754943847656, "epoch": 0.23986259427974013, "grad_norm": 5.2477641105651855, "kl": 3.48046875, "learning_rate": 2.8423316288809343e-07, "loss": 0.277, "reward": 0.6266741305589676, "reward_std": 0.30622561275959015, "rewards/accuracy_reward": 0.11607143329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5106026902794838, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 1348.9888916015625, "epoch": 0.2401613023672616, "grad_norm": 6.006759166717529, "kl": 2.173828125, "learning_rate": 2.841670616573052e-07, "loss": 0.2, "reward": 0.6456473618745804, "reward_std": 0.3233916237950325, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 1285.1317443847656, "epoch": 0.24046001045478307, "grad_norm": 13.885194778442383, "kl": 3.8671875, "learning_rate": 2.841008307844335e-07, "loss": 0.3105, "reward": 0.6746652126312256, "reward_std": 0.32120588421821594, "rewards/accuracy_reward": 0.17187500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027902126312256, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 1300.7857666015625, "epoch": 0.24075871854230455, "grad_norm": 19.235450744628906, "kl": 4.515625, "learning_rate": 2.8403447034153104e-07, "loss": 0.3386, "reward": 0.6824776977300644, "reward_std": 0.3611825704574585, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5351562723517418, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 1234.9866638183594, "epoch": 0.24105742662982602, "grad_norm": 21.42917251586914, "kl": 4.44921875, "learning_rate": 2.839679804007915e-07, "loss": 0.3764, "reward": 0.635044664144516, "reward_std": 0.3604029044508934, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5100446790456772, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 1385.0469360351562, "epoch": 0.24135613471734746, "grad_norm": 7.732161045074463, "kl": 3.4375, "learning_rate": 2.8390136103454937e-07, "loss": 0.2428, "reward": 0.529575914144516, "reward_std": 0.25557639449834824, "rewards/accuracy_reward": 0.026785715948790312, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502790205180645, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 1330.529052734375, "epoch": 0.24165484280486893, "grad_norm": 5.864535331726074, "kl": 2.80078125, "learning_rate": 2.8383461231528003e-07, "loss": 0.2271, "reward": 0.5306919813156128, "reward_std": 0.27875159680843353, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515066996216774, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 1287.2277221679688, "epoch": 0.2419535508923904, "grad_norm": 8.077582359313965, "kl": 2.189453125, "learning_rate": 2.8376773431559954e-07, "loss": 0.1838, "reward": 0.628348246216774, "reward_std": 0.284174345433712, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5145089477300644, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 1187.1853332519531, "epoch": 0.24225225897991187, "grad_norm": 11.42967700958252, "kl": 2.568359375, "learning_rate": 2.8370072710826465e-07, "loss": 0.2706, "reward": 0.6757812798023224, "reward_std": 0.30980367958545685, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574777126312256, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 1293.4264221191406, "epoch": 0.24255096706743334, "grad_norm": 6.9898223876953125, "kl": 2.091796875, "learning_rate": 2.8363359076617254e-07, "loss": 0.2259, "reward": 0.5792410969734192, "reward_std": 0.3464212641119957, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482387661934, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 1347.7522888183594, "epoch": 0.2428496751549548, "grad_norm": 8.35570240020752, "kl": 2.1875, "learning_rate": 2.8356632536236096e-07, "loss": 0.2167, "reward": 0.6534598469734192, "reward_std": 0.32220616936683655, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5530134290456772, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 1272.8572082519531, "epoch": 0.24314838324247628, "grad_norm": 7.225669860839844, "kl": 2.359375, "learning_rate": 2.8349893097000826e-07, "loss": 0.2154, "reward": 0.607700914144516, "reward_std": 0.3503415435552597, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5161830484867096, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 1283.3013610839844, "epoch": 0.24344709132999776, "grad_norm": 6.720060348510742, "kl": 1.892578125, "learning_rate": 2.834314076624327e-07, "loss": 0.2175, "reward": 0.5987723469734192, "reward_std": 0.3119438588619232, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5206473544239998, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 1319.3861999511719, "epoch": 0.24374579941751923, "grad_norm": 5.53410005569458, "kl": 2.7265625, "learning_rate": 2.8336375551309315e-07, "loss": 0.3113, "reward": 0.6305803656578064, "reward_std": 0.3360505923628807, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5100446790456772, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 1342.1697082519531, "epoch": 0.2440445075050407, "grad_norm": 8.228979110717773, "kl": 3.21484375, "learning_rate": 2.832959745955885e-07, "loss": 0.2704, "reward": 0.602120578289032, "reward_std": 0.30556943267583847, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384364962578, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 1391.7969360351562, "epoch": 0.24434321559256217, "grad_norm": 7.372379302978516, "kl": 6.10546875, "learning_rate": 2.8322806498365777e-07, "loss": 0.2114, "reward": 0.5446428805589676, "reward_std": 0.2974008545279503, "rewards/accuracy_reward": 0.03348214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607313156128, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 1313.7746276855469, "epoch": 0.24464192368008364, "grad_norm": 3.952873706817627, "kl": 3.3203125, "learning_rate": 2.8316002675117994e-07, "loss": 0.2855, "reward": 0.6568080633878708, "reward_std": 0.3087656497955322, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937649011612, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 1299.3616638183594, "epoch": 0.2449406317676051, "grad_norm": 8.576035499572754, "kl": 3.62109375, "learning_rate": 2.830918599721739e-07, "loss": 0.3071, "reward": 0.6015625149011612, "reward_std": 0.33396805822849274, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125298023224, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 1367.0692749023438, "epoch": 0.24523933985512658, "grad_norm": 4.274262428283691, "kl": 3.28125, "learning_rate": 2.830235647207985e-07, "loss": 0.233, "reward": 0.5864955559372902, "reward_std": 0.3256503865122795, "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517299123108387, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 1301.1273193359375, "epoch": 0.24553804794264805, "grad_norm": 3.9780843257904053, "kl": 3.0625, "learning_rate": 2.829551410713522e-07, "loss": 0.2982, "reward": 0.5781250298023224, "reward_std": 0.3117125928401947, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.504464291036129, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 1209.5245971679688, "epoch": 0.24583675603016952, "grad_norm": 7.420854091644287, "kl": 2.580078125, "learning_rate": 2.8288658909827327e-07, "loss": 0.2889, "reward": 0.641183078289032, "reward_std": 0.3433975800871849, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5273437798023224, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 1353.1072082519531, "epoch": 0.246135464117691, "grad_norm": 3.7962844371795654, "kl": 3.21875, "learning_rate": 2.8281790887613956e-07, "loss": 0.2563, "reward": 0.5652902126312256, "reward_std": 0.3389114886522293, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760045036673546, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 1312.4665832519531, "epoch": 0.24643417220521247, "grad_norm": 6.96976900100708, "kl": 3.38671875, "learning_rate": 2.827491004796683e-07, "loss": 0.2739, "reward": 0.5368303805589676, "reward_std": 0.3019115775823593, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875149011612, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 1357.9688110351562, "epoch": 0.24673288029273394, "grad_norm": 6.055230140686035, "kl": 3.06640625, "learning_rate": 2.826801639837165e-07, "loss": 0.3118, "reward": 0.6819196939468384, "reward_std": 0.3022591136395931, "rewards/accuracy_reward": 0.19196429708972573, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489955373108387, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 1423.4420471191406, "epoch": 0.2470315883802554, "grad_norm": 4.583759307861328, "kl": 2.2734375, "learning_rate": 2.8261109946328004e-07, "loss": 0.1718, "reward": 0.6383928954601288, "reward_std": 0.3050181567668915, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5178571566939354, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 1337.4665832519531, "epoch": 0.24733029646777688, "grad_norm": 5.01204252243042, "kl": 2.609375, "learning_rate": 2.825419069934946e-07, "loss": 0.2556, "reward": 0.5574776977300644, "reward_std": 0.29394273459911346, "rewards/accuracy_reward": 0.03571428684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.521763414144516, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 1387.82373046875, "epoch": 0.24762900455529832, "grad_norm": 5.667433738708496, "kl": 2.654296875, "learning_rate": 2.824725866496346e-07, "loss": 0.2622, "reward": 0.608816996216774, "reward_std": 0.2895742431282997, "rewards/accuracy_reward": 0.10714286239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741305589676, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 1309.9911193847656, "epoch": 0.2479277126428198, "grad_norm": 8.561729431152344, "kl": 3.07421875, "learning_rate": 2.8240313850711396e-07, "loss": 0.2983, "reward": 0.588169664144516, "reward_std": 0.28844039887189865, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5212053880095482, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 1281.5290832519531, "epoch": 0.24822642073034126, "grad_norm": 5.860593318939209, "kl": 2.9140625, "learning_rate": 2.8233356264148533e-07, "loss": 0.2611, "reward": 0.5993303805589676, "reward_std": 0.3253520503640175, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854910895228386, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 1369.57373046875, "epoch": 0.24852512881786273, "grad_norm": 4.06842041015625, "kl": 3.0546875, "learning_rate": 2.822638591284405e-07, "loss": 0.2219, "reward": 0.5591518059372902, "reward_std": 0.29623347893357277, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4787946715950966, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 1291.4933776855469, "epoch": 0.2488238369053842, "grad_norm": 47.73170852661133, "kl": 4.38671875, "learning_rate": 2.8219402804381e-07, "loss": 0.2706, "reward": 0.6127232313156128, "reward_std": 0.3205762207508087, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5412946790456772, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 1418.3572082519531, "epoch": 0.24912254499290568, "grad_norm": 6.2585883140563965, "kl": 3.34765625, "learning_rate": 2.821240694635632e-07, "loss": 0.2968, "reward": 0.5418526977300644, "reward_std": 0.28156135231256485, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 1377.5491333007812, "epoch": 0.24942125308042715, "grad_norm": 8.116395950317383, "kl": 2.59375, "learning_rate": 2.820539834638083e-07, "loss": 0.209, "reward": 0.5853794813156128, "reward_std": 0.30648173391819, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4893973544239998, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 1405.4866638183594, "epoch": 0.24971996116794862, "grad_norm": 7.691073417663574, "kl": 2.96484375, "learning_rate": 2.819837701207919e-07, "loss": 0.265, "reward": 0.5909598544239998, "reward_std": 0.33794164657592773, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169813156128, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 1284.8996276855469, "epoch": 0.2500186692554701, "grad_norm": 6.742989540100098, "kl": 3.25, "learning_rate": 2.819134295108992e-07, "loss": 0.2951, "reward": 0.5686384215950966, "reward_std": 0.31708216667175293, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483816996216774, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 1413.5067749023438, "epoch": 0.2503173773429916, "grad_norm": 4.414276123046875, "kl": 2.94921875, "learning_rate": 2.81842961710654e-07, "loss": 0.2686, "reward": 0.6132812798023224, "reward_std": 0.3042314723134041, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098395228386, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 1349.1183776855469, "epoch": 0.25061608543051306, "grad_norm": 7.670528411865234, "kl": 3.42578125, "learning_rate": 2.8177236679671826e-07, "loss": 0.2735, "reward": 0.5864955484867096, "reward_std": 0.3005574941635132, "rewards/accuracy_reward": 0.07142857415601611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5150669887661934, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 1290.18310546875, "epoch": 0.2509147935180345, "grad_norm": 4.2574052810668945, "kl": 3.1953125, "learning_rate": 2.817016448458924e-07, "loss": 0.3144, "reward": 0.6579241305589676, "reward_std": 0.33755966275930405, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4905134066939354, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 1486.2567749023438, "epoch": 0.25121350160555594, "grad_norm": 5.309843063354492, "kl": 3.58203125, "learning_rate": 2.816307959351149e-07, "loss": 0.2647, "reward": 0.5295759066939354, "reward_std": 0.3390566259622574, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4670759066939354, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 1318.6830749511719, "epoch": 0.2515122096930774, "grad_norm": 12.212471961975098, "kl": 2.87890625, "learning_rate": 2.8155982014146247e-07, "loss": 0.2874, "reward": 0.6383928805589676, "reward_std": 0.3132024519145489, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5290178656578064, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 1338.3236999511719, "epoch": 0.2518109177805989, "grad_norm": 7.254854202270508, "kl": 2.82421875, "learning_rate": 2.8148871754214986e-07, "loss": 0.2499, "reward": 0.5636160969734192, "reward_std": 0.3151557594537735, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.481026828289032, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 1351.8192749023438, "epoch": 0.25210962586812036, "grad_norm": 9.517291069030762, "kl": 2.609375, "learning_rate": 2.814174882145296e-07, "loss": 0.2499, "reward": 0.604910746216774, "reward_std": 0.3428619056940079, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035895228386, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 1336.3058471679688, "epoch": 0.25240833395564183, "grad_norm": 9.13882064819336, "kl": 3.4453125, "learning_rate": 2.813461322360924e-07, "loss": 0.3315, "reward": 0.6395089477300644, "reward_std": 0.3134279400110245, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4832589626312256, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 1341.6808471679688, "epoch": 0.2527070420431633, "grad_norm": 8.54890251159668, "kl": 3.17578125, "learning_rate": 2.812746496844664e-07, "loss": 0.2803, "reward": 0.6422991454601288, "reward_std": 0.35592351108789444, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5128348618745804, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 1357.9040832519531, "epoch": 0.25300575013068477, "grad_norm": 6.234477519989014, "kl": 3.05859375, "learning_rate": 2.812030406374177e-07, "loss": 0.274, "reward": 0.565848246216774, "reward_std": 0.295756570994854, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125149011612, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 1469.49560546875, "epoch": 0.25330445821820624, "grad_norm": 4.495426177978516, "kl": 2.75390625, "learning_rate": 2.8113130517284994e-07, "loss": 0.2075, "reward": 0.619419664144516, "reward_std": 0.2936927303671837, "rewards/accuracy_reward": 0.11607143492437899, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.503348246216774, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 1364.591552734375, "epoch": 0.2536031663057277, "grad_norm": 6.72017240524292, "kl": 2.43359375, "learning_rate": 2.8105944336880423e-07, "loss": 0.2006, "reward": 0.577566996216774, "reward_std": 0.3438948839902878, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384215950966, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 1435.1986999511719, "epoch": 0.2539018743932492, "grad_norm": 11.09793758392334, "kl": 2.337890625, "learning_rate": 2.809874553034592e-07, "loss": 0.2179, "reward": 0.6540178954601288, "reward_std": 0.3357407972216606, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607313156128, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 1386.6562805175781, "epoch": 0.25420058248077065, "grad_norm": 5.934141635894775, "kl": 2.26953125, "learning_rate": 2.8091534105513077e-07, "loss": 0.2292, "reward": 0.720982164144516, "reward_std": 0.3272845894098282, "rewards/accuracy_reward": 0.1964285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5245536044239998, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 1316.1652221679688, "epoch": 0.2544992905682921, "grad_norm": 6.863644123077393, "kl": 2.90625, "learning_rate": 2.808431007022722e-07, "loss": 0.3134, "reward": 0.6450893133878708, "reward_std": 0.2814142033457756, "rewards/accuracy_reward": 0.1361607233993709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089286044239998, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 1329.4553833007812, "epoch": 0.2547979986558136, "grad_norm": 8.53454303741455, "kl": 3.50390625, "learning_rate": 2.8077073432347384e-07, "loss": 0.3112, "reward": 0.6679687798023224, "reward_std": 0.31933191418647766, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223469734192, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 1472.7544860839844, "epoch": 0.25509670674333507, "grad_norm": 14.574356079101562, "kl": 4.19921875, "learning_rate": 2.806982419974634e-07, "loss": 0.3073, "reward": 0.616629496216774, "reward_std": 0.3035300374031067, "rewards/accuracy_reward": 0.09151786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5251116156578064, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 1381.6562805175781, "epoch": 0.25539541483085654, "grad_norm": 16.538753509521484, "kl": 4.5703125, "learning_rate": 2.806256238031053e-07, "loss": 0.352, "reward": 0.5731026977300644, "reward_std": 0.31681614369153976, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5106027126312256, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 1423.4465026855469, "epoch": 0.255694122918378, "grad_norm": 12.833476066589355, "kl": 3.66015625, "learning_rate": 2.8055287981940095e-07, "loss": 0.2807, "reward": 0.5323661044239998, "reward_std": 0.2602388896048069, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510044664144516, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 1322.2857666015625, "epoch": 0.2559928310058995, "grad_norm": 8.539982795715332, "kl": 3.35546875, "learning_rate": 2.804800101254888e-07, "loss": 0.3004, "reward": 0.6099330633878708, "reward_std": 0.31387824937701225, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5161830633878708, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 1354.7277221679688, "epoch": 0.25629153909342095, "grad_norm": 5.464725971221924, "kl": 2.890625, "learning_rate": 2.8040701480064395e-07, "loss": 0.2775, "reward": 0.6311384290456772, "reward_std": 0.33079566061496735, "rewards/accuracy_reward": 0.10937500419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.521763414144516, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 1387.5246276855469, "epoch": 0.2565902471809424, "grad_norm": 9.384422302246094, "kl": 2.96484375, "learning_rate": 2.80333893924278e-07, "loss": 0.2913, "reward": 0.6088169664144516, "reward_std": 0.332396000623703, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098544239998, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 1481.0023193359375, "epoch": 0.2568889552684639, "grad_norm": 6.578606605529785, "kl": 3.033203125, "learning_rate": 2.802606475759395e-07, "loss": 0.2747, "reward": 0.6071428954601288, "reward_std": 0.2850143313407898, "rewards/accuracy_reward": 0.11830357206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393208384514, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 1491.0647583007812, "epoch": 0.25718766335598536, "grad_norm": 6.945028305053711, "kl": 2.703125, "learning_rate": 2.801872758353131e-07, "loss": 0.2023, "reward": 0.6210937798023224, "reward_std": 0.3093777149915695, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5139508992433548, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 1418.2322387695312, "epoch": 0.25748637144350683, "grad_norm": 5.841421604156494, "kl": 2.73828125, "learning_rate": 2.801137787822202e-07, "loss": 0.2815, "reward": 0.683035746216774, "reward_std": 0.303206667304039, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.511160746216774, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 1400.1697082519531, "epoch": 0.2577850795310283, "grad_norm": 5.521731376647949, "kl": 3.107421875, "learning_rate": 2.800401564966183e-07, "loss": 0.3113, "reward": 0.5664062649011612, "reward_std": 0.2803182974457741, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455484867096, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 1347.9598388671875, "epoch": 0.2580837876185498, "grad_norm": 7.443802356719971, "kl": 2.650390625, "learning_rate": 2.799664090586014e-07, "loss": 0.2363, "reward": 0.6411830633878708, "reward_std": 0.292870108038187, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5385044813156128, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 1421.3906860351562, "epoch": 0.25838249570607125, "grad_norm": 3.698399305343628, "kl": 2.78515625, "learning_rate": 2.7989253654839924e-07, "loss": 0.2437, "reward": 0.5703125149011612, "reward_std": 0.27113954722881317, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494419664144516, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 1505.4330749511719, "epoch": 0.2586812037935927, "grad_norm": 23.132648468017578, "kl": 4.59375, "learning_rate": 2.798185390463781e-07, "loss": 0.3348, "reward": 0.5954241305589676, "reward_std": 0.3503289669752121, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506138414144516, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 1329.9911499023438, "epoch": 0.2589799118811142, "grad_norm": 17.73627281188965, "kl": 4.43359375, "learning_rate": 2.797444166330398e-07, "loss": 0.3577, "reward": 0.6177455484867096, "reward_std": 0.2893698588013649, "rewards/accuracy_reward": 0.13839286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.479352705180645, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 1424.9308166503906, "epoch": 0.25927861996863566, "grad_norm": 10.51187801361084, "kl": 3.8046875, "learning_rate": 2.7967016938902243e-07, "loss": 0.2944, "reward": 0.5848214626312256, "reward_std": 0.3200265169143677, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5044643133878708, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 1394.9129943847656, "epoch": 0.25957732805615713, "grad_norm": 16.28541374206543, "kl": 4.23828125, "learning_rate": 2.795957973950996e-07, "loss": 0.3718, "reward": 0.6595982387661934, "reward_std": 0.3363785967230797, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839477300644, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 1398.3281555175781, "epoch": 0.2598760361436786, "grad_norm": 3.8285696506500244, "kl": 2.78515625, "learning_rate": 2.795213007321808e-07, "loss": 0.2253, "reward": 0.5892857313156128, "reward_std": 0.3413441702723503, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.511160746216774, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 1367.1585388183594, "epoch": 0.2601747442312001, "grad_norm": 9.68786907196045, "kl": 2.47265625, "learning_rate": 2.794466794813112e-07, "loss": 0.2603, "reward": 0.658482164144516, "reward_std": 0.3356481119990349, "rewards/accuracy_reward": 0.15401786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5044643133878708, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 1367.5000610351562, "epoch": 0.26047345231872154, "grad_norm": 7.535067081451416, "kl": 3.1640625, "learning_rate": 2.793719337236712e-07, "loss": 0.3043, "reward": 0.5239955633878708, "reward_std": 0.2961348667740822, "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490513414144516, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 1332.5938110351562, "epoch": 0.260772160406243, "grad_norm": 15.367390632629395, "kl": 3.37890625, "learning_rate": 2.7929706354057696e-07, "loss": 0.3477, "reward": 0.6021205633878708, "reward_std": 0.25876715779304504, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384215950966, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 1488.2344360351562, "epoch": 0.2610708684937645, "grad_norm": 10.985326766967773, "kl": 2.71484375, "learning_rate": 2.7922206901347997e-07, "loss": 0.2431, "reward": 0.5675223469734192, "reward_std": 0.30618931353092194, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5117187649011612, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 1356.7567443847656, "epoch": 0.26136957658128596, "grad_norm": 8.635026931762695, "kl": 2.685546875, "learning_rate": 2.791469502239668e-07, "loss": 0.315, "reward": 0.647879496216774, "reward_std": 0.34306858852505684, "rewards/accuracy_reward": 0.15625000977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294813156128, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 1427.8281860351562, "epoch": 0.2616682846688074, "grad_norm": 4.388730049133301, "kl": 3.125, "learning_rate": 2.7907170725375945e-07, "loss": 0.2298, "reward": 0.6093750298023224, "reward_std": 0.333850659430027, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506696455180645, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 1482.2389221191406, "epoch": 0.2619669927563289, "grad_norm": 6.1048102378845215, "kl": 3.94921875, "learning_rate": 2.7899634018471476e-07, "loss": 0.2592, "reward": 0.5954241454601288, "reward_std": 0.31537649035453796, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919887661934, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 1349.7634582519531, "epoch": 0.26226570084385037, "grad_norm": 4.113785266876221, "kl": 3.20703125, "learning_rate": 2.7892084909882484e-07, "loss": 0.2893, "reward": 0.6210937798023224, "reward_std": 0.29919523000717163, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5161830633878708, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 1345.1786499023438, "epoch": 0.26256440893137184, "grad_norm": 9.511211395263672, "kl": 3.0703125, "learning_rate": 2.7884523407821657e-07, "loss": 0.3342, "reward": 0.6679687947034836, "reward_std": 0.2676994912326336, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5228794813156128, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 1439.7188110351562, "epoch": 0.2628631170188933, "grad_norm": 3.7201783657073975, "kl": 2.875, "learning_rate": 2.787694952051516e-07, "loss": 0.2463, "reward": 0.6199777126312256, "reward_std": 0.32747218012809753, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419887661934, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 1352.5915832519531, "epoch": 0.2631618251064148, "grad_norm": 6.517705917358398, "kl": 3.22265625, "learning_rate": 2.786936325620265e-07, "loss": 0.2597, "reward": 0.6065848469734192, "reward_std": 0.30460650473833084, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5262277126312256, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 1368.1250915527344, "epoch": 0.26346053319393625, "grad_norm": 25.649131774902344, "kl": 4.2734375, "learning_rate": 2.786176462313723e-07, "loss": 0.2904, "reward": 0.6495536118745804, "reward_std": 0.34788109362125397, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089285969734192, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 1374.3929138183594, "epoch": 0.26375924128145767, "grad_norm": 5.783568382263184, "kl": 2.9375, "learning_rate": 2.7854153629585476e-07, "loss": 0.236, "reward": 0.5781250223517418, "reward_std": 0.2821527384221554, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5022321566939354, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 1428.3304138183594, "epoch": 0.26405794936897914, "grad_norm": 3.995025396347046, "kl": 2.46484375, "learning_rate": 2.78465302838274e-07, "loss": 0.1999, "reward": 0.5758928805589676, "reward_std": 0.28347013145685196, "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5290178805589676, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 1378.4442443847656, "epoch": 0.2643566574565006, "grad_norm": 4.430325984954834, "kl": 2.81640625, "learning_rate": 2.783889459415645e-07, "loss": 0.2629, "reward": 0.6796875447034836, "reward_std": 0.3378197103738785, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498883955180645, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 1401.5379943847656, "epoch": 0.2646553655440221, "grad_norm": 4.138897895812988, "kl": 3.126953125, "learning_rate": 2.783124656887952e-07, "loss": 0.2472, "reward": 0.6445312798023224, "reward_std": 0.2888336703181267, "rewards/accuracy_reward": 0.15625000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 1344.4353637695312, "epoch": 0.26495407363154355, "grad_norm": 5.581734657287598, "kl": 3.25, "learning_rate": 2.78235862163169e-07, "loss": 0.3093, "reward": 0.597098246216774, "reward_std": 0.27381882444024086, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232313156128, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 1330.0067443847656, "epoch": 0.265252781719065, "grad_norm": 3.819575071334839, "kl": 2.728515625, "learning_rate": 2.781591354480231e-07, "loss": 0.2227, "reward": 0.6305803954601288, "reward_std": 0.32582540810108185, "rewards/accuracy_reward": 0.11830357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768133878708, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 1404.6451721191406, "epoch": 0.2655514898065865, "grad_norm": 10.274106979370117, "kl": 2.751953125, "learning_rate": 2.7808228562682856e-07, "loss": 0.2771, "reward": 0.5652901977300644, "reward_std": 0.32546598464250565, "rewards/accuracy_reward": 0.06473214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580559372902, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 1381.1161193847656, "epoch": 0.26585019789410796, "grad_norm": 3.8736863136291504, "kl": 3.08984375, "learning_rate": 2.7800531278319057e-07, "loss": 0.3083, "reward": 0.6886160969734192, "reward_std": 0.35241761803627014, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768059372902, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 1340.3861999511719, "epoch": 0.26614890598162944, "grad_norm": 15.187454223632812, "kl": 3.921875, "learning_rate": 2.77928217000848e-07, "loss": 0.3381, "reward": 0.695870578289032, "reward_std": 0.3062834143638611, "rewards/accuracy_reward": 0.17410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.521763414144516, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 1371.93310546875, "epoch": 0.2664476140691509, "grad_norm": 4.139835357666016, "kl": 2.7734375, "learning_rate": 2.778509983636734e-07, "loss": 0.2784, "reward": 0.6629464626312256, "reward_std": 0.30169688165187836, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607313156128, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 1484.77685546875, "epoch": 0.2667463221566724, "grad_norm": 4.0896100997924805, "kl": 3.55078125, "learning_rate": 2.7777365695567324e-07, "loss": 0.323, "reward": 0.623883955180645, "reward_std": 0.2865125313401222, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489955373108387, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 1436.5045166015625, "epoch": 0.26704503024419385, "grad_norm": 4.081417560577393, "kl": 3.5, "learning_rate": 2.7769619286098724e-07, "loss": 0.3162, "reward": 0.573660746216774, "reward_std": 0.27081097289919853, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089285969734192, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 1378.4844360351562, "epoch": 0.2673437383317153, "grad_norm": 7.377170562744141, "kl": 3.76953125, "learning_rate": 2.7761860616388896e-07, "loss": 0.3259, "reward": 0.576450914144516, "reward_std": 0.31278030574321747, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580633878708, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 1448.7031860351562, "epoch": 0.2676424464192368, "grad_norm": 6.561638355255127, "kl": 3.09765625, "learning_rate": 2.7754089694878485e-07, "loss": 0.2909, "reward": 0.5848214477300644, "reward_std": 0.35179176926612854, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143208384514, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 1420.5848693847656, "epoch": 0.26794115450675826, "grad_norm": 8.303954124450684, "kl": 2.58203125, "learning_rate": 2.774630653002151e-07, "loss": 0.2294, "reward": 0.6143973618745804, "reward_std": 0.26899826154112816, "rewards/accuracy_reward": 0.10491072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866454601288, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 1399.87060546875, "epoch": 0.26823986259427973, "grad_norm": 4.504161834716797, "kl": 3.10546875, "learning_rate": 2.773851113028529e-07, "loss": 0.253, "reward": 0.5831473469734192, "reward_std": 0.27508341893553734, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487165205180645, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 1359.6652221679688, "epoch": 0.2685385706818012, "grad_norm": 5.397341251373291, "kl": 2.732421875, "learning_rate": 2.7730703504150454e-07, "loss": 0.3473, "reward": 0.647879496216774, "reward_std": 0.32696540653705597, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027901902794838, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 1494.9911193847656, "epoch": 0.2688372787693227, "grad_norm": 6.519049644470215, "kl": 3.2890625, "learning_rate": 2.772288366011093e-07, "loss": 0.3064, "reward": 0.5072545036673546, "reward_std": 0.3061285838484764, "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 1454.9174499511719, "epoch": 0.26913598685684414, "grad_norm": 5.240793704986572, "kl": 2.62109375, "learning_rate": 2.771505160667394e-07, "loss": 0.2491, "reward": 0.5809152126312256, "reward_std": 0.31951793283224106, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616305589676, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 1298.6094360351562, "epoch": 0.2694346949443656, "grad_norm": 6.936910629272461, "kl": 2.890625, "learning_rate": 2.770720735236e-07, "loss": 0.2956, "reward": 0.5318080484867096, "reward_std": 0.30375228077173233, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871651977300644, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 1466.7500305175781, "epoch": 0.2697334030318871, "grad_norm": 13.108356475830078, "kl": 6.861328125, "learning_rate": 2.769935090570288e-07, "loss": 0.1905, "reward": 0.5881696492433548, "reward_std": 0.31211917102336884, "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160895228386, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 1482.6741638183594, "epoch": 0.27003211111940856, "grad_norm": 11.031299591064453, "kl": 2.7890625, "learning_rate": 2.7691482275249637e-07, "loss": 0.2956, "reward": 0.5708705559372902, "reward_std": 0.2689482197165489, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443638414144516, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 1457.72998046875, "epoch": 0.27033081920693003, "grad_norm": 5.434566974639893, "kl": 2.98828125, "learning_rate": 2.7683601469560556e-07, "loss": 0.29, "reward": 0.5496651977300644, "reward_std": 0.30042480677366257, "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580484867096, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 1448.7120971679688, "epoch": 0.2706295272944515, "grad_norm": 7.397217750549316, "kl": 4.04296875, "learning_rate": 2.7675708497209186e-07, "loss": 0.2689, "reward": 0.5976562798023224, "reward_std": 0.3011307679116726, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 1451.5491943359375, "epoch": 0.27092823538197297, "grad_norm": 4.881020545959473, "kl": 2.93359375, "learning_rate": 2.7667803366782314e-07, "loss": 0.2828, "reward": 0.6021205633878708, "reward_std": 0.2835773155093193, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5329241156578064, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 1448.1697082519531, "epoch": 0.27122694346949444, "grad_norm": 4.768532752990723, "kl": 2.91796875, "learning_rate": 2.7659886086879933e-07, "loss": 0.2887, "reward": 0.6099330484867096, "reward_std": 0.31883541867136955, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223469734192, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 1432.0067749023438, "epoch": 0.2715256515570159, "grad_norm": 7.3326497077941895, "kl": 3.99609375, "learning_rate": 2.765195666611528e-07, "loss": 0.3421, "reward": 0.643973246216774, "reward_std": 0.36227934807538986, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.503348246216774, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 1499.6786193847656, "epoch": 0.2718243596445374, "grad_norm": 10.305097579956055, "kl": 3.80078125, "learning_rate": 2.764401511311479e-07, "loss": 0.271, "reward": 0.624441996216774, "reward_std": 0.3277656026184559, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705484867096, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 1342.3795166015625, "epoch": 0.27212306773205885, "grad_norm": 9.473349571228027, "kl": 3.9375, "learning_rate": 2.763606143651808e-07, "loss": 0.3429, "reward": 0.674107164144516, "reward_std": 0.35529156774282455, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5468750223517418, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 1448.2031860351562, "epoch": 0.2724217758195803, "grad_norm": 3.524515390396118, "kl": 2.662109375, "learning_rate": 2.762809564497798e-07, "loss": 0.2276, "reward": 0.647879496216774, "reward_std": 0.3030768781900406, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5340402126312256, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 1451.2879943847656, "epoch": 0.2727204839071018, "grad_norm": 5.727113246917725, "kl": 2.826171875, "learning_rate": 2.7620117747160484e-07, "loss": 0.2625, "reward": 0.5809152126312256, "reward_std": 0.2795792259275913, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5072544813156128, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 1409.4732666015625, "epoch": 0.27301919199462327, "grad_norm": 7.37072229385376, "kl": 2.166015625, "learning_rate": 2.7612127751744776e-07, "loss": 0.2262, "reward": 0.6250000298023224, "reward_std": 0.2890464626252651, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250223517418, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 1469.9107971191406, "epoch": 0.27331790008214474, "grad_norm": 4.371983051300049, "kl": 3.0078125, "learning_rate": 2.7604125667423175e-07, "loss": 0.2848, "reward": 0.6149553954601288, "reward_std": 0.3044304922223091, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.503348246216774, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 1372.3482666015625, "epoch": 0.2736166081696662, "grad_norm": 6.481261730194092, "kl": 6.1640625, "learning_rate": 2.7596111502901174e-07, "loss": 0.221, "reward": 0.6434152126312256, "reward_std": 0.3222830891609192, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.507254496216774, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 1384.1361999511719, "epoch": 0.2739153162571877, "grad_norm": 10.593977928161621, "kl": 3.783203125, "learning_rate": 2.758808526689739e-07, "loss": 0.2291, "reward": 0.6428571790456772, "reward_std": 0.3083372339606285, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250298023224, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 1426.0759582519531, "epoch": 0.27421402434470915, "grad_norm": 5.6162190437316895, "kl": 3.15234375, "learning_rate": 2.7580046968143586e-07, "loss": 0.2977, "reward": 0.5452009215950966, "reward_std": 0.2771916165947914, "rewards/accuracy_reward": 0.04241071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502790205180645, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 1415.2121276855469, "epoch": 0.2745127324322306, "grad_norm": 5.279362678527832, "kl": 2.458984375, "learning_rate": 2.757199661538464e-07, "loss": 0.2329, "reward": 0.699776828289032, "reward_std": 0.32828231155872345, "rewards/accuracy_reward": 0.17857143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5212053805589676, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 1480.3840026855469, "epoch": 0.2748114405197521, "grad_norm": 7.9493303298950195, "kl": 3.01171875, "learning_rate": 2.756393421737855e-07, "loss": 0.2656, "reward": 0.732700914144516, "reward_std": 0.29900649189949036, "rewards/accuracy_reward": 0.2031250149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5295759215950966, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 1484.4219360351562, "epoch": 0.27511014860727356, "grad_norm": 17.76835060119629, "kl": 5.28125, "learning_rate": 2.7555859782896415e-07, "loss": 0.3466, "reward": 0.6099330633878708, "reward_std": 0.27581916004419327, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866305589676, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 1470.1317749023438, "epoch": 0.27540885669479503, "grad_norm": 12.294031143188477, "kl": 4.03125, "learning_rate": 2.754777332072242e-07, "loss": 0.2761, "reward": 0.5915178805589676, "reward_std": 0.2682410515844822, "rewards/accuracy_reward": 0.06026785867288709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5312500298023224, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 1322.0402526855469, "epoch": 0.2757075647823165, "grad_norm": 5.55271577835083, "kl": 3.125, "learning_rate": 2.7539674839653863e-07, "loss": 0.3367, "reward": 0.6383928805589676, "reward_std": 0.28095177188515663, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607387661934, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 1465.419677734375, "epoch": 0.276006272869838, "grad_norm": 15.839284896850586, "kl": 5.2578125, "learning_rate": 2.753156434850107e-07, "loss": 0.3095, "reward": 0.6300223469734192, "reward_std": 0.3170185983181, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5117187723517418, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 1421.0736999511719, "epoch": 0.27630498095735945, "grad_norm": 21.522491455078125, "kl": 4.30078125, "learning_rate": 2.752344185608749e-07, "loss": 0.3235, "reward": 0.5647321566939354, "reward_std": 0.29026636853814125, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517857164144516, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 1445.33935546875, "epoch": 0.27660368904488086, "grad_norm": 4.823523998260498, "kl": 3.33203125, "learning_rate": 2.7515307371249574e-07, "loss": 0.2419, "reward": 0.5926339626312256, "reward_std": 0.3127312958240509, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5189732313156128, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 1378.0603332519531, "epoch": 0.27690239713240233, "grad_norm": 4.585727691650391, "kl": 3.06640625, "learning_rate": 2.7507160902836856e-07, "loss": 0.2894, "reward": 0.6077009290456772, "reward_std": 0.3008413426578045, "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5184151977300644, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 1302.7143249511719, "epoch": 0.2772011052199238, "grad_norm": 6.833114147186279, "kl": 2.40625, "learning_rate": 2.7499002459711893e-07, "loss": 0.2634, "reward": 0.7215402126312256, "reward_std": 0.35914187133312225, "rewards/accuracy_reward": 0.17857143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5429687798023224, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 1421.4353332519531, "epoch": 0.2774998133074453, "grad_norm": 9.73373794555664, "kl": 2.68359375, "learning_rate": 2.7490832050750277e-07, "loss": 0.2555, "reward": 0.5680803805589676, "reward_std": 0.25897272303700447, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482387661934, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 1332.3348999023438, "epoch": 0.27779852139496675, "grad_norm": 9.891474723815918, "kl": 2.630859375, "learning_rate": 2.74826496848406e-07, "loss": 0.2185, "reward": 0.6462053656578064, "reward_std": 0.27672359719872475, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5145089626312256, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 1408.55810546875, "epoch": 0.2780972294824882, "grad_norm": 15.105474472045898, "kl": 4.625, "learning_rate": 2.7474455370884485e-07, "loss": 0.3505, "reward": 0.6021205633878708, "reward_std": 0.3153105527162552, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884066939354, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 1442.30810546875, "epoch": 0.2783959375700097, "grad_norm": 7.938187122344971, "kl": 2.31640625, "learning_rate": 2.746624911779654e-07, "loss": 0.2156, "reward": 0.5859375298023224, "reward_std": 0.2668657600879669, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375149011612, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 1494.7366943359375, "epoch": 0.27869464565753116, "grad_norm": 5.945620059967041, "kl": 2.49609375, "learning_rate": 2.745803093450436e-07, "loss": 0.2444, "reward": 0.6523437798023224, "reward_std": 0.31900499761104584, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866305589676, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 1412.9129943847656, "epoch": 0.27899335374505263, "grad_norm": 3.427300214767456, "kl": 3.12890625, "learning_rate": 2.744980082994853e-07, "loss": 0.309, "reward": 0.6316964626312256, "reward_std": 0.2572196498513222, "rewards/accuracy_reward": 0.13839286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933036118745804, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 1369.3505249023438, "epoch": 0.2792920618325741, "grad_norm": 17.040910720825195, "kl": 4.421875, "learning_rate": 2.744155881308259e-07, "loss": 0.3121, "reward": 0.609933078289032, "reward_std": 0.3335602581501007, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223544239998, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 1452.7545471191406, "epoch": 0.27959076992009557, "grad_norm": 13.026904106140137, "kl": 3.37890625, "learning_rate": 2.743330489287305e-07, "loss": 0.2827, "reward": 0.6350446790456772, "reward_std": 0.3073882833123207, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125298023224, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 1429.2880554199219, "epoch": 0.27988947800761704, "grad_norm": 8.501128196716309, "kl": 3.73828125, "learning_rate": 2.742503907829936e-07, "loss": 0.3107, "reward": 0.5630580484867096, "reward_std": 0.30201485753059387, "rewards/accuracy_reward": 0.09821429406292737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437723517418, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 1396.9197082519531, "epoch": 0.2801881860951385, "grad_norm": 5.724954605102539, "kl": 3.4140625, "learning_rate": 2.741676137835393e-07, "loss": 0.3279, "reward": 0.632254496216774, "reward_std": 0.3344198539853096, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5072544887661934, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 1443.5045471191406, "epoch": 0.28048689418266, "grad_norm": 6.802966594696045, "kl": 2.8671875, "learning_rate": 2.7408471802042074e-07, "loss": 0.2921, "reward": 0.5334821492433548, "reward_std": 0.27302858978509903, "rewards/accuracy_reward": 0.03348214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000149011612, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 1346.560302734375, "epoch": 0.28078560227018146, "grad_norm": 8.737113952636719, "kl": 2.1953125, "learning_rate": 2.7400170358382045e-07, "loss": 0.2399, "reward": 0.6328125149011612, "reward_std": 0.30339301377534866, "rewards/accuracy_reward": 0.09821429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5345982313156128, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 1380.4085388183594, "epoch": 0.2810843103577029, "grad_norm": 14.509044647216797, "kl": 2.53515625, "learning_rate": 2.7391857056404994e-07, "loss": 0.2953, "reward": 0.6333705559372902, "reward_std": 0.3109826222062111, "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949776977300644, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 1482.3840026855469, "epoch": 0.2813830184452244, "grad_norm": 4.6273603439331055, "kl": 3.0703125, "learning_rate": 2.738353190515498e-07, "loss": 0.2957, "reward": 0.5714285969734192, "reward_std": 0.3180937394499779, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357313156128, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 1498.2679443359375, "epoch": 0.28168172653274587, "grad_norm": 5.243226051330566, "kl": 3.1171875, "learning_rate": 2.737519491368896e-07, "loss": 0.2767, "reward": 0.572544664144516, "reward_std": 0.2991444617509842, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268133878708, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 1373.8348999023438, "epoch": 0.28198043462026734, "grad_norm": 6.316929340362549, "kl": 3.15625, "learning_rate": 2.7366846091076753e-07, "loss": 0.2619, "reward": 0.542968787252903, "reward_std": 0.29659396037459373, "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294813156128, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 1370.4152221679688, "epoch": 0.2822791427077888, "grad_norm": 20.4217586517334, "kl": 3.455078125, "learning_rate": 2.735848544640107e-07, "loss": 0.2909, "reward": 0.5407366156578064, "reward_std": 0.2838025614619255, "rewards/accuracy_reward": 0.04464285890571773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937723517418, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 1345.9933166503906, "epoch": 0.2825778507953103, "grad_norm": 4.118185997009277, "kl": 2.4375, "learning_rate": 2.7350112988757467e-07, "loss": 0.2496, "reward": 0.5468750223517418, "reward_std": 0.30858249962329865, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.497767873108387, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 1426.1340026855469, "epoch": 0.28287655888283175, "grad_norm": 5.580115795135498, "kl": 2.30078125, "learning_rate": 2.7341728727254347e-07, "loss": 0.2467, "reward": 0.5864955633878708, "reward_std": 0.30455219745635986, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 1472.044677734375, "epoch": 0.2831752669703532, "grad_norm": 5.723369121551514, "kl": 2.80859375, "learning_rate": 2.7333332671012984e-07, "loss": 0.2401, "reward": 0.5848214328289032, "reward_std": 0.302290890365839, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000223517418, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 1449.0357971191406, "epoch": 0.2834739750578747, "grad_norm": 4.364274024963379, "kl": 2.59375, "learning_rate": 2.7324924829167454e-07, "loss": 0.1989, "reward": 0.5172991305589676, "reward_std": 0.23525049537420273, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506138414144516, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 1480.5491638183594, "epoch": 0.28377268314539617, "grad_norm": 4.570127010345459, "kl": 2.125, "learning_rate": 2.731650521086467e-07, "loss": 0.1978, "reward": 0.6049107536673546, "reward_std": 0.26630448922514915, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506696455180645, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 1381.700927734375, "epoch": 0.28407139123291764, "grad_norm": 4.006049156188965, "kl": 2.4453125, "learning_rate": 2.730807382526435e-07, "loss": 0.2406, "reward": 0.6523437798023224, "reward_std": 0.3253670334815979, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5206473469734192, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 1526.8840026855469, "epoch": 0.2843700993204391, "grad_norm": 3.4655749797821045, "kl": 2.6484375, "learning_rate": 2.7299630681539e-07, "loss": 0.2417, "reward": 0.5680803805589676, "reward_std": 0.27517019957304, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5100446566939354, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 1309.3572082519531, "epoch": 0.2846688074079606, "grad_norm": 4.791429042816162, "kl": 3.078125, "learning_rate": 2.7291175788873957e-07, "loss": 0.3459, "reward": 0.666852705180645, "reward_std": 0.36631103605031967, "rewards/accuracy_reward": 0.16294643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062723517418, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 1461.3861999511719, "epoch": 0.28496751549548205, "grad_norm": 5.202968597412109, "kl": 2.013671875, "learning_rate": 2.72827091564673e-07, "loss": 0.2255, "reward": 0.6222098618745804, "reward_std": 0.3748091384768486, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5262277126312256, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 1405.55810546875, "epoch": 0.2852662235830035, "grad_norm": 3.467777729034424, "kl": 2.30859375, "learning_rate": 2.7274230793529907e-07, "loss": 0.2493, "reward": 0.654575914144516, "reward_std": 0.3502536565065384, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5206473469734192, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 1456.4554443359375, "epoch": 0.285564931670525, "grad_norm": 3.611502170562744, "kl": 2.427734375, "learning_rate": 2.726574070928539e-07, "loss": 0.239, "reward": 0.593191996216774, "reward_std": 0.3098136968910694, "rewards/accuracy_reward": 0.08928571688011289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062798023224, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 1453.5513916015625, "epoch": 0.28586363975804646, "grad_norm": 2.3119800090789795, "kl": 2.7421875, "learning_rate": 2.7257238912970144e-07, "loss": 0.2965, "reward": 0.5585937723517418, "reward_std": 0.33214229345321655, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871651977300644, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 1404.3215026855469, "epoch": 0.28616234784556793, "grad_norm": 2.159822702407837, "kl": 2.36328125, "learning_rate": 2.7248725413833283e-07, "loss": 0.2448, "reward": 0.6188616305589676, "reward_std": 0.2968074716627598, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937723517418, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 1523.35498046875, "epoch": 0.2864610559330894, "grad_norm": 3.473541736602783, "kl": 2.291015625, "learning_rate": 2.724020022113665e-07, "loss": 0.2257, "reward": 0.6422991305589676, "reward_std": 0.2849598377943039, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419738650322, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 1461.8572082519531, "epoch": 0.2867597640206109, "grad_norm": 2.928319215774536, "kl": 2.228515625, "learning_rate": 2.723166334415484e-07, "loss": 0.2208, "reward": 0.558035746216774, "reward_std": 0.28856799006462097, "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466517873108387, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 1370.6898193359375, "epoch": 0.28705847210813235, "grad_norm": 7.979773998260498, "kl": 1.7265625, "learning_rate": 2.7223114792175123e-07, "loss": 0.232, "reward": 0.5792410969734192, "reward_std": 0.29162852093577385, "rewards/accuracy_reward": 0.08035714388825, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498883955180645, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 1414.4241638183594, "epoch": 0.2873571801956538, "grad_norm": 8.977100372314453, "kl": 2.263671875, "learning_rate": 2.721455457449749e-07, "loss": 0.3034, "reward": 0.6171875149011612, "reward_std": 0.31962526962161064, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810267984867096, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 1364.9531860351562, "epoch": 0.2876558882831753, "grad_norm": 6.301506996154785, "kl": 1.609375, "learning_rate": 2.7205982700434626e-07, "loss": 0.2499, "reward": 0.6484375298023224, "reward_std": 0.3562006875872612, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.518973246216774, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 1404.2165832519531, "epoch": 0.28795459637069676, "grad_norm": 3.6643168926239014, "kl": 2.236328125, "learning_rate": 2.719739917931187e-07, "loss": 0.2646, "reward": 0.642857164144516, "reward_std": 0.36324674636125565, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5044643059372902, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 1406.5290832519531, "epoch": 0.28825330445821823, "grad_norm": 10.120776176452637, "kl": 2.32421875, "learning_rate": 2.718880402046727e-07, "loss": 0.3062, "reward": 0.5625000298023224, "reward_std": 0.30147644132375717, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502232164144516, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 1421.3884582519531, "epoch": 0.2885520125457397, "grad_norm": 14.409568786621094, "kl": 2.99609375, "learning_rate": 2.7180197233251516e-07, "loss": 0.2946, "reward": 0.6389509364962578, "reward_std": 0.30793674662709236, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871651977300644, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 1367.6340026855469, "epoch": 0.28885072063326117, "grad_norm": 7.342563152313232, "kl": 3.0859375, "learning_rate": 2.7171578827027945e-07, "loss": 0.3118, "reward": 0.5563616305589676, "reward_std": 0.3051098585128784, "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.507254496216774, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 1540.0491943359375, "epoch": 0.28914942872078264, "grad_norm": 6.503608226776123, "kl": 2.83984375, "learning_rate": 2.7162948811172534e-07, "loss": 0.2421, "reward": 0.536272332072258, "reward_std": 0.2971169874072075, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687649011612, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 1497.0045471191406, "epoch": 0.28944813680830406, "grad_norm": 10.2169189453125, "kl": 2.71484375, "learning_rate": 2.71543071950739e-07, "loss": 0.2567, "reward": 0.6060267984867096, "reward_std": 0.29889698326587677, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482238650322, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 1410.9063110351562, "epoch": 0.28974684489582553, "grad_norm": 4.593563079833984, "kl": 2.98828125, "learning_rate": 2.7145653988133266e-07, "loss": 0.2877, "reward": 0.5848214626312256, "reward_std": 0.3099651150405407, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071715950966, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 1407.4375610351562, "epoch": 0.290045552983347, "grad_norm": 6.600954532623291, "kl": 2.87890625, "learning_rate": 2.7136989199764483e-07, "loss": 0.3143, "reward": 0.675781287252903, "reward_std": 0.31163667142391205, "rewards/accuracy_reward": 0.16964286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384215950966, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 1524.9130249023438, "epoch": 0.29034426107086847, "grad_norm": 3.557403087615967, "kl": 1.962890625, "learning_rate": 2.712831283939399e-07, "loss": 0.1966, "reward": 0.5351562947034836, "reward_std": 0.29134922474622726, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 1478.6005554199219, "epoch": 0.29064296915838994, "grad_norm": 7.469570636749268, "kl": 2.6171875, "learning_rate": 2.7119624916460814e-07, "loss": 0.3175, "reward": 0.5574777126312256, "reward_std": 0.3190249875187874, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.468191996216774, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 1467.102783203125, "epoch": 0.2909416772459114, "grad_norm": 6.424932956695557, "kl": 1.708984375, "learning_rate": 2.7110925440416557e-07, "loss": 0.1637, "reward": 0.6702009290456772, "reward_std": 0.29631808400154114, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.529575914144516, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 1341.0692443847656, "epoch": 0.2912403853334329, "grad_norm": 6.8822021484375, "kl": 2.046875, "learning_rate": 2.710221442072541e-07, "loss": 0.2894, "reward": 0.6540178954601288, "reward_std": 0.32436078786849976, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502232164144516, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 1390.4085693359375, "epoch": 0.29153909342095435, "grad_norm": 9.387606620788574, "kl": 1.921875, "learning_rate": 2.70934918668641e-07, "loss": 0.2442, "reward": 0.6216518133878708, "reward_std": 0.3161216303706169, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5100446715950966, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 1467.5380249023438, "epoch": 0.2918378015084758, "grad_norm": 3.332003116607666, "kl": 2.01953125, "learning_rate": 2.708475778832191e-07, "loss": 0.2341, "reward": 0.5731026977300644, "reward_std": 0.26984647661447525, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705633878708, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 1490.3594665527344, "epoch": 0.2921365095959973, "grad_norm": 3.466125249862671, "kl": 1.91015625, "learning_rate": 2.7076012194600667e-07, "loss": 0.1946, "reward": 0.6406250298023224, "reward_std": 0.2916857823729515, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250149011612, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 1517.7300109863281, "epoch": 0.29243521768351877, "grad_norm": 4.047390460968018, "kl": 2.037109375, "learning_rate": 2.7067255095214716e-07, "loss": 0.2377, "reward": 0.6333705633878708, "reward_std": 0.33225350081920624, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384215950966, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 1534.4308776855469, "epoch": 0.29273392577104024, "grad_norm": 5.283846855163574, "kl": 2.287109375, "learning_rate": 2.705848649969093e-07, "loss": 0.2327, "reward": 0.5714285969734192, "reward_std": 0.29897549375891685, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482142873108387, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 1442.5000610351562, "epoch": 0.2930326338585617, "grad_norm": 6.715878009796143, "kl": 4.03125, "learning_rate": 2.7049706417568663e-07, "loss": 0.3068, "reward": 0.6350446790456772, "reward_std": 0.296127587556839, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.492187537252903, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 1504.1563110351562, "epoch": 0.2933313419460832, "grad_norm": 7.1804118156433105, "kl": 2.806640625, "learning_rate": 2.7040914858399807e-07, "loss": 0.2221, "reward": 0.534040205180645, "reward_std": 0.25015926361083984, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487165205180645, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 1434.2679443359375, "epoch": 0.29363005003360465, "grad_norm": 8.775562286376953, "kl": 2.625, "learning_rate": 2.70321118317487e-07, "loss": 0.2559, "reward": 0.6311384215950966, "reward_std": 0.3192870020866394, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455559372902, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 1437.8951721191406, "epoch": 0.2939287581211261, "grad_norm": 2.634521245956421, "kl": 2.55078125, "learning_rate": 2.702329734719218e-07, "loss": 0.2639, "reward": 0.6026786118745804, "reward_std": 0.302250437438488, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089285895228386, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 1472.9732971191406, "epoch": 0.2942274662086476, "grad_norm": 2.6780097484588623, "kl": 1.830078125, "learning_rate": 2.701447141431954e-07, "loss": 0.2185, "reward": 0.5625000298023224, "reward_std": 0.2975355163216591, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 1414.9531860351562, "epoch": 0.29452617429616906, "grad_norm": 5.006982803344727, "kl": 1.8203125, "learning_rate": 2.7005634042732525e-07, "loss": 0.2525, "reward": 0.6210937798023224, "reward_std": 0.3122224174439907, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502790205180645, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 1443.1853332519531, "epoch": 0.29482488238369053, "grad_norm": 2.5495188236236572, "kl": 1.765625, "learning_rate": 2.699678524204534e-07, "loss": 0.201, "reward": 0.655691996216774, "reward_std": 0.3181569315493107, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169813156128, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 1562.6496276855469, "epoch": 0.295123590471212, "grad_norm": 2.8818306922912598, "kl": 1.974609375, "learning_rate": 2.6987925021884606e-07, "loss": 0.2186, "reward": 0.5452009290456772, "reward_std": 0.2844936102628708, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723469734192, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 1505.1808776855469, "epoch": 0.2954222985587335, "grad_norm": 7.036995887756348, "kl": 1.5703125, "learning_rate": 2.6979053391889375e-07, "loss": 0.2088, "reward": 0.6099330633878708, "reward_std": 0.29395782947540283, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.493861623108387, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 1413.6116333007812, "epoch": 0.29572100664625495, "grad_norm": 8.649860382080078, "kl": 1.791015625, "learning_rate": 2.697017036171111e-07, "loss": 0.2276, "reward": 0.5457589477300644, "reward_std": 0.31159402802586555, "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839477300644, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 1535.1027526855469, "epoch": 0.2960197147337764, "grad_norm": 3.224149227142334, "kl": 1.837890625, "learning_rate": 2.6961275941013684e-07, "loss": 0.258, "reward": 0.5803571790456772, "reward_std": 0.27463657036423683, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107387661934, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 1526.1161193847656, "epoch": 0.2963184228212979, "grad_norm": 6.502218723297119, "kl": 1.384765625, "learning_rate": 2.6952370139473347e-07, "loss": 0.2169, "reward": 0.595982164144516, "reward_std": 0.30814895033836365, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000298023224, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 1489.4174499511719, "epoch": 0.29661713090881936, "grad_norm": 2.7905609607696533, "kl": 8.177734375, "learning_rate": 2.694345296677874e-07, "loss": 0.2064, "reward": 0.5457589477300644, "reward_std": 0.2756493017077446, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854911044239998, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 1558.6942749023438, "epoch": 0.29691583899634083, "grad_norm": 2.369065761566162, "kl": 2.033203125, "learning_rate": 2.693452443263087e-07, "loss": 0.2932, "reward": 0.5998884290456772, "reward_std": 0.35612812638282776, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 1498.5759582519531, "epoch": 0.2972145470838623, "grad_norm": 1.9895102977752686, "kl": 1.99609375, "learning_rate": 2.6925584546743114e-07, "loss": 0.2443, "reward": 0.6227678880095482, "reward_std": 0.3180830478668213, "rewards/accuracy_reward": 0.13392858067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393133878708, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 1477.9375610351562, "epoch": 0.2975132551713838, "grad_norm": 2.312823534011841, "kl": 2.3671875, "learning_rate": 2.691663331884119e-07, "loss": 0.2618, "reward": 0.6841518133878708, "reward_std": 0.2893116809427738, "rewards/accuracy_reward": 0.17633929569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125074505806, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 1472.1986999511719, "epoch": 0.29781196325890524, "grad_norm": 4.005575656890869, "kl": 2.17578125, "learning_rate": 2.690767075866315e-07, "loss": 0.2837, "reward": 0.5345982387661934, "reward_std": 0.28763777017593384, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 1439.7991943359375, "epoch": 0.2981106713464267, "grad_norm": 9.778768539428711, "kl": 2.791015625, "learning_rate": 2.689869687595939e-07, "loss": 0.224, "reward": 0.6540178954601288, "reward_std": 0.30617836117744446, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071566939354, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 1480.3772888183594, "epoch": 0.2984093794339482, "grad_norm": 7.044945240020752, "kl": 2.650390625, "learning_rate": 2.688971168049261e-07, "loss": 0.25, "reward": 0.594866082072258, "reward_std": 0.31427397206425667, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854910969734192, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 1453.0982971191406, "epoch": 0.29870808752146966, "grad_norm": 3.3059656620025635, "kl": 2.162109375, "learning_rate": 2.688071518203782e-07, "loss": 0.2195, "reward": 0.607700914144516, "reward_std": 0.2908322736620903, "rewards/accuracy_reward": 0.10714286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580559372902, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 1424.0514221191406, "epoch": 0.2990067956089911, "grad_norm": 3.4244844913482666, "kl": 1.85546875, "learning_rate": 2.6871707390382334e-07, "loss": 0.2392, "reward": 0.6300223544239998, "reward_std": 0.3272586688399315, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498325914144516, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 1388.0781860351562, "epoch": 0.2993055036965126, "grad_norm": 7.104359149932861, "kl": 2.3828125, "learning_rate": 2.6862688315325744e-07, "loss": 0.2262, "reward": 0.5970982611179352, "reward_std": 0.29287082701921463, "rewards/accuracy_reward": 0.07366071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375223517418, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 1454.6138916015625, "epoch": 0.29960421178403407, "grad_norm": 4.774572849273682, "kl": 1.6845703125, "learning_rate": 2.6853657966679913e-07, "loss": 0.2409, "reward": 0.6450893133878708, "reward_std": 0.3057345747947693, "rewards/accuracy_reward": 0.14732143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678880095482, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 1386.1808471679688, "epoch": 0.29990291987155554, "grad_norm": 4.3118510246276855, "kl": 1.61328125, "learning_rate": 2.684461635426899e-07, "loss": 0.1776, "reward": 0.5965401902794838, "reward_std": 0.27304916828870773, "rewards/accuracy_reward": 0.10267857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616305589676, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 1554.3951416015625, "epoch": 0.300201627959077, "grad_norm": 3.4718289375305176, "kl": 1.87890625, "learning_rate": 2.683556348792935e-07, "loss": 0.2292, "reward": 0.6501116454601288, "reward_std": 0.3060283586382866, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4893973469734192, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 1383.7009582519531, "epoch": 0.3005003360465985, "grad_norm": 3.5970728397369385, "kl": 2.47265625, "learning_rate": 2.6826499377509635e-07, "loss": 0.217, "reward": 0.549665205180645, "reward_std": 0.29419688135385513, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5184152126312256, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 1452.0558776855469, "epoch": 0.30079904413411995, "grad_norm": 4.332658290863037, "kl": 2.341796875, "learning_rate": 2.68174240328707e-07, "loss": 0.2341, "reward": 0.6121651977300644, "reward_std": 0.3093794137239456, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.513950914144516, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 1543.1719360351562, "epoch": 0.3010977522216414, "grad_norm": 5.576297760009766, "kl": 2.359375, "learning_rate": 2.6808337463885635e-07, "loss": 0.2856, "reward": 0.606026828289032, "reward_std": 0.29861152172088623, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 1649.0714721679688, "epoch": 0.3013964603091629, "grad_norm": 7.40854549407959, "kl": 2.591796875, "learning_rate": 2.679923968043973e-07, "loss": 0.24, "reward": 0.5496651828289032, "reward_std": 0.3126492351293564, "rewards/accuracy_reward": 0.09598215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830559372902, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 1488.2411499023438, "epoch": 0.30169516839668437, "grad_norm": 6.615168571472168, "kl": 2.42578125, "learning_rate": 2.679013069243049e-07, "loss": 0.2172, "reward": 0.655691996216774, "reward_std": 0.3407023623585701, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5150669813156128, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 1575.5692443847656, "epoch": 0.30199387648420584, "grad_norm": 2.524601697921753, "kl": 2.3671875, "learning_rate": 2.6781010509767595e-07, "loss": 0.2418, "reward": 0.5680803805589676, "reward_std": 0.2779093459248543, "rewards/accuracy_reward": 0.10491072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 1443.7679138183594, "epoch": 0.30229258457172725, "grad_norm": 2.493049144744873, "kl": 2.236328125, "learning_rate": 2.677187914237292e-07, "loss": 0.1829, "reward": 0.6149553805589676, "reward_std": 0.3119243048131466, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854910969734192, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 1380.0379943847656, "epoch": 0.3025912926592487, "grad_norm": 3.416057586669922, "kl": 1.5283203125, "learning_rate": 2.676273660018048e-07, "loss": 0.2353, "reward": 0.6116071790456772, "reward_std": 0.3210693597793579, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000298023224, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 1511.4755249023438, "epoch": 0.3028900007467702, "grad_norm": 3.7381417751312256, "kl": 1.986328125, "learning_rate": 2.675358289313649e-07, "loss": 0.2545, "reward": 0.616629496216774, "reward_std": 0.28330057859420776, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 1520.1183471679688, "epoch": 0.30318870883429166, "grad_norm": 2.0628437995910645, "kl": 1.794921875, "learning_rate": 2.6744418031199256e-07, "loss": 0.2323, "reward": 0.593191996216774, "reward_std": 0.31127891689538956, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562723517418, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 1398.1072082519531, "epoch": 0.30348741692181314, "grad_norm": 3.759448289871216, "kl": 1.5390625, "learning_rate": 2.6735242024339276e-07, "loss": 0.2595, "reward": 0.6445312798023224, "reward_std": 0.32022762298583984, "rewards/accuracy_reward": 0.13839286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506138414144516, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 1441.7188110351562, "epoch": 0.3037861250093346, "grad_norm": 4.92402982711792, "kl": 1.373046875, "learning_rate": 2.672605488253913e-07, "loss": 0.2056, "reward": 0.5943080633878708, "reward_std": 0.3006281331181526, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5072545036673546, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 1467.3460388183594, "epoch": 0.3040848330968561, "grad_norm": 3.691633939743042, "kl": 1.984375, "learning_rate": 2.6716856615793534e-07, "loss": 0.2542, "reward": 0.7181920111179352, "reward_std": 0.3296159729361534, "rewards/accuracy_reward": 0.22991071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 1479.1273193359375, "epoch": 0.30438354118437755, "grad_norm": 3.457298517227173, "kl": 1.787109375, "learning_rate": 2.67076472341093e-07, "loss": 0.2717, "reward": 0.6283482313156128, "reward_std": 0.28463688120245934, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268133878708, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 1526.5491943359375, "epoch": 0.304682249271899, "grad_norm": 9.308791160583496, "kl": 2.517578125, "learning_rate": 2.669842674750533e-07, "loss": 0.255, "reward": 0.6166295111179352, "reward_std": 0.2964761555194855, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437723517418, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 1449.3996276855469, "epoch": 0.3049809573594205, "grad_norm": 4.749098300933838, "kl": 2.640625, "learning_rate": 2.668919516601261e-07, "loss": 0.3213, "reward": 0.5864955633878708, "reward_std": 0.31412966549396515, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455484867096, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 1449.9666137695312, "epoch": 0.30527966544694196, "grad_norm": 5.268867492675781, "kl": 1.884765625, "learning_rate": 2.66799524996742e-07, "loss": 0.2221, "reward": 0.585379496216774, "reward_std": 0.341641828417778, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866454601288, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 1524.5045166015625, "epoch": 0.30557837353446343, "grad_norm": 8.92006778717041, "kl": 2.603515625, "learning_rate": 2.6670698758545214e-07, "loss": 0.3013, "reward": 0.6512276977300644, "reward_std": 0.34058764576911926, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 1568.6607971191406, "epoch": 0.3058770816219849, "grad_norm": 2.7050464153289795, "kl": 2.2421875, "learning_rate": 2.666143395269282e-07, "loss": 0.2249, "reward": 0.4955357313156128, "reward_std": 0.3100148141384125, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.450892873108387, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 1595.4509582519531, "epoch": 0.3061757897095064, "grad_norm": 2.174370288848877, "kl": 1.619140625, "learning_rate": 2.6652158092196206e-07, "loss": 0.1862, "reward": 0.572544664144516, "reward_std": 0.30517198890447617, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.505580373108387, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 1548.8125610351562, "epoch": 0.30647449779702785, "grad_norm": 4.475093364715576, "kl": 1.978515625, "learning_rate": 2.6642871187146606e-07, "loss": 0.2464, "reward": 0.6529018208384514, "reward_std": 0.27156414836645126, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483258955180645, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 1463.7143859863281, "epoch": 0.3067732058845493, "grad_norm": 4.186479568481445, "kl": 1.931640625, "learning_rate": 2.663357324764726e-07, "loss": 0.1992, "reward": 0.4988839626312256, "reward_std": 0.2867153212428093, "rewards/accuracy_reward": 0.03348214388824999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018059372902, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 1480.3795471191406, "epoch": 0.3070719139720708, "grad_norm": 4.017455101013184, "kl": 1.791015625, "learning_rate": 2.6624264283813427e-07, "loss": 0.2406, "reward": 0.5736607313156128, "reward_std": 0.30866797268390656, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035895228386, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 1535.3773193359375, "epoch": 0.30737062205959226, "grad_norm": 4.8947434425354, "kl": 1.576171875, "learning_rate": 2.661494430577233e-07, "loss": 0.195, "reward": 0.5943080633878708, "reward_std": 0.31089942157268524, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.513950914144516, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 1491.2924499511719, "epoch": 0.30766933014711373, "grad_norm": 4.278429985046387, "kl": 2.037109375, "learning_rate": 2.66056133236632e-07, "loss": 0.2573, "reward": 0.556919664144516, "reward_std": 0.2581605464220047, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 1396.8215026855469, "epoch": 0.3079680382346352, "grad_norm": 5.656686782836914, "kl": 2.494140625, "learning_rate": 2.659627134763723e-07, "loss": 0.2378, "reward": 0.5703125298023224, "reward_std": 0.3086673319339752, "rewards/accuracy_reward": 0.07366071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4966517984867096, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 1460.05810546875, "epoch": 0.30826674632215667, "grad_norm": 2.973719596862793, "kl": 2.416015625, "learning_rate": 2.658691838785758e-07, "loss": 0.2717, "reward": 0.623883955180645, "reward_std": 0.3271525502204895, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160895228386, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 1437.5313110351562, "epoch": 0.30856545440967814, "grad_norm": 2.2005081176757812, "kl": 2.154296875, "learning_rate": 2.6577554454499346e-07, "loss": 0.3061, "reward": 0.5602678805589676, "reward_std": 0.3195188120007515, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678805589676, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 1514.1897888183594, "epoch": 0.3088641624971996, "grad_norm": 3.586092948913574, "kl": 2.837890625, "learning_rate": 2.656817955774957e-07, "loss": 0.1716, "reward": 0.6294643208384514, "reward_std": 0.2949282377958298, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776786044239998, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 1474.8215026855469, "epoch": 0.3091628705847211, "grad_norm": 4.521512031555176, "kl": 2.552734375, "learning_rate": 2.655879370780722e-07, "loss": 0.2996, "reward": 0.5625000149011612, "reward_std": 0.24957020208239555, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500223517418, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 1524.12060546875, "epoch": 0.30946157867224255, "grad_norm": 7.906954288482666, "kl": 2.646484375, "learning_rate": 2.654939691488319e-07, "loss": 0.2449, "reward": 0.5870535969734192, "reward_std": 0.2960490398108959, "rewards/accuracy_reward": 0.10937500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776786044239998, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 1448.2031860351562, "epoch": 0.309760286759764, "grad_norm": 4.844135284423828, "kl": 2.5, "learning_rate": 2.653998918920026e-07, "loss": 0.2456, "reward": 0.619419664144516, "reward_std": 0.29809461906552315, "rewards/accuracy_reward": 0.1272321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875223517418, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 1414.27685546875, "epoch": 0.3100589948472855, "grad_norm": 3.5934360027313232, "kl": 1.6640625, "learning_rate": 2.653057054099312e-07, "loss": 0.2392, "reward": 0.6551339477300644, "reward_std": 0.34269052743911743, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768059372902, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 1483.5558776855469, "epoch": 0.31035770293480697, "grad_norm": 4.8978729248046875, "kl": 1.724609375, "learning_rate": 2.6521140980508333e-07, "loss": 0.2317, "reward": 0.5703125149011612, "reward_std": 0.30406298488378525, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 1450.3147888183594, "epoch": 0.31065641102232844, "grad_norm": 3.982600688934326, "kl": 1.0458984375, "learning_rate": 2.651170051800433e-07, "loss": 0.1907, "reward": 0.6261160969734192, "reward_std": 0.30675310641527176, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494419664144516, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 1477.44873046875, "epoch": 0.3109551191098499, "grad_norm": 3.1356923580169678, "kl": 1.57421875, "learning_rate": 2.650224916375142e-07, "loss": 0.1853, "reward": 0.526785746216774, "reward_std": 0.26938602328300476, "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000074505806, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 1369.7656555175781, "epoch": 0.3112538271973714, "grad_norm": 3.129868745803833, "kl": 1.96484375, "learning_rate": 2.6492786928031743e-07, "loss": 0.3082, "reward": 0.6300223618745804, "reward_std": 0.2943107411265373, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.493861623108387, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 1489.1719360351562, "epoch": 0.31155253528489285, "grad_norm": 2.0859944820404053, "kl": 1.77734375, "learning_rate": 2.648331382113929e-07, "loss": 0.2649, "reward": 0.5267857313156128, "reward_std": 0.3284180909395218, "rewards/accuracy_reward": 0.05580357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4709821715950966, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 1442.6473693847656, "epoch": 0.3118512433724143, "grad_norm": 4.97849178314209, "kl": 2.080078125, "learning_rate": 2.6473829853379873e-07, "loss": 0.288, "reward": 0.5664062798023224, "reward_std": 0.29543469101190567, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949776902794838, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 1508.3304138183594, "epoch": 0.3121499514599358, "grad_norm": 5.625909805297852, "kl": 2.55859375, "learning_rate": 2.646433503507111e-07, "loss": 0.3075, "reward": 0.5563616380095482, "reward_std": 0.27833162248134613, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330559372902, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 1485.7009582519531, "epoch": 0.31244865954745726, "grad_norm": 3.0781643390655518, "kl": 1.84765625, "learning_rate": 2.645482937654244e-07, "loss": 0.2454, "reward": 0.5948660969734192, "reward_std": 0.29854828119277954, "rewards/accuracy_reward": 0.08482143329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510044664144516, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 1452.9375915527344, "epoch": 0.31274736763497873, "grad_norm": 4.157790660858154, "kl": 1.85546875, "learning_rate": 2.6445312888135084e-07, "loss": 0.2111, "reward": 0.6356026977300644, "reward_std": 0.31800907850265503, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741305589676, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 1416.6652526855469, "epoch": 0.3130460757225002, "grad_norm": 3.6631932258605957, "kl": 1.712890625, "learning_rate": 2.643578558020206e-07, "loss": 0.2785, "reward": 0.6411830633878708, "reward_std": 0.3117794319987297, "rewards/accuracy_reward": 0.13169643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866305589676, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 1524.5692749023438, "epoch": 0.3133447838100217, "grad_norm": 6.936758518218994, "kl": 2.3095703125, "learning_rate": 2.642624746310813e-07, "loss": 0.2969, "reward": 0.6188616305589676, "reward_std": 0.35744620114564896, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487165205180645, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 1417.3326416015625, "epoch": 0.31364349189754315, "grad_norm": 3.1602208614349365, "kl": 1.638671875, "learning_rate": 2.6416698547229836e-07, "loss": 0.2498, "reward": 0.6110491305589676, "reward_std": 0.33106764405965805, "rewards/accuracy_reward": 0.0937500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5172991305589676, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 1494.2746276855469, "epoch": 0.3139421999850646, "grad_norm": 2.0980777740478516, "kl": 1.6875, "learning_rate": 2.640713884295546e-07, "loss": 0.2105, "reward": 0.5943080633878708, "reward_std": 0.2969750836491585, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 1543.6384887695312, "epoch": 0.3142409080725861, "grad_norm": 3.9006800651550293, "kl": 2.470703125, "learning_rate": 2.6397568360685037e-07, "loss": 0.2008, "reward": 0.5697544738650322, "reward_std": 0.3121168985962868, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616305589676, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 1479.4576110839844, "epoch": 0.31453961616010756, "grad_norm": 2.9607508182525635, "kl": 2.060546875, "learning_rate": 2.638798711083029e-07, "loss": 0.2242, "reward": 0.7154017984867096, "reward_std": 0.3095976747572422, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4832589626312256, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 1578.5603332519531, "epoch": 0.31483832424762903, "grad_norm": 4.910199165344238, "kl": 2.30859375, "learning_rate": 2.63783951038147e-07, "loss": 0.2254, "reward": 0.5262276902794838, "reward_std": 0.27604879811406136, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598469734192, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 1411.5179748535156, "epoch": 0.31513703233515045, "grad_norm": 3.724064350128174, "kl": 1.66015625, "learning_rate": 2.636879235007343e-07, "loss": 0.2597, "reward": 0.5658482611179352, "reward_std": 0.3014937527477741, "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167411044239998, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 1536.4576721191406, "epoch": 0.3154357404226719, "grad_norm": 2.204467296600342, "kl": 1.560546875, "learning_rate": 2.6359178860053325e-07, "loss": 0.206, "reward": 0.6021205484867096, "reward_std": 0.3521214798092842, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506138414144516, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 1502.2746276855469, "epoch": 0.3157344485101934, "grad_norm": 2.1975646018981934, "kl": 2.484375, "learning_rate": 2.634955464421292e-07, "loss": 0.2508, "reward": 0.5641741305589676, "reward_std": 0.3038434460759163, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.479352705180645, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 1542.7009582519531, "epoch": 0.31603315659771486, "grad_norm": 1.131091833114624, "kl": 1.6953125, "learning_rate": 2.633991971302242e-07, "loss": 0.173, "reward": 0.5502232313156128, "reward_std": 0.31378090754151344, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498883955180645, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 1445.1473999023438, "epoch": 0.31633186468523633, "grad_norm": 2.768827438354492, "kl": 2.607421875, "learning_rate": 2.63302740769637e-07, "loss": 0.2815, "reward": 0.600446455180645, "reward_std": 0.26286429166793823, "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785969734192, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 1534.3750610351562, "epoch": 0.3166305727727578, "grad_norm": 3.51875638961792, "kl": 1.701171875, "learning_rate": 2.6320617746530245e-07, "loss": 0.2131, "reward": 0.5195312723517418, "reward_std": 0.30048269778490067, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4793526977300644, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 1576.0156860351562, "epoch": 0.31692928086027927, "grad_norm": 2.2195017337799072, "kl": 1.51953125, "learning_rate": 2.631095073222721e-07, "loss": 0.236, "reward": 0.5809151977300644, "reward_std": 0.33550655096769333, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482700914144516, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 1490.4286193847656, "epoch": 0.31722798894780074, "grad_norm": 4.558156967163086, "kl": 1.62109375, "learning_rate": 2.6301273044571353e-07, "loss": 0.2296, "reward": 0.5909598469734192, "reward_std": 0.3272646591067314, "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.512834832072258, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 1518.3929138183594, "epoch": 0.3175266970353222, "grad_norm": 1.6785279512405396, "kl": 2.169921875, "learning_rate": 2.629158469409105e-07, "loss": 0.2104, "reward": 0.6199777126312256, "reward_std": 0.2976566106081009, "rewards/accuracy_reward": 0.13616072363220155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483816996216774, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 1494.5313415527344, "epoch": 0.3178254051228437, "grad_norm": 4.1378326416015625, "kl": 2.44921875, "learning_rate": 2.6281885691326277e-07, "loss": 0.2811, "reward": 0.635044664144516, "reward_std": 0.3726443871855736, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982313156128, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 1537.4665832519531, "epoch": 0.31812411321036516, "grad_norm": 5.6254096031188965, "kl": 2.294921875, "learning_rate": 2.627217604682861e-07, "loss": 0.2528, "reward": 0.6110491305589676, "reward_std": 0.3383433297276497, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098469734192, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 1481.6786193847656, "epoch": 0.3184228212978866, "grad_norm": 3.583040714263916, "kl": 2.208984375, "learning_rate": 2.6262455771161167e-07, "loss": 0.3054, "reward": 0.5602678954601288, "reward_std": 0.3304397389292717, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776786044239998, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 1528.1830749511719, "epoch": 0.3187215293854081, "grad_norm": 1.318103313446045, "kl": 1.474609375, "learning_rate": 2.625272487489868e-07, "loss": 0.2056, "reward": 0.6015625223517418, "reward_std": 0.30363646149635315, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 1491.5982971191406, "epoch": 0.31902023747292957, "grad_norm": 3.005110740661621, "kl": 1.8515625, "learning_rate": 2.6242983368627385e-07, "loss": 0.238, "reward": 0.6629464626312256, "reward_std": 0.3196617439389229, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.522321455180645, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 1537.2678833007812, "epoch": 0.31931894556045104, "grad_norm": 4.49307107925415, "kl": 1.8203125, "learning_rate": 2.623323126294511e-07, "loss": 0.2495, "reward": 0.6311384290456772, "reward_std": 0.34769195318222046, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205708384514, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 1493.3371276855469, "epoch": 0.3196176536479725, "grad_norm": 3.0605380535125732, "kl": 1.814453125, "learning_rate": 2.6223468568461177e-07, "loss": 0.2916, "reward": 0.6618303805589676, "reward_std": 0.2920939438045025, "rewards/accuracy_reward": 0.15625001164153218, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5055803805589676, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 1554.6897888183594, "epoch": 0.319916361735494, "grad_norm": 2.6466617584228516, "kl": 1.921875, "learning_rate": 2.6213695295796446e-07, "loss": 0.2427, "reward": 0.5775670036673546, "reward_std": 0.279326967895031, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241380095482, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 1486.3416137695312, "epoch": 0.32021506982301545, "grad_norm": 2.8505313396453857, "kl": 2.251953125, "learning_rate": 2.6203911455583276e-07, "loss": 0.2433, "reward": 0.6082589626312256, "reward_std": 0.3463137149810791, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5100446790456772, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 1605.8103332519531, "epoch": 0.3205137779105369, "grad_norm": 6.167478084564209, "kl": 2.40234375, "learning_rate": 2.619411705846553e-07, "loss": 0.2109, "reward": 0.5636161044239998, "reward_std": 0.27226894721388817, "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375149011612, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 1474.2969665527344, "epoch": 0.3208124859980584, "grad_norm": 1.7242356538772583, "kl": 2.20703125, "learning_rate": 2.6184312115098544e-07, "loss": 0.2769, "reward": 0.6015625298023224, "reward_std": 0.31737685203552246, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5145089626312256, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 1511.0558776855469, "epoch": 0.32111119408557987, "grad_norm": 1.8723255395889282, "kl": 1.712890625, "learning_rate": 2.617449663614915e-07, "loss": 0.2804, "reward": 0.585379496216774, "reward_std": 0.3219613656401634, "rewards/accuracy_reward": 0.11607143562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080559372902, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 1553.3594665527344, "epoch": 0.32140990217310134, "grad_norm": 4.816095352172852, "kl": 2.513671875, "learning_rate": 2.616467063229561e-07, "loss": 0.2265, "reward": 0.5859375223517418, "reward_std": 0.2915578857064247, "rewards/accuracy_reward": 0.11383929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982313156128, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 1581.7188415527344, "epoch": 0.3217086102606228, "grad_norm": 2.482623338699341, "kl": 2.033203125, "learning_rate": 2.6154834114227673e-07, "loss": 0.21, "reward": 0.5440848544239998, "reward_std": 0.30976933240890503, "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 1464.5357666015625, "epoch": 0.3220073183481443, "grad_norm": 3.4914841651916504, "kl": 2.267578125, "learning_rate": 2.6144987092646485e-07, "loss": 0.2544, "reward": 0.6356026977300644, "reward_std": 0.3485429063439369, "rewards/accuracy_reward": 0.14062501047737896, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 1471.30810546875, "epoch": 0.32230602643566575, "grad_norm": 5.7795634269714355, "kl": 1.537109375, "learning_rate": 2.613512957826465e-07, "loss": 0.2401, "reward": 0.6657366305589676, "reward_std": 0.3483474776148796, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330559372902, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 1464.3795166015625, "epoch": 0.3226047345231872, "grad_norm": 5.2647857666015625, "kl": 1.673828125, "learning_rate": 2.612526158180619e-07, "loss": 0.279, "reward": 0.668526828289032, "reward_std": 0.32395491376519203, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125149011612, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 1404.97998046875, "epoch": 0.3229034426107087, "grad_norm": 4.023599624633789, "kl": 1.736328125, "learning_rate": 2.6115383114006495e-07, "loss": 0.2415, "reward": 0.6255580633878708, "reward_std": 0.32622871920466423, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5429687798023224, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 1467.0692443847656, "epoch": 0.32320215069823016, "grad_norm": 2.3190064430236816, "kl": 2.134765625, "learning_rate": 2.610549418561238e-07, "loss": 0.3037, "reward": 0.6601562798023224, "reward_std": 0.3039412572979927, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4793526977300644, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 1528.9822082519531, "epoch": 0.32350085878575163, "grad_norm": 3.000131368637085, "kl": 1.87890625, "learning_rate": 2.609559480738204e-07, "loss": 0.2336, "reward": 0.6372768133878708, "reward_std": 0.30019962787628174, "rewards/accuracy_reward": 0.14955357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232313156128, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 1459.87060546875, "epoch": 0.3237995668732731, "grad_norm": 3.9586360454559326, "kl": 2.1953125, "learning_rate": 2.6085684990085006e-07, "loss": 0.2673, "reward": 0.593191996216774, "reward_std": 0.27781594917178154, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5172991156578064, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 1533.4130554199219, "epoch": 0.3240982749607946, "grad_norm": 6.313206195831299, "kl": 2.58203125, "learning_rate": 2.6075764744502206e-07, "loss": 0.2632, "reward": 0.5552455633878708, "reward_std": 0.3096628524363041, "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463727705180645, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 1504.4442749023438, "epoch": 0.32439698304831605, "grad_norm": 7.544454574584961, "kl": 2.58984375, "learning_rate": 2.6065834081425893e-07, "loss": 0.2726, "reward": 0.578683078289032, "reward_std": 0.3171040415763855, "rewards/accuracy_reward": 0.08928572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4893973469734192, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 1553.4688110351562, "epoch": 0.3246956911358375, "grad_norm": 4.474386215209961, "kl": 2.443359375, "learning_rate": 2.6055893011659656e-07, "loss": 0.2762, "reward": 0.5719866454601288, "reward_std": 0.28948893398046494, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487165205180645, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 1541.2656860351562, "epoch": 0.324994399223359, "grad_norm": 5.040806770324707, "kl": 2.4765625, "learning_rate": 2.604594154601839e-07, "loss": 0.2494, "reward": 0.6032366454601288, "reward_std": 0.3285473734140396, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294738650322, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 1476.5067749023438, "epoch": 0.32529310731088046, "grad_norm": 3.224231481552124, "kl": 2.38671875, "learning_rate": 2.6035979695328326e-07, "loss": 0.2667, "reward": 0.6422991305589676, "reward_std": 0.3085227608680725, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949776902794838, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 1515.46435546875, "epoch": 0.32559181539840193, "grad_norm": 2.6014020442962646, "kl": 1.982421875, "learning_rate": 2.6026007470426986e-07, "loss": 0.2021, "reward": 0.569754496216774, "reward_std": 0.31619132310152054, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866454601288, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 1501.0647888183594, "epoch": 0.3258905234859234, "grad_norm": 4.709430694580078, "kl": 1.8046875, "learning_rate": 2.6016024882163167e-07, "loss": 0.243, "reward": 0.553013414144516, "reward_std": 0.2741779088973999, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4905134215950966, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 1511.7054138183594, "epoch": 0.32618923157344487, "grad_norm": 2.8127620220184326, "kl": 1.70703125, "learning_rate": 2.600603194139694e-07, "loss": 0.2434, "reward": 0.6026785969734192, "reward_std": 0.36505126953125, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428880095482, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 1530.0044860839844, "epoch": 0.32648793966096634, "grad_norm": 2.8638858795166016, "kl": 1.51171875, "learning_rate": 2.599602865899966e-07, "loss": 0.2355, "reward": 0.577008955180645, "reward_std": 0.33273686096072197, "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839402794838, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 1429.4620971679688, "epoch": 0.3267866477484878, "grad_norm": 4.860335826873779, "kl": 1.650390625, "learning_rate": 2.5986015045853913e-07, "loss": 0.2672, "reward": 0.6668527275323868, "reward_std": 0.26031871512532234, "rewards/accuracy_reward": 0.15848214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705633878708, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 1553.7054443359375, "epoch": 0.3270853558360093, "grad_norm": 2.919179916381836, "kl": 1.763671875, "learning_rate": 2.597599111285351e-07, "loss": 0.213, "reward": 0.6177455633878708, "reward_std": 0.3066631779074669, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490513414144516, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 1552.4531860351562, "epoch": 0.32738406392353075, "grad_norm": 1.2966303825378418, "kl": 1.537109375, "learning_rate": 2.5965956870903535e-07, "loss": 0.2726, "reward": 0.6255580559372902, "reward_std": 0.31674841046333313, "rewards/accuracy_reward": 0.18080357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.444754496216774, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 1461.3929138183594, "epoch": 0.3276827720110522, "grad_norm": 1.2138861417770386, "kl": 2.02734375, "learning_rate": 2.5955912330920247e-07, "loss": 0.2607, "reward": 0.550223246216774, "reward_std": 0.30203256756067276, "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4899553805589676, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 1531.9241943359375, "epoch": 0.32798148009857364, "grad_norm": 2.556658983230591, "kl": 1.677734375, "learning_rate": 2.594585750383112e-07, "loss": 0.2647, "reward": 0.6512276977300644, "reward_std": 0.34733932465314865, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 1608.2478332519531, "epoch": 0.3282801881860951, "grad_norm": 3.9033408164978027, "kl": 2.11328125, "learning_rate": 2.593579240057483e-07, "loss": 0.2525, "reward": 0.5295759290456772, "reward_std": 0.2867263853549957, "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4380580559372902, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 1527.7879943847656, "epoch": 0.3285788962736166, "grad_norm": 1.313635230064392, "kl": 2.0703125, "learning_rate": 2.59257170321012e-07, "loss": 0.2393, "reward": 0.6099330633878708, "reward_std": 0.36040159314870834, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715402126312256, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 1520.2902526855469, "epoch": 0.32887760436113805, "grad_norm": 5.256302833557129, "kl": 2.212890625, "learning_rate": 2.591563140937127e-07, "loss": 0.242, "reward": 0.585937537252903, "reward_std": 0.2435493916273117, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.507812537252903, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 1489.22998046875, "epoch": 0.3291763124486595, "grad_norm": 4.986348628997803, "kl": 2.46484375, "learning_rate": 2.590553554335719e-07, "loss": 0.2684, "reward": 0.5719866305589676, "reward_std": 0.2748001962900162, "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871651977300644, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 1625.9531860351562, "epoch": 0.329475020536181, "grad_norm": 7.1511077880859375, "kl": 2.505859375, "learning_rate": 2.5895429445042283e-07, "loss": 0.2529, "reward": 0.5412946790456772, "reward_std": 0.2794616334140301, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018133878708, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 1498.9197082519531, "epoch": 0.32977372862370247, "grad_norm": 2.173046350479126, "kl": 2.005859375, "learning_rate": 2.5885313125420993e-07, "loss": 0.217, "reward": 0.556919664144516, "reward_std": 0.2893410474061966, "rewards/accuracy_reward": 0.06696429057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489955373108387, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 1456.9263916015625, "epoch": 0.33007243671122394, "grad_norm": 1.7583085298538208, "kl": 1.86328125, "learning_rate": 2.5875186595498883e-07, "loss": 0.2199, "reward": 0.6367187798023224, "reward_std": 0.2878870964050293, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498325914144516, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 1494.3906860351562, "epoch": 0.3303711447987454, "grad_norm": 1.4153239727020264, "kl": 1.654296875, "learning_rate": 2.586504986629262e-07, "loss": 0.2383, "reward": 0.6160714477300644, "reward_std": 0.2795017696917057, "rewards/accuracy_reward": 0.13616072479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 1476.2121276855469, "epoch": 0.3306698528862669, "grad_norm": 1.7699134349822998, "kl": 1.615234375, "learning_rate": 2.585490294882998e-07, "loss": 0.2506, "reward": 0.658482164144516, "reward_std": 0.2986295633018017, "rewards/accuracy_reward": 0.18750000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 1374.247802734375, "epoch": 0.33096856097378835, "grad_norm": 3.3396546840667725, "kl": 1.677734375, "learning_rate": 2.5844745854149814e-07, "loss": 0.2289, "reward": 0.6690848469734192, "reward_std": 0.29049060866236687, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5128348469734192, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 1325.4398193359375, "epoch": 0.3312672690613098, "grad_norm": 3.819262742996216, "kl": 1.310546875, "learning_rate": 2.583457859330204e-07, "loss": 0.2046, "reward": 0.6852678954601288, "reward_std": 0.2816692814230919, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250223517418, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 1507.430908203125, "epoch": 0.3315659771488313, "grad_norm": 4.437417030334473, "kl": 1.4482421875, "learning_rate": 2.5824401177347635e-07, "loss": 0.1754, "reward": 0.572544664144516, "reward_std": 0.32971565425395966, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483258955180645, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 1586.9554138183594, "epoch": 0.33186468523635276, "grad_norm": 2.8667891025543213, "kl": 1.767578125, "learning_rate": 2.581421361735864e-07, "loss": 0.2297, "reward": 0.585379496216774, "reward_std": 0.2828323319554329, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451450914144516, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 1527.0223693847656, "epoch": 0.33216339332387423, "grad_norm": 4.813324928283691, "kl": 1.7275390625, "learning_rate": 2.580401592441813e-07, "loss": 0.2577, "reward": 0.6277901902794838, "reward_std": 0.30820489674806595, "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580559372902, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 1526.2232971191406, "epoch": 0.3324621014113957, "grad_norm": 2.647437572479248, "kl": 1.74609375, "learning_rate": 2.5793808109620184e-07, "loss": 0.2152, "reward": 0.5753348618745804, "reward_std": 0.2907489724457264, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455633878708, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 1405.5223693847656, "epoch": 0.3327608094989172, "grad_norm": 3.1647727489471436, "kl": 1.884765625, "learning_rate": 2.57835901840699e-07, "loss": 0.2649, "reward": 0.5412946790456772, "reward_std": 0.2881372757256031, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160969734192, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 1605.5603332519531, "epoch": 0.33305951758643865, "grad_norm": 1.948773980140686, "kl": 1.81640625, "learning_rate": 2.577336215888339e-07, "loss": 0.2317, "reward": 0.6227678805589676, "reward_std": 0.3231372609734535, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 1522.4800109863281, "epoch": 0.3333582256739601, "grad_norm": 6.494892597198486, "kl": 2.140625, "learning_rate": 2.576312404518773e-07, "loss": 0.2964, "reward": 0.6729911118745804, "reward_std": 0.3132224418222904, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053805589676, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 1572.3148193359375, "epoch": 0.3336569337614816, "grad_norm": 3.7692909240722656, "kl": 1.861328125, "learning_rate": 2.5752875854121006e-07, "loss": 0.2476, "reward": 0.5027901977300644, "reward_std": 0.2967359721660614, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 1472.2388916015625, "epoch": 0.33395564184900306, "grad_norm": 5.977867126464844, "kl": 2.2890625, "learning_rate": 2.574261759683222e-07, "loss": 0.3174, "reward": 0.5809151977300644, "reward_std": 0.3023894801735878, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044813156128, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 1521.21435546875, "epoch": 0.33425434993652453, "grad_norm": 4.9648613929748535, "kl": 2.087890625, "learning_rate": 2.573234928448137e-07, "loss": 0.2933, "reward": 0.5524553880095482, "reward_std": 0.27551306784152985, "rewards/accuracy_reward": 0.06473214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232387661934, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 1581.1139221191406, "epoch": 0.334553058024046, "grad_norm": 2.9668993949890137, "kl": 1.4931640625, "learning_rate": 2.5722070928239364e-07, "loss": 0.233, "reward": 0.5312500223517418, "reward_std": 0.2740164287388325, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 1575.0603637695312, "epoch": 0.3348517661115675, "grad_norm": 2.1093974113464355, "kl": 1.791015625, "learning_rate": 2.571178253928804e-07, "loss": 0.2577, "reward": 0.5379464402794838, "reward_std": 0.310575932264328, "rewards/accuracy_reward": 0.08035714481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 1506.2277221679688, "epoch": 0.33515047419908894, "grad_norm": 2.6733639240264893, "kl": 1.34765625, "learning_rate": 2.570148412882018e-07, "loss": 0.2334, "reward": 0.581473246216774, "reward_std": 0.3157352805137634, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494419664144516, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 1470.9889221191406, "epoch": 0.3354491822866104, "grad_norm": 1.5879133939743042, "kl": 1.2529296875, "learning_rate": 2.569117570803942e-07, "loss": 0.1868, "reward": 0.601004496216774, "reward_std": 0.31709757447242737, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866305589676, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 1507.5714721679688, "epoch": 0.3357478903741319, "grad_norm": 7.8223185539245605, "kl": 1.4462890625, "learning_rate": 2.5680857288160326e-07, "loss": 0.2323, "reward": 0.5742187798023224, "reward_std": 0.25542283430695534, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330559372902, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 1470.8974304199219, "epoch": 0.33604659846165336, "grad_norm": 4.0536675453186035, "kl": 1.2451171875, "learning_rate": 2.567052888040832e-07, "loss": 0.2788, "reward": 0.5630580633878708, "reward_std": 0.30651840567588806, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.493861623108387, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 1590.6764221191406, "epoch": 0.3363453065491748, "grad_norm": 6.0376105308532715, "kl": 1.7177734375, "learning_rate": 2.56601904960197e-07, "loss": 0.2414, "reward": 0.6439732611179352, "reward_std": 0.383798323571682, "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4899553880095482, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 1613.4018859863281, "epoch": 0.3366440146366963, "grad_norm": 3.071429967880249, "kl": 1.2265625, "learning_rate": 2.564984214624162e-07, "loss": 0.1779, "reward": 0.5279018208384514, "reward_std": 0.3266928941011429, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474330373108387, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 1478.85498046875, "epoch": 0.33694272272421777, "grad_norm": 5.712805271148682, "kl": 1.4052734375, "learning_rate": 2.563948384233206e-07, "loss": 0.2673, "reward": 0.5691964477300644, "reward_std": 0.2845711372792721, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502232164144516, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 1634.5067749023438, "epoch": 0.33724143081173924, "grad_norm": 2.24774432182312, "kl": 1.322265625, "learning_rate": 2.5629115595559857e-07, "loss": 0.2256, "reward": 0.552455373108387, "reward_std": 0.29055969789624214, "rewards/accuracy_reward": 0.09821428940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410895228386, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 1534.5603332519531, "epoch": 0.3375401388992607, "grad_norm": 2.8800106048583984, "kl": 1.6015625, "learning_rate": 2.5618737417204623e-07, "loss": 0.1967, "reward": 0.6021205633878708, "reward_std": 0.331250362098217, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884066939354, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 1605.5045471191406, "epoch": 0.3378388469867822, "grad_norm": 1.698113203048706, "kl": 1.3701171875, "learning_rate": 2.5608349318556816e-07, "loss": 0.1806, "reward": 0.576450914144516, "reward_std": 0.3062754273414612, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580633878708, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 1423.1183776855469, "epoch": 0.33813755507430365, "grad_norm": 2.1558098793029785, "kl": 1.96484375, "learning_rate": 2.5597951310917664e-07, "loss": 0.2823, "reward": 0.5524553805589676, "reward_std": 0.30572865903377533, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510044664144516, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 1539.7634582519531, "epoch": 0.3384362631618251, "grad_norm": 2.358893394470215, "kl": 1.837890625, "learning_rate": 2.558754340559918e-07, "loss": 0.2198, "reward": 0.6590402126312256, "reward_std": 0.3034973293542862, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580559372902, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 1571.060302734375, "epoch": 0.3387349712493466, "grad_norm": 3.1756668090820312, "kl": 1.599609375, "learning_rate": 2.5577125613924144e-07, "loss": 0.1748, "reward": 0.5491071715950966, "reward_std": 0.30035829171538353, "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.495535746216774, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 1530.5647888183594, "epoch": 0.33903367933686807, "grad_norm": 2.1669278144836426, "kl": 1.63671875, "learning_rate": 2.5566697947226096e-07, "loss": 0.2593, "reward": 0.6110491305589676, "reward_std": 0.3301529586315155, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812649011612, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 1507.2902221679688, "epoch": 0.33933238742438954, "grad_norm": 2.9861345291137695, "kl": 2.248046875, "learning_rate": 2.555626041684932e-07, "loss": 0.3116, "reward": 0.526227705180645, "reward_std": 0.3063497841358185, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470424123108387, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 1524.5335388183594, "epoch": 0.339631095511911, "grad_norm": 8.765790939331055, "kl": 2.515625, "learning_rate": 2.554581303414881e-07, "loss": 0.2499, "reward": 0.5128348395228386, "reward_std": 0.26451415568590164, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705559372902, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 1534.8103637695312, "epoch": 0.3399298035994325, "grad_norm": 1.8975735902786255, "kl": 1.73046875, "learning_rate": 2.553535581049031e-07, "loss": 0.1997, "reward": 0.501116082072258, "reward_std": 0.2755190432071686, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474330373108387, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 1527.12060546875, "epoch": 0.34022851168695395, "grad_norm": 4.051145553588867, "kl": 2.19140625, "learning_rate": 2.552488875725024e-07, "loss": 0.2463, "reward": 0.5602678880095482, "reward_std": 0.29411884397268295, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750298023224, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 1533.2322082519531, "epoch": 0.3405272197744754, "grad_norm": 5.274078369140625, "kl": 2.005859375, "learning_rate": 2.5514411885815746e-07, "loss": 0.2442, "reward": 0.570870578289032, "reward_std": 0.29573073238134384, "rewards/accuracy_reward": 0.08035714528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490513414144516, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 1489.0715026855469, "epoch": 0.3408259278619969, "grad_norm": 5.762105464935303, "kl": 2.630859375, "learning_rate": 2.550392520758462e-07, "loss": 0.2365, "reward": 0.6941964626312256, "reward_std": 0.33988116681575775, "rewards/accuracy_reward": 0.16517857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5290178656578064, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 1414.5647583007812, "epoch": 0.3411246359495183, "grad_norm": 5.378069877624512, "kl": 1.7880859375, "learning_rate": 2.549342873396535e-07, "loss": 0.2933, "reward": 0.5172991305589676, "reward_std": 0.2548864260315895, "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741305589676, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 1437.2455749511719, "epoch": 0.3414233440370398, "grad_norm": 5.316827774047852, "kl": 1.689453125, "learning_rate": 2.548292247637707e-07, "loss": 0.2291, "reward": 0.5954241305589676, "reward_std": 0.34368423372507095, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5128348395228386, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 1447.4442443847656, "epoch": 0.34172205212456125, "grad_norm": 8.553762435913086, "kl": 1.46484375, "learning_rate": 2.5472406446249557e-07, "loss": 0.2626, "reward": 0.6199776977300644, "reward_std": 0.3840615078806877, "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 1512.2835693359375, "epoch": 0.3420207602120827, "grad_norm": 6.644222736358643, "kl": 1.49609375, "learning_rate": 2.546188065502322e-07, "loss": 0.1787, "reward": 0.6590402126312256, "reward_std": 0.2855857424438, "rewards/accuracy_reward": 0.14508928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5139509215950966, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 1457.6250305175781, "epoch": 0.3423194682996042, "grad_norm": 1.9739909172058105, "kl": 1.998046875, "learning_rate": 2.5451345114149086e-07, "loss": 0.2037, "reward": 0.5625000298023224, "reward_std": 0.26650097221136093, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089285969734192, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 1473.1428833007812, "epoch": 0.34261817638712566, "grad_norm": 2.0664994716644287, "kl": 1.755859375, "learning_rate": 2.54407998350888e-07, "loss": 0.2361, "reward": 0.6947545111179352, "reward_std": 0.2783166393637657, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937798023224, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 1532.7589721679688, "epoch": 0.34291688447464713, "grad_norm": 3.3574635982513428, "kl": 1.861328125, "learning_rate": 2.543024482931458e-07, "loss": 0.2511, "reward": 0.6936384290456772, "reward_std": 0.37581767141819, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098469734192, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 1464.0893249511719, "epoch": 0.3432155925621686, "grad_norm": 4.210370063781738, "kl": 1.81640625, "learning_rate": 2.541968010830925e-07, "loss": 0.245, "reward": 0.6395089477300644, "reward_std": 0.34396591037511826, "rewards/accuracy_reward": 0.1272321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122767984867096, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 1436.0736999511719, "epoch": 0.3435143006496901, "grad_norm": 2.534160852432251, "kl": 4.755859375, "learning_rate": 2.540910568356618e-07, "loss": 0.2831, "reward": 0.6484375149011612, "reward_std": 0.3328717313706875, "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768133878708, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 1613.3236999511719, "epoch": 0.34381300873721155, "grad_norm": 5.973161220550537, "kl": 2.6015625, "learning_rate": 2.539852156658931e-07, "loss": 0.2485, "reward": 0.5061384215950966, "reward_std": 0.25693267956376076, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.448102705180645, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 1488.6697082519531, "epoch": 0.344111716824733, "grad_norm": 6.389756679534912, "kl": 2.19140625, "learning_rate": 2.538792776889313e-07, "loss": 0.2905, "reward": 0.5747768059372902, "reward_std": 0.296277791261673, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.485491082072258, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 1596.5513916015625, "epoch": 0.3444104249122545, "grad_norm": 9.070158958435059, "kl": 2.9765625, "learning_rate": 2.5377324302002644e-07, "loss": 0.2816, "reward": 0.650669664144516, "reward_std": 0.268809724599123, "rewards/accuracy_reward": 0.18080357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 1482.9420471191406, "epoch": 0.34470913299977596, "grad_norm": 3.4117143154144287, "kl": 1.94921875, "learning_rate": 2.5366711177453394e-07, "loss": 0.2633, "reward": 0.685825914144516, "reward_std": 0.32386647164821625, "rewards/accuracy_reward": 0.16294643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.522879496216774, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 1377.0156860351562, "epoch": 0.34500784108729743, "grad_norm": 4.5506062507629395, "kl": 2.103515625, "learning_rate": 2.5356088406791413e-07, "loss": 0.2958, "reward": 0.5898437798023224, "reward_std": 0.3296296074986458, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294813156128, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 1568.8036499023438, "epoch": 0.3453065491748189, "grad_norm": 1.766747236251831, "kl": 1.849609375, "learning_rate": 2.5345456001573227e-07, "loss": 0.2576, "reward": 0.5993303880095482, "reward_std": 0.3025139383971691, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854910895228386, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 1503.3572082519531, "epoch": 0.34560525726234037, "grad_norm": 2.217923879623413, "kl": 1.716796875, "learning_rate": 2.533481397336587e-07, "loss": 0.2344, "reward": 0.5407366454601288, "reward_std": 0.3023833930492401, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489397332072258, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 1440.5870971679688, "epoch": 0.34590396534986184, "grad_norm": 2.6557528972625732, "kl": 1.408203125, "learning_rate": 2.532416233374681e-07, "loss": 0.1893, "reward": 0.6389509290456772, "reward_std": 0.346352756023407, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 1417.46435546875, "epoch": 0.3462026734373833, "grad_norm": 4.380662441253662, "kl": 1.623046875, "learning_rate": 2.531350109430399e-07, "loss": 0.2463, "reward": 0.5719866380095482, "reward_std": 0.2871382161974907, "rewards/accuracy_reward": 0.08482142933644354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871651977300644, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 1387.3906860351562, "epoch": 0.3465013815249048, "grad_norm": 2.7009525299072266, "kl": 1.41796875, "learning_rate": 2.53028302666358e-07, "loss": 0.2412, "reward": 0.6339286118745804, "reward_std": 0.2568640075623989, "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964477300644, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 1520.5536499023438, "epoch": 0.34680008961242625, "grad_norm": 3.620098829269409, "kl": 1.498046875, "learning_rate": 2.529214986235105e-07, "loss": 0.2431, "reward": 0.5708705559372902, "reward_std": 0.33712801337242126, "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491454601288, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 1450.9576721191406, "epoch": 0.3470987976999477, "grad_norm": 5.038084506988525, "kl": 1.474609375, "learning_rate": 2.5281459893068963e-07, "loss": 0.2429, "reward": 0.6545759290456772, "reward_std": 0.26497693732380867, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489397332072258, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 1557.4286499023438, "epoch": 0.3473975057874692, "grad_norm": 2.1680970191955566, "kl": 1.8046875, "learning_rate": 2.52707603704192e-07, "loss": 0.2499, "reward": 0.5714286044239998, "reward_std": 0.27767864614725113, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178880095482, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 1527.5469360351562, "epoch": 0.34769621387499067, "grad_norm": 7.56266975402832, "kl": 1.60546875, "learning_rate": 2.526005130604177e-07, "loss": 0.2465, "reward": 0.5608259066939354, "reward_std": 0.2735726237297058, "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580633878708, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 1491.7500915527344, "epoch": 0.34799492196251214, "grad_norm": 2.051330089569092, "kl": 2.0, "learning_rate": 2.5249332711587104e-07, "loss": 0.2284, "reward": 0.587611623108387, "reward_std": 0.2892594188451767, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498325914144516, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 1534.7099304199219, "epoch": 0.3482936300500336, "grad_norm": 2.312678098678589, "kl": 2.353515625, "learning_rate": 2.523860459871597e-07, "loss": 0.2762, "reward": 0.6702009290456772, "reward_std": 0.3481372445821762, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044813156128, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 1533.2366638183594, "epoch": 0.3485923381375551, "grad_norm": 4.894167900085449, "kl": 2.13671875, "learning_rate": 2.522786697909951e-07, "loss": 0.2714, "reward": 0.5982142984867096, "reward_std": 0.30134493857622147, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.493303582072258, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 1448.513427734375, "epoch": 0.34889104622507655, "grad_norm": 4.267093181610107, "kl": 2.216796875, "learning_rate": 2.521711986441921e-07, "loss": 0.2558, "reward": 0.5150669887661934, "reward_std": 0.24334944784641266, "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419887661934, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 1444.1942749023438, "epoch": 0.349189754312598, "grad_norm": 3.483475923538208, "kl": 1.6953125, "learning_rate": 2.520636326636685e-07, "loss": 0.2348, "reward": 0.5373884215950966, "reward_std": 0.26880137249827385, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 1523.4398193359375, "epoch": 0.3494884624001195, "grad_norm": 2.257246732711792, "kl": 2.005859375, "learning_rate": 2.5195597196644584e-07, "loss": 0.2973, "reward": 0.624441996216774, "reward_std": 0.33289800584316254, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4793526902794838, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 1494.9085388183594, "epoch": 0.34978717048764096, "grad_norm": 8.030393600463867, "kl": 2.36328125, "learning_rate": 2.518482166696482e-07, "loss": 0.2527, "reward": 0.5797991454601288, "reward_std": 0.3166400045156479, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705559372902, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 1445.07373046875, "epoch": 0.35008587857516243, "grad_norm": 3.4762513637542725, "kl": 2.322265625, "learning_rate": 2.517403668905029e-07, "loss": 0.2546, "reward": 0.6847098618745804, "reward_std": 0.3077287971973419, "rewards/accuracy_reward": 0.17410715413279831, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5106026977300644, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 1597.0692443847656, "epoch": 0.3503845866626839, "grad_norm": 3.1618385314941406, "kl": 1.626953125, "learning_rate": 2.516324227463399e-07, "loss": 0.2368, "reward": 0.565290205180645, "reward_std": 0.30239711701869965, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 1527.7522583007812, "epoch": 0.3506832947502054, "grad_norm": 1.8927603960037231, "kl": 2.013671875, "learning_rate": 2.515243843545918e-07, "loss": 0.2393, "reward": 0.5887276977300644, "reward_std": 0.35698414593935013, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098469734192, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 1525.2188110351562, "epoch": 0.35098200283772685, "grad_norm": 2.6890785694122314, "kl": 2.392578125, "learning_rate": 2.514162518327938e-07, "loss": 0.2489, "reward": 0.6623884290456772, "reward_std": 0.3275613486766815, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419813156128, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 1527.2411193847656, "epoch": 0.3512807109252483, "grad_norm": 4.051823139190674, "kl": 2.291015625, "learning_rate": 2.5130802529858364e-07, "loss": 0.2254, "reward": 0.5636160969734192, "reward_std": 0.2800626643002033, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625223517418, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 1480.4264221191406, "epoch": 0.3515794190127698, "grad_norm": 2.0948331356048584, "kl": 1.8203125, "learning_rate": 2.51199704869701e-07, "loss": 0.2491, "reward": 0.6077009290456772, "reward_std": 0.30335894227027893, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.513950914144516, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 1463.6786193847656, "epoch": 0.35187812710029126, "grad_norm": 2.3860557079315186, "kl": 1.7890625, "learning_rate": 2.510912906639879e-07, "loss": 0.2598, "reward": 0.6724330633878708, "reward_std": 0.2917674444615841, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937723517418, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 1594.5201721191406, "epoch": 0.35217683518781273, "grad_norm": 1.957651138305664, "kl": 1.7578125, "learning_rate": 2.5098278279938837e-07, "loss": 0.1829, "reward": 0.5563616380095482, "reward_std": 0.2988428473472595, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 1504.8996276855469, "epoch": 0.3524755432753342, "grad_norm": 1.6252835988998413, "kl": 1.53515625, "learning_rate": 2.508741813939484e-07, "loss": 0.2456, "reward": 0.6529018208384514, "reward_std": 0.2727416902780533, "rewards/accuracy_reward": 0.1696428677532822, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483258955180645, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 1465.6697387695312, "epoch": 0.3527742513628557, "grad_norm": 3.7602787017822266, "kl": 1.912109375, "learning_rate": 2.5076548656581573e-07, "loss": 0.2327, "reward": 0.6194196790456772, "reward_std": 0.3225301653146744, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122767984867096, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 1500.4710388183594, "epoch": 0.35307295945037714, "grad_norm": 1.1622236967086792, "kl": 1.470703125, "learning_rate": 2.5065669843323955e-07, "loss": 0.2256, "reward": 0.6439732313156128, "reward_std": 0.29667558521032333, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4832589477300644, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 1459.9554138183594, "epoch": 0.3533716675378986, "grad_norm": 2.4832520484924316, "kl": 1.935546875, "learning_rate": 2.505478171145707e-07, "loss": 0.2187, "reward": 0.5747768133878708, "reward_std": 0.3142852410674095, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510044664144516, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 1468.2277526855469, "epoch": 0.3536703756254201, "grad_norm": 1.5439695119857788, "kl": 1.82421875, "learning_rate": 2.504388427282614e-07, "loss": 0.2366, "reward": 0.6422991305589676, "reward_std": 0.3185621164739132, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419813156128, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 1548.8638916015625, "epoch": 0.3539690837129415, "grad_norm": 2.099919080734253, "kl": 1.884765625, "learning_rate": 2.503297753928652e-07, "loss": 0.2404, "reward": 0.507254496216774, "reward_std": 0.2541923075914383, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.491629496216774, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 1586.18310546875, "epoch": 0.35426779180046297, "grad_norm": 2.675692081451416, "kl": 1.80859375, "learning_rate": 2.502206152270365e-07, "loss": 0.231, "reward": 0.5228794887661934, "reward_std": 0.2755550630390644, "rewards/accuracy_reward": 0.04241071594879031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687649011612, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 1577.27685546875, "epoch": 0.35456649988798444, "grad_norm": 1.791926383972168, "kl": 1.76953125, "learning_rate": 2.5011136234953103e-07, "loss": 0.24, "reward": 0.5485491305589676, "reward_std": 0.336167111992836, "rewards/accuracy_reward": 0.06473214388825, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169887661934, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 1504.8639221191406, "epoch": 0.3548652079755059, "grad_norm": 2.2314422130584717, "kl": 1.599609375, "learning_rate": 2.5000201687920516e-07, "loss": 0.1888, "reward": 0.5530134215950966, "reward_std": 0.30816755443811417, "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419738650322, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 1419.3817443847656, "epoch": 0.3551639160630274, "grad_norm": 1.8423805236816406, "kl": 1.91015625, "learning_rate": 2.498925789350159e-07, "loss": 0.2471, "reward": 0.686941996216774, "reward_std": 0.31139643490314484, "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.521763414144516, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 1489.1161193847656, "epoch": 0.35546262415054886, "grad_norm": 4.1635870933532715, "kl": 1.990234375, "learning_rate": 2.497830486360212e-07, "loss": 0.2949, "reward": 0.6400670036673546, "reward_std": 0.3156759515404701, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062723517418, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 1522.8728332519531, "epoch": 0.3557613322380703, "grad_norm": 3.6478922367095947, "kl": 2.4140625, "learning_rate": 2.4967342610137923e-07, "loss": 0.2619, "reward": 0.645089328289032, "reward_std": 0.3133271783590317, "rewards/accuracy_reward": 0.14508929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000223517418, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 1593.9710693359375, "epoch": 0.3560600403255918, "grad_norm": 1.9877678155899048, "kl": 1.78125, "learning_rate": 2.4956371145034845e-07, "loss": 0.1961, "reward": 0.6361607313156128, "reward_std": 0.32155022770166397, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5200892984867096, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 1582.2054443359375, "epoch": 0.35635874841311327, "grad_norm": 3.86426043510437, "kl": 2.078125, "learning_rate": 2.494539048022879e-07, "loss": 0.2754, "reward": 0.6110491305589676, "reward_std": 0.2841814383864403, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490513414144516, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 1497.51123046875, "epoch": 0.35665745650063474, "grad_norm": 10.247713088989258, "kl": 3.228515625, "learning_rate": 2.493440062766562e-07, "loss": 0.3249, "reward": 0.620535746216774, "reward_std": 0.3188435584306717, "rewards/accuracy_reward": 0.12946428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4910714477300644, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 1478.5313415527344, "epoch": 0.3569561645881562, "grad_norm": 2.5867576599121094, "kl": 2.11328125, "learning_rate": 2.4923401599301227e-07, "loss": 0.2797, "reward": 0.7075893133878708, "reward_std": 0.2551192045211792, "rewards/accuracy_reward": 0.19196429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250149011612, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 1557.2835693359375, "epoch": 0.3572548726756777, "grad_norm": 1.8213342428207397, "kl": 1.966796875, "learning_rate": 2.491239340710148e-07, "loss": 0.2359, "reward": 0.5926339328289032, "reward_std": 0.2857806794345379, "rewards/accuracy_reward": 0.07812500209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5145089626312256, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 1436.6139221191406, "epoch": 0.35755358076319915, "grad_norm": 6.501287460327148, "kl": 2.263671875, "learning_rate": 2.4901376063042207e-07, "loss": 0.2696, "reward": 0.624441996216774, "reward_std": 0.2983981557190418, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490513414144516, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 1427.6429443359375, "epoch": 0.3578522888507206, "grad_norm": 4.123022556304932, "kl": 1.90234375, "learning_rate": 2.4890349579109196e-07, "loss": 0.2704, "reward": 0.6902902275323868, "reward_std": 0.3200419396162033, "rewards/accuracy_reward": 0.20982144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687574505806, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 1346.8929138183594, "epoch": 0.3581509969382421, "grad_norm": 1.5887442827224731, "kl": 1.349609375, "learning_rate": 2.4879313967298197e-07, "loss": 0.2014, "reward": 0.6640625298023224, "reward_std": 0.34110913425683975, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.541294664144516, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 1543.0803833007812, "epoch": 0.35844970502576357, "grad_norm": 4.0904951095581055, "kl": 1.720703125, "learning_rate": 2.486826923961485e-07, "loss": 0.2294, "reward": 0.569196455180645, "reward_std": 0.2757045663893223, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4754464626312256, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 1490.1875610351562, "epoch": 0.35874841311328504, "grad_norm": 2.938624382019043, "kl": 1.046875, "learning_rate": 2.485721540807476e-07, "loss": 0.24, "reward": 0.6601562798023224, "reward_std": 0.3262035846710205, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455559372902, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 1386.0291137695312, "epoch": 0.3590471212008065, "grad_norm": 2.9639222621917725, "kl": 1.3876953125, "learning_rate": 2.4846152484703397e-07, "loss": 0.2622, "reward": 0.6171875298023224, "reward_std": 0.2614656016230583, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5100446566939354, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 1486.6697082519531, "epoch": 0.359345829288328, "grad_norm": 1.3529603481292725, "kl": 1.609375, "learning_rate": 2.4835080481536135e-07, "loss": 0.1992, "reward": 0.5658482313156128, "reward_std": 0.3169441409409046, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839477300644, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 1577.7277221679688, "epoch": 0.35964453737584945, "grad_norm": 2.097174644470215, "kl": 1.2490234375, "learning_rate": 2.4823999410618245e-07, "loss": 0.1874, "reward": 0.5546875298023224, "reward_std": 0.2771466597914696, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196715950966, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 1573.5290832519531, "epoch": 0.3599432454633709, "grad_norm": 2.6907882690429688, "kl": 1.30859375, "learning_rate": 2.4812909284004834e-07, "loss": 0.2169, "reward": 0.5613839626312256, "reward_std": 0.32114310562610626, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 1480.1272888183594, "epoch": 0.3602419535508924, "grad_norm": 3.109938859939575, "kl": 1.734375, "learning_rate": 2.4801810113760867e-07, "loss": 0.2161, "reward": 0.5993303805589676, "reward_std": 0.31952914595603943, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5145089477300644, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 1507.6004943847656, "epoch": 0.36054066163841386, "grad_norm": 3.0427961349487305, "kl": 1.55078125, "learning_rate": 2.479070191196117e-07, "loss": 0.2278, "reward": 0.55859375, "reward_std": 0.32313814759254456, "rewards/accuracy_reward": 0.06473214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616454601288, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 1551.9576721191406, "epoch": 0.36083936972593533, "grad_norm": 1.524617075920105, "kl": 1.78125, "learning_rate": 2.4779584690690357e-07, "loss": 0.2366, "reward": 0.516741082072258, "reward_std": 0.2622910551726818, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 1455.4219665527344, "epoch": 0.3611380778134568, "grad_norm": 1.7518419027328491, "kl": 1.89453125, "learning_rate": 2.4768458462042904e-07, "loss": 0.1887, "reward": 0.6501116305589676, "reward_std": 0.34627649188041687, "rewards/accuracy_reward": 0.14508929569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223469734192, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 1471.74560546875, "epoch": 0.3614367859009783, "grad_norm": 2.4804036617279053, "kl": 2.21484375, "learning_rate": 2.4757323238123027e-07, "loss": 0.2555, "reward": 0.5496651977300644, "reward_std": 0.36178968101739883, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 1500.1473999023438, "epoch": 0.36173549398849975, "grad_norm": 2.4131736755371094, "kl": 1.81640625, "learning_rate": 2.4746179031044774e-07, "loss": 0.2045, "reward": 0.6049107313156128, "reward_std": 0.2948494255542755, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089285895228386, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 1472.07373046875, "epoch": 0.3620342020760212, "grad_norm": 2.2921135425567627, "kl": 2.21484375, "learning_rate": 2.473502585293195e-07, "loss": 0.2353, "reward": 0.6690848469734192, "reward_std": 0.32439498230814934, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.521763414144516, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 1462.4576416015625, "epoch": 0.3623329101635427, "grad_norm": 1.8263497352600098, "kl": 1.923828125, "learning_rate": 2.472386371591811e-07, "loss": 0.278, "reward": 0.6863839477300644, "reward_std": 0.34902092814445496, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482387661934, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 1492.0692749023438, "epoch": 0.36263161825106416, "grad_norm": 2.807361125946045, "kl": 1.771484375, "learning_rate": 2.471269263214659e-07, "loss": 0.2507, "reward": 0.5703125223517418, "reward_std": 0.27780384570360184, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4966518133878708, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 1356.6719055175781, "epoch": 0.36293032633858563, "grad_norm": 4.16854190826416, "kl": 2.171875, "learning_rate": 2.47015126137704e-07, "loss": 0.2383, "reward": 0.6445312798023224, "reward_std": 0.360138900578022, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5172991380095482, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 1366.4710388183594, "epoch": 0.3632290344261071, "grad_norm": 2.680048704147339, "kl": 1.6796875, "learning_rate": 2.469032367295233e-07, "loss": 0.27, "reward": 0.6099330633878708, "reward_std": 0.3322392478585243, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5429687649011612, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 1583.4889221191406, "epoch": 0.36352774251362857, "grad_norm": 7.992771148681641, "kl": 2.44921875, "learning_rate": 2.4679125821864833e-07, "loss": 0.2174, "reward": 0.5206473395228386, "reward_std": 0.2992311716079712, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723544239998, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 1443.8460388183594, "epoch": 0.36382645060115004, "grad_norm": 2.0823681354522705, "kl": 1.7646484375, "learning_rate": 2.466791907269009e-07, "loss": 0.2447, "reward": 0.6010044887661934, "reward_std": 0.3448963314294815, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482700914144516, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 1491.9777221679688, "epoch": 0.3641251586886715, "grad_norm": 2.6145763397216797, "kl": 2.1171875, "learning_rate": 2.4656703437619936e-07, "loss": 0.2369, "reward": 0.5725446790456772, "reward_std": 0.2798925191164017, "rewards/accuracy_reward": 0.07812500698491931, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196715950966, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 1453.7076721191406, "epoch": 0.364423866776193, "grad_norm": 2.691445827484131, "kl": 1.876953125, "learning_rate": 2.4645478928855887e-07, "loss": 0.2451, "reward": 0.6562500298023224, "reward_std": 0.3292275592684746, "rewards/accuracy_reward": 0.15401786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502232164144516, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 1605.4643859863281, "epoch": 0.36472257486371445, "grad_norm": 1.484845757484436, "kl": 2.296875, "learning_rate": 2.46342455586091e-07, "loss": 0.228, "reward": 0.5279017984867096, "reward_std": 0.2826339155435562, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 1587.9576721191406, "epoch": 0.3650212829512359, "grad_norm": 1.8797993659973145, "kl": 1.822265625, "learning_rate": 2.462300333910039e-07, "loss": 0.2037, "reward": 0.555245578289032, "reward_std": 0.29746440425515175, "rewards/accuracy_reward": 0.07812500512227416, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205484867096, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 1448.794677734375, "epoch": 0.3653199910387574, "grad_norm": 2.9802050590515137, "kl": 1.783203125, "learning_rate": 2.461175228256019e-07, "loss": 0.237, "reward": 0.5669643059372902, "reward_std": 0.26084911450743675, "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.497767873108387, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 1502.5581359863281, "epoch": 0.36561869912627887, "grad_norm": 1.60769784450531, "kl": 2.1328125, "learning_rate": 2.4600492401228545e-07, "loss": 0.2307, "reward": 0.6261160969734192, "reward_std": 0.3480650335550308, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875149011612, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 1583.6407165527344, "epoch": 0.36591740721380034, "grad_norm": 2.046940326690674, "kl": 1.76171875, "learning_rate": 2.4589223707355094e-07, "loss": 0.2429, "reward": 0.4927455484867096, "reward_std": 0.24819104373455048, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 1414.5156860351562, "epoch": 0.3662161153013218, "grad_norm": 2.9500179290771484, "kl": 1.1689453125, "learning_rate": 2.457794621319908e-07, "loss": 0.2279, "reward": 0.6010044813156128, "reward_std": 0.3122596964240074, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5117187798023224, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 1494.0291137695312, "epoch": 0.3665148233888433, "grad_norm": 2.3683114051818848, "kl": 1.794921875, "learning_rate": 2.4566659931029316e-07, "loss": 0.2165, "reward": 0.662388414144516, "reward_std": 0.35779009759426117, "rewards/accuracy_reward": 0.16517857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098544239998, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 1492.6697082519531, "epoch": 0.3668135314763647, "grad_norm": 1.7195428609848022, "kl": 1.783203125, "learning_rate": 2.4555364873124155e-07, "loss": 0.2472, "reward": 0.7059152126312256, "reward_std": 0.30543410032987595, "rewards/accuracy_reward": 0.1941964440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5117187723517418, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 1482.1206359863281, "epoch": 0.36711223956388617, "grad_norm": 1.9306803941726685, "kl": 1.5576171875, "learning_rate": 2.454406105177153e-07, "loss": 0.2447, "reward": 0.6065848469734192, "reward_std": 0.3480459451675415, "rewards/accuracy_reward": 0.10044643189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506138414144516, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 1422.8013916015625, "epoch": 0.36741094765140764, "grad_norm": 1.5453073978424072, "kl": 1.9296875, "learning_rate": 2.453274847926888e-07, "loss": 0.2408, "reward": 0.6690848469734192, "reward_std": 0.32652895152568817, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705559372902, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 1383.2545166015625, "epoch": 0.3677096557389291, "grad_norm": 3.2709836959838867, "kl": 1.544921875, "learning_rate": 2.4521427167923185e-07, "loss": 0.2083, "reward": 0.6484375298023224, "reward_std": 0.30119774490594864, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125298023224, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 1506.1607666015625, "epoch": 0.3680083638264506, "grad_norm": 2.899773597717285, "kl": 1.455078125, "learning_rate": 2.451009713005091e-07, "loss": 0.1826, "reward": 0.5870535969734192, "reward_std": 0.29108189418911934, "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5200893059372902, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 1442.6250610351562, "epoch": 0.36830707191397205, "grad_norm": 2.0792176723480225, "kl": 1.7578125, "learning_rate": 2.449875837797803e-07, "loss": 0.2551, "reward": 0.6462053954601288, "reward_std": 0.29665814340114594, "rewards/accuracy_reward": 0.16517857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 1468.8371276855469, "epoch": 0.3686057800014935, "grad_norm": 3.9714558124542236, "kl": 2.353515625, "learning_rate": 2.448741092404e-07, "loss": 0.2603, "reward": 0.643973246216774, "reward_std": 0.3595631942152977, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482387661934, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 1529.1652526855469, "epoch": 0.368904488089015, "grad_norm": 1.4264919757843018, "kl": 2.041015625, "learning_rate": 2.4476054780581733e-07, "loss": 0.2576, "reward": 0.6132812798023224, "reward_std": 0.2844252549111843, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490513414144516, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 1544.2656860351562, "epoch": 0.36920319617653646, "grad_norm": 1.761064052581787, "kl": 1.716796875, "learning_rate": 2.4464689959957607e-07, "loss": 0.2185, "reward": 0.514508955180645, "reward_std": 0.286401703953743, "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268133878708, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 1521.7255249023438, "epoch": 0.36950190426405793, "grad_norm": 1.863895297050476, "kl": 2.10546875, "learning_rate": 2.445331647453142e-07, "loss": 0.2332, "reward": 0.597098246216774, "reward_std": 0.2869054600596428, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.485491082072258, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 1526.9532165527344, "epoch": 0.3698006123515794, "grad_norm": 2.182443857192993, "kl": 1.703125, "learning_rate": 2.444193433667642e-07, "loss": 0.2176, "reward": 0.6015625298023224, "reward_std": 0.2735052816569805, "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160969734192, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 1341.2545166015625, "epoch": 0.3700993204391009, "grad_norm": 1.4195258617401123, "kl": 1.76953125, "learning_rate": 2.443054355877526e-07, "loss": 0.2313, "reward": 0.6914062798023224, "reward_std": 0.3236931711435318, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5306919813156128, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 1474.0179138183594, "epoch": 0.37039802852662235, "grad_norm": 2.315263509750366, "kl": 1.99609375, "learning_rate": 2.441914415321998e-07, "loss": 0.2474, "reward": 0.654575914144516, "reward_std": 0.28184138238430023, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027901902794838, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 1497.4822082519531, "epoch": 0.3706967366141438, "grad_norm": 4.038098335266113, "kl": 2.232421875, "learning_rate": 2.4407736132412024e-07, "loss": 0.2298, "reward": 0.5474330484867096, "reward_std": 0.25405143201351166, "rewards/accuracy_reward": 0.051339287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937798023224, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 1554.0022888183594, "epoch": 0.3709954447016653, "grad_norm": 2.37475848197937, "kl": 1.7578125, "learning_rate": 2.43963195087622e-07, "loss": 0.2194, "reward": 0.5351562798023224, "reward_std": 0.3039686307311058, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491380095482, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 1427.2567443847656, "epoch": 0.37129415278918676, "grad_norm": 1.4071630239486694, "kl": 1.880859375, "learning_rate": 2.4384894294690663e-07, "loss": 0.2312, "reward": 0.5664062798023224, "reward_std": 0.29921112954616547, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 1482.54248046875, "epoch": 0.37159286087670823, "grad_norm": 2.403430938720703, "kl": 1.505859375, "learning_rate": 2.4373460502626946e-07, "loss": 0.1911, "reward": 0.607700914144516, "reward_std": 0.3298124149441719, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5474330484867096, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 1529.7188415527344, "epoch": 0.3718915689642297, "grad_norm": 2.002354383468628, "kl": 1.83984375, "learning_rate": 2.4362018145009883e-07, "loss": 0.2247, "reward": 0.6155134215950966, "reward_std": 0.2978413291275501, "rewards/accuracy_reward": 0.12946428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486049123108387, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 1503.4554138183594, "epoch": 0.3721902770517512, "grad_norm": 1.4948707818984985, "kl": 1.490234375, "learning_rate": 2.435056723428763e-07, "loss": 0.246, "reward": 0.6132812798023224, "reward_std": 0.31870296597480774, "rewards/accuracy_reward": 0.12276785913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4905134215950966, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 1542.8304138183594, "epoch": 0.37248898513927264, "grad_norm": 1.8296422958374023, "kl": 1.767578125, "learning_rate": 2.4339107782917675e-07, "loss": 0.2437, "reward": 0.5731027126312256, "reward_std": 0.33566949516534805, "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 1483.0625610351562, "epoch": 0.3727876932267941, "grad_norm": 1.382575273513794, "kl": 1.462890625, "learning_rate": 2.4327639803366766e-07, "loss": 0.2447, "reward": 0.595424123108387, "reward_std": 0.3149113431572914, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169887661934, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 1489.8594665527344, "epoch": 0.3730864013143156, "grad_norm": 1.9360064268112183, "kl": 1.666015625, "learning_rate": 2.4316163308110934e-07, "loss": 0.2315, "reward": 0.662388414144516, "reward_std": 0.3074152432382107, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5217634290456772, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 1490.1763916015625, "epoch": 0.37338510940183706, "grad_norm": 3.994459629058838, "kl": 1.9921875, "learning_rate": 2.4304678309635495e-07, "loss": 0.2544, "reward": 0.5686384290456772, "reward_std": 0.2980843782424927, "rewards/accuracy_reward": 0.08482143003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169813156128, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 1562.1875610351562, "epoch": 0.3736838174893585, "grad_norm": 5.308684825897217, "kl": 2.44921875, "learning_rate": 2.4293184820434993e-07, "loss": 0.2612, "reward": 0.5122767984867096, "reward_std": 0.26824886724352837, "rewards/accuracy_reward": 0.035714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625149011612, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 1444.1094360351562, "epoch": 0.37398252557688, "grad_norm": 2.035405158996582, "kl": 1.908203125, "learning_rate": 2.4281682853013223e-07, "loss": 0.254, "reward": 0.628348246216774, "reward_std": 0.3101775199174881, "rewards/accuracy_reward": 0.11607143701985478, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.512276828289032, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 1450.0491943359375, "epoch": 0.37428123366440147, "grad_norm": 1.3829121589660645, "kl": 1.89453125, "learning_rate": 2.4270172419883196e-07, "loss": 0.2617, "reward": 0.5887276977300644, "reward_std": 0.3132634088397026, "rewards/accuracy_reward": 0.08705357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501674123108387, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 1557.0759582519531, "epoch": 0.37457994175192294, "grad_norm": 3.3813443183898926, "kl": 2.009765625, "learning_rate": 2.425865353356713e-07, "loss": 0.2729, "reward": 0.5792410895228386, "reward_std": 0.30570945143699646, "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 1592.102783203125, "epoch": 0.3748786498394444, "grad_norm": 2.281146287918091, "kl": 2.251953125, "learning_rate": 2.4247126206596454e-07, "loss": 0.2521, "reward": 0.4893973544239998, "reward_std": 0.2751332148909569, "rewards/accuracy_reward": 0.03125000046566129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473395228386, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 1501.8460693359375, "epoch": 0.3751773579269659, "grad_norm": 4.913239002227783, "kl": 1.640625, "learning_rate": 2.4235590451511766e-07, "loss": 0.2605, "reward": 0.6489955633878708, "reward_std": 0.30599241703748703, "rewards/accuracy_reward": 0.12946429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5195312798023224, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 1523.5982666015625, "epoch": 0.37547606601448735, "grad_norm": 3.8847358226776123, "kl": 1.83203125, "learning_rate": 2.422404628086284e-07, "loss": 0.2365, "reward": 0.529575914144516, "reward_std": 0.2655566520988941, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580633878708, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 1469.9175109863281, "epoch": 0.3757747741020088, "grad_norm": 2.9202418327331543, "kl": 2.15625, "learning_rate": 2.421249370720859e-07, "loss": 0.2557, "reward": 0.6043526977300644, "reward_std": 0.2799018621444702, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098469734192, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 1526.9844665527344, "epoch": 0.3760734821895303, "grad_norm": 4.90028190612793, "kl": 1.685546875, "learning_rate": 2.42009327431171e-07, "loss": 0.2603, "reward": 0.5848214477300644, "reward_std": 0.30018316581845284, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.491071455180645, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 1473.9085083007812, "epoch": 0.37637219027705177, "grad_norm": 3.036393880844116, "kl": 1.62109375, "learning_rate": 2.418936340116555e-07, "loss": 0.188, "reward": 0.6233259066939354, "reward_std": 0.2958284616470337, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937798023224, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 1337.5670471191406, "epoch": 0.37667089836457324, "grad_norm": 3.2065677642822266, "kl": 2.1328125, "learning_rate": 2.4177785693940257e-07, "loss": 0.1838, "reward": 0.6579241305589676, "reward_std": 0.30801471322774887, "rewards/accuracy_reward": 0.12946429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5284598469734192, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 1492.1116943359375, "epoch": 0.3769696064520947, "grad_norm": 2.9118001461029053, "kl": 1.85546875, "learning_rate": 2.416619963403664e-07, "loss": 0.2321, "reward": 0.6484375447034836, "reward_std": 0.28725626692175865, "rewards/accuracy_reward": 0.15401786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196790456772, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 1477.7232666015625, "epoch": 0.3772683145396162, "grad_norm": 4.679891109466553, "kl": 1.919921875, "learning_rate": 2.4154605234059186e-07, "loss": 0.2342, "reward": 0.626116082072258, "reward_std": 0.3011905252933502, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501116082072258, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 1468.4710693359375, "epoch": 0.37756702262713765, "grad_norm": 2.3571865558624268, "kl": 2.3359375, "learning_rate": 2.4143002506621473e-07, "loss": 0.2985, "reward": 0.6210937798023224, "reward_std": 0.3641302138566971, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489397332072258, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 1577.52685546875, "epoch": 0.3778657307146591, "grad_norm": 2.266418695449829, "kl": 2.16015625, "learning_rate": 2.413139146434612e-07, "loss": 0.2216, "reward": 0.5541294887661934, "reward_std": 0.25416944921016693, "rewards/accuracy_reward": 0.055803572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498325914144516, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 1506.9933471679688, "epoch": 0.3781644388021806, "grad_norm": 1.5541986227035522, "kl": 1.73828125, "learning_rate": 2.411977211986482e-07, "loss": 0.2153, "reward": 0.5809151977300644, "reward_std": 0.3670194670557976, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871652126312256, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 1461.24560546875, "epoch": 0.37846314688970206, "grad_norm": 1.9761892557144165, "kl": 1.87890625, "learning_rate": 2.4108144485818264e-07, "loss": 0.2795, "reward": 0.619977705180645, "reward_std": 0.26424113288521767, "rewards/accuracy_reward": 0.10491071967408061, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5150669887661934, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 1405.700927734375, "epoch": 0.37876185497722353, "grad_norm": 2.8279030323028564, "kl": 2.03125, "learning_rate": 2.409650857485619e-07, "loss": 0.3444, "reward": 0.6250000298023224, "reward_std": 0.295755747705698, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 1576.0134582519531, "epoch": 0.379060563064745, "grad_norm": 1.9728672504425049, "kl": 1.400390625, "learning_rate": 2.408486439963732e-07, "loss": 0.2204, "reward": 0.545200914144516, "reward_std": 0.3036728724837303, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616305589676, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 1546.0357666015625, "epoch": 0.3793592711522665, "grad_norm": 1.5471223592758179, "kl": 1.65625, "learning_rate": 2.4073211972829383e-07, "loss": 0.1735, "reward": 0.6478794887661934, "reward_std": 0.3189416043460369, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487165205180645, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 1415.0670471191406, "epoch": 0.3796579792397879, "grad_norm": 2.693434238433838, "kl": 1.4453125, "learning_rate": 2.4061551307109076e-07, "loss": 0.2802, "reward": 0.6289062798023224, "reward_std": 0.32683180272579193, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 1562.1406860351562, "epoch": 0.37995668732730936, "grad_norm": 3.0423424243927, "kl": 2.078125, "learning_rate": 2.404988241516205e-07, "loss": 0.2614, "reward": 0.5524553954601288, "reward_std": 0.3087272122502327, "rewards/accuracy_reward": 0.08482143329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 1512.1406860351562, "epoch": 0.38025539541483083, "grad_norm": 2.494155168533325, "kl": 1.72265625, "learning_rate": 2.4038205309682933e-07, "loss": 0.238, "reward": 0.6462053805589676, "reward_std": 0.3095675855875015, "rewards/accuracy_reward": 0.14732143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498883955180645, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 1371.8728332519531, "epoch": 0.3805541035023523, "grad_norm": 2.8709495067596436, "kl": 1.466796875, "learning_rate": 2.4026520003375265e-07, "loss": 0.1663, "reward": 0.660714328289032, "reward_std": 0.3215274028480053, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517857164144516, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 1519.6049499511719, "epoch": 0.3808528115898738, "grad_norm": 1.8223265409469604, "kl": 1.662109375, "learning_rate": 2.40148265089515e-07, "loss": 0.2057, "reward": 0.6082589626312256, "reward_std": 0.2729850187897682, "rewards/accuracy_reward": 0.11607143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875223517418, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 1465.1942749023438, "epoch": 0.38115151967739525, "grad_norm": 2.1371803283691406, "kl": 1.6220703125, "learning_rate": 2.4003124839133037e-07, "loss": 0.2396, "reward": 0.5803571715950966, "reward_std": 0.2915102317929268, "rewards/accuracy_reward": 0.09598214877769351, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750223517418, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 1466.2813110351562, "epoch": 0.3814502277649167, "grad_norm": 2.057072401046753, "kl": 2.224609375, "learning_rate": 2.399141500665013e-07, "loss": 0.2476, "reward": 0.6316964626312256, "reward_std": 0.25517718121409416, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000298023224, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 1564.6094360351562, "epoch": 0.3817489358524382, "grad_norm": 3.65620756149292, "kl": 1.5419921875, "learning_rate": 2.3979697024241933e-07, "loss": 0.2424, "reward": 0.609933078289032, "reward_std": 0.32663219422101974, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462611623108387, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 1456.9822082519531, "epoch": 0.38204764393995966, "grad_norm": 3.2446060180664062, "kl": 1.3369140625, "learning_rate": 2.396797090465646e-07, "loss": 0.2172, "reward": 0.6333705484867096, "reward_std": 0.3046021834015846, "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455484867096, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 1390.5982666015625, "epoch": 0.38234635202748113, "grad_norm": 3.687739610671997, "kl": 1.52734375, "learning_rate": 2.3956236660650583e-07, "loss": 0.2558, "reward": 0.6356026977300644, "reward_std": 0.34671296924352646, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5306920036673546, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 1540.24560546875, "epoch": 0.3826450601150026, "grad_norm": 1.8305350542068481, "kl": 1.994140625, "learning_rate": 2.394449430499001e-07, "loss": 0.2438, "reward": 0.549665205180645, "reward_std": 0.30963101610541344, "rewards/accuracy_reward": 0.05580357485450804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616380095482, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 1467.4442443847656, "epoch": 0.38294376820252407, "grad_norm": 4.607903480529785, "kl": 2.353515625, "learning_rate": 2.3932743850449275e-07, "loss": 0.2587, "reward": 0.6395089477300644, "reward_std": 0.3116210699081421, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768059372902, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 1554.0759582519531, "epoch": 0.38324247629004554, "grad_norm": 3.10245418548584, "kl": 2.33203125, "learning_rate": 2.3920985309811715e-07, "loss": 0.2283, "reward": 0.6054687798023224, "reward_std": 0.2900001257658005, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330559372902, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 1516.7366638183594, "epoch": 0.383541184377567, "grad_norm": 2.0437254905700684, "kl": 1.6640625, "learning_rate": 2.3909218695869475e-07, "loss": 0.2307, "reward": 0.624441996216774, "reward_std": 0.24613111838698387, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384290456772, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 1446.8549499511719, "epoch": 0.3838398924650885, "grad_norm": 1.9493842124938965, "kl": 1.720703125, "learning_rate": 2.389744402142348e-07, "loss": 0.2549, "reward": 0.6356027126312256, "reward_std": 0.3185468167066574, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741305589676, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 1339.65185546875, "epoch": 0.38413860055260995, "grad_norm": 1.7999439239501953, "kl": 1.748046875, "learning_rate": 2.3885661299283405e-07, "loss": 0.2404, "reward": 0.5301339477300644, "reward_std": 0.2737067975103855, "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482313156128, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 1442.274658203125, "epoch": 0.3844373086401314, "grad_norm": 2.484783887863159, "kl": 1.994140625, "learning_rate": 2.3873870542267716e-07, "loss": 0.2324, "reward": 0.5714285895228386, "reward_std": 0.27848711982369423, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678805589676, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 1532.2143859863281, "epoch": 0.3847360167276529, "grad_norm": 2.2035045623779297, "kl": 1.939453125, "learning_rate": 2.3862071763203593e-07, "loss": 0.2515, "reward": 0.6551339626312256, "reward_std": 0.30718832090497017, "rewards/accuracy_reward": 0.1897321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018133878708, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 1374.497802734375, "epoch": 0.38503472481517437, "grad_norm": 2.287745714187622, "kl": 1.87890625, "learning_rate": 2.385026497492695e-07, "loss": 0.2612, "reward": 0.6328125298023224, "reward_std": 0.29733770340681076, "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5345982313156128, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 1452.9286193847656, "epoch": 0.38533343290269584, "grad_norm": 2.2124204635620117, "kl": 1.57421875, "learning_rate": 2.3838450190282422e-07, "loss": 0.2679, "reward": 0.5580357238650322, "reward_std": 0.27205609157681465, "rewards/accuracy_reward": 0.05803571664728224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000298023224, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 1488.16748046875, "epoch": 0.3856321409902173, "grad_norm": 7.452094554901123, "kl": 1.8671875, "learning_rate": 2.382662742212333e-07, "loss": 0.2341, "reward": 0.5625000298023224, "reward_std": 0.3083209693431854, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5044643059372902, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 1366.46435546875, "epoch": 0.3859308490777388, "grad_norm": 4.018179416656494, "kl": 1.908203125, "learning_rate": 2.3814796683311692e-07, "loss": 0.2523, "reward": 0.6847098469734192, "reward_std": 0.3034178800880909, "rewards/accuracy_reward": 0.17187500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5128348469734192, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 1466.6541137695312, "epoch": 0.38622955716526025, "grad_norm": 1.6041358709335327, "kl": 1.8642578125, "learning_rate": 2.3802957986718185e-07, "loss": 0.2348, "reward": 0.5708705484867096, "reward_std": 0.2554754540324211, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5106026977300644, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 1521.1563415527344, "epoch": 0.3865282652527817, "grad_norm": 2.0297884941101074, "kl": 1.83984375, "learning_rate": 2.3791111345222158e-07, "loss": 0.2316, "reward": 0.588727705180645, "reward_std": 0.33402233198285103, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4815848544239998, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 1468.2232666015625, "epoch": 0.3868269733403032, "grad_norm": 1.2934504747390747, "kl": 1.55859375, "learning_rate": 2.3779256771711592e-07, "loss": 0.2198, "reward": 0.7020089775323868, "reward_std": 0.31132350862026215, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5145089477300644, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 1432.8772888183594, "epoch": 0.38712568142782466, "grad_norm": 2.894510507583618, "kl": 1.7578125, "learning_rate": 2.3767394279083106e-07, "loss": 0.2362, "reward": 0.610491082072258, "reward_std": 0.30643635243177414, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5256696566939354, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 1506.1161499023438, "epoch": 0.38742438951534613, "grad_norm": 2.5384390354156494, "kl": 1.48828125, "learning_rate": 2.3755523880241922e-07, "loss": 0.2454, "reward": 0.5580357313156128, "reward_std": 0.3345911428332329, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393133878708, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 1589.2880249023438, "epoch": 0.3877230976028676, "grad_norm": 5.006552219390869, "kl": 1.2900390625, "learning_rate": 2.3743645588101873e-07, "loss": 0.2384, "reward": 0.5574777126312256, "reward_std": 0.3290984332561493, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 1517.62060546875, "epoch": 0.3880218056903891, "grad_norm": 1.7683541774749756, "kl": 1.3623046875, "learning_rate": 2.373175941558538e-07, "loss": 0.2182, "reward": 0.5513393208384514, "reward_std": 0.3027558848261833, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785969734192, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 1455.8706359863281, "epoch": 0.38832051377791055, "grad_norm": 3.2112693786621094, "kl": 1.515625, "learning_rate": 2.3719865375623431e-07, "loss": 0.2618, "reward": 0.5959821715950966, "reward_std": 0.35355228930711746, "rewards/accuracy_reward": 0.11607143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 1473.0737609863281, "epoch": 0.388619221865432, "grad_norm": 2.768212080001831, "kl": 1.318359375, "learning_rate": 2.3707963481155576e-07, "loss": 0.2468, "reward": 0.6657366454601288, "reward_std": 0.3485056608915329, "rewards/accuracy_reward": 0.15848215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5072544738650322, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 1473.962158203125, "epoch": 0.3889179299529535, "grad_norm": 1.4899438619613647, "kl": 1.392578125, "learning_rate": 2.3696053745129906e-07, "loss": 0.2531, "reward": 0.6210937649011612, "reward_std": 0.30635225027799606, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937723517418, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 1412.591552734375, "epoch": 0.38921663804047496, "grad_norm": 3.7333476543426514, "kl": 1.96484375, "learning_rate": 2.3684136180503055e-07, "loss": 0.2831, "reward": 0.6010045111179352, "reward_std": 0.32788751274347305, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5206473395228386, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 1401.3795471191406, "epoch": 0.38951534612799643, "grad_norm": 1.699856162071228, "kl": 1.86328125, "learning_rate": 2.3672210800240164e-07, "loss": 0.2586, "reward": 0.5703125298023224, "reward_std": 0.2592441849410534, "rewards/accuracy_reward": 0.06026785867288709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510044664144516, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 1439.8505249023438, "epoch": 0.3898140542155179, "grad_norm": 3.2342801094055176, "kl": 1.4765625, "learning_rate": 2.366027761731487e-07, "loss": 0.2493, "reward": 0.557477705180645, "reward_std": 0.2981431260704994, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419887661934, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 1603.9353637695312, "epoch": 0.3901127623030394, "grad_norm": 7.115179061889648, "kl": 2.18359375, "learning_rate": 2.3648336644709307e-07, "loss": 0.2377, "reward": 0.5909598618745804, "reward_std": 0.3172552362084389, "rewards/accuracy_reward": 0.11160715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4793527126312256, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 1431.9063110351562, "epoch": 0.39041147039056084, "grad_norm": 2.179759979248047, "kl": 1.7890625, "learning_rate": 2.3636387895414097e-07, "loss": 0.2738, "reward": 0.6478794887661934, "reward_std": 0.2623223662376404, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866380095482, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 1544.950927734375, "epoch": 0.3907101784780823, "grad_norm": 2.535006284713745, "kl": 2.4765625, "learning_rate": 2.3624431382428286e-07, "loss": 0.2162, "reward": 0.5485491305589676, "reward_std": 0.3212612047791481, "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098395228386, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 1411.1808471679688, "epoch": 0.3910088865656038, "grad_norm": 1.4235376119613647, "kl": 1.2490234375, "learning_rate": 2.3612467118759406e-07, "loss": 0.2145, "reward": 0.6222098469734192, "reward_std": 0.3182403966784477, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062723517418, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 1539.6139221191406, "epoch": 0.39130759465312526, "grad_norm": 2.151952028274536, "kl": 1.7255859375, "learning_rate": 2.3600495117423402e-07, "loss": 0.2407, "reward": 0.5820312649011612, "reward_std": 0.26702695339918137, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455708384514, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 1492.9040832519531, "epoch": 0.39160630274064673, "grad_norm": 4.754322052001953, "kl": 2.16015625, "learning_rate": 2.3588515391444628e-07, "loss": 0.2791, "reward": 0.5479910895228386, "reward_std": 0.29419346153736115, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982313156128, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 1529.4129943847656, "epoch": 0.3919050108281682, "grad_norm": 1.6570690870285034, "kl": 1.794921875, "learning_rate": 2.357652795385586e-07, "loss": 0.1995, "reward": 0.5848214477300644, "reward_std": 0.3011120557785034, "rewards/accuracy_reward": 0.10267857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482142873108387, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 1501.1920166015625, "epoch": 0.39220371891568967, "grad_norm": 2.303438186645508, "kl": 1.580078125, "learning_rate": 2.3564532817698247e-07, "loss": 0.219, "reward": 0.6266741305589676, "reward_std": 0.330192007124424, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470424123108387, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 1328.8661804199219, "epoch": 0.3925024270032111, "grad_norm": 4.025355339050293, "kl": 1.314453125, "learning_rate": 2.3552529996021334e-07, "loss": 0.2683, "reward": 0.6534598469734192, "reward_std": 0.3071616366505623, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5262276977300644, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 1572.02685546875, "epoch": 0.39280113509073256, "grad_norm": 2.2215347290039062, "kl": 1.072265625, "learning_rate": 2.3540519501883007e-07, "loss": 0.1708, "reward": 0.6501116454601288, "reward_std": 0.3163149133324623, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794887661934, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 1562.5782165527344, "epoch": 0.393099843178254, "grad_norm": 1.3468217849731445, "kl": 1.814453125, "learning_rate": 2.35285013483495e-07, "loss": 0.2608, "reward": 0.6160714626312256, "reward_std": 0.3374079167842865, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893133878708, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 1507.7009887695312, "epoch": 0.3933985512657755, "grad_norm": 3.027459144592285, "kl": 1.72265625, "learning_rate": 2.3516475548495398e-07, "loss": 0.2106, "reward": 0.5117187798023224, "reward_std": 0.2667274624109268, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044887661934, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 1488.5782165527344, "epoch": 0.39369725935329697, "grad_norm": 3.2718305587768555, "kl": 1.533203125, "learning_rate": 2.3504442115403592e-07, "loss": 0.2339, "reward": 0.5876116156578064, "reward_std": 0.3446732312440872, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937723517418, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 1447.9777526855469, "epoch": 0.39399596744081844, "grad_norm": 1.7121366262435913, "kl": 1.794921875, "learning_rate": 2.3492401062165265e-07, "loss": 0.1972, "reward": 0.612723246216774, "reward_std": 0.23867544531822205, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160969734192, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 1439.7701416015625, "epoch": 0.3942946755283399, "grad_norm": 1.7903056144714355, "kl": 1.337890625, "learning_rate": 2.3480352401879914e-07, "loss": 0.2163, "reward": 0.6071428954601288, "reward_std": 0.2684115469455719, "rewards/accuracy_reward": 0.11830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393208384514, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 1547.5893249511719, "epoch": 0.3945933836158614, "grad_norm": 4.8330512046813965, "kl": 1.708984375, "learning_rate": 2.3468296147655302e-07, "loss": 0.2164, "reward": 0.5797991380095482, "reward_std": 0.3066110759973526, "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.526227705180645, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 1589.5223999023438, "epoch": 0.39489209170338285, "grad_norm": 12.461237907409668, "kl": 2.689453125, "learning_rate": 2.345623231260745e-07, "loss": 0.2669, "reward": 0.5418527126312256, "reward_std": 0.31575093418359756, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486049123108387, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 1424.2500610351562, "epoch": 0.3951907997909043, "grad_norm": 2.2202341556549072, "kl": 1.69921875, "learning_rate": 2.3444160909860614e-07, "loss": 0.2236, "reward": 0.6255580633878708, "reward_std": 0.30325376242399216, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5318080708384514, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 1443.9844665527344, "epoch": 0.3954895078784258, "grad_norm": 6.449258804321289, "kl": 2.21484375, "learning_rate": 2.3432081952547317e-07, "loss": 0.249, "reward": 0.6250000447034836, "reward_std": 0.284861221909523, "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964328289032, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 1511.2656860351562, "epoch": 0.39578821596594727, "grad_norm": 2.3954172134399414, "kl": 1.9453125, "learning_rate": 2.3419995453808272e-07, "loss": 0.2385, "reward": 0.620535746216774, "reward_std": 0.2780555337667465, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 1404.72998046875, "epoch": 0.39608692405346874, "grad_norm": 1.8016321659088135, "kl": 1.830078125, "learning_rate": 2.3407901426792405e-07, "loss": 0.1775, "reward": 0.6372768133878708, "reward_std": 0.25850120186805725, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768133878708, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 1490.80810546875, "epoch": 0.3963856321409902, "grad_norm": 2.1939308643341064, "kl": 1.703125, "learning_rate": 2.339579988465683e-07, "loss": 0.2517, "reward": 0.6138393133878708, "reward_std": 0.27671192958950996, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035895228386, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 1507.4353332519531, "epoch": 0.3966843402285117, "grad_norm": 3.378203868865967, "kl": 1.603515625, "learning_rate": 2.338369084056685e-07, "loss": 0.2263, "reward": 0.6757812798023224, "reward_std": 0.31222494691610336, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517299123108387, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 1486.91748046875, "epoch": 0.39698304831603315, "grad_norm": 3.0243966579437256, "kl": 1.71484375, "learning_rate": 2.3371574307695905e-07, "loss": 0.1993, "reward": 0.6523437798023224, "reward_std": 0.36396361887454987, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027902126312256, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 1400.5000305175781, "epoch": 0.3972817564035546, "grad_norm": 3.0767662525177, "kl": 1.73828125, "learning_rate": 2.33594502992256e-07, "loss": 0.2357, "reward": 0.5636160969734192, "reward_std": 0.27953004091978073, "rewards/accuracy_reward": 0.04687500046566129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167410969734192, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 1390.5156860351562, "epoch": 0.3975804644910761, "grad_norm": 2.1466262340545654, "kl": 1.904296875, "learning_rate": 2.3347318828345667e-07, "loss": 0.1668, "reward": 0.7187500447034836, "reward_std": 0.28335879370570183, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5133928805589676, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 1399.2120971679688, "epoch": 0.39787917257859756, "grad_norm": 1.9269047975540161, "kl": 2.091796875, "learning_rate": 2.3335179908253955e-07, "loss": 0.2406, "reward": 0.6489955633878708, "reward_std": 0.27536167204380035, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062649011612, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 1511.0045471191406, "epoch": 0.39817788066611903, "grad_norm": 3.4110898971557617, "kl": 2.185546875, "learning_rate": 2.3323033552156427e-07, "loss": 0.2307, "reward": 0.6110491305589676, "reward_std": 0.3134899325668812, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.497209832072258, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 1409.6451416015625, "epoch": 0.3984765887536405, "grad_norm": 3.0618834495544434, "kl": 2.220703125, "learning_rate": 2.331087977326712e-07, "loss": 0.2459, "reward": 0.6718750298023224, "reward_std": 0.32810966297984123, "rewards/accuracy_reward": 0.15178572479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5200893133878708, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 1518.6897888183594, "epoch": 0.398775296841162, "grad_norm": 2.292217254638672, "kl": 1.658203125, "learning_rate": 2.329871858480816e-07, "loss": 0.207, "reward": 0.6344866454601288, "reward_std": 0.3200658969581127, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294813156128, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 1492.1273193359375, "epoch": 0.39907400492868345, "grad_norm": 2.390524387359619, "kl": 1.7734375, "learning_rate": 2.3286550000009728e-07, "loss": 0.2287, "reward": 0.6925223618745804, "reward_std": 0.3389147110283375, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498325914144516, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 1583.9420471191406, "epoch": 0.3993727130162049, "grad_norm": 2.319023609161377, "kl": 2.1875, "learning_rate": 2.3274374032110042e-07, "loss": 0.2517, "reward": 0.6344866305589676, "reward_std": 0.29727640002965927, "rewards/accuracy_reward": 0.15178572502918541, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4827009215950966, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 1326.4286499023438, "epoch": 0.3996714211037264, "grad_norm": 2.1679024696350098, "kl": 1.931640625, "learning_rate": 2.3262190694355373e-07, "loss": 0.2605, "reward": 0.7315848618745804, "reward_std": 0.3491140231490135, "rewards/accuracy_reward": 0.21205358672887087, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5195312798023224, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 1547.74560546875, "epoch": 0.39997012919124786, "grad_norm": 3.2778496742248535, "kl": 1.736328125, "learning_rate": 2.325e-07, "loss": 0.2194, "reward": 0.6523437649011612, "reward_std": 0.31633260846138, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 1547.7277221679688, "epoch": 0.40026883727876933, "grad_norm": 2.3986005783081055, "kl": 2.248046875, "learning_rate": 2.3237801962306193e-07, "loss": 0.2501, "reward": 0.6367187798023224, "reward_std": 0.32730938494205475, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027901977300644, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 1587.0268249511719, "epoch": 0.4005675453662908, "grad_norm": 4.027761936187744, "kl": 1.865234375, "learning_rate": 2.3225596594544228e-07, "loss": 0.1676, "reward": 0.5809151977300644, "reward_std": 0.31631167232990265, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871651977300644, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 1399.8527221679688, "epoch": 0.40086625345381227, "grad_norm": 1.2273012399673462, "kl": 1.439453125, "learning_rate": 2.3213383909992348e-07, "loss": 0.1961, "reward": 0.65011166036129, "reward_std": 0.2840423434972763, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5318080484867096, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 1520.88623046875, "epoch": 0.40116496154133374, "grad_norm": 1.8300904035568237, "kl": 1.826171875, "learning_rate": 2.3201163921936752e-07, "loss": 0.2195, "reward": 0.5753348469734192, "reward_std": 0.28419889509677887, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510602705180645, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 1617.8817443847656, "epoch": 0.4014636696288552, "grad_norm": 1.7778493165969849, "kl": 1.923828125, "learning_rate": 2.31889366436716e-07, "loss": 0.2193, "reward": 0.5463169887661934, "reward_std": 0.2810414843261242, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884215950966, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 1557.4241943359375, "epoch": 0.4017623777163767, "grad_norm": 1.7421088218688965, "kl": 1.2626953125, "learning_rate": 2.3176702088498959e-07, "loss": 0.1749, "reward": 0.5786830559372902, "reward_std": 0.30764780938625336, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223469734192, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 1465.2701721191406, "epoch": 0.40206108580389815, "grad_norm": 7.954113006591797, "kl": 1.287109375, "learning_rate": 2.3164460269728837e-07, "loss": 0.23, "reward": 0.612723246216774, "reward_std": 0.3194785639643669, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.503348246216774, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 1529.2254943847656, "epoch": 0.4023597938914196, "grad_norm": 4.647814750671387, "kl": 1.451171875, "learning_rate": 2.3152211200679128e-07, "loss": 0.198, "reward": 0.5507812798023224, "reward_std": 0.2603357955813408, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5239955484867096, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 1514.2947082519531, "epoch": 0.4026585019789411, "grad_norm": 4.018387317657471, "kl": 1.48828125, "learning_rate": 2.313995489467562e-07, "loss": 0.2003, "reward": 0.5797991305589676, "reward_std": 0.287667378783226, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384066939354, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 1516.3348999023438, "epoch": 0.40295721006646257, "grad_norm": 5.7713727951049805, "kl": 1.587890625, "learning_rate": 2.3127691365051968e-07, "loss": 0.2098, "reward": 0.6763393133878708, "reward_std": 0.3098171688616276, "rewards/accuracy_reward": 0.19196429592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750298023224, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 1464.5447387695312, "epoch": 0.40325591815398404, "grad_norm": 2.5591115951538086, "kl": 1.5859375, "learning_rate": 2.31154206251497e-07, "loss": 0.2455, "reward": 0.6383928880095482, "reward_std": 0.25259875133633614, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357387661934, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 1558.2478332519531, "epoch": 0.4035546262415055, "grad_norm": 1.1749528646469116, "kl": 1.8125, "learning_rate": 2.3103142688318173e-07, "loss": 0.2324, "reward": 0.5401785969734192, "reward_std": 0.31044530123472214, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 1411.99560546875, "epoch": 0.403853334329027, "grad_norm": 3.161705255508423, "kl": 1.6044921875, "learning_rate": 2.3090857567914574e-07, "loss": 0.2735, "reward": 0.5691964626312256, "reward_std": 0.30655307322740555, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5200892984867096, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 1499.9621276855469, "epoch": 0.40415204241654845, "grad_norm": 3.374631404876709, "kl": 1.912109375, "learning_rate": 2.307856527730392e-07, "loss": 0.265, "reward": 0.5669643133878708, "reward_std": 0.2969013750553131, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357313156128, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 1535.243408203125, "epoch": 0.4044507505040699, "grad_norm": 1.8125622272491455, "kl": 1.76953125, "learning_rate": 2.3066265829859008e-07, "loss": 0.2371, "reward": 0.6473214477300644, "reward_std": 0.31106048449873924, "rewards/accuracy_reward": 0.16517857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 1441.0647888183594, "epoch": 0.4047494585915914, "grad_norm": 5.13922119140625, "kl": 2.41796875, "learning_rate": 2.3053959238960435e-07, "loss": 0.3049, "reward": 0.6344866305589676, "reward_std": 0.3032919019460678, "rewards/accuracy_reward": 0.12500000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866305589676, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 1459.9710693359375, "epoch": 0.40504816667911286, "grad_norm": 3.681689500808716, "kl": 1.453125, "learning_rate": 2.3041645517996573e-07, "loss": 0.2512, "reward": 0.5636160969734192, "reward_std": 0.30768243968486786, "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167411118745804, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 1452.1697082519531, "epoch": 0.4053468747666343, "grad_norm": 2.5586564540863037, "kl": 1.638671875, "learning_rate": 2.3029324680363535e-07, "loss": 0.2367, "reward": 0.655691996216774, "reward_std": 0.33466949313879013, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5106026977300644, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 1369.1295166015625, "epoch": 0.40564558285415575, "grad_norm": 3.663208246231079, "kl": 1.67578125, "learning_rate": 2.3016996739465186e-07, "loss": 0.3027, "reward": 0.646763414144516, "reward_std": 0.30267178267240524, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5329241305589676, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 1527.2032165527344, "epoch": 0.4059442909416772, "grad_norm": 2.05086350440979, "kl": 1.5, "learning_rate": 2.300466170871312e-07, "loss": 0.2389, "reward": 0.6177455633878708, "reward_std": 0.3157784268260002, "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562798023224, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 1584.5513916015625, "epoch": 0.4062429990291987, "grad_norm": 1.7659022808074951, "kl": 1.501953125, "learning_rate": 2.2992319601526643e-07, "loss": 0.2073, "reward": 0.6356026828289032, "reward_std": 0.28906021267175674, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.497209832072258, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 1506.4420166015625, "epoch": 0.40654170711672016, "grad_norm": 1.6853842735290527, "kl": 1.53515625, "learning_rate": 2.297997043133275e-07, "loss": 0.1753, "reward": 0.6824776977300644, "reward_std": 0.2616593763232231, "rewards/accuracy_reward": 0.17187500977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510602705180645, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 1547.8147888183594, "epoch": 0.40684041520424163, "grad_norm": 3.3365561962127686, "kl": 1.8232421875, "learning_rate": 2.2967614211566135e-07, "loss": 0.2292, "reward": 0.5898437649011612, "reward_std": 0.3061774671077728, "rewards/accuracy_reward": 0.11830357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 1475.0402221679688, "epoch": 0.4071391232917631, "grad_norm": 2.6852316856384277, "kl": 1.984375, "learning_rate": 2.2955250955669153e-07, "loss": 0.1843, "reward": 0.6104910969734192, "reward_std": 0.29983172565698624, "rewards/accuracy_reward": 0.09821429313160479, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768133878708, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 1519.3906860351562, "epoch": 0.4074378313792846, "grad_norm": 1.2451227903366089, "kl": 1.640625, "learning_rate": 2.2942880677091814e-07, "loss": 0.2063, "reward": 0.616071455180645, "reward_std": 0.27619507908821106, "rewards/accuracy_reward": 0.11383929243311286, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502232164144516, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 1526.1340026855469, "epoch": 0.40773653946680605, "grad_norm": 1.6941393613815308, "kl": 1.494140625, "learning_rate": 2.293050338929177e-07, "loss": 0.2179, "reward": 0.5920759439468384, "reward_std": 0.3432261496782303, "rewards/accuracy_reward": 0.08482143329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5072544887661934, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 1526.2879943847656, "epoch": 0.4080352475543275, "grad_norm": 4.490591049194336, "kl": 1.62109375, "learning_rate": 2.291811910573429e-07, "loss": 0.2135, "reward": 0.6925223469734192, "reward_std": 0.36918432265520096, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223395228386, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 1496.07373046875, "epoch": 0.408333955641849, "grad_norm": 2.2814910411834717, "kl": 1.736328125, "learning_rate": 2.290572783989227e-07, "loss": 0.2249, "reward": 0.6629464626312256, "reward_std": 0.3226710148155689, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 1515.0491638183594, "epoch": 0.40863266372937046, "grad_norm": 2.5673232078552246, "kl": 1.27734375, "learning_rate": 2.2893329605246193e-07, "loss": 0.2135, "reward": 0.6132812798023224, "reward_std": 0.3250419646501541, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5284598469734192, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 1457.4420166015625, "epoch": 0.40893137181689193, "grad_norm": 6.037413120269775, "kl": 1.33203125, "learning_rate": 2.2880924415284112e-07, "loss": 0.2326, "reward": 0.6506696790456772, "reward_std": 0.29560771211981773, "rewards/accuracy_reward": 0.13392858067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167410969734192, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 1494.0514221191406, "epoch": 0.4092300799044134, "grad_norm": 2.512235164642334, "kl": 1.6181640625, "learning_rate": 2.286851228350167e-07, "loss": 0.2295, "reward": 0.6110491305589676, "reward_std": 0.3530833497643471, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506138414144516, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 1573.9107971191406, "epoch": 0.4095287879919349, "grad_norm": 1.9404128789901733, "kl": 1.455078125, "learning_rate": 2.2856093223402038e-07, "loss": 0.2296, "reward": 0.517857164144516, "reward_std": 0.24971888959407806, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482142873108387, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 1496.3147888183594, "epoch": 0.40982749607945634, "grad_norm": 2.4331183433532715, "kl": 1.89453125, "learning_rate": 2.284366724849595e-07, "loss": 0.2585, "reward": 0.5825893208384514, "reward_std": 0.27872735261917114, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964477300644, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 1404.5290832519531, "epoch": 0.4101262041669778, "grad_norm": 1.828360676765442, "kl": 1.96484375, "learning_rate": 2.2831234372301635e-07, "loss": 0.2298, "reward": 0.6222098469734192, "reward_std": 0.29864081367850304, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5351562798023224, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 1529.8661499023438, "epoch": 0.4104249122544993, "grad_norm": 6.725151538848877, "kl": 2.75, "learning_rate": 2.2818794608344857e-07, "loss": 0.2373, "reward": 0.5675223469734192, "reward_std": 0.32140346616506577, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580633878708, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 1535.4442749023438, "epoch": 0.41072362034202076, "grad_norm": 3.1825602054595947, "kl": 1.998046875, "learning_rate": 2.2806347970158856e-07, "loss": 0.236, "reward": 0.5234375298023224, "reward_std": 0.2762548699975014, "rewards/accuracy_reward": 0.02901785750873387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196566939354, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 1444.5447082519531, "epoch": 0.4110223284295422, "grad_norm": 3.8846609592437744, "kl": 3.123046875, "learning_rate": 2.2793894471284347e-07, "loss": 0.2834, "reward": 0.7003348469734192, "reward_std": 0.2894907295703888, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5195312723517418, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 1396.1674499511719, "epoch": 0.4113210365170637, "grad_norm": 2.647737979888916, "kl": 2.560546875, "learning_rate": 2.2781434125269518e-07, "loss": 0.3021, "reward": 0.6802455633878708, "reward_std": 0.28072065114974976, "rewards/accuracy_reward": 0.16741072479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5128348469734192, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 1440.7188110351562, "epoch": 0.41161974460458517, "grad_norm": 4.8616623878479, "kl": 2.091796875, "learning_rate": 2.2768966945670014e-07, "loss": 0.258, "reward": 0.667410746216774, "reward_std": 0.25503166392445564, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678656578064, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 1549.2411499023438, "epoch": 0.41191845269210664, "grad_norm": 2.869443655014038, "kl": 2.580078125, "learning_rate": 2.2756492946048895e-07, "loss": 0.2585, "reward": 0.6177455633878708, "reward_std": 0.31544870883226395, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205633878708, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 1465.6563110351562, "epoch": 0.4122171607796281, "grad_norm": 5.193906784057617, "kl": 2.671875, "learning_rate": 2.2744012139976654e-07, "loss": 0.3072, "reward": 0.5680803880095482, "reward_std": 0.26802071928977966, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167411044239998, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 1431.4040832519531, "epoch": 0.4125158688671496, "grad_norm": 1.8440731763839722, "kl": 1.9140625, "learning_rate": 2.273152454103118e-07, "loss": 0.2586, "reward": 0.6981027126312256, "reward_std": 0.3413487449288368, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5351562649011612, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 1617.3973693847656, "epoch": 0.41281457695467105, "grad_norm": 1.9364107847213745, "kl": 1.8203125, "learning_rate": 2.2719030162797765e-07, "loss": 0.2067, "reward": 0.569754496216774, "reward_std": 0.2816745713353157, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723395228386, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 1540.7969055175781, "epoch": 0.4131132850421925, "grad_norm": 2.6085987091064453, "kl": 2.16015625, "learning_rate": 2.270652901886906e-07, "loss": 0.2396, "reward": 0.592633955180645, "reward_std": 0.3205285146832466, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303656578064, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 1489.7902526855469, "epoch": 0.413411993129714, "grad_norm": 2.1002349853515625, "kl": 1.771484375, "learning_rate": 2.2694021122845086e-07, "loss": 0.2447, "reward": 0.654575914144516, "reward_std": 0.3092617504298687, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.505022332072258, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 1642.1763916015625, "epoch": 0.41371070121723547, "grad_norm": 4.572430610656738, "kl": 1.73046875, "learning_rate": 2.2681506488333215e-07, "loss": 0.192, "reward": 0.5379464402794838, "reward_std": 0.2779473848640919, "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250223517418, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 1428.2322387695312, "epoch": 0.41400940930475694, "grad_norm": 4.263123035430908, "kl": 1.259765625, "learning_rate": 2.2668985128948141e-07, "loss": 0.2118, "reward": 0.6707589477300644, "reward_std": 0.2923332005739212, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5323660969734192, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 1559.9822082519531, "epoch": 0.4143081173922784, "grad_norm": 2.386319398880005, "kl": 1.423828125, "learning_rate": 2.2656457058311868e-07, "loss": 0.2004, "reward": 0.5513393208384514, "reward_std": 0.2819220796227455, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4754464477300644, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 1464.7523193359375, "epoch": 0.4146068254797999, "grad_norm": 5.471649646759033, "kl": 1.75, "learning_rate": 2.2643922290053705e-07, "loss": 0.2159, "reward": 0.6077009290456772, "reward_std": 0.3284735158085823, "rewards/accuracy_reward": 0.09375000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.513950914144516, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 1473.4755249023438, "epoch": 0.41490553356732135, "grad_norm": 7.029452800750732, "kl": 1.767578125, "learning_rate": 2.2631380837810269e-07, "loss": 0.2544, "reward": 0.550223246216774, "reward_std": 0.3142516687512398, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487723246216774, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 1416.2947082519531, "epoch": 0.4152042416548428, "grad_norm": 3.4260995388031006, "kl": 1.5625, "learning_rate": 2.261883271522542e-07, "loss": 0.2077, "reward": 0.6026785969734192, "reward_std": 0.27834780141711235, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517857164144516, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 1462.825927734375, "epoch": 0.4155029497423643, "grad_norm": 5.152620792388916, "kl": 1.419921875, "learning_rate": 2.2606277935950286e-07, "loss": 0.2294, "reward": 0.5552455484867096, "reward_std": 0.2902812026441097, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.481584832072258, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 1411.1741638183594, "epoch": 0.41580165782988576, "grad_norm": 5.130824089050293, "kl": 1.509765625, "learning_rate": 2.2593716513643237e-07, "loss": 0.2445, "reward": 0.6400669813156128, "reward_std": 0.35861486196517944, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5306919738650322, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 1581.0916137695312, "epoch": 0.41610036591740723, "grad_norm": 2.6898276805877686, "kl": 1.794921875, "learning_rate": 2.2581148461969867e-07, "loss": 0.2029, "reward": 0.6489955633878708, "reward_std": 0.24781298264861107, "rewards/accuracy_reward": 0.12946429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.519531287252903, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 1505.3125610351562, "epoch": 0.4163990740049287, "grad_norm": 3.1344404220581055, "kl": 1.40625, "learning_rate": 2.2568573794602992e-07, "loss": 0.204, "reward": 0.6043526977300644, "reward_std": 0.30210689455270767, "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5061384066939354, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 1485.1317443847656, "epoch": 0.4166977820924502, "grad_norm": 3.1377854347229004, "kl": 1.8046875, "learning_rate": 2.2555992525222607e-07, "loss": 0.2552, "reward": 0.5223214626312256, "reward_std": 0.2820385582745075, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486607164144516, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 1677.1138916015625, "epoch": 0.41699649017997165, "grad_norm": 3.928460121154785, "kl": 1.56640625, "learning_rate": 2.2543404667515907e-07, "loss": 0.1946, "reward": 0.5518973469734192, "reward_std": 0.324357233941555, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366380095482, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 1550.2389221191406, "epoch": 0.4172951982674931, "grad_norm": 4.0114898681640625, "kl": 1.671875, "learning_rate": 2.2530810235177254e-07, "loss": 0.2271, "reward": 0.5602678954601288, "reward_std": 0.2854892760515213, "rewards/accuracy_reward": 0.05580357485450804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5044643208384514, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 1577.44873046875, "epoch": 0.4175939063550146, "grad_norm": 6.225615501403809, "kl": 2.171875, "learning_rate": 2.2518209241908138e-07, "loss": 0.2344, "reward": 0.5122768059372902, "reward_std": 0.2711348161101341, "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875149011612, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 1516.9219665527344, "epoch": 0.41789261444253606, "grad_norm": 5.171302318572998, "kl": 1.865234375, "learning_rate": 2.2505601701417217e-07, "loss": 0.2448, "reward": 0.5915178880095482, "reward_std": 0.3024270609021187, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 1425.2679138183594, "epoch": 0.4181913225300575, "grad_norm": 7.671365261077881, "kl": 1.333984375, "learning_rate": 2.2492987627420258e-07, "loss": 0.2778, "reward": 0.5792410969734192, "reward_std": 0.336541585624218, "rewards/accuracy_reward": 0.07812500093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160895228386, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 1506.368408203125, "epoch": 0.41849003061757895, "grad_norm": 8.1069917678833, "kl": 1.482421875, "learning_rate": 2.2480367033640138e-07, "loss": 0.2247, "reward": 0.5558035969734192, "reward_std": 0.29732149094343185, "rewards/accuracy_reward": 0.03794643119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5178571566939354, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 1530.96435546875, "epoch": 0.4187887387051004, "grad_norm": 10.70815372467041, "kl": 2.068359375, "learning_rate": 2.2467739933806823e-07, "loss": 0.2579, "reward": 0.571428582072258, "reward_std": 0.3141334317624569, "rewards/accuracy_reward": 0.0736607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678880095482, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 1483.0603637695312, "epoch": 0.4190874467926219, "grad_norm": 12.774983406066895, "kl": 1.751953125, "learning_rate": 2.2455106341657364e-07, "loss": 0.2713, "reward": 0.6082589477300644, "reward_std": 0.3159748762845993, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125298023224, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 1565.1965026855469, "epoch": 0.41938615488014336, "grad_norm": 10.930985450744629, "kl": 2.291015625, "learning_rate": 2.2442466270935866e-07, "loss": 0.2652, "reward": 0.5915178805589676, "reward_std": 0.3528621643781662, "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143133878708, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 1535.4598999023438, "epoch": 0.41968486296766483, "grad_norm": 14.090312957763672, "kl": 2.029296875, "learning_rate": 2.2429819735393488e-07, "loss": 0.2668, "reward": 0.6166294813156128, "reward_std": 0.35974276810884476, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.471540205180645, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 1551.1741943359375, "epoch": 0.4199835710551863, "grad_norm": 14.804902076721191, "kl": 2.05859375, "learning_rate": 2.2417166748788424e-07, "loss": 0.2322, "reward": 0.6417410969734192, "reward_std": 0.30567821115255356, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494419664144516, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 1543.9263916015625, "epoch": 0.42028227914270777, "grad_norm": 19.548954010009766, "kl": 2.580078125, "learning_rate": 2.2404507324885883e-07, "loss": 0.2646, "reward": 0.6010044813156128, "reward_std": 0.308273009955883, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937723517418, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 1531.7411193847656, "epoch": 0.42058098723022924, "grad_norm": 16.485963821411133, "kl": 2.47265625, "learning_rate": 2.2391841477458077e-07, "loss": 0.2665, "reward": 0.5803571566939354, "reward_std": 0.27184155583381653, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035969734192, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 1640.4754943847656, "epoch": 0.4208796953177507, "grad_norm": 18.83064842224121, "kl": 2.625, "learning_rate": 2.2379169220284201e-07, "loss": 0.2969, "reward": 0.549107164144516, "reward_std": 0.31563691049814224, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4598214477300644, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 1483.8281555175781, "epoch": 0.4211784034052722, "grad_norm": 21.424755096435547, "kl": 2.302734375, "learning_rate": 2.2366490567150439e-07, "loss": 0.2919, "reward": 0.5764509290456772, "reward_std": 0.3031199835240841, "rewards/accuracy_reward": 0.08258929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616305589676, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 1545.74560546875, "epoch": 0.42147711149279365, "grad_norm": 17.94525909423828, "kl": 2.8359375, "learning_rate": 2.235380553184992e-07, "loss": 0.3081, "reward": 0.6049107313156128, "reward_std": 0.30865485966205597, "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4910714328289032, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 1579.6964721679688, "epoch": 0.4217758195803151, "grad_norm": 18.799821853637695, "kl": 3.3046875, "learning_rate": 2.234111412818271e-07, "loss": 0.3005, "reward": 0.588169664144516, "reward_std": 0.29999156296253204, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053880095482, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 1469.6942749023438, "epoch": 0.4220745276678366, "grad_norm": 18.733200073242188, "kl": 3.18359375, "learning_rate": 2.2328416369955822e-07, "loss": 0.2868, "reward": 0.6406250223517418, "reward_std": 0.3157069534063339, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607387661934, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 1553.1407165527344, "epoch": 0.42237323575535807, "grad_norm": 14.482266426086426, "kl": 3.421875, "learning_rate": 2.2315712270983173e-07, "loss": 0.3137, "reward": 0.533482164144516, "reward_std": 0.30118706822395325, "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 1440.2322082519531, "epoch": 0.42267194384287954, "grad_norm": 16.6065673828125, "kl": 3.69140625, "learning_rate": 2.2303001845085572e-07, "loss": 0.3594, "reward": 0.5625000149011612, "reward_std": 0.3000834956765175, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607387661934, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 1512.1295471191406, "epoch": 0.422970651930401, "grad_norm": 12.6604585647583, "kl": 4.53125, "learning_rate": 2.2290285106090718e-07, "loss": 0.3933, "reward": 0.615513414144516, "reward_std": 0.24142476171255112, "rewards/accuracy_reward": 0.12053571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949777126312256, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 1534.77685546875, "epoch": 0.4232693600179225, "grad_norm": 12.469084739685059, "kl": 5.234375, "learning_rate": 2.2277562067833176e-07, "loss": 0.4064, "reward": 0.6774553954601288, "reward_std": 0.2860117107629776, "rewards/accuracy_reward": 0.17633929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011161044239998, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 1545.2880249023438, "epoch": 0.42356806810544395, "grad_norm": 11.00802993774414, "kl": 10.9296875, "learning_rate": 2.2264832744154368e-07, "loss": 0.4705, "reward": 0.511160746216774, "reward_std": 0.27420518174767494, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035895228386, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 1539.5848693847656, "epoch": 0.4238667761929654, "grad_norm": 22.093597412109375, "kl": 6.59375, "learning_rate": 2.2252097148902548e-07, "loss": 0.486, "reward": 0.6216518133878708, "reward_std": 0.35221047699451447, "rewards/accuracy_reward": 0.12723214854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494419664144516, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 1498.2031860351562, "epoch": 0.4241654842804869, "grad_norm": 10.509690284729004, "kl": 5.76953125, "learning_rate": 2.2239355295932796e-07, "loss": 0.4621, "reward": 0.5641741454601288, "reward_std": 0.28776853531599045, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 1546.6005249023438, "epoch": 0.42446419236800836, "grad_norm": 5.257259845733643, "kl": 5.04296875, "learning_rate": 2.2226607199107e-07, "loss": 0.4001, "reward": 0.6250000298023224, "reward_std": 0.3314121663570404, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607313156128, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 1573.888427734375, "epoch": 0.42476290045552983, "grad_norm": 8.944177627563477, "kl": 4.34375, "learning_rate": 2.2213852872293842e-07, "loss": 0.342, "reward": 0.6199776977300644, "reward_std": 0.38196495920419693, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501674123108387, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 1461.8348999023438, "epoch": 0.4250616085430513, "grad_norm": 39.04438781738281, "kl": 2.935546875, "learning_rate": 2.220109232936877e-07, "loss": 0.3278, "reward": 0.6300223469734192, "reward_std": 0.3230581730604172, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5340401977300644, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 1437.6875610351562, "epoch": 0.4253603166305728, "grad_norm": 37.089576721191406, "kl": 2.2265625, "learning_rate": 2.218832558421402e-07, "loss": 0.2409, "reward": 0.6629464477300644, "reward_std": 0.30686983838677406, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5401785969734192, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 1529.6942749023438, "epoch": 0.42565902471809425, "grad_norm": 28.48572540283203, "kl": 3.35546875, "learning_rate": 2.2175552650718545e-07, "loss": 0.3245, "reward": 0.6316964775323868, "reward_std": 0.3014398664236069, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964477300644, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 1476.9130249023438, "epoch": 0.4259577328056157, "grad_norm": 28.88487434387207, "kl": 3.06640625, "learning_rate": 2.2162773542778046e-07, "loss": 0.3247, "reward": 0.6406250298023224, "reward_std": 0.34660089761018753, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607238650322, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 1506.6250610351562, "epoch": 0.4262564408931372, "grad_norm": 7.69644021987915, "kl": 4.35546875, "learning_rate": 2.2149988274294947e-07, "loss": 0.384, "reward": 0.6250000298023224, "reward_std": 0.3388647362589836, "rewards/accuracy_reward": 0.1294642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.495535746216774, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 1607.7121276855469, "epoch": 0.42655514898065866, "grad_norm": 23.905540466308594, "kl": 6.40625, "learning_rate": 2.2137196859178367e-07, "loss": 0.4343, "reward": 0.6199777126312256, "reward_std": 0.3498629480600357, "rewards/accuracy_reward": 0.1339285778813064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 1587.9888916015625, "epoch": 0.42685385706818013, "grad_norm": 30.582571029663086, "kl": 6.3125, "learning_rate": 2.2124399311344108e-07, "loss": 0.4696, "reward": 0.640066996216774, "reward_std": 0.3433000445365906, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 1555.8594665527344, "epoch": 0.4271525651557016, "grad_norm": 23.91868019104004, "kl": 6.265625, "learning_rate": 2.2111595644714642e-07, "loss": 0.4508, "reward": 0.5446428805589676, "reward_std": 0.3307347968220711, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 1636.7009887695312, "epoch": 0.4274512732432231, "grad_norm": 24.500635147094727, "kl": 6.3203125, "learning_rate": 2.2098785873219117e-07, "loss": 0.4565, "reward": 0.538504496216774, "reward_std": 0.2742152139544487, "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4559151977300644, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 1509.1942138671875, "epoch": 0.42774998133074454, "grad_norm": 7.035635471343994, "kl": 4.36328125, "learning_rate": 2.2085970010793304e-07, "loss": 0.3782, "reward": 0.6026786118745804, "reward_std": 0.3258751481771469, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.508928582072258, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 1567.2433471679688, "epoch": 0.428048689418266, "grad_norm": 14.29142951965332, "kl": 4.02734375, "learning_rate": 2.2073148071379598e-07, "loss": 0.3531, "reward": 0.6004464477300644, "reward_std": 0.28920628130435944, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393208384514, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 1650.5067443847656, "epoch": 0.4283473975057875, "grad_norm": 8.707704544067383, "kl": 4.4453125, "learning_rate": 2.206032006892702e-07, "loss": 0.3531, "reward": 0.5758928805589676, "reward_std": 0.3189225047826767, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 1549.3259582519531, "epoch": 0.42864610559330896, "grad_norm": 6.156910419464111, "kl": 4.9375, "learning_rate": 2.2047486017391173e-07, "loss": 0.4124, "reward": 0.608816996216774, "reward_std": 0.2825680524110794, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490513414144516, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 1524.3772888183594, "epoch": 0.42894481368083043, "grad_norm": 22.051424026489258, "kl": 3.8984375, "learning_rate": 2.2034645930734257e-07, "loss": 0.4002, "reward": 0.5931919813156128, "reward_std": 0.3234848380088806, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 1572.0000915527344, "epoch": 0.4292435217683519, "grad_norm": 6.034488201141357, "kl": 5.05859375, "learning_rate": 2.2021799822925017e-07, "loss": 0.407, "reward": 0.547991082072258, "reward_std": 0.29328303039073944, "rewards/accuracy_reward": 0.08035714691504836, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 1580.8371276855469, "epoch": 0.42954222985587337, "grad_norm": 6.3467278480529785, "kl": 4.56640625, "learning_rate": 2.2008947707938765e-07, "loss": 0.3894, "reward": 0.5418526977300644, "reward_std": 0.30673257261514664, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455559372902, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 1544.6719360351562, "epoch": 0.42984093794339484, "grad_norm": 4.059657096862793, "kl": 4.55078125, "learning_rate": 2.1996089599757345e-07, "loss": 0.3636, "reward": 0.6690848618745804, "reward_std": 0.26352088525891304, "rewards/accuracy_reward": 0.17410715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 1449.88623046875, "epoch": 0.4301396460309163, "grad_norm": 10.829463958740234, "kl": 4.41015625, "learning_rate": 2.1983225512369111e-07, "loss": 0.394, "reward": 0.6367187947034836, "reward_std": 0.34332918375730515, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5340402126312256, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 1514.4130249023438, "epoch": 0.4304383541184378, "grad_norm": 5.735431671142578, "kl": 5.5078125, "learning_rate": 2.1970355459768936e-07, "loss": 0.4416, "reward": 0.593191996216774, "reward_std": 0.2803773283958435, "rewards/accuracy_reward": 0.09598214947618544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098395228386, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 1436.5558776855469, "epoch": 0.43073706220595925, "grad_norm": 10.761423110961914, "kl": 4.5390625, "learning_rate": 2.1957479455958175e-07, "loss": 0.4658, "reward": 0.607700914144516, "reward_std": 0.32539260387420654, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5161830484867096, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 1575.15185546875, "epoch": 0.43103577029348067, "grad_norm": 15.745744705200195, "kl": 6.453125, "learning_rate": 2.1944597514944653e-07, "loss": 0.4818, "reward": 0.5106026902794838, "reward_std": 0.2850605249404907, "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 1549.7433776855469, "epoch": 0.43133447838100214, "grad_norm": 19.444753646850586, "kl": 6.640625, "learning_rate": 2.193170965074266e-07, "loss": 0.5144, "reward": 0.5200892984867096, "reward_std": 0.29917027056217194, "rewards/accuracy_reward": 0.03125000069849193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393133878708, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 1560.6495971679688, "epoch": 0.4316331864685236, "grad_norm": 22.39396095275879, "kl": 7.2265625, "learning_rate": 2.1918815877372937e-07, "loss": 0.5554, "reward": 0.537388414144516, "reward_std": 0.31372538954019547, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884066939354, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 1491.3906860351562, "epoch": 0.4319318945560451, "grad_norm": 6.108685493469238, "kl": 5.625, "learning_rate": 2.1905916208862642e-07, "loss": 0.4797, "reward": 0.6127232387661934, "reward_std": 0.3230596147477627, "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4899553880095482, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 1589.6808776855469, "epoch": 0.43223060264356655, "grad_norm": 4.523467063903809, "kl": 5.578125, "learning_rate": 2.189301065924534e-07, "loss": 0.4387, "reward": 0.6015625298023224, "reward_std": 0.3173196576535702, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854910969734192, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 1443.05810546875, "epoch": 0.432529310731088, "grad_norm": 4.728201866149902, "kl": 4.66796875, "learning_rate": 2.1880099242561023e-07, "loss": 0.3704, "reward": 0.6026785969734192, "reward_std": 0.33310266584157944, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5223214626312256, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 1414.8326721191406, "epoch": 0.4328280188186095, "grad_norm": 10.509032249450684, "kl": 4.5546875, "learning_rate": 2.1867181972856032e-07, "loss": 0.4661, "reward": 0.6255580633878708, "reward_std": 0.3155180439352989, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487165205180645, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 1634.5157165527344, "epoch": 0.43312672690613097, "grad_norm": 6.528981685638428, "kl": 5.7734375, "learning_rate": 2.1854258864183092e-07, "loss": 0.4606, "reward": 0.5904018208384514, "reward_std": 0.2718006744980812, "rewards/accuracy_reward": 0.14062500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4497768059372902, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 1515.1473693847656, "epoch": 0.43342543499365244, "grad_norm": 22.28665542602539, "kl": 3.83203125, "learning_rate": 2.1841329930601284e-07, "loss": 0.394, "reward": 0.6685268133878708, "reward_std": 0.35350401699543, "rewards/accuracy_reward": 0.15401786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.514508955180645, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 1511.82373046875, "epoch": 0.4337241430811739, "grad_norm": 15.132950782775879, "kl": 4.4921875, "learning_rate": 2.182839518617602e-07, "loss": 0.4627, "reward": 0.6869419887661934, "reward_std": 0.30150584876537323, "rewards/accuracy_reward": 0.1852678705472499, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741380095482, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 1590.7568054199219, "epoch": 0.4340228511686954, "grad_norm": 7.31808614730835, "kl": 6.1484375, "learning_rate": 2.1815454644979033e-07, "loss": 0.5046, "reward": 0.5518973469734192, "reward_std": 0.3235413730144501, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330708384514, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 1562.41748046875, "epoch": 0.43432155925621685, "grad_norm": 16.652753829956055, "kl": 6.3359375, "learning_rate": 2.1802508321088355e-07, "loss": 0.4881, "reward": 0.565848246216774, "reward_std": 0.32887278497219086, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875149011612, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 1481.5692749023438, "epoch": 0.4346202673437383, "grad_norm": 23.766000747680664, "kl": 6.5078125, "learning_rate": 2.1789556228588331e-07, "loss": 0.5361, "reward": 0.5418527126312256, "reward_std": 0.29790544509887695, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455559372902, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 1508.8728332519531, "epoch": 0.4349189754312598, "grad_norm": 16.168472290039062, "kl": 6.2734375, "learning_rate": 2.1776598381569562e-07, "loss": 0.4962, "reward": 0.5675223469734192, "reward_std": 0.3474701792001724, "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4983259290456772, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 1508.9085388183594, "epoch": 0.43521768351878126, "grad_norm": 6.491587162017822, "kl": 5.25, "learning_rate": 2.1763634794128915e-07, "loss": 0.4514, "reward": 0.621651828289032, "reward_std": 0.31574078649282455, "rewards/accuracy_reward": 0.12276786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498883955180645, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 1477.2255249023438, "epoch": 0.43551639160630273, "grad_norm": 14.592218399047852, "kl": 5.78125, "learning_rate": 2.1750665480369507e-07, "loss": 0.4553, "reward": 0.6679687798023224, "reward_std": 0.33357036858797073, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4827009215950966, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 1504.7277526855469, "epoch": 0.4358150996938242, "grad_norm": 11.388388633728027, "kl": 5.0390625, "learning_rate": 2.1737690454400672e-07, "loss": 0.4695, "reward": 0.6707589626312256, "reward_std": 0.32592596113681793, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122767984867096, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 1524.04248046875, "epoch": 0.4361138077813457, "grad_norm": 16.14168357849121, "kl": 4.33203125, "learning_rate": 2.1724709730337975e-07, "loss": 0.3768, "reward": 0.5613839626312256, "reward_std": 0.3298700414597988, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482387661934, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 1522.8482666015625, "epoch": 0.43641251586886715, "grad_norm": 20.175941467285156, "kl": 4.3046875, "learning_rate": 2.1711723322303166e-07, "loss": 0.4025, "reward": 0.621651828289032, "reward_std": 0.3106336072087288, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768133878708, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 1547.5291137695312, "epoch": 0.4367112239563886, "grad_norm": 10.53950023651123, "kl": 5.1015625, "learning_rate": 2.169873124442419e-07, "loss": 0.4114, "reward": 0.5703125298023224, "reward_std": 0.31497129797935486, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.518973246216774, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 1561.44873046875, "epoch": 0.4370099320439101, "grad_norm": 7.596487522125244, "kl": 5.671875, "learning_rate": 2.1685733510835147e-07, "loss": 0.4487, "reward": 0.5530134215950966, "reward_std": 0.3208700902760029, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4905134215950966, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 1569.21435546875, "epoch": 0.43730864013143156, "grad_norm": 17.02894401550293, "kl": 6.265625, "learning_rate": 2.1672730135676304e-07, "loss": 0.4477, "reward": 0.5407366454601288, "reward_std": 0.30761151760816574, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866380095482, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 1553.0246276855469, "epoch": 0.43760734821895303, "grad_norm": 11.24913215637207, "kl": 5.9140625, "learning_rate": 2.1659721133094047e-07, "loss": 0.4941, "reward": 0.5797991305589676, "reward_std": 0.29499755799770355, "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949776977300644, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 1518.5871276855469, "epoch": 0.4379060563064745, "grad_norm": 9.695063591003418, "kl": 5.46875, "learning_rate": 2.164670651724091e-07, "loss": 0.4399, "reward": 0.584263414144516, "reward_std": 0.3032407648861408, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5396205633878708, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 1489.5804138183594, "epoch": 0.43820476439399597, "grad_norm": 16.47775650024414, "kl": 5.9609375, "learning_rate": 2.1633686302275514e-07, "loss": 0.5389, "reward": 0.6054687798023224, "reward_std": 0.30784136801958084, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.509486623108387, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 1395.7812805175781, "epoch": 0.43850347248151744, "grad_norm": 15.513894081115723, "kl": 4.55078125, "learning_rate": 2.162066050236258e-07, "loss": 0.4434, "reward": 0.6612723469734192, "reward_std": 0.3396308049559593, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5340401902794838, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 1612.3527526855469, "epoch": 0.4388021805690389, "grad_norm": 17.13524627685547, "kl": 6.4140625, "learning_rate": 2.1607629131672892e-07, "loss": 0.4226, "reward": 0.5318080484867096, "reward_std": 0.2927812226116657, "rewards/accuracy_reward": 0.03571428754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937798023224, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 1363.3996276855469, "epoch": 0.4391008886565604, "grad_norm": 10.29346752166748, "kl": 4.7109375, "learning_rate": 2.159459220438331e-07, "loss": 0.4316, "reward": 0.6545759290456772, "reward_std": 0.3183325082063675, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5429687798023224, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 1438.5357971191406, "epoch": 0.43939959674408186, "grad_norm": 9.512896537780762, "kl": 4.69140625, "learning_rate": 2.1581549734676738e-07, "loss": 0.3632, "reward": 0.6640625298023224, "reward_std": 0.2761295326054096, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167410895228386, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 1379.3281860351562, "epoch": 0.4396983048316033, "grad_norm": 11.603841781616211, "kl": 5.1796875, "learning_rate": 2.1568501736742097e-07, "loss": 0.4559, "reward": 0.6573660969734192, "reward_std": 0.3174849562346935, "rewards/accuracy_reward": 0.11160714598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5457589626312256, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 1416.6473999023438, "epoch": 0.4399970129191248, "grad_norm": 14.191446304321289, "kl": 4.9375, "learning_rate": 2.1555448224774337e-07, "loss": 0.4352, "reward": 0.6378348618745804, "reward_std": 0.26475993171334267, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5440848469734192, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 1553.71435546875, "epoch": 0.44029572100664627, "grad_norm": 15.507113456726074, "kl": 6.125, "learning_rate": 2.1542389212974392e-07, "loss": 0.451, "reward": 0.6160714626312256, "reward_std": 0.31666115671396255, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250298023224, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 1355.8370971679688, "epoch": 0.44059442909416774, "grad_norm": 11.472373962402344, "kl": 5.09375, "learning_rate": 2.1529324715549192e-07, "loss": 0.4714, "reward": 0.6925223469734192, "reward_std": 0.2783847413957119, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5496651977300644, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 1436.8929443359375, "epoch": 0.4408931371816892, "grad_norm": 7.5864410400390625, "kl": 4.640625, "learning_rate": 2.1516254746711623e-07, "loss": 0.3783, "reward": 0.6813616454601288, "reward_std": 0.3882969468832016, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.554129496216774, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 1446.0380249023438, "epoch": 0.4411918452692107, "grad_norm": 14.457064628601074, "kl": 4.56640625, "learning_rate": 2.150317932068054e-07, "loss": 0.3742, "reward": 0.619419664144516, "reward_std": 0.27981849387288094, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768133878708, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 1425.6875305175781, "epoch": 0.44149055335673215, "grad_norm": 20.20216941833496, "kl": 4.4296875, "learning_rate": 2.1490098451680722e-07, "loss": 0.4068, "reward": 0.6473214477300644, "reward_std": 0.3198374733328819, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714626312256, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 1351.8259582519531, "epoch": 0.4417892614442536, "grad_norm": 22.43639373779297, "kl": 4.16015625, "learning_rate": 2.1477012153942867e-07, "loss": 0.3936, "reward": 0.6473214477300644, "reward_std": 0.269289068877697, "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 1438.8996276855469, "epoch": 0.4420879695317751, "grad_norm": 22.698286056518555, "kl": 6.390625, "learning_rate": 2.146392044170359e-07, "loss": 0.4968, "reward": 0.714285746216774, "reward_std": 0.3279765695333481, "rewards/accuracy_reward": 0.17187500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.542410746216774, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 1357.5759582519531, "epoch": 0.44238667761929656, "grad_norm": 20.755939483642578, "kl": 5.9375, "learning_rate": 2.1450823329205392e-07, "loss": 0.5348, "reward": 0.6551339477300644, "reward_std": 0.3115290328860283, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5412946790456772, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 1283.6272888183594, "epoch": 0.44268538570681804, "grad_norm": 11.414862632751465, "kl": 5.078125, "learning_rate": 2.1437720830696638e-07, "loss": 0.4216, "reward": 0.7544643133878708, "reward_std": 0.27426209300756454, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870535969734192, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 1367.1518249511719, "epoch": 0.4429840937943395, "grad_norm": 6.142383098602295, "kl": 3.900390625, "learning_rate": 2.142461296043157e-07, "loss": 0.3146, "reward": 0.7170759290456772, "reward_std": 0.3253011256456375, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687649011612, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 1466.8326721191406, "epoch": 0.443282801881861, "grad_norm": 8.421957969665527, "kl": 5.35546875, "learning_rate": 2.141149973267027e-07, "loss": 0.4002, "reward": 0.6400670111179352, "reward_std": 0.30382438004016876, "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491156578064, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 1331.4889221191406, "epoch": 0.44358150996938245, "grad_norm": 10.892145156860352, "kl": 4.27734375, "learning_rate": 2.139838116167864e-07, "loss": 0.3885, "reward": 0.6718750298023224, "reward_std": 0.3290696442127228, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714477300644, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 1331.2947082519531, "epoch": 0.44388021805690386, "grad_norm": 12.450241088867188, "kl": 4.01171875, "learning_rate": 2.1385257261728393e-07, "loss": 0.3738, "reward": 0.7360491454601288, "reward_std": 0.3417762294411659, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 1359.5848693847656, "epoch": 0.44417892614442533, "grad_norm": 16.763866424560547, "kl": 4.15234375, "learning_rate": 2.1372128047097056e-07, "loss": 0.343, "reward": 0.6941964626312256, "reward_std": 0.29103484004735947, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 1270.6965026855469, "epoch": 0.4444776342319468, "grad_norm": 10.934659004211426, "kl": 4.046875, "learning_rate": 2.135899353206792e-07, "loss": 0.3785, "reward": 0.742745578289032, "reward_std": 0.33738449588418007, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 1374.1585388183594, "epoch": 0.4447763423194683, "grad_norm": 7.330787658691406, "kl": 5.25, "learning_rate": 2.1345853730930056e-07, "loss": 0.4307, "reward": 0.5803571790456772, "reward_std": 0.2664579339325428, "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5580357313156128, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 1331.7411193847656, "epoch": 0.44507505040698975, "grad_norm": 7.661723613739014, "kl": 5.1171875, "learning_rate": 2.1332708657978276e-07, "loss": 0.4258, "reward": 0.667410746216774, "reward_std": 0.26560812070965767, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 1341.7277221679688, "epoch": 0.4453737584945112, "grad_norm": 29.43506622314453, "kl": 6.1328125, "learning_rate": 2.1319558327513138e-07, "loss": 0.4656, "reward": 0.7042410969734192, "reward_std": 0.3297179937362671, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 1340.6228637695312, "epoch": 0.4456724665820327, "grad_norm": 11.11806583404541, "kl": 4.9609375, "learning_rate": 2.1306402753840905e-07, "loss": 0.4196, "reward": 0.7087053805589676, "reward_std": 0.30499985441565514, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875298023224, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 1266.8973999023438, "epoch": 0.44597117466955416, "grad_norm": 22.535297393798828, "kl": 5.7265625, "learning_rate": 2.129324195127355e-07, "loss": 0.474, "reward": 0.7159598618745804, "reward_std": 0.30225304141640663, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6177455633878708, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 1300.6830749511719, "epoch": 0.44626988275707563, "grad_norm": 11.502291679382324, "kl": 3.94921875, "learning_rate": 2.1280075934128742e-07, "loss": 0.2491, "reward": 0.6707589477300644, "reward_std": 0.32806679606437683, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603794664144516, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 1401.352783203125, "epoch": 0.4465685908445971, "grad_norm": 8.887372016906738, "kl": 4.9375, "learning_rate": 2.126690471672982e-07, "loss": 0.3353, "reward": 0.6785714626312256, "reward_std": 0.29318634793162346, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 1282.1607666015625, "epoch": 0.4468672989321186, "grad_norm": 9.010900497436523, "kl": 4.93359375, "learning_rate": 2.1253728313405764e-07, "loss": 0.3611, "reward": 0.7343750149011612, "reward_std": 0.28309790417551994, "rewards/accuracy_reward": 0.14285714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 1328.6495971679688, "epoch": 0.44716600701964004, "grad_norm": 22.767288208007812, "kl": 3.099609375, "learning_rate": 2.124054673849122e-07, "loss": 0.3258, "reward": 0.663504496216774, "reward_std": 0.2800235040485859, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6121652126312256, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 1378.5357666015625, "epoch": 0.4474647151071615, "grad_norm": 26.498620986938477, "kl": 2.57421875, "learning_rate": 2.1227360006326456e-07, "loss": 0.2212, "reward": 0.6925223618745804, "reward_std": 0.2916315570473671, "rewards/accuracy_reward": 0.08482143422588706, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.607700914144516, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 1375.8839721679688, "epoch": 0.447763423194683, "grad_norm": 13.936447143554688, "kl": 3.54296875, "learning_rate": 2.1214168131257332e-07, "loss": 0.3092, "reward": 0.754464328289032, "reward_std": 0.34689556062221527, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500447034836, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 1220.5201110839844, "epoch": 0.44806213128220446, "grad_norm": 21.7470645904541, "kl": 3.11328125, "learning_rate": 2.1200971127635317e-07, "loss": 0.3209, "reward": 0.7360491454601288, "reward_std": 0.35004279017448425, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 1311.5826416015625, "epoch": 0.4483608393697259, "grad_norm": 15.641504287719727, "kl": 3.58203125, "learning_rate": 2.118776900981746e-07, "loss": 0.3644, "reward": 0.8191964477300644, "reward_std": 0.3326188251376152, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 1321.8304138183594, "epoch": 0.4486595474572474, "grad_norm": 13.975750923156738, "kl": 3.5078125, "learning_rate": 2.1174561792166376e-07, "loss": 0.3017, "reward": 0.7042410969734192, "reward_std": 0.32157452404499054, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303954601288, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 1313.6563110351562, "epoch": 0.44895825554476887, "grad_norm": 4.1443190574646, "kl": 4.34765625, "learning_rate": 2.1161349489050217e-07, "loss": 0.3395, "reward": 0.686941996216774, "reward_std": 0.3039984107017517, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6177455484867096, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 1377.3773193359375, "epoch": 0.44925696363229034, "grad_norm": 13.932790756225586, "kl": 4.90234375, "learning_rate": 2.114813211484267e-07, "loss": 0.3378, "reward": 0.6713169813156128, "reward_std": 0.2921220548450947, "rewards/accuracy_reward": 0.0691964307334274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 1326.8840026855469, "epoch": 0.4495556717198118, "grad_norm": 17.658716201782227, "kl": 5.546875, "learning_rate": 2.1134909683922952e-07, "loss": 0.4633, "reward": 0.76339291036129, "reward_std": 0.29427943378686905, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183036118745804, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 1402.9710388183594, "epoch": 0.4498543798073333, "grad_norm": 15.483263969421387, "kl": 5.203125, "learning_rate": 2.112168221067577e-07, "loss": 0.3864, "reward": 0.6774553805589676, "reward_std": 0.2967297360301018, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5881696790456772, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 1308.1607666015625, "epoch": 0.45015308789485475, "grad_norm": 12.22896957397461, "kl": 4.66015625, "learning_rate": 2.1108449709491313e-07, "loss": 0.3625, "reward": 0.7137276977300644, "reward_std": 0.2887205593287945, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6065848469734192, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 1344.2344360351562, "epoch": 0.4504517959823762, "grad_norm": 7.802123546600342, "kl": 4.6171875, "learning_rate": 2.109521219476525e-07, "loss": 0.3662, "reward": 0.8164062947034836, "reward_std": 0.3399123251438141, "rewards/accuracy_reward": 0.20089286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.615513414144516, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 1305.4643249511719, "epoch": 0.4507505040698977, "grad_norm": 9.446602821350098, "kl": 3.59375, "learning_rate": 2.1081969680898697e-07, "loss": 0.3333, "reward": 0.7042411118745804, "reward_std": 0.3086544945836067, "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6261160969734192, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 1283.0067749023438, "epoch": 0.45104921215741917, "grad_norm": 12.11299991607666, "kl": 3.0859375, "learning_rate": 2.1068722182298213e-07, "loss": 0.2891, "reward": 0.8292411118745804, "reward_std": 0.32303134724497795, "rewards/accuracy_reward": 0.2165178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.612723246216774, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 1191.9754943847656, "epoch": 0.45134792024494064, "grad_norm": 11.89132022857666, "kl": 3.1953125, "learning_rate": 2.1055469713375776e-07, "loss": 0.3945, "reward": 0.7170759290456772, "reward_std": 0.33207443356513977, "rewards/accuracy_reward": 0.09151785913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.625558078289032, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 1395.4665832519531, "epoch": 0.4516466283324621, "grad_norm": 10.127398490905762, "kl": 3.62109375, "learning_rate": 2.1042212288548776e-07, "loss": 0.3475, "reward": 0.6997768133878708, "reward_std": 0.29021503403782845, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.610491082072258, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 1209.76123046875, "epoch": 0.4519453364199836, "grad_norm": 6.651552677154541, "kl": 3.90625, "learning_rate": 2.1028949922239993e-07, "loss": 0.4153, "reward": 0.8822545111179352, "reward_std": 0.33628325909376144, "rewards/accuracy_reward": 0.2678571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.614397332072258, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 1302.5804138183594, "epoch": 0.45224404450750505, "grad_norm": 10.218255043029785, "kl": 4.5078125, "learning_rate": 2.1015682628877576e-07, "loss": 0.3731, "reward": 0.7723214775323868, "reward_std": 0.2558363936841488, "rewards/accuracy_reward": 0.17633929592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 1261.745590209961, "epoch": 0.4525427525950265, "grad_norm": 14.672829627990723, "kl": 4.9296875, "learning_rate": 2.1002410422895047e-07, "loss": 0.4363, "reward": 0.6919643133878708, "reward_std": 0.3010734096169472, "rewards/accuracy_reward": 0.08035714575089514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611607164144516, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 1238.10498046875, "epoch": 0.452841460682548, "grad_norm": 12.46191120147705, "kl": 4.9375, "learning_rate": 2.098913331873126e-07, "loss": 0.4319, "reward": 0.7723214775323868, "reward_std": 0.3222237601876259, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6071428805589676, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 1289.4621276855469, "epoch": 0.45314016877006946, "grad_norm": 14.77263069152832, "kl": 4.53515625, "learning_rate": 2.0975851330830403e-07, "loss": 0.3805, "reward": 0.7360491454601288, "reward_std": 0.2809465266764164, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 1384.8326416015625, "epoch": 0.45343887685759093, "grad_norm": 7.389474391937256, "kl": 4.8828125, "learning_rate": 2.0962564473641977e-07, "loss": 0.3562, "reward": 0.7483259290456772, "reward_std": 0.28044645115733147, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437947034836, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 1386.6920166015625, "epoch": 0.4537375849451124, "grad_norm": 15.44855785369873, "kl": 4.30859375, "learning_rate": 2.0949272761620784e-07, "loss": 0.2906, "reward": 0.7572544813156128, "reward_std": 0.2962959408760071, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6456473469734192, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 1326.2210083007812, "epoch": 0.4540362930326339, "grad_norm": 6.657374858856201, "kl": 3.4765625, "learning_rate": 2.0935976209226907e-07, "loss": 0.3093, "reward": 0.702566996216774, "reward_std": 0.2944677099585533, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6199777126312256, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 1306.4107971191406, "epoch": 0.45433500112015535, "grad_norm": 12.530574798583984, "kl": 3.35546875, "learning_rate": 2.092267483092568e-07, "loss": 0.3232, "reward": 0.701450914144516, "reward_std": 0.30656586587429047, "rewards/accuracy_reward": 0.11383929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116305589676, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 1367.7567443847656, "epoch": 0.4546337092076768, "grad_norm": 6.076208591461182, "kl": 3.13671875, "learning_rate": 2.0909368641187706e-07, "loss": 0.2395, "reward": 0.7031250298023224, "reward_std": 0.28041596710681915, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750298023224, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 1356.9822082519531, "epoch": 0.4549324172951983, "grad_norm": 7.142483234405518, "kl": 3.59765625, "learning_rate": 2.0896057654488818e-07, "loss": 0.2604, "reward": 0.6389509290456772, "reward_std": 0.28811776265501976, "rewards/accuracy_reward": 0.046875001629814506, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 1365.0647583007812, "epoch": 0.45523112538271976, "grad_norm": 7.258602142333984, "kl": 3.48046875, "learning_rate": 2.0882741885310062e-07, "loss": 0.2702, "reward": 0.7109375298023224, "reward_std": 0.29094772785902023, "rewards/accuracy_reward": 0.0870535729918629, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6238839626312256, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 1320.7254943847656, "epoch": 0.45552983347024123, "grad_norm": 3.703352689743042, "kl": 3.52734375, "learning_rate": 2.0869421348137685e-07, "loss": 0.3123, "reward": 0.7527902126312256, "reward_std": 0.32056353241205215, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6121651977300644, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 1356.90185546875, "epoch": 0.4558285415577627, "grad_norm": 3.993206739425659, "kl": 4.4609375, "learning_rate": 2.0856096057463133e-07, "loss": 0.3319, "reward": 0.766183078289032, "reward_std": 0.3542278781533241, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6143973469734192, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 1389.4419860839844, "epoch": 0.45612724964528417, "grad_norm": 5.412484169006348, "kl": 4.796875, "learning_rate": 2.0842766027783017e-07, "loss": 0.3489, "reward": 0.8337053954601288, "reward_std": 0.28610809892416, "rewards/accuracy_reward": 0.2254464451689273, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6082589477300644, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 1230.2902221679688, "epoch": 0.45642595773280564, "grad_norm": 10.951200485229492, "kl": 4.546875, "learning_rate": 2.0829431273599097e-07, "loss": 0.3724, "reward": 0.7159598618745804, "reward_std": 0.3168816938996315, "rewards/accuracy_reward": 0.1205357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 1370.1429443359375, "epoch": 0.45672466582032706, "grad_norm": 2.8691258430480957, "kl": 4.140625, "learning_rate": 2.081609180941828e-07, "loss": 0.2761, "reward": 0.7003348469734192, "reward_std": 0.3136804774403572, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.608816996216774, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 1301.69873046875, "epoch": 0.45702337390784853, "grad_norm": 3.6688928604125977, "kl": 3.5859375, "learning_rate": 2.0802747649752605e-07, "loss": 0.3203, "reward": 0.792410746216774, "reward_std": 0.35875802114605904, "rewards/accuracy_reward": 0.16294643748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 1331.5335388183594, "epoch": 0.45732208199537, "grad_norm": 5.070215702056885, "kl": 4.26171875, "learning_rate": 2.0789398809119194e-07, "loss": 0.32, "reward": 0.650669664144516, "reward_std": 0.25989116355776787, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 1298.7969360351562, "epoch": 0.45762079008289147, "grad_norm": 4.309478282928467, "kl": 4.5, "learning_rate": 2.077604530204028e-07, "loss": 0.3984, "reward": 0.8147321790456772, "reward_std": 0.34832043945789337, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 1337.3058776855469, "epoch": 0.45791949817041294, "grad_norm": 3.557746410369873, "kl": 3.5703125, "learning_rate": 2.0762687143043185e-07, "loss": 0.2428, "reward": 0.7137277126312256, "reward_std": 0.2566774860024452, "rewards/accuracy_reward": 0.09151785913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.622209832072258, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 1313.3125610351562, "epoch": 0.4582182062579344, "grad_norm": 3.2612788677215576, "kl": 4.13671875, "learning_rate": 2.0749324346660263e-07, "loss": 0.3335, "reward": 0.729910746216774, "reward_std": 0.3572607859969139, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 1432.91748046875, "epoch": 0.4585169143454559, "grad_norm": 6.090360164642334, "kl": 4.078125, "learning_rate": 2.0735956927428928e-07, "loss": 0.2793, "reward": 0.733816996216774, "reward_std": 0.2814171835780144, "rewards/accuracy_reward": 0.15178571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312798023224, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 1259.4129638671875, "epoch": 0.45881562243297735, "grad_norm": 7.18500280380249, "kl": 2.703125, "learning_rate": 2.0722584899891626e-07, "loss": 0.1951, "reward": 0.7672991454601288, "reward_std": 0.2617420181632042, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6289062798023224, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 1401.4911193847656, "epoch": 0.4591143305204988, "grad_norm": 5.592360019683838, "kl": 3.7109375, "learning_rate": 2.0709208278595813e-07, "loss": 0.2881, "reward": 0.758370578289032, "reward_std": 0.34868670254945755, "rewards/accuracy_reward": 0.16741072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 1431.6116943359375, "epoch": 0.4594130386080203, "grad_norm": 6.922409534454346, "kl": 3.3515625, "learning_rate": 2.0695827078093938e-07, "loss": 0.248, "reward": 0.787388414144516, "reward_std": 0.2927350103855133, "rewards/accuracy_reward": 0.17410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6132812649011612, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 1360.2076416015625, "epoch": 0.45971174669554177, "grad_norm": 5.051283836364746, "kl": 4.1796875, "learning_rate": 2.0682441312943435e-07, "loss": 0.3405, "reward": 0.8136160969734192, "reward_std": 0.3322000503540039, "rewards/accuracy_reward": 0.21428571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303954601288, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 1351.5759582519531, "epoch": 0.46001045478306324, "grad_norm": 5.562852382659912, "kl": 4.1015625, "learning_rate": 2.0669050997706712e-07, "loss": 0.2978, "reward": 0.6875000298023224, "reward_std": 0.30264315754175186, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183035969734192, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 1347.9866638183594, "epoch": 0.4603091628705847, "grad_norm": 206.3802947998047, "kl": 6.69921875, "learning_rate": 2.065565614695111e-07, "loss": 0.4551, "reward": 0.680245578289032, "reward_std": 0.2839187681674957, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991156578064, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 1542.7478332519531, "epoch": 0.4606078709581062, "grad_norm": 11.724173545837402, "kl": 4.88671875, "learning_rate": 2.0642256775248918e-07, "loss": 0.32, "reward": 0.7243303954601288, "reward_std": 0.29550880193710327, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792411044239998, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 1430.4777526855469, "epoch": 0.46090657904562765, "grad_norm": 8.777519226074219, "kl": 4.046875, "learning_rate": 2.0628852897177338e-07, "loss": 0.2318, "reward": 0.6205357313156128, "reward_std": 0.2714385576546192, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 1369.52685546875, "epoch": 0.4612052871331491, "grad_norm": 4.630526542663574, "kl": 3.86328125, "learning_rate": 2.061544452731848e-07, "loss": 0.2805, "reward": 0.7433036118745804, "reward_std": 0.3114032596349716, "rewards/accuracy_reward": 0.1205357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6227678805589676, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 1258.6920471191406, "epoch": 0.4615039952206706, "grad_norm": 4.465635776519775, "kl": 4.10546875, "learning_rate": 2.0602031680259333e-07, "loss": 0.3444, "reward": 0.7639509290456772, "reward_std": 0.33353014290332794, "rewards/accuracy_reward": 0.15848215343430638, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687649011612, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 1329.8750915527344, "epoch": 0.46180270330819206, "grad_norm": 3.941086530685425, "kl": 3.44921875, "learning_rate": 2.058861437059176e-07, "loss": 0.2819, "reward": 0.6930803954601288, "reward_std": 0.2698259651660919, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6149553805589676, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 1471.7745971679688, "epoch": 0.46210141139571353, "grad_norm": 9.58068561553955, "kl": 2.9453125, "learning_rate": 2.0575192612912477e-07, "loss": 0.1919, "reward": 0.6534598618745804, "reward_std": 0.26042795926332474, "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 1442.8638916015625, "epoch": 0.462400119483235, "grad_norm": 7.732929229736328, "kl": 3.56640625, "learning_rate": 2.056176642182305e-07, "loss": 0.2405, "reward": 0.7382812798023224, "reward_std": 0.2979823686182499, "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.590959832072258, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 1257.669677734375, "epoch": 0.4626988275707565, "grad_norm": 11.746439933776855, "kl": 3.13671875, "learning_rate": 2.0548335811929844e-07, "loss": 0.2972, "reward": 0.6640625149011612, "reward_std": 0.2771879881620407, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6127232313156128, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 1354.3861999511719, "epoch": 0.46299753565827795, "grad_norm": 6.83600378036499, "kl": 3.1875, "learning_rate": 2.053490079784406e-07, "loss": 0.2935, "reward": 0.731026828289032, "reward_std": 0.3165281191468239, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603794664144516, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 1407.9308471679688, "epoch": 0.4632962437457994, "grad_norm": 3.1341280937194824, "kl": 3.75390625, "learning_rate": 2.0521461394181665e-07, "loss": 0.2487, "reward": 0.7600446790456772, "reward_std": 0.2839038819074631, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6082589626312256, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 1318.6853332519531, "epoch": 0.4635949518333209, "grad_norm": 3.8533098697662354, "kl": 4.36328125, "learning_rate": 2.0508017615563424e-07, "loss": 0.3614, "reward": 0.7433035969734192, "reward_std": 0.3292200081050396, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464477300644, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 1339.7076416015625, "epoch": 0.46389365992084236, "grad_norm": 2.6789896488189697, "kl": 4.0546875, "learning_rate": 2.0494569476614838e-07, "loss": 0.3313, "reward": 0.6824777126312256, "reward_std": 0.28370238840579987, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 1449.2388916015625, "epoch": 0.46419236800836383, "grad_norm": 6.035990238189697, "kl": 5.1640625, "learning_rate": 2.0481116991966179e-07, "loss": 0.3478, "reward": 0.650669664144516, "reward_std": 0.2517826110124588, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594866082072258, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 1310.8215026855469, "epoch": 0.4644910760958853, "grad_norm": 10.14941692352295, "kl": 4.98828125, "learning_rate": 2.0467660176252421e-07, "loss": 0.3712, "reward": 0.694754496216774, "reward_std": 0.28745853528380394, "rewards/accuracy_reward": 0.13169643585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5630580559372902, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 1306.5804138183594, "epoch": 0.4647897841834068, "grad_norm": 10.084206581115723, "kl": 4.17578125, "learning_rate": 2.0454199044113266e-07, "loss": 0.2885, "reward": 0.703683078289032, "reward_std": 0.29172926396131516, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 1363.5893249511719, "epoch": 0.46508849227092824, "grad_norm": 16.821989059448242, "kl": 4.66015625, "learning_rate": 2.04407336101931e-07, "loss": 0.3045, "reward": 0.6768973618745804, "reward_std": 0.27323097735643387, "rewards/accuracy_reward": 0.09151786239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 1438.6340026855469, "epoch": 0.4653872003584497, "grad_norm": 4.545848846435547, "kl": 3.57421875, "learning_rate": 2.0427263889140999e-07, "loss": 0.2774, "reward": 0.7304687798023224, "reward_std": 0.2766968756914139, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6077009290456772, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 1377.96435546875, "epoch": 0.4656859084459712, "grad_norm": 3.8287994861602783, "kl": 3.91796875, "learning_rate": 2.0413789895610702e-07, "loss": 0.3178, "reward": 0.6512276977300644, "reward_std": 0.30488792434334755, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312798023224, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 1493.4308776855469, "epoch": 0.46598461653349266, "grad_norm": 7.401872158050537, "kl": 4.09375, "learning_rate": 2.0400311644260595e-07, "loss": 0.2968, "reward": 0.6406250298023224, "reward_std": 0.24092628806829453, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.573660746216774, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 1383.3572387695312, "epoch": 0.46628332462101413, "grad_norm": 4.847527980804443, "kl": 3.58203125, "learning_rate": 2.0386829149753684e-07, "loss": 0.2533, "reward": 0.6992187798023224, "reward_std": 0.31821486726403236, "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5831473469734192, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 1373.54248046875, "epoch": 0.4665820327085356, "grad_norm": 5.568328380584717, "kl": 4.09765625, "learning_rate": 2.0373342426757614e-07, "loss": 0.3167, "reward": 0.6540178805589676, "reward_std": 0.3039088100194931, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 1325.7947082519531, "epoch": 0.46688074079605707, "grad_norm": 6.191868782043457, "kl": 4.984375, "learning_rate": 2.0359851489944608e-07, "loss": 0.4121, "reward": 0.6824776977300644, "reward_std": 0.3005826584994793, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.577566996216774, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 1396.1563110351562, "epoch": 0.46717944888357854, "grad_norm": 7.215513229370117, "kl": 3.8125, "learning_rate": 2.0346356353991487e-07, "loss": 0.3309, "reward": 0.620535746216774, "reward_std": 0.2726714685559273, "rewards/accuracy_reward": 0.04464285960420966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 1395.88623046875, "epoch": 0.4674781569711, "grad_norm": 4.31882905960083, "kl": 4.2890625, "learning_rate": 2.033285703357964e-07, "loss": 0.3569, "reward": 0.7215402126312256, "reward_std": 0.295431450009346, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 1389.8371276855469, "epoch": 0.4677768650586215, "grad_norm": 7.806412696838379, "kl": 3.9609375, "learning_rate": 2.0319353543394998e-07, "loss": 0.2611, "reward": 0.6662946790456772, "reward_std": 0.2709098570048809, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 1169.6674499511719, "epoch": 0.46807557314614295, "grad_norm": 5.595086574554443, "kl": 4.2734375, "learning_rate": 2.030584589812804e-07, "loss": 0.4313, "reward": 0.6880580484867096, "reward_std": 0.3056786209344864, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116305589676, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 1263.3504943847656, "epoch": 0.4683742812336644, "grad_norm": 7.08564567565918, "kl": 3.171875, "learning_rate": 2.0292334112473753e-07, "loss": 0.2985, "reward": 0.8381696790456772, "reward_std": 0.30178751796483994, "rewards/accuracy_reward": 0.23437500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603794664144516, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 1308.9554138183594, "epoch": 0.4686729893211859, "grad_norm": 3.4572083950042725, "kl": 3.77734375, "learning_rate": 2.0278818201131644e-07, "loss": 0.2462, "reward": 0.6774553954601288, "reward_std": 0.27913698554039, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6082589626312256, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 1380.22998046875, "epoch": 0.46897169740870737, "grad_norm": 5.091702461242676, "kl": 4.046875, "learning_rate": 2.0265298178805695e-07, "loss": 0.288, "reward": 0.7187500149011612, "reward_std": 0.2832820415496826, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6138393133878708, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 1318.0357666015625, "epoch": 0.46927040549622884, "grad_norm": 6.641470909118652, "kl": 4.609375, "learning_rate": 2.0251774060204367e-07, "loss": 0.3381, "reward": 0.6975446790456772, "reward_std": 0.2789897285401821, "rewards/accuracy_reward": 0.13839286798611283, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.559151828289032, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 1366.4822082519531, "epoch": 0.46956911358375025, "grad_norm": 4.358578205108643, "kl": 3.96875, "learning_rate": 2.0238245860040574e-07, "loss": 0.2916, "reward": 0.664620578289032, "reward_std": 0.24115866795182228, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 1327.0558776855469, "epoch": 0.4698678216712717, "grad_norm": 1.8272216320037842, "kl": 3.24609375, "learning_rate": 2.0224713593031676e-07, "loss": 0.2219, "reward": 0.693638414144516, "reward_std": 0.26886867731809616, "rewards/accuracy_reward": 0.11830357369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348469734192, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 1302.3817443847656, "epoch": 0.4701665297587932, "grad_norm": 3.0165092945098877, "kl": 3.67578125, "learning_rate": 2.021117727389945e-07, "loss": 0.3, "reward": 0.7421875298023224, "reward_std": 0.2772320620715618, "rewards/accuracy_reward": 0.1651785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089626312256, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 1338.4844055175781, "epoch": 0.47046523784631467, "grad_norm": 4.224653720855713, "kl": 3.484375, "learning_rate": 2.019763691737008e-07, "loss": 0.2787, "reward": 0.6886161118745804, "reward_std": 0.25775570794939995, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303805589676, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 1305.6965026855469, "epoch": 0.47076394593383614, "grad_norm": 5.991189956665039, "kl": 3.76953125, "learning_rate": 2.0184092538174156e-07, "loss": 0.3185, "reward": 0.676339328289032, "reward_std": 0.27102702111005783, "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.587053582072258, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 1303.0335388183594, "epoch": 0.4710626540213576, "grad_norm": 8.91067886352539, "kl": 4.078125, "learning_rate": 2.017054415104663e-07, "loss": 0.3255, "reward": 0.6969866454601288, "reward_std": 0.2913122773170471, "rewards/accuracy_reward": 0.09375000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366454601288, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 1283.6830749511719, "epoch": 0.4713613621088791, "grad_norm": 14.39980411529541, "kl": 4.75390625, "learning_rate": 2.0156991770726813e-07, "loss": 0.3368, "reward": 0.6774553805589676, "reward_std": 0.2707214020192623, "rewards/accuracy_reward": 0.09151786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 1361.5558471679688, "epoch": 0.47166007019640055, "grad_norm": 5.865981101989746, "kl": 3.96484375, "learning_rate": 2.0143435411958378e-07, "loss": 0.2925, "reward": 0.6406250298023224, "reward_std": 0.21658537536859512, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 1239.7277221679688, "epoch": 0.471958778283922, "grad_norm": 2.691100835800171, "kl": 3.9375, "learning_rate": 2.0129875089489312e-07, "loss": 0.3139, "reward": 0.704241082072258, "reward_std": 0.3036893457174301, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 1265.0915832519531, "epoch": 0.4722574863714435, "grad_norm": 3.5948033332824707, "kl": 4.0, "learning_rate": 2.0116310818071912e-07, "loss": 0.3457, "reward": 0.6947544813156128, "reward_std": 0.27341486886143684, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965402126312256, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 1287.5558776855469, "epoch": 0.47255619445896496, "grad_norm": 3.4976773262023926, "kl": 3.52734375, "learning_rate": 2.0102742612462772e-07, "loss": 0.3038, "reward": 0.7142857313156128, "reward_std": 0.24362030625343323, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 1314.3594055175781, "epoch": 0.47285490254648643, "grad_norm": 2.196894884109497, "kl": 4.05078125, "learning_rate": 2.0089170487422785e-07, "loss": 0.3559, "reward": 0.7282366305589676, "reward_std": 0.3162112310528755, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187798023224, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 1305.9442443847656, "epoch": 0.4731536106340079, "grad_norm": 2.869126796722412, "kl": 3.75390625, "learning_rate": 2.0075594457717085e-07, "loss": 0.259, "reward": 0.7393973469734192, "reward_std": 0.33464112877845764, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116156578064, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 1320.3371276855469, "epoch": 0.4734523187215294, "grad_norm": 6.299445629119873, "kl": 3.6640625, "learning_rate": 2.0062014538115056e-07, "loss": 0.2607, "reward": 0.7243303954601288, "reward_std": 0.263735756278038, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 1215.7344055175781, "epoch": 0.47375102680905085, "grad_norm": 11.297211647033691, "kl": 3.28125, "learning_rate": 2.0048430743390328e-07, "loss": 0.3125, "reward": 0.757254496216774, "reward_std": 0.34123445302248, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576450914144516, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 1165.4219055175781, "epoch": 0.4740497348965723, "grad_norm": 6.674849510192871, "kl": 2.828125, "learning_rate": 2.003484308832073e-07, "loss": 0.1632, "reward": 0.7399553954601288, "reward_std": 0.26977359876036644, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6149553805589676, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 1282.4219055175781, "epoch": 0.4743484429840938, "grad_norm": 11.699036598205566, "kl": 2.97265625, "learning_rate": 2.002125158768831e-07, "loss": 0.2609, "reward": 0.7433036118745804, "reward_std": 0.31349047273397446, "rewards/accuracy_reward": 0.15401786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5892857313156128, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 1301.2879943847656, "epoch": 0.47464715107161526, "grad_norm": 4.8200554847717285, "kl": 3.8203125, "learning_rate": 2.000765625627928e-07, "loss": 0.3169, "reward": 0.6891741305589676, "reward_std": 0.2520902678370476, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.561941996216774, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 1191.4754943847656, "epoch": 0.47494585915913673, "grad_norm": 5.148187637329102, "kl": 3.037109375, "learning_rate": 1.999405710888403e-07, "loss": 0.1819, "reward": 0.6981027126312256, "reward_std": 0.2788524739444256, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043526977300644, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 1313.5871276855469, "epoch": 0.4752445672466582, "grad_norm": 7.8769402503967285, "kl": 2.99609375, "learning_rate": 1.99804541602971e-07, "loss": 0.2805, "reward": 0.7628348469734192, "reward_std": 0.30619092285633087, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6110491305589676, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 1294.9264221191406, "epoch": 0.47554327533417967, "grad_norm": 4.239384651184082, "kl": 3.4453125, "learning_rate": 1.9966847425317165e-07, "loss": 0.3083, "reward": 0.7717634439468384, "reward_std": 0.28684114292263985, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 1222.7478637695312, "epoch": 0.47584198342170114, "grad_norm": 3.8260843753814697, "kl": 4.0390625, "learning_rate": 1.9953236918747018e-07, "loss": 0.3303, "reward": 0.7248884290456772, "reward_std": 0.33784889429807663, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 1424.0402526855469, "epoch": 0.4761406915092226, "grad_norm": 10.830255508422852, "kl": 4.01171875, "learning_rate": 1.9939622655393556e-07, "loss": 0.2019, "reward": 0.6724330633878708, "reward_std": 0.22467462718486786, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187947034836, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 1276.2545166015625, "epoch": 0.4764393995967441, "grad_norm": 6.514368057250977, "kl": 3.77734375, "learning_rate": 1.9926004650067766e-07, "loss": 0.2422, "reward": 0.6835937798023224, "reward_std": 0.2981753870844841, "rewards/accuracy_reward": 0.09375000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 1332.3259582519531, "epoch": 0.47673810768426556, "grad_norm": 5.622742652893066, "kl": 4.0234375, "learning_rate": 1.9912382917584702e-07, "loss": 0.2605, "reward": 0.706473246216774, "reward_std": 0.2743474766612053, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303805589676, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 1338.8661499023438, "epoch": 0.477036815771787, "grad_norm": 3.7814133167266846, "kl": 3.5234375, "learning_rate": 1.9898757472763473e-07, "loss": 0.2303, "reward": 0.7332589626312256, "reward_std": 0.2876005358994007, "rewards/accuracy_reward": 0.12053571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6127232611179352, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 1263.9353332519531, "epoch": 0.4773355238593085, "grad_norm": 4.02673864364624, "kl": 3.76171875, "learning_rate": 1.9885128330427221e-07, "loss": 0.347, "reward": 0.6964286118745804, "reward_std": 0.31361495703458786, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 1415.0536193847656, "epoch": 0.47763423194682997, "grad_norm": 3.48429012298584, "kl": 3.64453125, "learning_rate": 1.9871495505403132e-07, "loss": 0.1773, "reward": 0.6512277126312256, "reward_std": 0.28336452692747116, "rewards/accuracy_reward": 0.05580357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241454601288, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 1340.5804443359375, "epoch": 0.47793294003435144, "grad_norm": 4.386862754821777, "kl": 3.21875, "learning_rate": 1.985785901252237e-07, "loss": 0.2504, "reward": 0.685825914144516, "reward_std": 0.2823125869035721, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5786830484867096, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 1300.8861999511719, "epoch": 0.4782316481218729, "grad_norm": 9.642070770263672, "kl": 3.18359375, "learning_rate": 1.9844218866620112e-07, "loss": 0.2947, "reward": 0.7020089626312256, "reward_std": 0.25788886845111847, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594866082072258, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 1265.87060546875, "epoch": 0.4785303562093944, "grad_norm": 18.53834342956543, "kl": 2.876953125, "learning_rate": 1.9830575082535496e-07, "loss": 0.3437, "reward": 0.6562500298023224, "reward_std": 0.23521753400564194, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571790456772, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 1344.4554138183594, "epoch": 0.47882906429691585, "grad_norm": 3.382558822631836, "kl": 3.9609375, "learning_rate": 1.9816927675111616e-07, "loss": 0.2821, "reward": 0.6149553805589676, "reward_std": 0.2741096690297127, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5479910969734192, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 1367.2389221191406, "epoch": 0.4791277723844373, "grad_norm": 14.220845222473145, "kl": 3.765625, "learning_rate": 1.9803276659195526e-07, "loss": 0.2418, "reward": 0.5948660969734192, "reward_std": 0.2604578621685505, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.556919664144516, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 1390.9710388183594, "epoch": 0.4794264804719588, "grad_norm": 8.53323745727539, "kl": 3.0, "learning_rate": 1.9789622049638184e-07, "loss": 0.2152, "reward": 0.6975446790456772, "reward_std": 0.26376328617334366, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948660969734192, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 1422.9465026855469, "epoch": 0.47972518855948026, "grad_norm": 4.706604957580566, "kl": 4.0859375, "learning_rate": 1.977596386129447e-07, "loss": 0.2881, "reward": 0.7237723469734192, "reward_std": 0.28905392810702324, "rewards/accuracy_reward": 0.16741072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5563616305589676, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 1320.7232666015625, "epoch": 0.48002389664700174, "grad_norm": 11.463300704956055, "kl": 3.16796875, "learning_rate": 1.9762302109023153e-07, "loss": 0.256, "reward": 0.7438616454601288, "reward_std": 0.304403156042099, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 1425.7098999023438, "epoch": 0.4803226047345232, "grad_norm": 3.5947554111480713, "kl": 3.8125, "learning_rate": 1.974863680768688e-07, "loss": 0.2456, "reward": 0.667410746216774, "reward_std": 0.2553550563752651, "rewards/accuracy_reward": 0.06919643119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 1285.0201416015625, "epoch": 0.4806213128220447, "grad_norm": 8.42138957977295, "kl": 4.3984375, "learning_rate": 1.9734967972152167e-07, "loss": 0.3886, "reward": 0.6863839775323868, "reward_std": 0.29749029874801636, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.588169664144516, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 1295.3192749023438, "epoch": 0.48092002090956615, "grad_norm": 10.56811237335205, "kl": 4.671875, "learning_rate": 1.9721295617289354e-07, "loss": 0.3318, "reward": 0.6400669813156128, "reward_std": 0.28265146538615227, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241454601288, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 1301.8370971679688, "epoch": 0.4812187289970876, "grad_norm": 14.143131256103516, "kl": 4.5234375, "learning_rate": 1.9707619757972633e-07, "loss": 0.2903, "reward": 0.744419664144516, "reward_std": 0.3248135596513748, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581473246216774, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 1372.6674499511719, "epoch": 0.4815174370846091, "grad_norm": 4.436441421508789, "kl": 3.73828125, "learning_rate": 1.9693940409079997e-07, "loss": 0.2739, "reward": 0.7460937798023224, "reward_std": 0.2916469983756542, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.623325914144516, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 1366.0670471191406, "epoch": 0.48181614517213056, "grad_norm": 7.0892791748046875, "kl": 4.3203125, "learning_rate": 1.9680257585493236e-07, "loss": 0.305, "reward": 0.679129496216774, "reward_std": 0.3327193595468998, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830484867096, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 1347.7991943359375, "epoch": 0.48211485325965203, "grad_norm": 2.976555824279785, "kl": 3.265625, "learning_rate": 1.9666571302097917e-07, "loss": 0.199, "reward": 0.7717634290456772, "reward_std": 0.31505087018013, "rewards/accuracy_reward": 0.16517857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6065848469734192, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 1306.7813110351562, "epoch": 0.48241356134717345, "grad_norm": 5.865286827087402, "kl": 3.36328125, "learning_rate": 1.965288157378338e-07, "loss": 0.287, "reward": 0.6925223469734192, "reward_std": 0.31032050400972366, "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 1303.8192749023438, "epoch": 0.4827122694346949, "grad_norm": 6.01124382019043, "kl": 2.859375, "learning_rate": 1.9639188415442713e-07, "loss": 0.217, "reward": 0.6802455633878708, "reward_std": 0.3030710816383362, "rewards/accuracy_reward": 0.08482143003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241454601288, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 1244.6719055175781, "epoch": 0.4830109775222164, "grad_norm": 5.97653865814209, "kl": 3.05859375, "learning_rate": 1.962549184197271e-07, "loss": 0.2272, "reward": 0.7483259439468384, "reward_std": 0.2620849907398224, "rewards/accuracy_reward": 0.16964286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 1371.8326416015625, "epoch": 0.48330968560973786, "grad_norm": 8.681578636169434, "kl": 3.33203125, "learning_rate": 1.9611791868273926e-07, "loss": 0.3017, "reward": 0.7265625298023224, "reward_std": 0.2915649712085724, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792410969734192, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 1434.0424499511719, "epoch": 0.48360839369725933, "grad_norm": 2.9661567211151123, "kl": 4.07421875, "learning_rate": 1.9598088509250573e-07, "loss": 0.2789, "reward": 0.7421875298023224, "reward_std": 0.30048616975545883, "rewards/accuracy_reward": 0.19196429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5502232313156128, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 1387.1853637695312, "epoch": 0.4839071017847808, "grad_norm": 8.77545166015625, "kl": 4.828125, "learning_rate": 1.958438177981057e-07, "loss": 0.3729, "reward": 0.6445312798023224, "reward_std": 0.3116954304277897, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5418527126312256, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 1398.8236999511719, "epoch": 0.4842058098723023, "grad_norm": 3.1613850593566895, "kl": 3.82421875, "learning_rate": 1.957067169486549e-07, "loss": 0.2822, "reward": 0.5998884290456772, "reward_std": 0.2882663160562515, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546316996216774, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 1440.9487609863281, "epoch": 0.48450451795982374, "grad_norm": 5.933197975158691, "kl": 4.0390625, "learning_rate": 1.9556958269330564e-07, "loss": 0.2526, "reward": 0.664620578289032, "reward_std": 0.2739277631044388, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.561941996216774, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 1269.372802734375, "epoch": 0.4848032260473452, "grad_norm": 3.2034926414489746, "kl": 4.328125, "learning_rate": 1.9543241518124657e-07, "loss": 0.3916, "reward": 0.7405134290456772, "reward_std": 0.2988819479942322, "rewards/accuracy_reward": 0.19642858393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5440848469734192, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 1274.529052734375, "epoch": 0.4851019341348667, "grad_norm": 3.065988063812256, "kl": 3.51953125, "learning_rate": 1.9529521456170245e-07, "loss": 0.2754, "reward": 0.6802455633878708, "reward_std": 0.2914046421647072, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991305589676, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 1364.5067749023438, "epoch": 0.48540064222238816, "grad_norm": 2.06795597076416, "kl": 2.79296875, "learning_rate": 1.9515798098393416e-07, "loss": 0.1884, "reward": 0.6110491305589676, "reward_std": 0.2474736087024212, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 1291.5156860351562, "epoch": 0.4856993503099096, "grad_norm": 5.0915656089782715, "kl": 3.22265625, "learning_rate": 1.9502071459723836e-07, "loss": 0.2953, "reward": 0.7578125447034836, "reward_std": 0.31021758913993835, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 1364.49560546875, "epoch": 0.4859980583974311, "grad_norm": 4.461061477661133, "kl": 3.68359375, "learning_rate": 1.9488341555094743e-07, "loss": 0.2808, "reward": 0.6668527126312256, "reward_std": 0.323155902326107, "rewards/accuracy_reward": 0.06919643329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562798023224, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 1365.622802734375, "epoch": 0.48629676648495257, "grad_norm": 5.668583393096924, "kl": 3.9140625, "learning_rate": 1.947460839944292e-07, "loss": 0.3211, "reward": 0.6640625298023224, "reward_std": 0.2973475530743599, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125149011612, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 1233.4732666015625, "epoch": 0.48659547457247404, "grad_norm": 5.397857189178467, "kl": 3.59765625, "learning_rate": 1.9460872007708713e-07, "loss": 0.2747, "reward": 0.767857164144516, "reward_std": 0.3381780683994293, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 1314.1004943847656, "epoch": 0.4868941826599955, "grad_norm": 3.260645627975464, "kl": 3.2890625, "learning_rate": 1.944713239483595e-07, "loss": 0.2344, "reward": 0.6640625298023224, "reward_std": 0.26548292487859726, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 1265.8080749511719, "epoch": 0.487192890747517, "grad_norm": 8.839542388916016, "kl": 3.43359375, "learning_rate": 1.9433389575771993e-07, "loss": 0.2691, "reward": 0.6534598618745804, "reward_std": 0.2714804671704769, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 1368.6406555175781, "epoch": 0.48749159883503845, "grad_norm": 2.9078683853149414, "kl": 3.3515625, "learning_rate": 1.9419643565467684e-07, "loss": 0.2061, "reward": 0.7304687798023224, "reward_std": 0.2875344827771187, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965402126312256, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 1290.0022888183594, "epoch": 0.4877903069225599, "grad_norm": 3.814577341079712, "kl": 3.08203125, "learning_rate": 1.9405894378877326e-07, "loss": 0.1841, "reward": 0.6679687947034836, "reward_std": 0.27827034145593643, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687649011612, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 1270.4911193847656, "epoch": 0.4880890150100814, "grad_norm": 8.052262306213379, "kl": 3.1953125, "learning_rate": 1.9392142030958696e-07, "loss": 0.2554, "reward": 0.7477678954601288, "reward_std": 0.3319152817130089, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 1330.4219665527344, "epoch": 0.48838772309760287, "grad_norm": 5.5437164306640625, "kl": 2.982421875, "learning_rate": 1.9378386536672993e-07, "loss": 0.2082, "reward": 0.670200914144516, "reward_std": 0.2934141233563423, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 1208.8750305175781, "epoch": 0.48868643118512434, "grad_norm": 4.245074272155762, "kl": 3.28125, "learning_rate": 1.9364627910984855e-07, "loss": 0.2273, "reward": 0.7338169813156128, "reward_std": 0.33132336661219597, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5842634290456772, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 1327.6965026855469, "epoch": 0.4889851392726458, "grad_norm": 4.54076623916626, "kl": 3.38671875, "learning_rate": 1.935086616886231e-07, "loss": 0.2749, "reward": 0.6852678805589676, "reward_std": 0.2907552160322666, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 1306.8550109863281, "epoch": 0.4892838473601673, "grad_norm": 5.2605509757995605, "kl": 3.7890625, "learning_rate": 1.933710132527679e-07, "loss": 0.2853, "reward": 0.6651785969734192, "reward_std": 0.3161527290940285, "rewards/accuracy_reward": 0.06026786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6049107313156128, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 1336.7322387695312, "epoch": 0.48958255544768875, "grad_norm": 5.806676864624023, "kl": 3.42578125, "learning_rate": 1.9323333395203095e-07, "loss": 0.2343, "reward": 0.746651828289032, "reward_std": 0.2894095629453659, "rewards/accuracy_reward": 0.14062500838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6060268133878708, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 1321.4085388183594, "epoch": 0.4898812635352102, "grad_norm": 15.283336639404297, "kl": 4.8046875, "learning_rate": 1.930956239361938e-07, "loss": 0.3615, "reward": 0.643973246216774, "reward_std": 0.3129674047231674, "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 1295.7098388671875, "epoch": 0.4901799716227317, "grad_norm": 4.887364387512207, "kl": 3.60546875, "learning_rate": 1.9295788335507153e-07, "loss": 0.2519, "reward": 0.7187500298023224, "reward_std": 0.2866996005177498, "rewards/accuracy_reward": 0.12946429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 1327.2589721679688, "epoch": 0.49047867971025316, "grad_norm": 5.061673164367676, "kl": 4.16015625, "learning_rate": 1.9282011235851226e-07, "loss": 0.3625, "reward": 0.7293527126312256, "reward_std": 0.3156285770237446, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5998884290456772, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 1331.7523193359375, "epoch": 0.49077738779777463, "grad_norm": 3.97023606300354, "kl": 3.46484375, "learning_rate": 1.9268231109639745e-07, "loss": 0.2389, "reward": 0.7550223618745804, "reward_std": 0.28403887525200844, "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5920758992433548, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 1380.7902526855469, "epoch": 0.4910760958852961, "grad_norm": 3.144777297973633, "kl": 3.80859375, "learning_rate": 1.925444797186413e-07, "loss": 0.3284, "reward": 0.631138414144516, "reward_std": 0.30842068046331406, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5619419813156128, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 1363.7634582519531, "epoch": 0.4913748039728176, "grad_norm": 12.9495210647583, "kl": 2.95703125, "learning_rate": 1.924066183751909e-07, "loss": 0.253, "reward": 0.6623884290456772, "reward_std": 0.26319481432437897, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6199776977300644, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 1245.5848999023438, "epoch": 0.49167351206033905, "grad_norm": 3.7254228591918945, "kl": 3.5390625, "learning_rate": 1.9226872721602583e-07, "loss": 0.3219, "reward": 0.6958705633878708, "reward_std": 0.26219572871923447, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 1227.7634582519531, "epoch": 0.4919722201478605, "grad_norm": 6.18304443359375, "kl": 3.609375, "learning_rate": 1.9213080639115817e-07, "loss": 0.3144, "reward": 0.7539062798023224, "reward_std": 0.26042570173740387, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 1230.7188415527344, "epoch": 0.492270928235382, "grad_norm": 4.208085060119629, "kl": 3.25, "learning_rate": 1.9199285605063227e-07, "loss": 0.2416, "reward": 0.7672991305589676, "reward_std": 0.32996129989624023, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 1276.6786193847656, "epoch": 0.49256963632290346, "grad_norm": 3.4972572326660156, "kl": 4.19140625, "learning_rate": 1.9185487634452453e-07, "loss": 0.2722, "reward": 0.7109375298023224, "reward_std": 0.33162321150302887, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 1335.0201416015625, "epoch": 0.49286834441042493, "grad_norm": 13.204781532287598, "kl": 4.47265625, "learning_rate": 1.9171686742294345e-07, "loss": 0.271, "reward": 0.6294643133878708, "reward_std": 0.3035174459218979, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 1285.6428833007812, "epoch": 0.4931670524979464, "grad_norm": 8.63597297668457, "kl": 4.19921875, "learning_rate": 1.915788294360291e-07, "loss": 0.308, "reward": 0.6774553954601288, "reward_std": 0.3137855678796768, "rewards/accuracy_reward": 0.11160715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.565848246216774, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 1358.2813110351562, "epoch": 0.49346576058546787, "grad_norm": 4.065438747406006, "kl": 4.4140625, "learning_rate": 1.9144076253395333e-07, "loss": 0.3643, "reward": 0.6919643133878708, "reward_std": 0.2954140976071358, "rewards/accuracy_reward": 0.13839286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714626312256, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 1337.82373046875, "epoch": 0.49376446867298934, "grad_norm": 5.9848222732543945, "kl": 3.703125, "learning_rate": 1.913026668669194e-07, "loss": 0.2757, "reward": 0.6768973469734192, "reward_std": 0.3604113236069679, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 1352.4308471679688, "epoch": 0.4940631767605108, "grad_norm": 3.3374857902526855, "kl": 3.2421875, "learning_rate": 1.9116454258516192e-07, "loss": 0.1491, "reward": 0.6601562798023224, "reward_std": 0.27676670625805855, "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348618745804, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 1257.8638916015625, "epoch": 0.4943618848480323, "grad_norm": 4.832503318786621, "kl": 4.00390625, "learning_rate": 1.9102638983894647e-07, "loss": 0.3317, "reward": 0.6395089626312256, "reward_std": 0.2716721184551716, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 1255.9688110351562, "epoch": 0.49466059293555376, "grad_norm": 7.443584442138672, "kl": 2.890625, "learning_rate": 1.9088820877856968e-07, "loss": 0.2353, "reward": 0.7165178954601288, "reward_std": 0.319161593914032, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6071428954601288, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 1300.8616638183594, "epoch": 0.4949593010230752, "grad_norm": 4.46483039855957, "kl": 3.51171875, "learning_rate": 1.9074999955435912e-07, "loss": 0.2482, "reward": 0.6690848618745804, "reward_std": 0.2626349553465843, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741454601288, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 1231.9063110351562, "epoch": 0.49525800911059664, "grad_norm": 5.725292205810547, "kl": 3.4453125, "learning_rate": 1.9061176231667277e-07, "loss": 0.3346, "reward": 0.7449777126312256, "reward_std": 0.3177124410867691, "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 1349.5000610351562, "epoch": 0.4955567171981181, "grad_norm": 4.2036824226379395, "kl": 3.6171875, "learning_rate": 1.9047349721589927e-07, "loss": 0.2141, "reward": 0.5770089477300644, "reward_std": 0.2834569625556469, "rewards/accuracy_reward": 0.03794642980210483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625149011612, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 1259.4398040771484, "epoch": 0.4958554252856396, "grad_norm": 6.104475975036621, "kl": 3.5546875, "learning_rate": 1.9033520440245748e-07, "loss": 0.2822, "reward": 0.714285746216774, "reward_std": 0.31260767206549644, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870536118745804, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 1342.5648193359375, "epoch": 0.49615413337316105, "grad_norm": 6.7528300285339355, "kl": 4.125, "learning_rate": 1.901968840267964e-07, "loss": 0.2806, "reward": 0.6177455633878708, "reward_std": 0.2496688924729824, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.553013414144516, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 1445.0201416015625, "epoch": 0.4964528414606825, "grad_norm": 4.064300060272217, "kl": 4.1640625, "learning_rate": 1.9005853623939505e-07, "loss": 0.2461, "reward": 0.6847098469734192, "reward_std": 0.2771996296942234, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 1286.99560546875, "epoch": 0.496751549548204, "grad_norm": 4.640791893005371, "kl": 3.484375, "learning_rate": 1.899201611907623e-07, "loss": 0.2581, "reward": 0.6551339626312256, "reward_std": 0.3098566383123398, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 1338.7545166015625, "epoch": 0.49705025763572547, "grad_norm": 2.781301975250244, "kl": 3.8046875, "learning_rate": 1.897817590314366e-07, "loss": 0.2888, "reward": 0.5954241156578064, "reward_std": 0.2862516716122627, "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574776977300644, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 1340.4286193847656, "epoch": 0.49734896572324694, "grad_norm": 4.8232855796813965, "kl": 3.62109375, "learning_rate": 1.8964332991198595e-07, "loss": 0.2647, "reward": 0.6674107611179352, "reward_std": 0.32756802439689636, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669642984867096, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 1369.3170471191406, "epoch": 0.4976476738107684, "grad_norm": 4.806682109832764, "kl": 3.58984375, "learning_rate": 1.8950487398300764e-07, "loss": 0.2234, "reward": 0.6863839626312256, "reward_std": 0.29525822401046753, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 1366.8638916015625, "epoch": 0.4979463818982899, "grad_norm": 7.60567045211792, "kl": 3.59765625, "learning_rate": 1.8936639139512816e-07, "loss": 0.2536, "reward": 0.670200914144516, "reward_std": 0.3220164626836777, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973618745804, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 1347.7344360351562, "epoch": 0.49824508998581135, "grad_norm": 3.6842169761657715, "kl": 3.78515625, "learning_rate": 1.8922788229900303e-07, "loss": 0.254, "reward": 0.6132812798023224, "reward_std": 0.25151074305176735, "rewards/accuracy_reward": 0.024553572991862893, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 1288.8705749511719, "epoch": 0.4985437980733328, "grad_norm": 5.45065975189209, "kl": 3.40234375, "learning_rate": 1.890893468453166e-07, "loss": 0.3053, "reward": 0.6724330633878708, "reward_std": 0.29521162807941437, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187649011612, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 1346.9241638183594, "epoch": 0.4988425061608543, "grad_norm": 5.944262981414795, "kl": 3.078125, "learning_rate": 1.8895078518478177e-07, "loss": 0.1928, "reward": 0.6852678805589676, "reward_std": 0.25897427648305893, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250149011612, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 1361.4040832519531, "epoch": 0.49914121424837576, "grad_norm": 5.730788707733154, "kl": 4.53125, "learning_rate": 1.8881219746814003e-07, "loss": 0.3045, "reward": 0.6450893133878708, "reward_std": 0.3190178722143173, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000149011612, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 1330.8906860351562, "epoch": 0.49943992233589724, "grad_norm": 6.90684700012207, "kl": 4.28515625, "learning_rate": 1.8867358384616136e-07, "loss": 0.2848, "reward": 0.6210937798023224, "reward_std": 0.3201490566134453, "rewards/accuracy_reward": 0.0691964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973618745804, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 1387.7902221679688, "epoch": 0.4997386304234187, "grad_norm": 11.459232330322266, "kl": 4.22265625, "learning_rate": 1.885349444696437e-07, "loss": 0.2891, "reward": 0.5976562798023224, "reward_std": 0.27550605684518814, "rewards/accuracy_reward": 0.02678571525029838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705633878708, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 1350.6451416015625, "epoch": 0.5000373385109402, "grad_norm": 5.764761447906494, "kl": 3.77734375, "learning_rate": 1.8839627948941314e-07, "loss": 0.2488, "reward": 0.6835937798023224, "reward_std": 0.3133853152394295, "rewards/accuracy_reward": 0.11830357881262898, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 1389.7723999023438, "epoch": 0.5003360465984616, "grad_norm": 4.096139430999756, "kl": 4.07421875, "learning_rate": 1.8825758905632357e-07, "loss": 0.2518, "reward": 0.6690848469734192, "reward_std": 0.332134947180748, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955484867096, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 1368.2947387695312, "epoch": 0.5006347546859832, "grad_norm": 3.3785245418548584, "kl": 4.2109375, "learning_rate": 1.8811887332125663e-07, "loss": 0.3432, "reward": 0.7393973469734192, "reward_std": 0.31782878935337067, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576450914144516, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 1288.8683471679688, "epoch": 0.5009334627735046, "grad_norm": 13.820820808410645, "kl": 4.01171875, "learning_rate": 1.8798013243512135e-07, "loss": 0.3419, "reward": 0.649553582072258, "reward_std": 0.3185456320643425, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513393133878708, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 1329.3036193847656, "epoch": 0.5012321708610261, "grad_norm": 6.6286516189575195, "kl": 3.1640625, "learning_rate": 1.8784136654885432e-07, "loss": 0.2078, "reward": 0.6729910969734192, "reward_std": 0.2690846398472786, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053656578064, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 1295.0938110351562, "epoch": 0.5015308789485475, "grad_norm": 6.860150337219238, "kl": 4.11328125, "learning_rate": 1.8770257581341915e-07, "loss": 0.3532, "reward": 0.6367187649011612, "reward_std": 0.27577510103583336, "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5719866454601288, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 1368.9866638183594, "epoch": 0.501829587036069, "grad_norm": 7.633252143859863, "kl": 3.78515625, "learning_rate": 1.8756376037980666e-07, "loss": 0.2311, "reward": 0.628348246216774, "reward_std": 0.314268134534359, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839328289032, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 1269.3638916015625, "epoch": 0.5021282951235905, "grad_norm": 4.807422637939453, "kl": 3.3125, "learning_rate": 1.8742492039903426e-07, "loss": 0.2251, "reward": 0.7366071790456772, "reward_std": 0.3088779076933861, "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 1405.5513916015625, "epoch": 0.5024270032111119, "grad_norm": 4.4754414558410645, "kl": 4.19140625, "learning_rate": 1.872860560221464e-07, "loss": 0.2717, "reward": 0.6512276977300644, "reward_std": 0.3001725897192955, "rewards/accuracy_reward": 0.08705357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 1268.7589721679688, "epoch": 0.5027257112986334, "grad_norm": 3.453320026397705, "kl": 3.63671875, "learning_rate": 1.8714716740021383e-07, "loss": 0.3186, "reward": 0.7109375298023224, "reward_std": 0.3276364877820015, "rewards/accuracy_reward": 0.12500000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375447034836, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 1386.7344055175781, "epoch": 0.5030244193861548, "grad_norm": 6.000179290771484, "kl": 4.3671875, "learning_rate": 1.870082546843338e-07, "loss": 0.2922, "reward": 0.7327009290456772, "reward_std": 0.2984233945608139, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187947034836, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 1336.9040832519531, "epoch": 0.5033231274736764, "grad_norm": 5.75900936126709, "kl": 4.01171875, "learning_rate": 1.8686931802562973e-07, "loss": 0.3477, "reward": 0.5976562798023224, "reward_std": 0.3132771775126457, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5440848469734192, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 1294.8995971679688, "epoch": 0.5036218355611978, "grad_norm": 5.241732120513916, "kl": 2.921875, "learning_rate": 1.8673035757525105e-07, "loss": 0.2231, "reward": 0.6891741454601288, "reward_std": 0.2943219915032387, "rewards/accuracy_reward": 0.10267857811413705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 1422.8326416015625, "epoch": 0.5039205436487193, "grad_norm": 7.607037544250488, "kl": 3.7265625, "learning_rate": 1.8659137348437313e-07, "loss": 0.2519, "reward": 0.628348246216774, "reward_std": 0.2972921319305897, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803805589676, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 1191.4263763427734, "epoch": 0.5042192517362407, "grad_norm": 8.610334396362305, "kl": 3.21875, "learning_rate": 1.8645236590419704e-07, "loss": 0.2755, "reward": 0.6824777126312256, "reward_std": 0.29513928666710854, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312649011612, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 1316.4353637695312, "epoch": 0.5045179598237622, "grad_norm": 5.006099224090576, "kl": 3.70703125, "learning_rate": 1.8631333498594942e-07, "loss": 0.3139, "reward": 0.6339285969734192, "reward_std": 0.27830062434077263, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513392984867096, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 1337.0804138183594, "epoch": 0.5048166679112837, "grad_norm": 4.500782012939453, "kl": 3.08984375, "learning_rate": 1.8617428088088228e-07, "loss": 0.2267, "reward": 0.7070312798023224, "reward_std": 0.26753486692905426, "rewards/accuracy_reward": 0.11607143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 1392.3638916015625, "epoch": 0.5051153759988052, "grad_norm": 8.240315437316895, "kl": 3.65234375, "learning_rate": 1.8603520374027276e-07, "loss": 0.2838, "reward": 0.6601562798023224, "reward_std": 0.2580473944544792, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348469734192, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 1407.2790832519531, "epoch": 0.5054140840863266, "grad_norm": 10.019281387329102, "kl": 4.1171875, "learning_rate": 1.8589610371542337e-07, "loss": 0.287, "reward": 0.648995578289032, "reward_std": 0.28489091247320175, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5664062649011612, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 1344.7701416015625, "epoch": 0.5057127921738481, "grad_norm": 3.88565993309021, "kl": 3.4453125, "learning_rate": 1.8575698095766108e-07, "loss": 0.233, "reward": 0.5965401977300644, "reward_std": 0.27811185643076897, "rewards/accuracy_reward": 0.040178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5563616305589676, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 1210.6808624267578, "epoch": 0.5060115002613695, "grad_norm": 7.5183281898498535, "kl": 4.015625, "learning_rate": 1.8561783561833794e-07, "loss": 0.3545, "reward": 0.7717634439468384, "reward_std": 0.29582830518484116, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 1316.4353332519531, "epoch": 0.5063102083488911, "grad_norm": 3.7517998218536377, "kl": 3.7109375, "learning_rate": 1.854786678488304e-07, "loss": 0.2724, "reward": 0.7109375298023224, "reward_std": 0.30919060856103897, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5904018133878708, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 1385.9398193359375, "epoch": 0.5066089164364125, "grad_norm": 9.990367889404297, "kl": 4.875, "learning_rate": 1.853394778005394e-07, "loss": 0.367, "reward": 0.631138414144516, "reward_std": 0.29289767146110535, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491305589676, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 1226.1786499023438, "epoch": 0.506907624523934, "grad_norm": 7.634828567504883, "kl": 3.93359375, "learning_rate": 1.8520026562489e-07, "loss": 0.3536, "reward": 0.7488839477300644, "reward_std": 0.34994780272245407, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339477300644, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 1225.1027221679688, "epoch": 0.5072063326114554, "grad_norm": 7.085205078125, "kl": 2.9140625, "learning_rate": 1.8506103147333143e-07, "loss": 0.1955, "reward": 0.6819196790456772, "reward_std": 0.29513339698314667, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.597098246216774, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 1269.5268249511719, "epoch": 0.507505040698977, "grad_norm": 7.684974193572998, "kl": 3.93359375, "learning_rate": 1.8492177549733687e-07, "loss": 0.2777, "reward": 0.664620578289032, "reward_std": 0.314199835062027, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5664062798023224, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 1354.3817749023438, "epoch": 0.5078037487864984, "grad_norm": 7.288438320159912, "kl": 4.11328125, "learning_rate": 1.8478249784840307e-07, "loss": 0.246, "reward": 0.6300223469734192, "reward_std": 0.2940448969602585, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 1408.4130249023438, "epoch": 0.5081024568740199, "grad_norm": 4.526577949523926, "kl": 4.25, "learning_rate": 1.846431986780505e-07, "loss": 0.3321, "reward": 0.7165178954601288, "reward_std": 0.2985009290277958, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 1253.9085388183594, "epoch": 0.5084011649615413, "grad_norm": 4.863806247711182, "kl": 3.84765625, "learning_rate": 1.8450387813782304e-07, "loss": 0.3561, "reward": 0.6568080633878708, "reward_std": 0.2998911216855049, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 1378.99560546875, "epoch": 0.5086998730490628, "grad_norm": 4.519865989685059, "kl": 3.71875, "learning_rate": 1.843645363792877e-07, "loss": 0.2905, "reward": 0.6941964626312256, "reward_std": 0.2575160600244999, "rewards/accuracy_reward": 0.12946428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 1309.2054138183594, "epoch": 0.5089985811365842, "grad_norm": 10.018928527832031, "kl": 4.28515625, "learning_rate": 1.8422517355403476e-07, "loss": 0.314, "reward": 0.6333705484867096, "reward_std": 0.2778964340686798, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5530134066939354, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 1324.0848999023438, "epoch": 0.5092972892241058, "grad_norm": 6.818155288696289, "kl": 3.08984375, "learning_rate": 1.840857898136772e-07, "loss": 0.1933, "reward": 0.7003348469734192, "reward_std": 0.28164853900671005, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 1366.3504943847656, "epoch": 0.5095959973116272, "grad_norm": 5.426451206207275, "kl": 4.0, "learning_rate": 1.8394638530985102e-07, "loss": 0.2837, "reward": 0.615513414144516, "reward_std": 0.30057111382484436, "rewards/accuracy_reward": 0.04464285960420966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705633878708, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 1216.8839721679688, "epoch": 0.5098947053991487, "grad_norm": 8.467202186584473, "kl": 4.0, "learning_rate": 1.838069601942145e-07, "loss": 0.2713, "reward": 0.685825914144516, "reward_std": 0.3083635941147804, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 1260.3192443847656, "epoch": 0.5101934134866701, "grad_norm": 7.291774272918701, "kl": 3.37109375, "learning_rate": 1.8366751461844862e-07, "loss": 0.2653, "reward": 0.6886161118745804, "reward_std": 0.29138703644275665, "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.597098246216774, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 1359.279052734375, "epoch": 0.5104921215741917, "grad_norm": 7.63773775100708, "kl": 3.5, "learning_rate": 1.835280487342564e-07, "loss": 0.2631, "reward": 0.7321428954601288, "reward_std": 0.3200082518160343, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643208384514, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 1358.2411499023438, "epoch": 0.5107908296617131, "grad_norm": 9.471477508544922, "kl": 3.8203125, "learning_rate": 1.833885626933631e-07, "loss": 0.3434, "reward": 0.7198661118745804, "reward_std": 0.30859699100255966, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053954601288, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 1272.8438110351562, "epoch": 0.5110895377492346, "grad_norm": 5.437255382537842, "kl": 3.90234375, "learning_rate": 1.832490566475159e-07, "loss": 0.3642, "reward": 0.723214328289032, "reward_std": 0.349441297352314, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714286118745804, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 1302.9397888183594, "epoch": 0.511388245836756, "grad_norm": 4.26542854309082, "kl": 3.66796875, "learning_rate": 1.8310953074848362e-07, "loss": 0.2679, "reward": 0.7187500298023224, "reward_std": 0.2828754708170891, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5892857313156128, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 1281.8683776855469, "epoch": 0.5116869539242775, "grad_norm": 4.3277411460876465, "kl": 4.0625, "learning_rate": 1.8296998514805686e-07, "loss": 0.2963, "reward": 0.6266741156578064, "reward_std": 0.263213936239481, "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 1297.8639221191406, "epoch": 0.511985662011799, "grad_norm": 7.954524040222168, "kl": 3.76953125, "learning_rate": 1.828304199980475e-07, "loss": 0.3036, "reward": 0.6841518133878708, "reward_std": 0.30367495119571686, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 1268.9107666015625, "epoch": 0.5122843700993205, "grad_norm": 6.322964668273926, "kl": 3.61328125, "learning_rate": 1.8269083545028874e-07, "loss": 0.3341, "reward": 0.6501116305589676, "reward_std": 0.303517110645771, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.560825914144516, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 1266.18310546875, "epoch": 0.5125830781868419, "grad_norm": 9.485877990722656, "kl": 4.0234375, "learning_rate": 1.8255123165663487e-07, "loss": 0.3532, "reward": 0.6651785969734192, "reward_std": 0.33130914717912674, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928954601288, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 1303.7210388183594, "epoch": 0.5128817862743634, "grad_norm": 13.055898666381836, "kl": 3.84765625, "learning_rate": 1.824116087689612e-07, "loss": 0.3505, "reward": 0.7293527126312256, "reward_std": 0.3238983750343323, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 1389.2590026855469, "epoch": 0.5131804943618848, "grad_norm": 9.741665840148926, "kl": 4.015625, "learning_rate": 1.822719669391637e-07, "loss": 0.2961, "reward": 0.6763393133878708, "reward_std": 0.28875813633203506, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870535969734192, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 1270.0179138183594, "epoch": 0.5134792024494064, "grad_norm": 35.37015914916992, "kl": 4.9140625, "learning_rate": 1.8213230631915897e-07, "loss": 0.3301, "reward": 0.7399553954601288, "reward_std": 0.28919221460819244, "rewards/accuracy_reward": 0.15401786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 1264.04248046875, "epoch": 0.5137779105369278, "grad_norm": 5.555796146392822, "kl": 3.38671875, "learning_rate": 1.8199262706088415e-07, "loss": 0.3008, "reward": 0.7070312798023224, "reward_std": 0.3292572572827339, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6088169813156128, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 1436.1875610351562, "epoch": 0.5140766186244493, "grad_norm": 13.752983093261719, "kl": 4.984375, "learning_rate": 1.8185292931629657e-07, "loss": 0.3349, "reward": 0.6015625149011612, "reward_std": 0.29197610914707184, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5435268133878708, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 1391.0491638183594, "epoch": 0.5143753267119707, "grad_norm": 13.572246551513672, "kl": 4.47265625, "learning_rate": 1.817132132373736e-07, "loss": 0.3017, "reward": 0.6735491454601288, "reward_std": 0.32946552708745, "rewards/accuracy_reward": 0.1004464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731027126312256, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 1356.8415832519531, "epoch": 0.5146740347994921, "grad_norm": 3.7880406379699707, "kl": 3.98046875, "learning_rate": 1.8157347897611274e-07, "loss": 0.3103, "reward": 0.6707589477300644, "reward_std": 0.2637282684445381, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 1312.6160888671875, "epoch": 0.5149727428870137, "grad_norm": 8.263078689575195, "kl": 4.73828125, "learning_rate": 1.814337266845311e-07, "loss": 0.3358, "reward": 0.7343750298023224, "reward_std": 0.3444240018725395, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 1371.12060546875, "epoch": 0.5152714509745351, "grad_norm": 5.65316104888916, "kl": 3.4296875, "learning_rate": 1.8129395651466554e-07, "loss": 0.2078, "reward": 0.6573660969734192, "reward_std": 0.2862193435430527, "rewards/accuracy_reward": 0.08258929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 1365.8504943847656, "epoch": 0.5155701590620566, "grad_norm": 7.109509468078613, "kl": 3.53125, "learning_rate": 1.8115416861857224e-07, "loss": 0.3232, "reward": 0.642857164144516, "reward_std": 0.32258113101124763, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714626312256, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 1328.9264221191406, "epoch": 0.515868867149578, "grad_norm": 7.9608964920043945, "kl": 3.33203125, "learning_rate": 1.810143631483268e-07, "loss": 0.2965, "reward": 0.6344866454601288, "reward_std": 0.276231050491333, "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 1338.6875915527344, "epoch": 0.5161675752370996, "grad_norm": 8.807537078857422, "kl": 2.53125, "learning_rate": 1.808745402560238e-07, "loss": 0.2047, "reward": 0.7433035969734192, "reward_std": 0.3107919469475746, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 1324.94873046875, "epoch": 0.516466283324621, "grad_norm": 10.643682479858398, "kl": 3.03125, "learning_rate": 1.8073470009377688e-07, "loss": 0.2781, "reward": 0.6768973618745804, "reward_std": 0.2994657978415489, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 1284.0335693359375, "epoch": 0.5167649914121425, "grad_norm": 12.239896774291992, "kl": 3.1171875, "learning_rate": 1.805948428137184e-07, "loss": 0.3215, "reward": 0.7924107611179352, "reward_std": 0.2973739728331566, "rewards/accuracy_reward": 0.2366071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5558035969734192, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 1365.6965026855469, "epoch": 0.5170636994996639, "grad_norm": 5.212847709655762, "kl": 3.671875, "learning_rate": 1.8045496856799944e-07, "loss": 0.2879, "reward": 0.6540178805589676, "reward_std": 0.3062277063727379, "rewards/accuracy_reward": 0.0870535729918629, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669642984867096, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 1339.0870971679688, "epoch": 0.5173624075871854, "grad_norm": 6.394562721252441, "kl": 3.72265625, "learning_rate": 1.803150775087894e-07, "loss": 0.278, "reward": 0.6780134290456772, "reward_std": 0.2888672426342964, "rewards/accuracy_reward": 0.08035714458674192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562798023224, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 1331.5156860351562, "epoch": 0.5176611156747069, "grad_norm": 4.407042503356934, "kl": 4.046875, "learning_rate": 1.8017516978827606e-07, "loss": 0.3629, "reward": 0.7204241454601288, "reward_std": 0.28363989293575287, "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 1282.9085388183594, "epoch": 0.5179598237622284, "grad_norm": 10.373388290405273, "kl": 3.8984375, "learning_rate": 1.8003524555866534e-07, "loss": 0.2533, "reward": 0.7762277126312256, "reward_std": 0.265471737831831, "rewards/accuracy_reward": 0.16517858393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6110491454601288, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 1326.0134887695312, "epoch": 0.5182585318497498, "grad_norm": 9.215019226074219, "kl": 4.54296875, "learning_rate": 1.7989530497218106e-07, "loss": 0.3177, "reward": 0.6785714626312256, "reward_std": 0.2561635822057724, "rewards/accuracy_reward": 0.09375000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 1359.3147888183594, "epoch": 0.5185572399372713, "grad_norm": 5.574534893035889, "kl": 4.21484375, "learning_rate": 1.797553481810649e-07, "loss": 0.2802, "reward": 0.6835937649011612, "reward_std": 0.35762544721364975, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5719866305589676, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 1385.4598693847656, "epoch": 0.5188559480247927, "grad_norm": 3.6656112670898438, "kl": 3.7109375, "learning_rate": 1.7961537533757606e-07, "loss": 0.2636, "reward": 0.6166294813156128, "reward_std": 0.28284890204668045, "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 1423.2746276855469, "epoch": 0.5191546561123143, "grad_norm": 5.517726898193359, "kl": 3.76953125, "learning_rate": 1.7947538659399134e-07, "loss": 0.2901, "reward": 0.622209832072258, "reward_std": 0.27824532613158226, "rewards/accuracy_reward": 0.03571428684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 1378.3594665527344, "epoch": 0.5194533641998357, "grad_norm": 11.72478199005127, "kl": 4.50390625, "learning_rate": 1.793353821026047e-07, "loss": 0.3705, "reward": 0.640066996216774, "reward_std": 0.24731917679309845, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544084832072258, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 1375.6406860351562, "epoch": 0.5197520722873572, "grad_norm": 7.281454086303711, "kl": 4.42578125, "learning_rate": 1.7919536201572734e-07, "loss": 0.3616, "reward": 0.676339328289032, "reward_std": 0.30283934623003006, "rewards/accuracy_reward": 0.12500000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513393133878708, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 1347.9130249023438, "epoch": 0.5200507803748786, "grad_norm": 5.687786102294922, "kl": 3.89453125, "learning_rate": 1.7905532648568748e-07, "loss": 0.3539, "reward": 0.6852678805589676, "reward_std": 0.28852370753884315, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714286118745804, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 1384.513427734375, "epoch": 0.5203494884624001, "grad_norm": 6.75421667098999, "kl": 3.37109375, "learning_rate": 1.789152756648299e-07, "loss": 0.26, "reward": 0.6032366305589676, "reward_std": 0.27696172520518303, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937649011612, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 1300.2366943359375, "epoch": 0.5206481965499216, "grad_norm": 9.429688453674316, "kl": 3.359375, "learning_rate": 1.7877520970551627e-07, "loss": 0.2385, "reward": 0.632254496216774, "reward_std": 0.2509811818599701, "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 1316.7411193847656, "epoch": 0.5209469046374431, "grad_norm": 8.773137092590332, "kl": 2.79296875, "learning_rate": 1.7863512876012464e-07, "loss": 0.2011, "reward": 0.670200914144516, "reward_std": 0.2637033350765705, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5943080633878708, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 1304.3303833007812, "epoch": 0.5212456127249645, "grad_norm": 9.603525161743164, "kl": 2.619140625, "learning_rate": 1.7849503298104925e-07, "loss": 0.1785, "reward": 0.7103794813156128, "reward_std": 0.3121807351708412, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5987723469734192, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 1339.4241638183594, "epoch": 0.521544320812486, "grad_norm": 6.220691680908203, "kl": 3.71484375, "learning_rate": 1.7835492252070063e-07, "loss": 0.3231, "reward": 0.7723214775323868, "reward_std": 0.2769378200173378, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178954601288, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 1317.8750610351562, "epoch": 0.5218430289000074, "grad_norm": 8.0018310546875, "kl": 3.34765625, "learning_rate": 1.7821479753150522e-07, "loss": 0.2739, "reward": 0.6250000149011612, "reward_std": 0.24866241216659546, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 1351.8326110839844, "epoch": 0.522141736987529, "grad_norm": 9.401191711425781, "kl": 2.94140625, "learning_rate": 1.780746581659053e-07, "loss": 0.2542, "reward": 0.6579241305589676, "reward_std": 0.28038696199655533, "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6110491156578064, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 1378.7143249511719, "epoch": 0.5224404450750504, "grad_norm": 6.892897129058838, "kl": 4.2265625, "learning_rate": 1.7793450457635876e-07, "loss": 0.3564, "reward": 0.6930803805589676, "reward_std": 0.28513022884726524, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303805589676, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 1306.9978332519531, "epoch": 0.5227391531625719, "grad_norm": 15.897427558898926, "kl": 4.875, "learning_rate": 1.77794336915339e-07, "loss": 0.3876, "reward": 0.6032366454601288, "reward_std": 0.2862699143588543, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.560825914144516, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 1304.0535888671875, "epoch": 0.5230378612500933, "grad_norm": 6.889565467834473, "kl": 3.8828125, "learning_rate": 1.776541553353346e-07, "loss": 0.3482, "reward": 0.6160714477300644, "reward_std": 0.24973992258310318, "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 1370.7701416015625, "epoch": 0.5233365693376149, "grad_norm": 13.940754890441895, "kl": 4.203125, "learning_rate": 1.7751395998884942e-07, "loss": 0.269, "reward": 0.7366071790456772, "reward_std": 0.32817402482032776, "rewards/accuracy_reward": 0.15401786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 1356.1518249511719, "epoch": 0.5236352774251363, "grad_norm": 10.366837501525879, "kl": 4.2421875, "learning_rate": 1.773737510284023e-07, "loss": 0.3479, "reward": 0.7293526977300644, "reward_std": 0.3550317883491516, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5954241454601288, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 1401.8906860351562, "epoch": 0.5239339855126578, "grad_norm": 6.6068267822265625, "kl": 4.05078125, "learning_rate": 1.7723352860652683e-07, "loss": 0.3563, "reward": 0.6657366454601288, "reward_std": 0.3204226419329643, "rewards/accuracy_reward": 0.08928572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5764509290456772, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 1396.3750610351562, "epoch": 0.5242326936001792, "grad_norm": 5.559892654418945, "kl": 3.87109375, "learning_rate": 1.7709329287577122e-07, "loss": 0.3072, "reward": 0.6724330633878708, "reward_std": 0.329147607088089, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5630580633878708, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 1369.5982971191406, "epoch": 0.5245314016877007, "grad_norm": 3.68656325340271, "kl": 4.3203125, "learning_rate": 1.769530439886983e-07, "loss": 0.289, "reward": 0.6395089477300644, "reward_std": 0.24796005338430405, "rewards/accuracy_reward": 0.05580357206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053954601288, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 1403.0491638183594, "epoch": 0.5248301097752222, "grad_norm": 5.520887851715088, "kl": 3.46875, "learning_rate": 1.7681278209788497e-07, "loss": 0.3057, "reward": 0.6841518133878708, "reward_std": 0.307666152715683, "rewards/accuracy_reward": 0.10267858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581473246216774, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 1306.5335388183594, "epoch": 0.5251288178627437, "grad_norm": 9.492051124572754, "kl": 2.880859375, "learning_rate": 1.7667250735592255e-07, "loss": 0.2195, "reward": 0.7204241305589676, "reward_std": 0.29264552891254425, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 1402.58935546875, "epoch": 0.5254275259502651, "grad_norm": 10.880304336547852, "kl": 3.2578125, "learning_rate": 1.765322199154161e-07, "loss": 0.2745, "reward": 0.6774553805589676, "reward_std": 0.3136075362563133, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 1288.7411193847656, "epoch": 0.5257262340377866, "grad_norm": 4.56883430480957, "kl": 3.6875, "learning_rate": 1.7639191992898471e-07, "loss": 0.3227, "reward": 0.68526791036129, "reward_std": 0.3190123811364174, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870536118745804, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 1353.7991638183594, "epoch": 0.526024942125308, "grad_norm": 10.096705436706543, "kl": 4.23828125, "learning_rate": 1.7625160754926094e-07, "loss": 0.2978, "reward": 0.6813616305589676, "reward_std": 0.29836834967136383, "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 1265.6964721679688, "epoch": 0.5263236502128296, "grad_norm": 9.820947647094727, "kl": 3.40234375, "learning_rate": 1.7611128292889092e-07, "loss": 0.2452, "reward": 0.7433036118745804, "reward_std": 0.2539111375808716, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 1367.6116638183594, "epoch": 0.526622358300351, "grad_norm": 7.1331987380981445, "kl": 3.6484375, "learning_rate": 1.7597094622053413e-07, "loss": 0.2435, "reward": 0.728794664144516, "reward_std": 0.2894943244755268, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5904017984867096, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 1346.9732666015625, "epoch": 0.5269210663878725, "grad_norm": 6.641334056854248, "kl": 3.5234375, "learning_rate": 1.7583059757686308e-07, "loss": 0.2923, "reward": 0.6411830633878708, "reward_std": 0.2748975604772568, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 1334.9152526855469, "epoch": 0.5272197744753939, "grad_norm": 5.512397289276123, "kl": 3.3984375, "learning_rate": 1.756902371505634e-07, "loss": 0.2409, "reward": 0.674107164144516, "reward_std": 0.279889315366745, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 1275.9822082519531, "epoch": 0.5275184825629153, "grad_norm": 8.112975120544434, "kl": 4.2265625, "learning_rate": 1.7554986509433347e-07, "loss": 0.3836, "reward": 0.6707589626312256, "reward_std": 0.30727599933743477, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 1439.7813415527344, "epoch": 0.5278171906504369, "grad_norm": 6.137165069580078, "kl": 5.109375, "learning_rate": 1.7540948156088428e-07, "loss": 0.2983, "reward": 0.718191996216774, "reward_std": 0.3085430674254894, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 1468.2634582519531, "epoch": 0.5281158987379583, "grad_norm": 7.333249568939209, "kl": 3.36328125, "learning_rate": 1.7526908670293943e-07, "loss": 0.2265, "reward": 0.6640625298023224, "reward_std": 0.2755220904946327, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948660969734192, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 1386.2746276855469, "epoch": 0.5284146068254798, "grad_norm": 5.694908618927002, "kl": 3.49609375, "learning_rate": 1.751286806732347e-07, "loss": 0.2427, "reward": 0.6891741305589676, "reward_std": 0.2739315554499626, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598618745804, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 1284.841552734375, "epoch": 0.5287133149130012, "grad_norm": 10.0690336227417, "kl": 3.80078125, "learning_rate": 1.749882636245181e-07, "loss": 0.3378, "reward": 0.6880580633878708, "reward_std": 0.32736557722091675, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 1443.9129943847656, "epoch": 0.5290120230005227, "grad_norm": 6.75313138961792, "kl": 4.1171875, "learning_rate": 1.748478357095497e-07, "loss": 0.2884, "reward": 0.706473246216774, "reward_std": 0.25867390632629395, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5725446790456772, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 1318.7969360351562, "epoch": 0.5293107310880442, "grad_norm": 7.452897548675537, "kl": 3.36328125, "learning_rate": 1.7470739708110122e-07, "loss": 0.2399, "reward": 0.7862723469734192, "reward_std": 0.2931477017700672, "rewards/accuracy_reward": 0.17187500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6143973469734192, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 1251.6005249023438, "epoch": 0.5296094391755657, "grad_norm": 5.869091987609863, "kl": 3.40234375, "learning_rate": 1.7456694789195608e-07, "loss": 0.245, "reward": 0.7092634290456772, "reward_std": 0.33310258388519287, "rewards/accuracy_reward": 0.1495535816065967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098469734192, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 1244.0625610351562, "epoch": 0.5299081472630871, "grad_norm": 4.629581451416016, "kl": 3.10546875, "learning_rate": 1.7442648829490934e-07, "loss": 0.2603, "reward": 0.6819196790456772, "reward_std": 0.30857954919338226, "rewards/accuracy_reward": 0.09375000791624188, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5881696790456772, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 1310.4085388183594, "epoch": 0.5302068553506086, "grad_norm": 15.370144844055176, "kl": 5.015625, "learning_rate": 1.7428601844276725e-07, "loss": 0.4269, "reward": 0.6668527126312256, "reward_std": 0.2793399840593338, "rewards/accuracy_reward": 0.0825892926659435, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5842634290456772, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 1409.7098999023438, "epoch": 0.53050556343813, "grad_norm": 18.95305824279785, "kl": 4.55078125, "learning_rate": 1.7414553848834715e-07, "loss": 0.2985, "reward": 0.6713170111179352, "reward_std": 0.28812141716480255, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312798023224, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 1237.5022888183594, "epoch": 0.5308042715256516, "grad_norm": 5.990171432495117, "kl": 3.71484375, "learning_rate": 1.7400504858447763e-07, "loss": 0.3245, "reward": 0.772879496216774, "reward_std": 0.33807648718357086, "rewards/accuracy_reward": 0.19419643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 1364.5469360351562, "epoch": 0.531102979613173, "grad_norm": 4.7632622718811035, "kl": 3.796875, "learning_rate": 1.7386454888399773e-07, "loss": 0.3197, "reward": 0.7159598618745804, "reward_std": 0.3185453563928604, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312798023224, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 1300.3772888183594, "epoch": 0.5314016877006945, "grad_norm": 9.618247985839844, "kl": 3.53125, "learning_rate": 1.7372403953975757e-07, "loss": 0.3254, "reward": 0.7762277126312256, "reward_std": 0.30974604189395905, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6065848469734192, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 1349.0558776855469, "epoch": 0.5317003957882159, "grad_norm": 7.678754806518555, "kl": 3.30078125, "learning_rate": 1.7358352070461736e-07, "loss": 0.2565, "reward": 0.7198661118745804, "reward_std": 0.31995009630918503, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6216518133878708, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 1290.7366638183594, "epoch": 0.5319991038757375, "grad_norm": 6.14072322845459, "kl": 3.8671875, "learning_rate": 1.7344299253144795e-07, "loss": 0.3451, "reward": 0.689732164144516, "reward_std": 0.3102748766541481, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 1406.3884582519531, "epoch": 0.5322978119632589, "grad_norm": 4.427496433258057, "kl": 3.81640625, "learning_rate": 1.7330245517313015e-07, "loss": 0.317, "reward": 0.7444196790456772, "reward_std": 0.29852067679166794, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.579241082072258, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 1366.0692749023438, "epoch": 0.5325965200507804, "grad_norm": 20.720237731933594, "kl": 4.97265625, "learning_rate": 1.7316190878255482e-07, "loss": 0.3036, "reward": 0.6149553954601288, "reward_std": 0.2791655883193016, "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 1420.0781555175781, "epoch": 0.5328952281383018, "grad_norm": 6.537832736968994, "kl": 3.71484375, "learning_rate": 1.7302135351262273e-07, "loss": 0.2524, "reward": 0.6880580633878708, "reward_std": 0.29507848992943764, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5764509290456772, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 1352.4420318603516, "epoch": 0.5331939362258233, "grad_norm": 8.454792976379395, "kl": 4.0546875, "learning_rate": 1.728807895162442e-07, "loss": 0.3666, "reward": 0.8694196790456772, "reward_std": 0.38637878745794296, "rewards/accuracy_reward": 0.274553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948660969734192, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 1282.0871276855469, "epoch": 0.5334926443133448, "grad_norm": 5.439426898956299, "kl": 3.76171875, "learning_rate": 1.7274021694633908e-07, "loss": 0.2572, "reward": 0.6847098618745804, "reward_std": 0.2664589062333107, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731027126312256, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 1353.2433471679688, "epoch": 0.5337913524008663, "grad_norm": 9.380500793457031, "kl": 4.12109375, "learning_rate": 1.725996359558365e-07, "loss": 0.3313, "reward": 0.733816996216774, "reward_std": 0.2591885104775429, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 1309.138427734375, "epoch": 0.5340900604883877, "grad_norm": 15.470882415771484, "kl": 3.6640625, "learning_rate": 1.7245904669767484e-07, "loss": 0.3342, "reward": 0.7187500298023224, "reward_std": 0.27300335839390755, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 1375.46435546875, "epoch": 0.5343887685759092, "grad_norm": 5.562556743621826, "kl": 3.125, "learning_rate": 1.723184493248015e-07, "loss": 0.2125, "reward": 0.7020089626312256, "reward_std": 0.2771356925368309, "rewards/accuracy_reward": 0.10937500279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 1318.0469360351562, "epoch": 0.5346874766634306, "grad_norm": 16.622604370117188, "kl": 2.693359375, "learning_rate": 1.7217784399017252e-07, "loss": 0.2769, "reward": 0.748325914144516, "reward_std": 0.3137582316994667, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.607700914144516, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 1335.7344055175781, "epoch": 0.5349861847509522, "grad_norm": 13.579619407653809, "kl": 3.44140625, "learning_rate": 1.7203723084675283e-07, "loss": 0.2942, "reward": 0.7148437798023224, "reward_std": 0.28933244943618774, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6121652126312256, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 1303.6027221679688, "epoch": 0.5352848928384736, "grad_norm": 6.560168743133545, "kl": 3.98046875, "learning_rate": 1.718966100475157e-07, "loss": 0.3987, "reward": 0.675223246216774, "reward_std": 0.32556214183568954, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948660969734192, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 1367.997802734375, "epoch": 0.5355836009259951, "grad_norm": 5.029166221618652, "kl": 4.015625, "learning_rate": 1.717559817454427e-07, "loss": 0.2836, "reward": 0.7399553954601288, "reward_std": 0.2845252864062786, "rewards/accuracy_reward": 0.15178572130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.588169664144516, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 1316.7723999023438, "epoch": 0.5358823090135165, "grad_norm": 9.988751411437988, "kl": 4.37109375, "learning_rate": 1.716153460935238e-07, "loss": 0.3473, "reward": 0.620535746216774, "reward_std": 0.24743155762553215, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500447034836, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 1380.8884582519531, "epoch": 0.536181017101038, "grad_norm": 26.352659225463867, "kl": 4.43359375, "learning_rate": 1.7147470324475668e-07, "loss": 0.3439, "reward": 0.810825914144516, "reward_std": 0.33591924607753754, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366305589676, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 1212.7544860839844, "epoch": 0.5364797251885595, "grad_norm": 13.284770011901855, "kl": 3.810546875, "learning_rate": 1.7133405335214706e-07, "loss": 0.2702, "reward": 0.6735491454601288, "reward_std": 0.2882115840911865, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 1356.55810546875, "epoch": 0.536778433276081, "grad_norm": 21.825998306274414, "kl": 4.59765625, "learning_rate": 1.7119339656870814e-07, "loss": 0.3619, "reward": 0.7544643133878708, "reward_std": 0.3415154591202736, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 1398.9040832519531, "epoch": 0.5370771413636024, "grad_norm": 6.8752546310424805, "kl": 4.0078125, "learning_rate": 1.7105273304746082e-07, "loss": 0.3327, "reward": 0.699776828289032, "reward_std": 0.2849023900926113, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6104911118745804, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 1301.1027221679688, "epoch": 0.5373758494511239, "grad_norm": 10.250410079956055, "kl": 4.0390625, "learning_rate": 1.7091206294143316e-07, "loss": 0.3493, "reward": 0.6768973469734192, "reward_std": 0.28143156319856644, "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.598772332072258, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 1338.5357666015625, "epoch": 0.5376745575386453, "grad_norm": 8.740803718566895, "kl": 3.27734375, "learning_rate": 1.7077138640366052e-07, "loss": 0.2442, "reward": 0.6573661118745804, "reward_std": 0.31743423640727997, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 1362.2835388183594, "epoch": 0.5379732656261669, "grad_norm": 5.368032932281494, "kl": 3.099609375, "learning_rate": 1.7063070358718502e-07, "loss": 0.2063, "reward": 0.6981026977300644, "reward_std": 0.28207625448703766, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 1378.3639221191406, "epoch": 0.5382719737136883, "grad_norm": 15.866640090942383, "kl": 3.23046875, "learning_rate": 1.7049001464505597e-07, "loss": 0.2905, "reward": 0.6958705633878708, "reward_std": 0.2753642797470093, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955484867096, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 1303.6183776855469, "epoch": 0.5385706818012098, "grad_norm": 6.568786144256592, "kl": 3.40234375, "learning_rate": 1.7034931973032902e-07, "loss": 0.2814, "reward": 0.6863839626312256, "reward_std": 0.28821950778365135, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5881696492433548, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 1297.7746276855469, "epoch": 0.5388693898887312, "grad_norm": 14.643795013427734, "kl": 4.01953125, "learning_rate": 1.7020861899606652e-07, "loss": 0.3264, "reward": 0.6199776977300644, "reward_std": 0.25983677059412, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 1320.7254943847656, "epoch": 0.5391680979762528, "grad_norm": 5.969735622406006, "kl": 3.51953125, "learning_rate": 1.7006791259533706e-07, "loss": 0.2846, "reward": 0.5926339626312256, "reward_std": 0.24839221686124802, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089626312256, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 1369.1340026855469, "epoch": 0.5394668060637742, "grad_norm": 5.746124744415283, "kl": 3.39453125, "learning_rate": 1.6992720068121536e-07, "loss": 0.2421, "reward": 0.6997768133878708, "reward_std": 0.2850535288453102, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6082589477300644, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 1423.6250610351562, "epoch": 0.5397655141512957, "grad_norm": 8.337557792663574, "kl": 4.0703125, "learning_rate": 1.697864834067823e-07, "loss": 0.3078, "reward": 0.6573661118745804, "reward_std": 0.3011219948530197, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803805589676, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 1256.15185546875, "epoch": 0.5400642222388171, "grad_norm": 24.62969398498535, "kl": 5.171875, "learning_rate": 1.6964576092512434e-07, "loss": 0.4222, "reward": 0.7047991454601288, "reward_std": 0.30592329055070877, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241454601288, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 1342.1630249023438, "epoch": 0.5403629303263385, "grad_norm": 12.503030776977539, "kl": 4.9375, "learning_rate": 1.695050333893339e-07, "loss": 0.4337, "reward": 0.660714328289032, "reward_std": 0.2860044948756695, "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928954601288, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 1272.5335388183594, "epoch": 0.5406616384138601, "grad_norm": 11.839045524597168, "kl": 4.265625, "learning_rate": 1.6936430095250862e-07, "loss": 0.371, "reward": 0.6875000298023224, "reward_std": 0.29236023128032684, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 1337.0580749511719, "epoch": 0.5409603465013815, "grad_norm": 7.7815961837768555, "kl": 4.40625, "learning_rate": 1.692235637677517e-07, "loss": 0.3635, "reward": 0.6707589477300644, "reward_std": 0.2666308917105198, "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792410969734192, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 1303.8415832519531, "epoch": 0.541259054588903, "grad_norm": 8.508535385131836, "kl": 3.484375, "learning_rate": 1.6908282198817129e-07, "loss": 0.3412, "reward": 0.6724330633878708, "reward_std": 0.3177808150649071, "rewards/accuracy_reward": 0.06696428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687798023224, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 1250.0357666015625, "epoch": 0.5415577626764244, "grad_norm": 8.96481704711914, "kl": 3.56640625, "learning_rate": 1.689420757668808e-07, "loss": 0.3333, "reward": 0.7087053805589676, "reward_std": 0.3243592269718647, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5970982611179352, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 1318.5067443847656, "epoch": 0.5418564707639459, "grad_norm": 23.01692771911621, "kl": 2.87109375, "learning_rate": 1.688013252569982e-07, "loss": 0.2615, "reward": 0.6975446790456772, "reward_std": 0.31613466888666153, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5904018133878708, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 1265.6072082519531, "epoch": 0.5421551788514674, "grad_norm": 7.538328647613525, "kl": 4.02734375, "learning_rate": 1.6866057061164623e-07, "loss": 0.3387, "reward": 0.6953125149011612, "reward_std": 0.3343024328351021, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 1380.12060546875, "epoch": 0.5424538869389889, "grad_norm": 10.284692764282227, "kl": 4.34765625, "learning_rate": 1.685198119839523e-07, "loss": 0.3286, "reward": 0.631138414144516, "reward_std": 0.28192076086997986, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991454601288, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 1421.4911193847656, "epoch": 0.5427525950265103, "grad_norm": 6.956277847290039, "kl": 3.56640625, "learning_rate": 1.683790495270479e-07, "loss": 0.2678, "reward": 0.7003348469734192, "reward_std": 0.3064187988638878, "rewards/accuracy_reward": 0.10267857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562798023224, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 1400.4844665527344, "epoch": 0.5430513031140318, "grad_norm": 5.587220668792725, "kl": 3.4375, "learning_rate": 1.6823828339406876e-07, "loss": 0.2567, "reward": 0.7031250298023224, "reward_std": 0.2740012817084789, "rewards/accuracy_reward": 0.13839285797439516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321492433548, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 1337.49560546875, "epoch": 0.5433500112015532, "grad_norm": 7.042967319488525, "kl": 3.4609375, "learning_rate": 1.6809751373815465e-07, "loss": 0.2836, "reward": 0.7159598469734192, "reward_std": 0.31968826055526733, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 1388.4040832519531, "epoch": 0.5436487192890748, "grad_norm": 9.334644317626953, "kl": 4.08984375, "learning_rate": 1.6795674071244924e-07, "loss": 0.3616, "reward": 0.663504496216774, "reward_std": 0.27335813269019127, "rewards/accuracy_reward": 0.06696429080329835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 1427.3951416015625, "epoch": 0.5439474273765962, "grad_norm": 13.223341941833496, "kl": 3.08203125, "learning_rate": 1.678159644700997e-07, "loss": 0.2451, "reward": 0.731026828289032, "reward_std": 0.2942696884274483, "rewards/accuracy_reward": 0.16294643748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803954601288, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 1416.7768859863281, "epoch": 0.5442461354641177, "grad_norm": 10.674654006958008, "kl": 3.28125, "learning_rate": 1.6767518516425685e-07, "loss": 0.2647, "reward": 0.6936384290456772, "reward_std": 0.2945929802954197, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 1382.4822082519531, "epoch": 0.5445448435516391, "grad_norm": 7.89846134185791, "kl": 4.0078125, "learning_rate": 1.6753440294807466e-07, "loss": 0.3563, "reward": 0.703683078289032, "reward_std": 0.30358660593628883, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366454601288, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 1262.9063110351562, "epoch": 0.5448435516391607, "grad_norm": 9.696975708007812, "kl": 4.0625, "learning_rate": 1.6739361797471052e-07, "loss": 0.3671, "reward": 0.7047991305589676, "reward_std": 0.25882915779948235, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.599888414144516, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 1315.6853332519531, "epoch": 0.5451422597266821, "grad_norm": 11.101248741149902, "kl": 3.203125, "learning_rate": 1.6725283039732461e-07, "loss": 0.2809, "reward": 0.7338169813156128, "reward_std": 0.3154509738087654, "rewards/accuracy_reward": 0.18080357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.553013414144516, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 1307.3125305175781, "epoch": 0.5454409678142036, "grad_norm": 12.441606521606445, "kl": 3.828125, "learning_rate": 1.6711204036908e-07, "loss": 0.2717, "reward": 0.6439732313156128, "reward_std": 0.28180330991744995, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603794664144516, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 1348.7857360839844, "epoch": 0.545739675901725, "grad_norm": 46.06245422363281, "kl": 6.30078125, "learning_rate": 1.6697124804314253e-07, "loss": 0.4707, "reward": 0.706473246216774, "reward_std": 0.2981967553496361, "rewards/accuracy_reward": 0.12053571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 1379.9822082519531, "epoch": 0.5460383839892465, "grad_norm": 15.112664222717285, "kl": 5.00390625, "learning_rate": 1.6683045357268035e-07, "loss": 0.3779, "reward": 0.717075914144516, "reward_std": 0.2863999418914318, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937649011612, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 1352.2388916015625, "epoch": 0.546337092076768, "grad_norm": 15.728433609008789, "kl": 4.78515625, "learning_rate": 1.6668965711086406e-07, "loss": 0.3657, "reward": 0.622209832072258, "reward_std": 0.26343002915382385, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 1394.6897888183594, "epoch": 0.5466358001642895, "grad_norm": 8.302680015563965, "kl": 4.07421875, "learning_rate": 1.665488588108665e-07, "loss": 0.3644, "reward": 0.7550223618745804, "reward_std": 0.28796323388814926, "rewards/accuracy_reward": 0.16741071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116305589676, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 1341.5045166015625, "epoch": 0.5469345082518109, "grad_norm": 10.011714935302734, "kl": 4.33984375, "learning_rate": 1.6640805882586235e-07, "loss": 0.3435, "reward": 0.7159598618745804, "reward_std": 0.3083478733897209, "rewards/accuracy_reward": 0.1272321459837258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 1418.82373046875, "epoch": 0.5472332163393324, "grad_norm": 9.381051063537598, "kl": 3.3046875, "learning_rate": 1.6626725730902816e-07, "loss": 0.218, "reward": 0.7243303954601288, "reward_std": 0.30463917553424835, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6060267984867096, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 1322.0111999511719, "epoch": 0.5475319244268538, "grad_norm": 11.611653327941895, "kl": 3.66015625, "learning_rate": 1.6612645441354226e-07, "loss": 0.3014, "reward": 0.6713169813156128, "reward_std": 0.2970084920525551, "rewards/accuracy_reward": 0.08482143329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 1403.7389221191406, "epoch": 0.5478306325143754, "grad_norm": 10.367018699645996, "kl": 4.45703125, "learning_rate": 1.659856502925843e-07, "loss": 0.3655, "reward": 0.6914062798023224, "reward_std": 0.3093472346663475, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205484867096, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 1372.60498046875, "epoch": 0.5481293406018968, "grad_norm": 6.635549068450928, "kl": 3.52734375, "learning_rate": 1.658448450993355e-07, "loss": 0.2782, "reward": 0.607700914144516, "reward_std": 0.2774813584983349, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5474330633878708, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 1284.4866638183594, "epoch": 0.5484280486894183, "grad_norm": 10.344928741455078, "kl": 3.52734375, "learning_rate": 1.65704038986978e-07, "loss": 0.2996, "reward": 0.6791294813156128, "reward_std": 0.32816046476364136, "rewards/accuracy_reward": 0.1138392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652901977300644, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 1427.15185546875, "epoch": 0.5487267567769397, "grad_norm": 64.29132843017578, "kl": 5.57421875, "learning_rate": 1.6556323210869509e-07, "loss": 0.397, "reward": 0.6690848469734192, "reward_std": 0.2647186480462551, "rewards/accuracy_reward": 0.09598214575089514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731027126312256, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 1431.7053833007812, "epoch": 0.5490254648644612, "grad_norm": 14.03317928314209, "kl": 2.4140625, "learning_rate": 1.6542242461767086e-07, "loss": 0.165, "reward": 0.6573660969734192, "reward_std": 0.2974407412111759, "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 1436.4353332519531, "epoch": 0.5493241729519827, "grad_norm": 67.4900131225586, "kl": 4.23828125, "learning_rate": 1.6528161666709008e-07, "loss": 0.3143, "reward": 0.6450893133878708, "reward_std": 0.28389234095811844, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 1396.5045166015625, "epoch": 0.5496228810395042, "grad_norm": 7.329573154449463, "kl": 3.671875, "learning_rate": 1.6514080841013788e-07, "loss": 0.266, "reward": 0.6618303954601288, "reward_std": 0.28251851722598076, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5435267984867096, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 1359.9889221191406, "epoch": 0.5499215891270256, "grad_norm": 7.053776264190674, "kl": 3.34765625, "learning_rate": 1.65e-07, "loss": 0.2599, "reward": 0.6690848618745804, "reward_std": 0.25529273971915245, "rewards/accuracy_reward": 0.09151785750873387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.577566996216774, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 1352.5179138183594, "epoch": 0.5502202972145471, "grad_norm": 11.710762023925781, "kl": 3.890625, "learning_rate": 1.6485919158986214e-07, "loss": 0.3306, "reward": 0.6540178805589676, "reward_std": 0.2657124139368534, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 1389.8795166015625, "epoch": 0.5505190053020685, "grad_norm": 10.120984077453613, "kl": 4.375, "learning_rate": 1.6471838333290993e-07, "loss": 0.3674, "reward": 0.684151828289032, "reward_std": 0.2729986608028412, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 1288.4063110351562, "epoch": 0.5508177133895901, "grad_norm": 6.827935695648193, "kl": 3.91796875, "learning_rate": 1.6457757538232913e-07, "loss": 0.3287, "reward": 0.690848246216774, "reward_std": 0.3063231185078621, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594866082072258, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 1422.7232360839844, "epoch": 0.5511164214771115, "grad_norm": 30.53774070739746, "kl": 6.3984375, "learning_rate": 1.644367678913049e-07, "loss": 0.4835, "reward": 0.6646205633878708, "reward_std": 0.2895274832844734, "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731027126312256, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 1375.3170166015625, "epoch": 0.551415129564633, "grad_norm": 24.625476837158203, "kl": 4.7578125, "learning_rate": 1.6429596101302201e-07, "loss": 0.3503, "reward": 0.7165178954601288, "reward_std": 0.294022798538208, "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.602678582072258, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 1414.27685546875, "epoch": 0.5517138376521544, "grad_norm": 5.970858573913574, "kl": 4.1640625, "learning_rate": 1.6415515490066449e-07, "loss": 0.3004, "reward": 0.6679687947034836, "reward_std": 0.24580786377191544, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5697545111179352, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 1350.5692749023438, "epoch": 0.552012545739676, "grad_norm": 17.858951568603516, "kl": 4.609375, "learning_rate": 1.6401434970741566e-07, "loss": 0.3945, "reward": 0.6383928805589676, "reward_std": 0.2726961709558964, "rewards/accuracy_reward": 0.037946428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464477300644, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 1424.26123046875, "epoch": 0.5523112538271974, "grad_norm": 10.058762550354004, "kl": 4.8203125, "learning_rate": 1.6387354558645776e-07, "loss": 0.4353, "reward": 0.6339286118745804, "reward_std": 0.2725955992937088, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321790456772, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 1399.0335388183594, "epoch": 0.5526099619147189, "grad_norm": 12.776554107666016, "kl": 4.578125, "learning_rate": 1.637327426909718e-07, "loss": 0.3483, "reward": 0.6914062947034836, "reward_std": 0.31423961371183395, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991156578064, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 1433.0938110351562, "epoch": 0.5529086700022403, "grad_norm": 21.250253677368164, "kl": 3.15234375, "learning_rate": 1.6359194117413766e-07, "loss": 0.293, "reward": 0.6969866454601288, "reward_std": 0.2816440276801586, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 1486.263427734375, "epoch": 0.5532073780897617, "grad_norm": 13.457307815551758, "kl": 3.0, "learning_rate": 1.634511411891335e-07, "loss": 0.2437, "reward": 0.7287946790456772, "reward_std": 0.3007345348596573, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5904018133878708, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 1383.5692443847656, "epoch": 0.5535060861772833, "grad_norm": 21.413915634155273, "kl": 2.74609375, "learning_rate": 1.633103428891359e-07, "loss": 0.2551, "reward": 0.720982164144516, "reward_std": 0.3091169744729996, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 1286.2991638183594, "epoch": 0.5538047942648047, "grad_norm": 22.73155975341797, "kl": 3.21484375, "learning_rate": 1.6316954642731967e-07, "loss": 0.2464, "reward": 0.8141741454601288, "reward_std": 0.28752677515149117, "rewards/accuracy_reward": 0.2008928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6132812798023224, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 1411.1027526855469, "epoch": 0.5541035023523262, "grad_norm": 10.088276863098145, "kl": 3.453125, "learning_rate": 1.6302875195685749e-07, "loss": 0.257, "reward": 0.6808035969734192, "reward_std": 0.24437502771615982, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5892857313156128, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 1370.6964721679688, "epoch": 0.5544022104398476, "grad_norm": 10.712175369262695, "kl": 6.86328125, "learning_rate": 1.6288795963092e-07, "loss": 0.2632, "reward": 0.6060267984867096, "reward_std": 0.2658748887479305, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 1354.91748046875, "epoch": 0.5547009185273691, "grad_norm": 12.01828670501709, "kl": 4.62890625, "learning_rate": 1.627471696026754e-07, "loss": 0.3854, "reward": 0.6657366305589676, "reward_std": 0.307402141392231, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 1368.2143249511719, "epoch": 0.5549996266148906, "grad_norm": 25.36585807800293, "kl": 5.0859375, "learning_rate": 1.6260638202528947e-07, "loss": 0.3576, "reward": 0.7606027126312256, "reward_std": 0.27870112657546997, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562649011612, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 1375.3884582519531, "epoch": 0.5552983347024121, "grad_norm": 10.057307243347168, "kl": 4.68359375, "learning_rate": 1.6246559705192536e-07, "loss": 0.379, "reward": 0.651785746216774, "reward_std": 0.3087032586336136, "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 1397.3549499511719, "epoch": 0.5555970427899335, "grad_norm": 20.265117645263672, "kl": 5.2109375, "learning_rate": 1.623248148357432e-07, "loss": 0.3938, "reward": 0.6227678805589676, "reward_std": 0.288337878882885, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321492433548, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 1333.6518249511719, "epoch": 0.555895750877455, "grad_norm": 17.345556259155273, "kl": 4.67578125, "learning_rate": 1.621840355299003e-07, "loss": 0.3516, "reward": 0.7148437947034836, "reward_std": 0.3085753917694092, "rewards/accuracy_reward": 0.13839286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576450914144516, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 1322.3348693847656, "epoch": 0.5561944589649764, "grad_norm": 7.198326587677002, "kl": 3.90625, "learning_rate": 1.6204325928755075e-07, "loss": 0.3066, "reward": 0.7416294813156128, "reward_std": 0.30330733954906464, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 1425.1585388183594, "epoch": 0.556493167052498, "grad_norm": 10.70687484741211, "kl": 4.30859375, "learning_rate": 1.6190248626184534e-07, "loss": 0.3914, "reward": 0.6540178805589676, "reward_std": 0.3107154443860054, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870535969734192, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 1447.685302734375, "epoch": 0.5567918751400194, "grad_norm": 8.575031280517578, "kl": 4.01171875, "learning_rate": 1.6176171660593123e-07, "loss": 0.2601, "reward": 0.7014509290456772, "reward_std": 0.2860442325472832, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5853795111179352, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 1347.3036499023438, "epoch": 0.5570905832275409, "grad_norm": 16.527585983276367, "kl": 3.50390625, "learning_rate": 1.6162095047295211e-07, "loss": 0.2982, "reward": 0.7388393133878708, "reward_std": 0.25839030742645264, "rewards/accuracy_reward": 0.13839285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464477300644, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 1389.52685546875, "epoch": 0.5573892913150623, "grad_norm": 7.605131149291992, "kl": 3.6640625, "learning_rate": 1.6148018801604774e-07, "loss": 0.3469, "reward": 0.6489955633878708, "reward_std": 0.28549107536673546, "rewards/accuracy_reward": 0.07589286146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 1410.1295471191406, "epoch": 0.5576879994025838, "grad_norm": 7.936020374298096, "kl": 4.36328125, "learning_rate": 1.6133942938835373e-07, "loss": 0.3374, "reward": 0.710379496216774, "reward_std": 0.3042495585978031, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.554129496216774, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 1279.8058471679688, "epoch": 0.5579867074901053, "grad_norm": 10.291875839233398, "kl": 3.75, "learning_rate": 1.611986747430018e-07, "loss": 0.2824, "reward": 0.6702009290456772, "reward_std": 0.25702940300107, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687798023224, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 1315.5045013427734, "epoch": 0.5582854155776268, "grad_norm": 10.761310577392578, "kl": 3.328125, "learning_rate": 1.6105792423311922e-07, "loss": 0.3041, "reward": 0.6925223469734192, "reward_std": 0.26111481338739395, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116454601288, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 1475.7746276855469, "epoch": 0.5585841236651482, "grad_norm": 11.359289169311523, "kl": 4.5, "learning_rate": 1.609171780118287e-07, "loss": 0.3252, "reward": 0.6227678805589676, "reward_std": 0.2544337585568428, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 1437.6675109863281, "epoch": 0.5588828317526697, "grad_norm": 20.473417282104492, "kl": 5.0234375, "learning_rate": 1.607764362322483e-07, "loss": 0.3653, "reward": 0.6121652126312256, "reward_std": 0.2940291166305542, "rewards/accuracy_reward": 0.04241071594879031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5697544813156128, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 1413.19873046875, "epoch": 0.5591815398401911, "grad_norm": 8.011659622192383, "kl": 4.1796875, "learning_rate": 1.6063569904749137e-07, "loss": 0.3307, "reward": 0.6210937798023224, "reward_std": 0.22539325803518295, "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576450914144516, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 1338.5268249511719, "epoch": 0.5594802479277127, "grad_norm": 9.977227210998535, "kl": 4.42578125, "learning_rate": 1.604949666106661e-07, "loss": 0.3492, "reward": 0.684151828289032, "reward_std": 0.27899813279509544, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5658482313156128, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 1345.6250305175781, "epoch": 0.5597789560152341, "grad_norm": 6.7840704917907715, "kl": 3.6171875, "learning_rate": 1.6035423907487565e-07, "loss": 0.3053, "reward": 0.718191996216774, "reward_std": 0.28159430623054504, "rewards/accuracy_reward": 0.14062500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 1424.60498046875, "epoch": 0.5600776641027556, "grad_norm": 8.577967643737793, "kl": 4.03125, "learning_rate": 1.6021351659321772e-07, "loss": 0.3383, "reward": 0.6718750298023224, "reward_std": 0.31771160289645195, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 1435.4665832519531, "epoch": 0.560376372190277, "grad_norm": 12.478219985961914, "kl": 3.10546875, "learning_rate": 1.6007279931878463e-07, "loss": 0.275, "reward": 0.7366071790456772, "reward_std": 0.3168102391064167, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 1391.91748046875, "epoch": 0.5606750802777986, "grad_norm": 12.850930213928223, "kl": 3.30078125, "learning_rate": 1.5993208740466295e-07, "loss": 0.3139, "reward": 0.6969866454601288, "reward_std": 0.25338756665587425, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.569754496216774, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 1433.5647888183594, "epoch": 0.56097378836532, "grad_norm": 6.718024730682373, "kl": 4.41796875, "learning_rate": 1.5979138100393345e-07, "loss": 0.3421, "reward": 0.636160746216774, "reward_std": 0.2744355536997318, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678805589676, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 1327.2053833007812, "epoch": 0.5612724964528415, "grad_norm": 9.475250244140625, "kl": 3.828125, "learning_rate": 1.5965068026967095e-07, "loss": 0.3125, "reward": 0.6835937798023224, "reward_std": 0.2797829769551754, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687798023224, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 1371.6719360351562, "epoch": 0.5615712045403629, "grad_norm": 12.463448524475098, "kl": 4.53125, "learning_rate": 1.5950998535494405e-07, "loss": 0.3734, "reward": 0.6847098469734192, "reward_std": 0.30426161736249924, "rewards/accuracy_reward": 0.09151786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.593191996216774, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 1352.6228332519531, "epoch": 0.5618699126278844, "grad_norm": 16.644023895263672, "kl": 5.8046875, "learning_rate": 1.5936929641281497e-07, "loss": 0.4041, "reward": 0.6540178805589676, "reward_std": 0.2767050825059414, "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714477300644, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 1477.5647888183594, "epoch": 0.5621686207154059, "grad_norm": 8.648137092590332, "kl": 4.14453125, "learning_rate": 1.592286135963395e-07, "loss": 0.2462, "reward": 0.718191996216774, "reward_std": 0.3114876039326191, "rewards/accuracy_reward": 0.12276786402799189, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241454601288, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 1474.6786499023438, "epoch": 0.5624673288029274, "grad_norm": 7.946831703186035, "kl": 4.75, "learning_rate": 1.590879370585668e-07, "loss": 0.3623, "reward": 0.7198660969734192, "reward_std": 0.30389414727687836, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636161118745804, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 1351.2478332519531, "epoch": 0.5627660368904488, "grad_norm": 5.698549270629883, "kl": 4.51171875, "learning_rate": 1.5894726695253917e-07, "loss": 0.3788, "reward": 0.7371652126312256, "reward_std": 0.2931933179497719, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5987723618745804, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 1442.5045166015625, "epoch": 0.5630647449779703, "grad_norm": 8.737481117248535, "kl": 3.375, "learning_rate": 1.5880660343129186e-07, "loss": 0.2464, "reward": 0.6450893133878708, "reward_std": 0.31790120154619217, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 1427.3795166015625, "epoch": 0.5633634530654917, "grad_norm": 19.727590560913086, "kl": 3.9765625, "learning_rate": 1.5866594664785295e-07, "loss": 0.3289, "reward": 0.7248884290456772, "reward_std": 0.2937803231179714, "rewards/accuracy_reward": 0.15848215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5664062649011612, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 1381.4955749511719, "epoch": 0.5636621611530133, "grad_norm": 12.120770454406738, "kl": 3.59765625, "learning_rate": 1.585252967552433e-07, "loss": 0.3267, "reward": 0.658482164144516, "reward_std": 0.3108033016324043, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5468750298023224, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 1408.4866943359375, "epoch": 0.5639608692405347, "grad_norm": 6.821669101715088, "kl": 3.421875, "learning_rate": 1.5838465390647622e-07, "loss": 0.2626, "reward": 0.623325914144516, "reward_std": 0.29980961233377457, "rewards/accuracy_reward": 0.049107145983725786, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187798023224, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 1351.1406860351562, "epoch": 0.5642595773280562, "grad_norm": 7.972107887268066, "kl": 4.25, "learning_rate": 1.5824401825455729e-07, "loss": 0.3139, "reward": 0.7109375298023224, "reward_std": 0.277482595294714, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 1348.4040832519531, "epoch": 0.5645582854155776, "grad_norm": 15.238397598266602, "kl": 4.2421875, "learning_rate": 1.5810338995248435e-07, "loss": 0.4617, "reward": 0.7779018133878708, "reward_std": 0.28258268162608147, "rewards/accuracy_reward": 0.18303571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948660969734192, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 1409.6473693847656, "epoch": 0.5648569935030991, "grad_norm": 18.133298873901367, "kl": 4.38671875, "learning_rate": 1.5796276915324722e-07, "loss": 0.3062, "reward": 0.6356026977300644, "reward_std": 0.28336698934435844, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098469734192, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 1486.5045166015625, "epoch": 0.5651557015906206, "grad_norm": 5.50261926651001, "kl": 4.1484375, "learning_rate": 1.578221560098275e-07, "loss": 0.2991, "reward": 0.7343750298023224, "reward_std": 0.2647612430155277, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928954601288, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 1385.3103332519531, "epoch": 0.5654544096781421, "grad_norm": 27.096145629882812, "kl": 5.3515625, "learning_rate": 1.5768155067519853e-07, "loss": 0.4017, "reward": 0.6646205633878708, "reward_std": 0.3141920119524002, "rewards/accuracy_reward": 0.12276785913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5418527126312256, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 1384.8572082519531, "epoch": 0.5657531177656635, "grad_norm": 20.733558654785156, "kl": 4.59375, "learning_rate": 1.5754095330232518e-07, "loss": 0.3311, "reward": 0.6757812798023224, "reward_std": 0.2998349256813526, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.553013414144516, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 1375.4420471191406, "epoch": 0.5660518258531849, "grad_norm": 11.478658676147461, "kl": 3.4609375, "learning_rate": 1.5740036404416352e-07, "loss": 0.3322, "reward": 0.6562500447034836, "reward_std": 0.30472536385059357, "rewards/accuracy_reward": 0.06919643306173384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.587053582072258, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 1513.3884887695312, "epoch": 0.5663505339407064, "grad_norm": 5.773934841156006, "kl": 4.99609375, "learning_rate": 1.5725978305366097e-07, "loss": 0.2544, "reward": 0.7154018133878708, "reward_std": 0.2312828041613102, "rewards/accuracy_reward": 0.15178571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636161118745804, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 1394.3884582519531, "epoch": 0.5666492420282279, "grad_norm": 14.000238418579102, "kl": 3.421875, "learning_rate": 1.5711921048375583e-07, "loss": 0.3253, "reward": 0.6311384290456772, "reward_std": 0.2548329047858715, "rewards/accuracy_reward": 0.040178574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 1368.8572387695312, "epoch": 0.5669479501157494, "grad_norm": 6.613846302032471, "kl": 3.91796875, "learning_rate": 1.5697864648737728e-07, "loss": 0.2855, "reward": 0.680245578289032, "reward_std": 0.29822012782096863, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 1300.3304138183594, "epoch": 0.5672466582032708, "grad_norm": 6.335110664367676, "kl": 3.62109375, "learning_rate": 1.568380912174452e-07, "loss": 0.288, "reward": 0.6501116305589676, "reward_std": 0.30537545680999756, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437649011612, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 1380.2500610351562, "epoch": 0.5675453662907923, "grad_norm": 13.746726036071777, "kl": 3.5078125, "learning_rate": 1.5669754482686986e-07, "loss": 0.2941, "reward": 0.6344866305589676, "reward_std": 0.28729943186044693, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 1469.6897888183594, "epoch": 0.5678440743783137, "grad_norm": 20.4333438873291, "kl": 3.43359375, "learning_rate": 1.5655700746855207e-07, "loss": 0.2834, "reward": 0.6746652275323868, "reward_std": 0.3097744919359684, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187649011612, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 1403.357177734375, "epoch": 0.5681427824658353, "grad_norm": 32.224605560302734, "kl": 5.2890625, "learning_rate": 1.5641647929538263e-07, "loss": 0.4192, "reward": 0.6372768133878708, "reward_std": 0.31116385012865067, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5725446790456772, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 1342.2969360351562, "epoch": 0.5684414905533567, "grad_norm": 8.271327018737793, "kl": 3.75390625, "learning_rate": 1.5627596046024245e-07, "loss": 0.3403, "reward": 0.6735491454601288, "reward_std": 0.28192390501499176, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 1307.2255249023438, "epoch": 0.5687401986408782, "grad_norm": 12.247727394104004, "kl": 3.9765625, "learning_rate": 1.5613545111600223e-07, "loss": 0.2726, "reward": 0.7282366454601288, "reward_std": 0.28172971680760384, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116305589676, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 1324.2500610351562, "epoch": 0.5690389067283996, "grad_norm": 6.331303596496582, "kl": 3.98046875, "learning_rate": 1.5599495141552244e-07, "loss": 0.3644, "reward": 0.6813616156578064, "reward_std": 0.3220340982079506, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 1379.9866333007812, "epoch": 0.5693376148159212, "grad_norm": 15.62591552734375, "kl": 4.51171875, "learning_rate": 1.5585446151165284e-07, "loss": 0.355, "reward": 0.6294643133878708, "reward_std": 0.25295573472976685, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 1320.6942443847656, "epoch": 0.5696363229034426, "grad_norm": 23.15890121459961, "kl": 4.68359375, "learning_rate": 1.5571398155723277e-07, "loss": 0.3571, "reward": 0.6657366305589676, "reward_std": 0.24996113404631615, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366305589676, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 1424.0335388183594, "epoch": 0.5699350309909641, "grad_norm": 15.789984703063965, "kl": 5.21875, "learning_rate": 1.5557351170509067e-07, "loss": 0.4118, "reward": 0.7332589626312256, "reward_std": 0.32338325679302216, "rewards/accuracy_reward": 0.1540178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792410969734192, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 1373.5804443359375, "epoch": 0.5702337390784855, "grad_norm": 14.429099082946777, "kl": 4.375, "learning_rate": 1.5543305210804394e-07, "loss": 0.3003, "reward": 0.6707589626312256, "reward_std": 0.29293566942214966, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.597098246216774, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 1255.69873046875, "epoch": 0.570532447166007, "grad_norm": 18.307212829589844, "kl": 4.94140625, "learning_rate": 1.5529260291889882e-07, "loss": 0.3829, "reward": 0.729910746216774, "reward_std": 0.3018101491034031, "rewards/accuracy_reward": 0.14732143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 1442.1920166015625, "epoch": 0.5708311552535285, "grad_norm": 11.762177467346191, "kl": 3.77734375, "learning_rate": 1.5515216429045032e-07, "loss": 0.3253, "reward": 0.6858259290456772, "reward_std": 0.2684941254556179, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.545200914144516, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 1399.2947082519531, "epoch": 0.57112986334105, "grad_norm": 23.930805206298828, "kl": 3.24609375, "learning_rate": 1.5501173637548188e-07, "loss": 0.2637, "reward": 0.7209821939468384, "reward_std": 0.35404370725154877, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 1409.8482666015625, "epoch": 0.5714285714285714, "grad_norm": 23.586381912231445, "kl": 2.83203125, "learning_rate": 1.548713193267653e-07, "loss": 0.2536, "reward": 0.6467634290456772, "reward_std": 0.342055581510067, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.553013414144516, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 1367.6942749023438, "epoch": 0.5717272795160929, "grad_norm": 10.011240005493164, "kl": 3.375, "learning_rate": 1.5473091329706058e-07, "loss": 0.2944, "reward": 0.6981027126312256, "reward_std": 0.3256761357188225, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.559709832072258, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 1374.6361999511719, "epoch": 0.5720259876036143, "grad_norm": 19.35411834716797, "kl": 2.5859375, "learning_rate": 1.5459051843911573e-07, "loss": 0.2205, "reward": 0.7678571790456772, "reward_std": 0.25900914892554283, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178954601288, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 1450.4263916015625, "epoch": 0.5723246956911359, "grad_norm": 5.638742446899414, "kl": 3.24609375, "learning_rate": 1.5445013490566655e-07, "loss": 0.233, "reward": 0.7148437798023224, "reward_std": 0.27841370552778244, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6010044813156128, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 1368.33935546875, "epoch": 0.5726234037786573, "grad_norm": 9.68177318572998, "kl": 5.28125, "learning_rate": 1.543097628494366e-07, "loss": 0.3582, "reward": 0.6964285969734192, "reward_std": 0.25139642134308815, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 1509.7925109863281, "epoch": 0.5729221118661788, "grad_norm": 13.936638832092285, "kl": 4.421875, "learning_rate": 1.541694024231369e-07, "loss": 0.3158, "reward": 0.5652901977300644, "reward_std": 0.23869851231575012, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5496651977300644, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 1457.0357666015625, "epoch": 0.5732208199537002, "grad_norm": 13.078454971313477, "kl": 4.671875, "learning_rate": 1.540290537794659e-07, "loss": 0.3524, "reward": 0.6395089477300644, "reward_std": 0.2726872190833092, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.556919664144516, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 1372.1697082519531, "epoch": 0.5735195280412217, "grad_norm": 5.247359752655029, "kl": 4.16015625, "learning_rate": 1.5388871707110912e-07, "loss": 0.3598, "reward": 0.685825914144516, "reward_std": 0.27826663851737976, "rewards/accuracy_reward": 0.12053571944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652902126312256, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 1380.3482666015625, "epoch": 0.5738182361287432, "grad_norm": 6.251551628112793, "kl": 4.1328125, "learning_rate": 1.5374839245073908e-07, "loss": 0.295, "reward": 0.6540178954601288, "reward_std": 0.27331798151135445, "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 1423.3750915527344, "epoch": 0.5741169442162647, "grad_norm": 9.717523574829102, "kl": 4.40625, "learning_rate": 1.5360808007101533e-07, "loss": 0.3432, "reward": 0.6880580633878708, "reward_std": 0.28817786276340485, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5764509290456772, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 1385.4531860351562, "epoch": 0.5744156523037861, "grad_norm": 6.986042499542236, "kl": 3.64453125, "learning_rate": 1.5346778008458392e-07, "loss": 0.3174, "reward": 0.783482164144516, "reward_std": 0.255219716578722, "rewards/accuracy_reward": 0.20982143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.573660746216774, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 1464.1920471191406, "epoch": 0.5747143603913076, "grad_norm": 4.5904154777526855, "kl": 3.48828125, "learning_rate": 1.5332749264407747e-07, "loss": 0.289, "reward": 0.7265625149011612, "reward_std": 0.293657049536705, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839626312256, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 1327.7813110351562, "epoch": 0.575013068478829, "grad_norm": 9.009905815124512, "kl": 3.79296875, "learning_rate": 1.5318721790211505e-07, "loss": 0.3636, "reward": 0.6796875298023224, "reward_std": 0.28179340064525604, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5658482611179352, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 1342.2879638671875, "epoch": 0.5753117765663506, "grad_norm": 18.013227462768555, "kl": 4.8203125, "learning_rate": 1.5304695601130176e-07, "loss": 0.3781, "reward": 0.6434151977300644, "reward_std": 0.30055392906069756, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.569754496216774, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 1387.6986999511719, "epoch": 0.575610484653872, "grad_norm": 15.544002532958984, "kl": 5.3203125, "learning_rate": 1.529067071242288e-07, "loss": 0.4253, "reward": 0.6026785969734192, "reward_std": 0.3063415288925171, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.549107164144516, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 1456.7813415527344, "epoch": 0.5759091927413935, "grad_norm": 6.908018112182617, "kl": 4.0625, "learning_rate": 1.5276647139347316e-07, "loss": 0.3155, "reward": 0.613839328289032, "reward_std": 0.29881251603364944, "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 1329.63623046875, "epoch": 0.5762079008289149, "grad_norm": 14.821483612060547, "kl": 3.98046875, "learning_rate": 1.5262624897159772e-07, "loss": 0.3011, "reward": 0.6969866305589676, "reward_std": 0.2955911308526993, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6143973469734192, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 1386.1786499023438, "epoch": 0.5765066089164365, "grad_norm": 5.2094268798828125, "kl": 3.5625, "learning_rate": 1.5248604001115057e-07, "loss": 0.2694, "reward": 0.680803582072258, "reward_std": 0.32700880244374275, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 1473.87060546875, "epoch": 0.5768053170039579, "grad_norm": 7.692128658294678, "kl": 6.3203125, "learning_rate": 1.523458446646654e-07, "loss": 0.2693, "reward": 0.6082589477300644, "reward_std": 0.2695879228413105, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 1405.7210388183594, "epoch": 0.5771040250914794, "grad_norm": 7.895233154296875, "kl": 3.50390625, "learning_rate": 1.5220566308466102e-07, "loss": 0.2596, "reward": 0.7734375298023224, "reward_std": 0.3238030895590782, "rewards/accuracy_reward": 0.19196429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5814732313156128, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 1376.0335388183594, "epoch": 0.5774027331790008, "grad_norm": 7.1541924476623535, "kl": 3.94921875, "learning_rate": 1.5206549542364125e-07, "loss": 0.3316, "reward": 0.671316996216774, "reward_std": 0.27328693866729736, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5507812798023224, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 1322.9420166015625, "epoch": 0.5777014412665223, "grad_norm": 5.020935535430908, "kl": 3.34765625, "learning_rate": 1.519253418340947e-07, "loss": 0.2796, "reward": 0.6210937649011612, "reward_std": 0.2695132866501808, "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437649011612, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 1373.9620666503906, "epoch": 0.5780001493540438, "grad_norm": 7.337472438812256, "kl": 3.89453125, "learning_rate": 1.5178520246849477e-07, "loss": 0.3027, "reward": 0.6724330633878708, "reward_std": 0.31244557350873947, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5853794813156128, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 1380.5736999511719, "epoch": 0.5782988574415653, "grad_norm": 7.608755111694336, "kl": 4.125, "learning_rate": 1.5164507747929938e-07, "loss": 0.3287, "reward": 0.7165178805589676, "reward_std": 0.3155674710869789, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714286118745804, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 1371.05810546875, "epoch": 0.5785975655290867, "grad_norm": 6.090319633483887, "kl": 3.03515625, "learning_rate": 1.515049670189508e-07, "loss": 0.1917, "reward": 0.6316964477300644, "reward_std": 0.28099002316594124, "rewards/accuracy_reward": 0.04687500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214626312256, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 1285.7880249023438, "epoch": 0.5788962736166081, "grad_norm": 7.86739444732666, "kl": 3.53125, "learning_rate": 1.513648712398754e-07, "loss": 0.3528, "reward": 0.7327009290456772, "reward_std": 0.29888515919446945, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5943080484867096, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 1338.5246276855469, "epoch": 0.5791949817041296, "grad_norm": 8.26458740234375, "kl": 3.734375, "learning_rate": 1.5122479029448375e-07, "loss": 0.3294, "reward": 0.6958705633878708, "reward_std": 0.31446077674627304, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312649011612, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 1370.5960388183594, "epoch": 0.5794936897916511, "grad_norm": 5.58909797668457, "kl": 4.125, "learning_rate": 1.510847243351701e-07, "loss": 0.3706, "reward": 0.6607143133878708, "reward_std": 0.30466414615511894, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 1402.0938110351562, "epoch": 0.5797923978791726, "grad_norm": 8.996810913085938, "kl": 4.28515625, "learning_rate": 1.5094467351431257e-07, "loss": 0.3039, "reward": 0.6155134290456772, "reward_std": 0.3025863841176033, "rewards/accuracy_reward": 0.053571432596072555, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5619419813156128, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 1340.83935546875, "epoch": 0.580091105966694, "grad_norm": 11.24134635925293, "kl": 3.03515625, "learning_rate": 1.5080463798427265e-07, "loss": 0.2945, "reward": 0.7527902126312256, "reward_std": 0.3421020954847336, "rewards/accuracy_reward": 0.16964286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5831473618745804, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 1380.9822082519531, "epoch": 0.5803898140542155, "grad_norm": 6.453120231628418, "kl": 4.12890625, "learning_rate": 1.5066461789739532e-07, "loss": 0.3737, "reward": 0.7070312798023224, "reward_std": 0.27580878138542175, "rewards/accuracy_reward": 0.15848215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491454601288, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 1408.4442749023438, "epoch": 0.5806885221417369, "grad_norm": 6.05728006362915, "kl": 3.1953125, "learning_rate": 1.5052461340600876e-07, "loss": 0.252, "reward": 0.6316964626312256, "reward_std": 0.3103446662425995, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 1406.7991638183594, "epoch": 0.5809872302292585, "grad_norm": 8.349687576293945, "kl": 3.69140625, "learning_rate": 1.5038462466242396e-07, "loss": 0.244, "reward": 0.6367187649011612, "reward_std": 0.2740139998495579, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.560825914144516, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 1356.8460388183594, "epoch": 0.5812859383167799, "grad_norm": 6.206048011779785, "kl": 4.23828125, "learning_rate": 1.5024465181893515e-07, "loss": 0.3288, "reward": 0.6875000298023224, "reward_std": 0.29526805505156517, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 1393.5513916015625, "epoch": 0.5815846464043014, "grad_norm": 14.71874713897705, "kl": 3.52734375, "learning_rate": 1.5010469502781893e-07, "loss": 0.2794, "reward": 0.726004496216774, "reward_std": 0.296544186770916, "rewards/accuracy_reward": 0.15625000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.569754496216774, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 1368.4911193847656, "epoch": 0.5818833544918228, "grad_norm": 15.23145580291748, "kl": 5.2265625, "learning_rate": 1.499647544413347e-07, "loss": 0.4455, "reward": 0.6808035969734192, "reward_std": 0.3072381317615509, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5424107313156128, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 1387.4754638671875, "epoch": 0.5821820625793444, "grad_norm": 7.739885330200195, "kl": 3.8359375, "learning_rate": 1.4982483021172393e-07, "loss": 0.3311, "reward": 0.6657366305589676, "reward_std": 0.28103844076395035, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187649011612, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 1432.9933471679688, "epoch": 0.5824807706668658, "grad_norm": 18.051645278930664, "kl": 4.76171875, "learning_rate": 1.4968492249121062e-07, "loss": 0.3693, "reward": 0.6824776977300644, "reward_std": 0.26480428874492645, "rewards/accuracy_reward": 0.13616072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546316996216774, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 1310.9129943847656, "epoch": 0.5827794787543873, "grad_norm": 6.450240135192871, "kl": 4.16796875, "learning_rate": 1.495450314320006e-07, "loss": 0.3775, "reward": 0.7131696790456772, "reward_std": 0.29962307959795, "rewards/accuracy_reward": 0.15401786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5591518133878708, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 1310.6808471679688, "epoch": 0.5830781868419087, "grad_norm": 9.06146240234375, "kl": 3.84375, "learning_rate": 1.4940515718628157e-07, "loss": 0.3579, "reward": 0.694754496216774, "reward_std": 0.333117812871933, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.560825914144516, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 1426.2679138183594, "epoch": 0.5833768949294302, "grad_norm": 5.897479057312012, "kl": 3.76953125, "learning_rate": 1.4926529990622317e-07, "loss": 0.2734, "reward": 0.6205357313156128, "reward_std": 0.2962794080376625, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000149011612, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 1285.2701721191406, "epoch": 0.5836756030169516, "grad_norm": 6.376990795135498, "kl": 3.4296875, "learning_rate": 1.4912545974397625e-07, "loss": 0.2978, "reward": 0.7829241305589676, "reward_std": 0.3116602823138237, "rewards/accuracy_reward": 0.21205358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705484867096, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 1416.83935546875, "epoch": 0.5839743111044732, "grad_norm": 7.64666748046875, "kl": 3.02734375, "learning_rate": 1.4898563685167327e-07, "loss": 0.2497, "reward": 0.6612723618745804, "reward_std": 0.3043437749147415, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973469734192, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 1400.4107971191406, "epoch": 0.5842730191919946, "grad_norm": 13.71502685546875, "kl": 3.86328125, "learning_rate": 1.4884583138142775e-07, "loss": 0.2762, "reward": 0.6411830484867096, "reward_std": 0.26736724004149437, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187649011612, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 1334.5736999511719, "epoch": 0.5845717272795161, "grad_norm": 7.495909214019775, "kl": 4.40234375, "learning_rate": 1.4870604348533448e-07, "loss": 0.3716, "reward": 0.7036830484867096, "reward_std": 0.3061627447605133, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5340401977300644, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 1348.8460388183594, "epoch": 0.5848704353670375, "grad_norm": 7.124397277832031, "kl": 3.46875, "learning_rate": 1.4856627331546892e-07, "loss": 0.3117, "reward": 0.689732164144516, "reward_std": 0.3047820031642914, "rewards/accuracy_reward": 0.12500000675208867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 1387.5201721191406, "epoch": 0.5851691434545591, "grad_norm": 7.831532001495361, "kl": 2.89453125, "learning_rate": 1.4842652102388727e-07, "loss": 0.2485, "reward": 0.694754496216774, "reward_std": 0.3176226541399956, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 1385.5245971679688, "epoch": 0.5854678515420805, "grad_norm": 7.4912238121032715, "kl": 3.3203125, "learning_rate": 1.482867867626264e-07, "loss": 0.2671, "reward": 0.6322544813156128, "reward_std": 0.2844536602497101, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5630580633878708, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 1413.3705749511719, "epoch": 0.585766559629602, "grad_norm": 5.603409767150879, "kl": 3.0546875, "learning_rate": 1.4814707068370347e-07, "loss": 0.2356, "reward": 0.6283482313156128, "reward_std": 0.262585386633873, "rewards/accuracy_reward": 0.0736607147846371, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875149011612, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 1393.94873046875, "epoch": 0.5860652677171234, "grad_norm": 17.369138717651367, "kl": 4.29296875, "learning_rate": 1.480073729391159e-07, "loss": 0.3258, "reward": 0.7539062798023224, "reward_std": 0.3215068466961384, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098469734192, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 1426.7969360351562, "epoch": 0.5863639758046449, "grad_norm": 8.792394638061523, "kl": 3.14453125, "learning_rate": 1.4786769368084102e-07, "loss": 0.2859, "reward": 0.6473214477300644, "reward_std": 0.26325012743473053, "rewards/accuracy_reward": 0.07812500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964626312256, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 1327.7210388183594, "epoch": 0.5866626838921664, "grad_norm": 5.443241596221924, "kl": 3.54296875, "learning_rate": 1.4772803306083634e-07, "loss": 0.3387, "reward": 0.6138393133878708, "reward_std": 0.28456345945596695, "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321492433548, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 1296.2031860351562, "epoch": 0.5869613919796879, "grad_norm": 7.9027018547058105, "kl": 3.16015625, "learning_rate": 1.4758839123103885e-07, "loss": 0.3054, "reward": 0.6679687947034836, "reward_std": 0.32707367092370987, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 1367.3013916015625, "epoch": 0.5872601000672093, "grad_norm": 6.960598468780518, "kl": 3.140625, "learning_rate": 1.4744876834336513e-07, "loss": 0.3105, "reward": 0.6802455484867096, "reward_std": 0.28270360454916954, "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.553013414144516, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 1254.6272888183594, "epoch": 0.5875588081547308, "grad_norm": 14.233488082885742, "kl": 4.11328125, "learning_rate": 1.473091645497113e-07, "loss": 0.3625, "reward": 0.7566964626312256, "reward_std": 0.3341137617826462, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870535969734192, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 1367.5781555175781, "epoch": 0.5878575162422522, "grad_norm": 15.641640663146973, "kl": 4.05078125, "learning_rate": 1.4716958000195254e-07, "loss": 0.3311, "reward": 0.5987723469734192, "reward_std": 0.2587144561111927, "rewards/accuracy_reward": 0.04017857299186289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937649011612, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 1495.368408203125, "epoch": 0.5881562243297738, "grad_norm": 9.600568771362305, "kl": 4.34765625, "learning_rate": 1.470300148519432e-07, "loss": 0.3022, "reward": 0.6344866305589676, "reward_std": 0.26814424246549606, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937798023224, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 1377.7389221191406, "epoch": 0.5884549324172952, "grad_norm": 18.730422973632812, "kl": 3.6796875, "learning_rate": 1.4689046925151637e-07, "loss": 0.2714, "reward": 0.6434151977300644, "reward_std": 0.23121271282434464, "rewards/accuracy_reward": 0.08928571618162096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.554129496216774, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 1361.72998046875, "epoch": 0.5887536405048167, "grad_norm": 7.908825874328613, "kl": 2.80078125, "learning_rate": 1.4675094335248415e-07, "loss": 0.2282, "reward": 0.6780134290456772, "reward_std": 0.2550499439239502, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 1434.3861999511719, "epoch": 0.5890523485923381, "grad_norm": 4.169092178344727, "kl": 3.2421875, "learning_rate": 1.466114373066369e-07, "loss": 0.2807, "reward": 0.8080357760190964, "reward_std": 0.2966434247791767, "rewards/accuracy_reward": 0.2544642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714328289032, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 1431.5826721191406, "epoch": 0.5893510566798597, "grad_norm": 4.9884419441223145, "kl": 3.1640625, "learning_rate": 1.4647195126574367e-07, "loss": 0.2712, "reward": 0.638950914144516, "reward_std": 0.29749497026205063, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.549665205180645, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 1391.9933776855469, "epoch": 0.5896497647673811, "grad_norm": 11.601157188415527, "kl": 3.12109375, "learning_rate": 1.4633248538155142e-07, "loss": 0.2806, "reward": 0.6238839477300644, "reward_std": 0.3072785958647728, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636160969734192, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 1390.6831359863281, "epoch": 0.5899484728549026, "grad_norm": 9.811483383178711, "kl": 3.06640625, "learning_rate": 1.4619303980578554e-07, "loss": 0.2128, "reward": 0.752232164144516, "reward_std": 0.31431031227111816, "rewards/accuracy_reward": 0.1674107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 1414.0647888183594, "epoch": 0.590247180942424, "grad_norm": 7.929187774658203, "kl": 2.34765625, "learning_rate": 1.4605361469014902e-07, "loss": 0.1907, "reward": 0.695870578289032, "reward_std": 0.24410007148981094, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574777126312256, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 1384.1116943359375, "epoch": 0.5905458890299455, "grad_norm": 14.09411907196045, "kl": 2.216796875, "learning_rate": 1.4591421018632278e-07, "loss": 0.1917, "reward": 0.7282366305589676, "reward_std": 0.27439112216234207, "rewards/accuracy_reward": 0.11830357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6099330633878708, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 1418.6875610351562, "epoch": 0.590844597117467, "grad_norm": 8.055232048034668, "kl": 2.66796875, "learning_rate": 1.4577482644596528e-07, "loss": 0.2833, "reward": 0.7047991454601288, "reward_std": 0.23440709710121155, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991305589676, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 1420.0313110351562, "epoch": 0.5911433052049885, "grad_norm": 3.697152614593506, "kl": 3.1640625, "learning_rate": 1.4563546362071234e-07, "loss": 0.2111, "reward": 0.6054687649011612, "reward_std": 0.2922612242400646, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937798023224, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 1421.2478332519531, "epoch": 0.5914420132925099, "grad_norm": 6.970768451690674, "kl": 3.11328125, "learning_rate": 1.45496121862177e-07, "loss": 0.2758, "reward": 0.6891741305589676, "reward_std": 0.27772384136915207, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 1364.3370971679688, "epoch": 0.5917407213800313, "grad_norm": 9.672041893005371, "kl": 3.142578125, "learning_rate": 1.453568013219495e-07, "loss": 0.2505, "reward": 0.6763393133878708, "reward_std": 0.26078562811017036, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.558035746216774, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 1415.8639221191406, "epoch": 0.5920394294675528, "grad_norm": 12.074259757995605, "kl": 3.76953125, "learning_rate": 1.4521750215159697e-07, "loss": 0.2853, "reward": 0.6378348469734192, "reward_std": 0.27555037289857864, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 1353.2478637695312, "epoch": 0.5923381375550743, "grad_norm": 8.450279235839844, "kl": 3.65625, "learning_rate": 1.450782245026632e-07, "loss": 0.2919, "reward": 0.6573661118745804, "reward_std": 0.2825522795319557, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125298023224, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 1487.6027526855469, "epoch": 0.5926368456425958, "grad_norm": 8.778834342956543, "kl": 3.25, "learning_rate": 1.4493896852666856e-07, "loss": 0.2536, "reward": 0.6501116305589676, "reward_std": 0.30807413905858994, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 1481.1920166015625, "epoch": 0.5929355537301172, "grad_norm": 4.010795593261719, "kl": 3.2734375, "learning_rate": 1.4479973437511e-07, "loss": 0.2421, "reward": 0.7477678954601288, "reward_std": 0.3143846467137337, "rewards/accuracy_reward": 0.20089286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5468750447034836, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 1343.7344360351562, "epoch": 0.5932342618176387, "grad_norm": 17.09865951538086, "kl": 2.373046875, "learning_rate": 1.4466052219946066e-07, "loss": 0.2046, "reward": 0.6618303805589676, "reward_std": 0.28607727959752083, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089626312256, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 1392.6942749023438, "epoch": 0.5935329699051601, "grad_norm": 8.985151290893555, "kl": 2.45703125, "learning_rate": 1.4452133215116964e-07, "loss": 0.2145, "reward": 0.6316964626312256, "reward_std": 0.253093596547842, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964626312256, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 1319.5023193359375, "epoch": 0.5938316779926817, "grad_norm": 24.96198081970215, "kl": 2.4921875, "learning_rate": 1.4438216438166208e-07, "loss": 0.3013, "reward": 0.706473246216774, "reward_std": 0.3130028322339058, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 1296.7657165527344, "epoch": 0.5941303860802031, "grad_norm": 14.484698295593262, "kl": 2.27734375, "learning_rate": 1.4424301904233896e-07, "loss": 0.2217, "reward": 0.7092634290456772, "reward_std": 0.26312778890132904, "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731027275323868, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 1331.6830749511719, "epoch": 0.5944290941677246, "grad_norm": 6.195709228515625, "kl": 2.54296875, "learning_rate": 1.4410389628457673e-07, "loss": 0.2469, "reward": 0.7131696790456772, "reward_std": 0.32255779951810837, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 1449.2322082519531, "epoch": 0.594727802255246, "grad_norm": 7.8950114250183105, "kl": 2.90234375, "learning_rate": 1.4396479625972723e-07, "loss": 0.2539, "reward": 0.6367187798023224, "reward_std": 0.3096061274409294, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 1353.1719665527344, "epoch": 0.5950265103427675, "grad_norm": 20.94936752319336, "kl": 3.80859375, "learning_rate": 1.438257191191178e-07, "loss": 0.3085, "reward": 0.684151828289032, "reward_std": 0.2808511294424534, "rewards/accuracy_reward": 0.12053572363220155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.563616082072258, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 1414.2723693847656, "epoch": 0.595325218430289, "grad_norm": 5.279806613922119, "kl": 3.2578125, "learning_rate": 1.4368666501405062e-07, "loss": 0.2811, "reward": 0.6155134290456772, "reward_std": 0.2500479482114315, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.599888414144516, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 1440.6808776855469, "epoch": 0.5956239265178105, "grad_norm": 17.93058967590332, "kl": 3.7734375, "learning_rate": 1.43547634095803e-07, "loss": 0.2858, "reward": 0.6462053805589676, "reward_std": 0.3061857894062996, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.579241082072258, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 1375.1563415527344, "epoch": 0.5959226346053319, "grad_norm": 14.843433380126953, "kl": 4.171875, "learning_rate": 1.434086265156269e-07, "loss": 0.2724, "reward": 0.6110491305589676, "reward_std": 0.2276722900569439, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098618745804, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 1451.1920776367188, "epoch": 0.5962213426928534, "grad_norm": 4.387238502502441, "kl": 3.390625, "learning_rate": 1.43269642424749e-07, "loss": 0.2254, "reward": 0.718191996216774, "reward_std": 0.2646103650331497, "rewards/accuracy_reward": 0.14508928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 1313.49560546875, "epoch": 0.5965200507803748, "grad_norm": 10.169485092163086, "kl": 3.34765625, "learning_rate": 1.4313068197437032e-07, "loss": 0.2781, "reward": 0.7137277126312256, "reward_std": 0.30214254558086395, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 1454.7656860351562, "epoch": 0.5968187588678964, "grad_norm": 11.764808654785156, "kl": 4.0078125, "learning_rate": 1.4299174531566622e-07, "loss": 0.3049, "reward": 0.6802455633878708, "reward_std": 0.2555863745510578, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5507812649011612, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 1366.2411499023438, "epoch": 0.5971174669554178, "grad_norm": 4.61937141418457, "kl": 3.3828125, "learning_rate": 1.4285283259978622e-07, "loss": 0.3594, "reward": 0.582589328289032, "reward_std": 0.2607192099094391, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5580357313156128, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 1340.7813110351562, "epoch": 0.5974161750429393, "grad_norm": 8.287111282348633, "kl": 2.39453125, "learning_rate": 1.4271394397785363e-07, "loss": 0.2216, "reward": 0.702566996216774, "reward_std": 0.29358669370412827, "rewards/accuracy_reward": 0.13169643585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705633878708, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 1424.0736999511719, "epoch": 0.5977148831304607, "grad_norm": 3.136125326156616, "kl": 2.240234375, "learning_rate": 1.4257507960096579e-07, "loss": 0.2178, "reward": 0.6261161118745804, "reward_std": 0.29503367096185684, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803954601288, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 1383.0648193359375, "epoch": 0.5980135912179823, "grad_norm": 7.328308582305908, "kl": 3.21484375, "learning_rate": 1.4243623962019343e-07, "loss": 0.3225, "reward": 0.7438616305589676, "reward_std": 0.3307199329137802, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.567522332072258, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 1309.5402221679688, "epoch": 0.5983122993055037, "grad_norm": 9.66983413696289, "kl": 2.6171875, "learning_rate": 1.4229742418658084e-07, "loss": 0.2866, "reward": 0.777901828289032, "reward_std": 0.3283437117934227, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636160969734192, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 1418.08935546875, "epoch": 0.5986110073930252, "grad_norm": 5.183480262756348, "kl": 3.03515625, "learning_rate": 1.4215863345114572e-07, "loss": 0.2293, "reward": 0.615513414144516, "reward_std": 0.2601779028773308, "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705484867096, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 1241.607177734375, "epoch": 0.5989097154805466, "grad_norm": 5.825594902038574, "kl": 3.1015625, "learning_rate": 1.4201986756487864e-07, "loss": 0.3164, "reward": 0.7494420111179352, "reward_std": 0.2913287319242954, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.593191996216774, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 1400.1473999023438, "epoch": 0.5992084235680681, "grad_norm": 10.661396026611328, "kl": 4.02734375, "learning_rate": 1.418811266787434e-07, "loss": 0.2828, "reward": 0.6847098618745804, "reward_std": 0.2777964249253273, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5507812798023224, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 1422.2991638183594, "epoch": 0.5995071316555896, "grad_norm": 5.692141056060791, "kl": 3.2734375, "learning_rate": 1.4174241094367645e-07, "loss": 0.2537, "reward": 0.599888414144516, "reward_std": 0.2918461337685585, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.553013414144516, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 1475.3013916015625, "epoch": 0.5998058397431111, "grad_norm": 5.6396870613098145, "kl": 3.2890625, "learning_rate": 1.4160372051058687e-07, "loss": 0.2859, "reward": 0.6378348469734192, "reward_std": 0.3051426038146019, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991305589676, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 1374.7857666015625, "epoch": 0.6001045478306325, "grad_norm": 6.649861812591553, "kl": 3.734375, "learning_rate": 1.414650555303563e-07, "loss": 0.2962, "reward": 0.6529017984867096, "reward_std": 0.2985694110393524, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125447034836, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 1415.7210388183594, "epoch": 0.600403255918154, "grad_norm": 18.136808395385742, "kl": 3.921875, "learning_rate": 1.4132641615383866e-07, "loss": 0.3012, "reward": 0.6266741305589676, "reward_std": 0.2935921438038349, "rewards/accuracy_reward": 0.08705357369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5396205484867096, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 1475.1139221191406, "epoch": 0.6007019640056754, "grad_norm": 6.444738388061523, "kl": 3.05859375, "learning_rate": 1.4118780253185998e-07, "loss": 0.3008, "reward": 0.6238839626312256, "reward_std": 0.29765721037983894, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5591518133878708, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 1363.169677734375, "epoch": 0.601000672093197, "grad_norm": 5.034937858581543, "kl": 2.96875, "learning_rate": 1.4104921481521825e-07, "loss": 0.1864, "reward": 0.7371651977300644, "reward_std": 0.3083040118217468, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937649011612, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 1351.4397888183594, "epoch": 0.6012993801807184, "grad_norm": 3.793933391571045, "kl": 3.3125, "learning_rate": 1.4091065315468343e-07, "loss": 0.2507, "reward": 0.6729910969734192, "reward_std": 0.29275859519839287, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803805589676, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 1306.450927734375, "epoch": 0.6015980882682399, "grad_norm": 9.880385398864746, "kl": 2.61328125, "learning_rate": 1.4077211770099696e-07, "loss": 0.2459, "reward": 0.6445312798023224, "reward_std": 0.26968057081103325, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5686384290456772, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 1371.513427734375, "epoch": 0.6018967963557613, "grad_norm": 5.6552581787109375, "kl": 3.11328125, "learning_rate": 1.4063360860487183e-07, "loss": 0.3035, "reward": 0.6668526977300644, "reward_std": 0.2999422550201416, "rewards/accuracy_reward": 0.10937500279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574777126312256, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 1413.0156860351562, "epoch": 0.6021955044432828, "grad_norm": 10.961566925048828, "kl": 2.30078125, "learning_rate": 1.4049512601699238e-07, "loss": 0.2256, "reward": 0.6579241454601288, "reward_std": 0.2867502197623253, "rewards/accuracy_reward": 0.08035714691504836, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.577566996216774, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 1322.8281860351562, "epoch": 0.6024942125308043, "grad_norm": 18.35201644897461, "kl": 2.451171875, "learning_rate": 1.403566700880141e-07, "loss": 0.2431, "reward": 0.6640625447034836, "reward_std": 0.3272792622447014, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 1501.3728332519531, "epoch": 0.6027929206183258, "grad_norm": 16.268569946289062, "kl": 3.40625, "learning_rate": 1.4021824096856342e-07, "loss": 0.247, "reward": 0.6551339477300644, "reward_std": 0.3274150714278221, "rewards/accuracy_reward": 0.1138392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.541294664144516, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 1500.4732971191406, "epoch": 0.6030916287058472, "grad_norm": 8.358407974243164, "kl": 2.96875, "learning_rate": 1.4007983880923772e-07, "loss": 0.2262, "reward": 0.6623884290456772, "reward_std": 0.2701106294989586, "rewards/accuracy_reward": 0.11383928661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491305589676, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 1378.33935546875, "epoch": 0.6033903367933687, "grad_norm": 2.6728768348693848, "kl": 3.3125, "learning_rate": 1.3994146376060497e-07, "loss": 0.2989, "reward": 0.706473246216774, "reward_std": 0.29209083318710327, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5368303954601288, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 1343.3929138183594, "epoch": 0.6036890448808901, "grad_norm": 4.25686502456665, "kl": 2.9453125, "learning_rate": 1.3980311597320366e-07, "loss": 0.2968, "reward": 0.6668527275323868, "reward_std": 0.28350237756967545, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5552455633878708, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 1337.8170166015625, "epoch": 0.6039877529684117, "grad_norm": 9.100079536437988, "kl": 3.5703125, "learning_rate": 1.3966479559754257e-07, "loss": 0.3116, "reward": 0.6590401977300644, "reward_std": 0.286414697766304, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 1468.1473693847656, "epoch": 0.6042864610559331, "grad_norm": 5.173929691314697, "kl": 2.9375, "learning_rate": 1.3952650278410075e-07, "loss": 0.2498, "reward": 0.6540178805589676, "reward_std": 0.2640705928206444, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714477300644, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 1322.9688110351562, "epoch": 0.6045851691434545, "grad_norm": 6.981040000915527, "kl": 2.2734375, "learning_rate": 1.3938823768332722e-07, "loss": 0.2417, "reward": 0.6316964477300644, "reward_std": 0.2715758867561817, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214626312256, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 1433.1451416015625, "epoch": 0.604883877230976, "grad_norm": 5.941267490386963, "kl": 3.34375, "learning_rate": 1.3925000044564093e-07, "loss": 0.2682, "reward": 0.6640625298023224, "reward_std": 0.3082190304994583, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625298023224, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 1447.7991638183594, "epoch": 0.6051825853184974, "grad_norm": 4.679924488067627, "kl": 3.21484375, "learning_rate": 1.3911179122143034e-07, "loss": 0.2791, "reward": 0.6311384290456772, "reward_std": 0.2832742929458618, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.561941996216774, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 1379.9040832519531, "epoch": 0.605481293406019, "grad_norm": 4.6137614250183105, "kl": 3.08203125, "learning_rate": 1.389736101610536e-07, "loss": 0.2855, "reward": 0.631138414144516, "reward_std": 0.2681901827454567, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 1392.9933471679688, "epoch": 0.6057800014935404, "grad_norm": 20.459091186523438, "kl": 3.65625, "learning_rate": 1.3883545741483812e-07, "loss": 0.3269, "reward": 0.6981027126312256, "reward_std": 0.2959746941924095, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 1324.6585388183594, "epoch": 0.6060787095810619, "grad_norm": 6.81165885925293, "kl": 3.3671875, "learning_rate": 1.386973331330806e-07, "loss": 0.2986, "reward": 0.672991082072258, "reward_std": 0.3134932443499565, "rewards/accuracy_reward": 0.1071428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5658482313156128, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 1355.7657165527344, "epoch": 0.6063774176685833, "grad_norm": 32.33526611328125, "kl": 3.97265625, "learning_rate": 1.3855923746604664e-07, "loss": 0.322, "reward": 0.690848246216774, "reward_std": 0.3211059272289276, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875298023224, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 1301.13623046875, "epoch": 0.6066761257561049, "grad_norm": 8.162555694580078, "kl": 2.6484375, "learning_rate": 1.3842117056397087e-07, "loss": 0.2658, "reward": 0.7137277126312256, "reward_std": 0.3362724632024765, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5552455484867096, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 1401.6295166015625, "epoch": 0.6069748338436263, "grad_norm": 7.253874778747559, "kl": 3.556640625, "learning_rate": 1.3828313257705655e-07, "loss": 0.3138, "reward": 0.6328125447034836, "reward_std": 0.2536350041627884, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089477300644, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 1492.8259582519531, "epoch": 0.6072735419311478, "grad_norm": 7.0123395919799805, "kl": 2.837890625, "learning_rate": 1.3814512365547543e-07, "loss": 0.2003, "reward": 0.6177455633878708, "reward_std": 0.2501892112195492, "rewards/accuracy_reward": 0.06919643259607255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491305589676, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 1520.7165832519531, "epoch": 0.6075722500186692, "grad_norm": 8.260126113891602, "kl": 2.3671875, "learning_rate": 1.3800714394936775e-07, "loss": 0.1497, "reward": 0.5781250298023224, "reward_std": 0.26952284574508667, "rewards/accuracy_reward": 0.04017857415601611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5379464477300644, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 1307.4844360351562, "epoch": 0.6078709581061907, "grad_norm": 5.001379013061523, "kl": 2.23046875, "learning_rate": 1.3786919360884182e-07, "loss": 0.2094, "reward": 0.707589328289032, "reward_std": 0.2728531025350094, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 1323.6920166015625, "epoch": 0.6081696661937122, "grad_norm": 12.456847190856934, "kl": 2.48828125, "learning_rate": 1.3773127278397416e-07, "loss": 0.2117, "reward": 0.6138393133878708, "reward_std": 0.2713956981897354, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 1374.9308776855469, "epoch": 0.6084683742812337, "grad_norm": 19.171274185180664, "kl": 3.69140625, "learning_rate": 1.3759338162480907e-07, "loss": 0.24, "reward": 0.6339286118745804, "reward_std": 0.25061483308672905, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 1335.6250610351562, "epoch": 0.6087670823687551, "grad_norm": 12.806414604187012, "kl": 2.12109375, "learning_rate": 1.3745552028135863e-07, "loss": 0.1734, "reward": 0.7126116305589676, "reward_std": 0.3050399199128151, "rewards/accuracy_reward": 0.12053571548312902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 1392.0357666015625, "epoch": 0.6090657904562766, "grad_norm": 25.608442306518555, "kl": 2.134765625, "learning_rate": 1.3731768890360254e-07, "loss": 0.2539, "reward": 0.6947544813156128, "reward_std": 0.283842321485281, "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5407366305589676, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 1390.3415832519531, "epoch": 0.609364498543798, "grad_norm": 6.262856960296631, "kl": 2.8671875, "learning_rate": 1.3717988764148773e-07, "loss": 0.264, "reward": 0.7036830484867096, "reward_std": 0.2763158231973648, "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973469734192, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 1307.8884887695312, "epoch": 0.6096632066313196, "grad_norm": 5.579747200012207, "kl": 2.484375, "learning_rate": 1.370421166449285e-07, "loss": 0.2646, "reward": 0.6071428656578064, "reward_std": 0.273663017898798, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 1460.3504943847656, "epoch": 0.609961914718841, "grad_norm": 10.784658432006836, "kl": 3.22265625, "learning_rate": 1.3690437606380615e-07, "loss": 0.3206, "reward": 0.6824777126312256, "reward_std": 0.2947229743003845, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098469734192, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 1378.5223693847656, "epoch": 0.6102606228063625, "grad_norm": 7.041727542877197, "kl": 3.21484375, "learning_rate": 1.3676666604796901e-07, "loss": 0.3, "reward": 0.6205357313156128, "reward_std": 0.29026759415864944, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5758928805589676, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 1389.1942443847656, "epoch": 0.6105593308938839, "grad_norm": 4.712844371795654, "kl": 2.466796875, "learning_rate": 1.3662898674723209e-07, "loss": 0.183, "reward": 0.6953125447034836, "reward_std": 0.2734471969306469, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375447034836, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 1404.8036499023438, "epoch": 0.6108580389814054, "grad_norm": 19.491504669189453, "kl": 3.75, "learning_rate": 1.3649133831137684e-07, "loss": 0.2946, "reward": 0.6489955633878708, "reward_std": 0.3216875195503235, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705633878708, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 1554.8036499023438, "epoch": 0.6111567470689269, "grad_norm": 19.158523559570312, "kl": 3.9765625, "learning_rate": 1.3635372089015142e-07, "loss": 0.2524, "reward": 0.6110491454601288, "reward_std": 0.2757703699171543, "rewards/accuracy_reward": 0.053571432596072555, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574776977300644, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 1362.3192443847656, "epoch": 0.6114554551564484, "grad_norm": 7.84423828125, "kl": 3.3203125, "learning_rate": 1.3621613463327e-07, "loss": 0.276, "reward": 0.6802455633878708, "reward_std": 0.25528425350785255, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 1351.3907165527344, "epoch": 0.6117541632439698, "grad_norm": 8.967618942260742, "kl": 4.03125, "learning_rate": 1.3607857969041303e-07, "loss": 0.3172, "reward": 0.6484375298023224, "reward_std": 0.30337879061698914, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5412946790456772, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 1409.2679443359375, "epoch": 0.6120528713314913, "grad_norm": 3.4414706230163574, "kl": 2.19140625, "learning_rate": 1.359410562112267e-07, "loss": 0.1572, "reward": 0.6657366454601288, "reward_std": 0.26253141090273857, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 1462.1853332519531, "epoch": 0.6123515794190127, "grad_norm": 9.656447410583496, "kl": 2.802734375, "learning_rate": 1.3580356434532315e-07, "loss": 0.2315, "reward": 0.6529018133878708, "reward_std": 0.2661077566444874, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625298023224, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 1398.9152526855469, "epoch": 0.6126502875065343, "grad_norm": 12.31200885772705, "kl": 2.306640625, "learning_rate": 1.3566610424228006e-07, "loss": 0.2702, "reward": 0.6735491305589676, "reward_std": 0.3250267580151558, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741454601288, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 1427.1942443847656, "epoch": 0.6129489955940557, "grad_norm": 11.687807083129883, "kl": 2.052734375, "learning_rate": 1.355286760516405e-07, "loss": 0.2169, "reward": 0.707589328289032, "reward_std": 0.2624897062778473, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000447034836, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 1438.5179443359375, "epoch": 0.6132477036815772, "grad_norm": 5.864814758300781, "kl": 2.51171875, "learning_rate": 1.353912799229129e-07, "loss": 0.2849, "reward": 0.6523437798023224, "reward_std": 0.2844872809946537, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5630580633878708, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 1352.6295166015625, "epoch": 0.6135464117690986, "grad_norm": 8.97548770904541, "kl": 2.689453125, "learning_rate": 1.3525391600557074e-07, "loss": 0.2225, "reward": 0.7070312798023224, "reward_std": 0.249535471200943, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 1391.5290832519531, "epoch": 0.6138451198566202, "grad_norm": 4.30704927444458, "kl": 2.041015625, "learning_rate": 1.351165844490526e-07, "loss": 0.1553, "reward": 0.6010044813156128, "reward_std": 0.24478978663682938, "rewards/accuracy_reward": 0.022321428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830484867096, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 1425.76123046875, "epoch": 0.6141438279441416, "grad_norm": 9.433969497680664, "kl": 2.025390625, "learning_rate": 1.3497928540276163e-07, "loss": 0.1562, "reward": 0.6238839477300644, "reward_std": 0.24698534235358238, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5591517984867096, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 1288.6272888183594, "epoch": 0.6144425360316631, "grad_norm": 4.861375331878662, "kl": 2.46484375, "learning_rate": 1.3484201901606583e-07, "loss": 0.2041, "reward": 0.6400669813156128, "reward_std": 0.3128587156534195, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348469734192, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 1416.4688110351562, "epoch": 0.6147412441191845, "grad_norm": 4.7002129554748535, "kl": 2.62109375, "learning_rate": 1.3470478543829754e-07, "loss": 0.2285, "reward": 0.6847098469734192, "reward_std": 0.32094500213861465, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5418527126312256, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 1284.2678833007812, "epoch": 0.615039952206706, "grad_norm": 29.02745246887207, "kl": 3.6484375, "learning_rate": 1.3456758481875347e-07, "loss": 0.3334, "reward": 0.7165178805589676, "reward_std": 0.3420348986983299, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000298023224, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 1334.9888916015625, "epoch": 0.6153386602942275, "grad_norm": 7.206047058105469, "kl": 2.76171875, "learning_rate": 1.3443041730669435e-07, "loss": 0.216, "reward": 0.6333705633878708, "reward_std": 0.29448676109313965, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 1358.44873046875, "epoch": 0.615637368381749, "grad_norm": 2.4809181690216064, "kl": 2.72265625, "learning_rate": 1.3429328305134512e-07, "loss": 0.2425, "reward": 0.6367187798023224, "reward_std": 0.3128928542137146, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809152275323868, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 1424.3081359863281, "epoch": 0.6159360764692704, "grad_norm": 4.828571319580078, "kl": 2.220703125, "learning_rate": 1.3415618220189433e-07, "loss": 0.2425, "reward": 0.739397332072258, "reward_std": 0.2939066104590893, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187798023224, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 1370.8728332519531, "epoch": 0.6162347845567919, "grad_norm": 3.928506851196289, "kl": 2.3828125, "learning_rate": 1.3401911490749423e-07, "loss": 0.2275, "reward": 0.6718750298023224, "reward_std": 0.2761625908315182, "rewards/accuracy_reward": 0.11160714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678805589676, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 1308.1317749023438, "epoch": 0.6165334926443133, "grad_norm": 3.7572546005249023, "kl": 2.822265625, "learning_rate": 1.338820813172607e-07, "loss": 0.2989, "reward": 0.6026785969734192, "reward_std": 0.27735285460948944, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 1388.2880249023438, "epoch": 0.6168322007318349, "grad_norm": 4.223583698272705, "kl": 2.296875, "learning_rate": 1.3374508158027285e-07, "loss": 0.2175, "reward": 0.766183078289032, "reward_std": 0.3078824132680893, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.569754496216774, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 1391.8326416015625, "epoch": 0.6171309088193563, "grad_norm": 4.50452184677124, "kl": 2.08984375, "learning_rate": 1.3360811584557288e-07, "loss": 0.1916, "reward": 0.6316964626312256, "reward_std": 0.27487511932849884, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500149011612, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 1424.2656860351562, "epoch": 0.6174296169068777, "grad_norm": 10.256278038024902, "kl": 3.05078125, "learning_rate": 1.3347118426216614e-07, "loss": 0.2579, "reward": 0.5898437798023224, "reward_std": 0.28863874077796936, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5429687798023224, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 1411.8750305175781, "epoch": 0.6177283249943992, "grad_norm": 9.15918254852295, "kl": 3.1171875, "learning_rate": 1.333342869790208e-07, "loss": 0.2218, "reward": 0.685825914144516, "reward_std": 0.24110550433397293, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5697544813156128, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 1355.3527526855469, "epoch": 0.6180270330819206, "grad_norm": 5.4397358894348145, "kl": 2.74609375, "learning_rate": 1.3319742414506765e-07, "loss": 0.3641, "reward": 0.6601562649011612, "reward_std": 0.272803645581007, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5530133992433548, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 1306.4531860351562, "epoch": 0.6183257411694422, "grad_norm": 3.8741517066955566, "kl": 1.99609375, "learning_rate": 1.3306059590920002e-07, "loss": 0.2208, "reward": 0.6925223469734192, "reward_std": 0.26930828392505646, "rewards/accuracy_reward": 0.10044643259607255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5920759290456772, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 1375.5960388183594, "epoch": 0.6186244492569636, "grad_norm": 7.92760705947876, "kl": 2.359375, "learning_rate": 1.3292380242027366e-07, "loss": 0.2552, "reward": 0.7187500298023224, "reward_std": 0.24194421619176865, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 1379.5156860351562, "epoch": 0.6189231573444851, "grad_norm": 4.306384086608887, "kl": 2.40234375, "learning_rate": 1.3278704382710645e-07, "loss": 0.2601, "reward": 0.6255580633878708, "reward_std": 0.24379045143723488, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.560825914144516, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 1431.7098693847656, "epoch": 0.6192218654320065, "grad_norm": 2.9878029823303223, "kl": 2.41796875, "learning_rate": 1.3265032027847834e-07, "loss": 0.2325, "reward": 0.6512276977300644, "reward_std": 0.31337952613830566, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 1443.3348693847656, "epoch": 0.619520573519528, "grad_norm": 10.72780704498291, "kl": 2.6953125, "learning_rate": 1.3251363192313116e-07, "loss": 0.1724, "reward": 0.6199777126312256, "reward_std": 0.2291683629155159, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 1412.35498046875, "epoch": 0.6198192816070495, "grad_norm": 4.105729103088379, "kl": 2.240234375, "learning_rate": 1.3237697890976846e-07, "loss": 0.244, "reward": 0.6367187798023224, "reward_std": 0.32357896119356155, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937798023224, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 1333.9531860351562, "epoch": 0.620117989694571, "grad_norm": 4.657469272613525, "kl": 2.666015625, "learning_rate": 1.322403613870553e-07, "loss": 0.2637, "reward": 0.77901791036129, "reward_std": 0.3138384744524956, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321790456772, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 1469.6629943847656, "epoch": 0.6204166977820924, "grad_norm": 8.70986557006836, "kl": 2.51953125, "learning_rate": 1.3210377950361815e-07, "loss": 0.2423, "reward": 0.7075893133878708, "reward_std": 0.28733591362833977, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 1271.5558471679688, "epoch": 0.6207154058696139, "grad_norm": 8.724376678466797, "kl": 2.302734375, "learning_rate": 1.3196723340804473e-07, "loss": 0.196, "reward": 0.6428571790456772, "reward_std": 0.2373412288725376, "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 1414.1585693359375, "epoch": 0.6210141139571353, "grad_norm": 3.3355889320373535, "kl": 1.67578125, "learning_rate": 1.318307232488838e-07, "loss": 0.1346, "reward": 0.659598246216774, "reward_std": 0.28299809992313385, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6060268133878708, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 1340.4107971191406, "epoch": 0.6213128220446569, "grad_norm": 5.921475410461426, "kl": 1.927734375, "learning_rate": 1.3169424917464506e-07, "loss": 0.2179, "reward": 0.705357164144516, "reward_std": 0.33479221910238266, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214626312256, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 1366.08935546875, "epoch": 0.6216115301321783, "grad_norm": 4.801887035369873, "kl": 2.625, "learning_rate": 1.3155781133379884e-07, "loss": 0.2274, "reward": 0.5976562649011612, "reward_std": 0.24859394133090973, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 1399.4107971191406, "epoch": 0.6219102382196998, "grad_norm": 9.350871086120605, "kl": 1.9296875, "learning_rate": 1.3142140987477624e-07, "loss": 0.2318, "reward": 0.7321428805589676, "reward_std": 0.2812781296670437, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513393133878708, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 1442.5380249023438, "epoch": 0.6222089463072212, "grad_norm": 3.3561131954193115, "kl": 2.251953125, "learning_rate": 1.3128504494596867e-07, "loss": 0.2363, "reward": 0.5731026977300644, "reward_std": 0.2721835933625698, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5396205484867096, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 1351.5715026855469, "epoch": 0.6225076543947428, "grad_norm": 11.010128021240234, "kl": 1.84765625, "learning_rate": 1.3114871669572772e-07, "loss": 0.1969, "reward": 0.719308078289032, "reward_std": 0.28124114871025085, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687649011612, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 1430.8460388183594, "epoch": 0.6228063624822642, "grad_norm": 3.7862820625305176, "kl": 2.326171875, "learning_rate": 1.3101242527236529e-07, "loss": 0.2139, "reward": 0.7042410969734192, "reward_std": 0.2883504256606102, "rewards/accuracy_reward": 0.15401786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.550223246216774, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 1439.5179138183594, "epoch": 0.6231050705697857, "grad_norm": 7.4686713218688965, "kl": 2.5703125, "learning_rate": 1.30876170824153e-07, "loss": 0.2261, "reward": 0.5987723469734192, "reward_std": 0.2615841254591942, "rewards/accuracy_reward": 0.06696429057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5318080633878708, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 1376.6741943359375, "epoch": 0.6234037786573071, "grad_norm": 3.8755974769592285, "kl": 2.3984375, "learning_rate": 1.3073995349932233e-07, "loss": 0.2474, "reward": 0.6902902126312256, "reward_std": 0.3172275274991989, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5630580484867096, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 1304.2255249023438, "epoch": 0.6237024867448286, "grad_norm": 8.920968055725098, "kl": 2.009765625, "learning_rate": 1.306037734460644e-07, "loss": 0.2533, "reward": 0.6880580633878708, "reward_std": 0.3066554367542267, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187649011612, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 1420.8906860351562, "epoch": 0.6240011948323501, "grad_norm": 6.379368305206299, "kl": 2.375, "learning_rate": 1.304676308125298e-07, "loss": 0.2612, "reward": 0.6941964626312256, "reward_std": 0.31725019589066505, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870535969734192, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 1416.2902221679688, "epoch": 0.6242999029198716, "grad_norm": 6.957844257354736, "kl": 2.98046875, "learning_rate": 1.3033152574682837e-07, "loss": 0.242, "reward": 0.6250000298023224, "reward_std": 0.25323544815182686, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 1386.6384582519531, "epoch": 0.624598611007393, "grad_norm": 5.364038944244385, "kl": 3.08984375, "learning_rate": 1.30195458397029e-07, "loss": 0.2207, "reward": 0.6556919813156128, "reward_std": 0.2463594637811184, "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312798023224, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 1384.0514221191406, "epoch": 0.6248973190949145, "grad_norm": 6.207615375518799, "kl": 2.873046875, "learning_rate": 1.3005942891115967e-07, "loss": 0.2798, "reward": 0.7031250298023224, "reward_std": 0.28937672451138496, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250298023224, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 1387.4263916015625, "epoch": 0.6251960271824359, "grad_norm": 3.209023952484131, "kl": 2.51171875, "learning_rate": 1.299234374372072e-07, "loss": 0.2607, "reward": 0.655691996216774, "reward_std": 0.3084185943007469, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 1381.6563415527344, "epoch": 0.6254947352699575, "grad_norm": 3.562760829925537, "kl": 2.75390625, "learning_rate": 1.297874841231169e-07, "loss": 0.2734, "reward": 0.7003348469734192, "reward_std": 0.2965191900730133, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348469734192, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 1399.8527526855469, "epoch": 0.6257934433574789, "grad_norm": 6.045639514923096, "kl": 2.267578125, "learning_rate": 1.2965156911679264e-07, "loss": 0.2473, "reward": 0.6422991305589676, "reward_std": 0.3078872561454773, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 1446.3951721191406, "epoch": 0.6260921514450004, "grad_norm": 2.7999768257141113, "kl": 2.283203125, "learning_rate": 1.2951569256609674e-07, "loss": 0.202, "reward": 0.6529017984867096, "reward_std": 0.2885247468948364, "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125298023224, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 1386.7567749023438, "epoch": 0.6263908595325218, "grad_norm": 3.1927220821380615, "kl": 2.419921875, "learning_rate": 1.293798546188494e-07, "loss": 0.2193, "reward": 0.737723246216774, "reward_std": 0.25781120732426643, "rewards/accuracy_reward": 0.16964286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803954601288, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 1419.02685546875, "epoch": 0.6266895676200434, "grad_norm": 6.42357063293457, "kl": 2.3359375, "learning_rate": 1.2924405542282914e-07, "loss": 0.1759, "reward": 0.5993303805589676, "reward_std": 0.28914379328489304, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5636160969734192, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 1328.1942749023438, "epoch": 0.6269882757075648, "grad_norm": 11.820815086364746, "kl": 2.677734375, "learning_rate": 1.2910829512577212e-07, "loss": 0.2343, "reward": 0.616629496216774, "reward_std": 0.2939216010272503, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937723517418, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 1484.9063110351562, "epoch": 0.6272869837950863, "grad_norm": 5.640720367431641, "kl": 2.6796875, "learning_rate": 1.2897257387537222e-07, "loss": 0.2088, "reward": 0.627232164144516, "reward_std": 0.28691086173057556, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5200892984867096, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 1468.88623046875, "epoch": 0.6275856918826077, "grad_norm": 4.4649834632873535, "kl": 2.4453125, "learning_rate": 1.288368918192809e-07, "loss": 0.1883, "reward": 0.6568080633878708, "reward_std": 0.2899632006883621, "rewards/accuracy_reward": 0.07812500582076609, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 1507.3861999511719, "epoch": 0.6278843999701292, "grad_norm": 5.000542163848877, "kl": 2.39453125, "learning_rate": 1.287012491051069e-07, "loss": 0.2294, "reward": 0.6540178954601288, "reward_std": 0.2886112555861473, "rewards/accuracy_reward": 0.10267857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513392984867096, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 1330.1741638183594, "epoch": 0.6281831080576507, "grad_norm": 7.716157913208008, "kl": 2.01171875, "learning_rate": 1.285656458804162e-07, "loss": 0.218, "reward": 0.6824777126312256, "reward_std": 0.30813799425959587, "rewards/accuracy_reward": 0.12053572293370962, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5619419813156128, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 1342.3527526855469, "epoch": 0.6284818161451722, "grad_norm": 5.932388782501221, "kl": 1.595703125, "learning_rate": 1.2843008229273186e-07, "loss": 0.2425, "reward": 0.6685268133878708, "reward_std": 0.2924739643931389, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 1313.2790832519531, "epoch": 0.6287805242326936, "grad_norm": 6.5753912925720215, "kl": 1.990234375, "learning_rate": 1.282945584895337e-07, "loss": 0.1985, "reward": 0.679129496216774, "reward_std": 0.3045887388288975, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5541294813156128, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 1416.1406860351562, "epoch": 0.6290792323202151, "grad_norm": 7.194782257080078, "kl": 1.802734375, "learning_rate": 1.2815907461825843e-07, "loss": 0.2341, "reward": 0.7098214477300644, "reward_std": 0.3220200762152672, "rewards/accuracy_reward": 0.15848215483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513393133878708, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 1360.9263916015625, "epoch": 0.6293779404077365, "grad_norm": 5.4587554931640625, "kl": 2.91796875, "learning_rate": 1.2802363082629916e-07, "loss": 0.2762, "reward": 0.6981027126312256, "reward_std": 0.27804142236709595, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491305589676, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 1283.7589721679688, "epoch": 0.6296766484952581, "grad_norm": 6.8890228271484375, "kl": 2.005859375, "learning_rate": 1.278882272610055e-07, "loss": 0.2111, "reward": 0.7282366454601288, "reward_std": 0.30846159905195236, "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5920759290456772, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 1336.1786193847656, "epoch": 0.6299753565827795, "grad_norm": 7.333356857299805, "kl": 2.4765625, "learning_rate": 1.277528640696832e-07, "loss": 0.3076, "reward": 0.6718750298023224, "reward_std": 0.2914937771856785, "rewards/accuracy_reward": 0.12053572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5513393133878708, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 1458.0558776855469, "epoch": 0.6302740646703009, "grad_norm": 5.116927146911621, "kl": 2.490234375, "learning_rate": 1.276175413995942e-07, "loss": 0.2045, "reward": 0.5954241156578064, "reward_std": 0.2811330556869507, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5552455633878708, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 1341.3348999023438, "epoch": 0.6305727727578224, "grad_norm": 6.623912811279297, "kl": 2.20703125, "learning_rate": 1.2748225939795632e-07, "loss": 0.2447, "reward": 0.681919664144516, "reward_std": 0.32608944177627563, "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5591518133878708, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 1269.5781860351562, "epoch": 0.6308714808453438, "grad_norm": 3.612905740737915, "kl": 2.083984375, "learning_rate": 1.27347018211943e-07, "loss": 0.2409, "reward": 0.651785746216774, "reward_std": 0.2762058563530445, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964477300644, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 1402.1897888183594, "epoch": 0.6311701889328654, "grad_norm": 7.141191482543945, "kl": 2.46484375, "learning_rate": 1.272118179886835e-07, "loss": 0.2809, "reward": 0.6891741454601288, "reward_std": 0.30765021592378616, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5597098469734192, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 1442.5045471191406, "epoch": 0.6314688970203868, "grad_norm": 6.427230358123779, "kl": 2.431640625, "learning_rate": 1.270766588752624e-07, "loss": 0.225, "reward": 0.6796875298023224, "reward_std": 0.2853126935660839, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.565848246216774, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 1405.0402526855469, "epoch": 0.6317676051079083, "grad_norm": 20.11049461364746, "kl": 2.498046875, "learning_rate": 1.269415410187196e-07, "loss": 0.2259, "reward": 0.6456473469734192, "reward_std": 0.2962990254163742, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937649011612, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 1386.7567443847656, "epoch": 0.6320663131954297, "grad_norm": 2.8638453483581543, "kl": 2.42578125, "learning_rate": 1.2680646456604998e-07, "loss": 0.2442, "reward": 0.7165178954601288, "reward_std": 0.33770665526390076, "rewards/accuracy_reward": 0.16071428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.555803582072258, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 1413.2277526855469, "epoch": 0.6323650212829512, "grad_norm": 2.9624698162078857, "kl": 2.400390625, "learning_rate": 1.266714296642036e-07, "loss": 0.2232, "reward": 0.654575914144516, "reward_std": 0.2738899439573288, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5541295036673546, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 1477.6027221679688, "epoch": 0.6326637293704727, "grad_norm": 5.595718860626221, "kl": 1.869140625, "learning_rate": 1.265364364600851e-07, "loss": 0.1664, "reward": 0.5904018133878708, "reward_std": 0.2781102657318115, "rewards/accuracy_reward": 0.03794643119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.552455373108387, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 1470.0491943359375, "epoch": 0.6329624374579942, "grad_norm": 3.4569947719573975, "kl": 2.044921875, "learning_rate": 1.2640148510055388e-07, "loss": 0.2036, "reward": 0.6015625298023224, "reward_std": 0.269117571413517, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 1399.5291137695312, "epoch": 0.6332611455455156, "grad_norm": 6.199786186218262, "kl": 2.076171875, "learning_rate": 1.2626657573242385e-07, "loss": 0.2055, "reward": 0.629464328289032, "reward_std": 0.2783103287220001, "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 1435.0245971679688, "epoch": 0.6335598536330371, "grad_norm": 3.2284181118011475, "kl": 1.5283203125, "learning_rate": 1.2613170850246313e-07, "loss": 0.1849, "reward": 0.6579241305589676, "reward_std": 0.29290829598903656, "rewards/accuracy_reward": 0.08928571990691125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 1349.9598693847656, "epoch": 0.6338585617205585, "grad_norm": 9.995537757873535, "kl": 2.45703125, "learning_rate": 1.2599688355739407e-07, "loss": 0.2669, "reward": 0.6316964626312256, "reward_std": 0.297170490026474, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.549107164144516, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 1483.2991943359375, "epoch": 0.6341572698080801, "grad_norm": 4.6857805252075195, "kl": 2.19140625, "learning_rate": 1.2586210104389292e-07, "loss": 0.2529, "reward": 0.6316964626312256, "reward_std": 0.30523815006017685, "rewards/accuracy_reward": 0.11830357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5133928805589676, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 1378.7656860351562, "epoch": 0.6344559778956015, "grad_norm": 3.967373847961426, "kl": 1.76953125, "learning_rate": 1.2572736110858998e-07, "loss": 0.3286, "reward": 0.7349330633878708, "reward_std": 0.26253537833690643, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809152126312256, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 1422.8371276855469, "epoch": 0.634754685983123, "grad_norm": 3.1180083751678467, "kl": 1.8203125, "learning_rate": 1.25592663898069e-07, "loss": 0.1697, "reward": 0.6551339477300644, "reward_std": 0.25101371109485626, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 1429.2656860351562, "epoch": 0.6350533940706444, "grad_norm": 3.853020191192627, "kl": 2.193359375, "learning_rate": 1.2545800955886735e-07, "loss": 0.2608, "reward": 0.619419664144516, "reward_std": 0.29072704166173935, "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5457589626312256, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 1399.7098693847656, "epoch": 0.635352102158166, "grad_norm": 8.610684394836426, "kl": 1.73046875, "learning_rate": 1.2532339823747578e-07, "loss": 0.182, "reward": 0.6852678805589676, "reward_std": 0.30202198028564453, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 1491.01123046875, "epoch": 0.6356508102456874, "grad_norm": 4.101110935211182, "kl": 1.53125, "learning_rate": 1.251888300803382e-07, "loss": 0.1557, "reward": 0.738839328289032, "reward_std": 0.3007631450891495, "rewards/accuracy_reward": 0.17633929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000298023224, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 1320.6473999023438, "epoch": 0.6359495183332089, "grad_norm": 7.237894535064697, "kl": 1.611328125, "learning_rate": 1.250543052338516e-07, "loss": 0.2491, "reward": 0.6595982611179352, "reward_std": 0.2420138418674469, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 1421.8705749511719, "epoch": 0.6362482264207303, "grad_norm": 5.037436485290527, "kl": 2.12109375, "learning_rate": 1.2491982384436578e-07, "loss": 0.2201, "reward": 0.6004464626312256, "reward_std": 0.3135596886277199, "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5424107313156128, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 1372.2344665527344, "epoch": 0.6365469345082518, "grad_norm": 3.6699914932250977, "kl": 1.359375, "learning_rate": 1.2478538605818337e-07, "loss": 0.233, "reward": 0.7176339626312256, "reward_std": 0.35362474620342255, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581473246216774, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 1507.0781860351562, "epoch": 0.6368456425957733, "grad_norm": 4.3013739585876465, "kl": 2.619140625, "learning_rate": 1.2465099202155942e-07, "loss": 0.2238, "reward": 0.6601562649011612, "reward_std": 0.30155930668115616, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5262276902794838, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 1240.9755249023438, "epoch": 0.6371443506832948, "grad_norm": 4.705128192901611, "kl": 2.484375, "learning_rate": 1.2451664188070152e-07, "loss": 0.2723, "reward": 0.7176339626312256, "reward_std": 0.34276438876986504, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875149011612, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 1451.450927734375, "epoch": 0.6374430587708162, "grad_norm": 3.5489754676818848, "kl": 1.982421875, "learning_rate": 1.2438233578176951e-07, "loss": 0.2224, "reward": 0.6824777126312256, "reward_std": 0.2915274240076542, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741156578064, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 1364.8281860351562, "epoch": 0.6377417668583377, "grad_norm": 3.726219892501831, "kl": 1.814453125, "learning_rate": 1.242480738708752e-07, "loss": 0.2256, "reward": 0.6847098469734192, "reward_std": 0.3242516592144966, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 1391.9532165527344, "epoch": 0.6380404749458591, "grad_norm": 7.02950382232666, "kl": 2.265625, "learning_rate": 1.241138562940824e-07, "loss": 0.2461, "reward": 0.679129496216774, "reward_std": 0.2779528647661209, "rewards/accuracy_reward": 0.12053572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937798023224, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 1359.0022583007812, "epoch": 0.6383391830333807, "grad_norm": 4.604991436004639, "kl": 2.197265625, "learning_rate": 1.2397968319740663e-07, "loss": 0.2288, "reward": 0.7198660969734192, "reward_std": 0.33364418894052505, "rewards/accuracy_reward": 0.15178571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5680803805589676, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 1359.2433471679688, "epoch": 0.6386378911209021, "grad_norm": 10.755806922912598, "kl": 2.95703125, "learning_rate": 1.2384555472681518e-07, "loss": 0.3292, "reward": 0.7109375149011612, "reward_std": 0.3355864882469177, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875298023224, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 1415.0982666015625, "epoch": 0.6389365992084236, "grad_norm": 3.0915651321411133, "kl": 1.8671875, "learning_rate": 1.2371147102822658e-07, "loss": 0.2391, "reward": 0.6629464477300644, "reward_std": 0.28157851845026016, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 1326.0558471679688, "epoch": 0.639235307295945, "grad_norm": 3.208778142929077, "kl": 1.912109375, "learning_rate": 1.2357743224751084e-07, "loss": 0.2156, "reward": 0.7198661267757416, "reward_std": 0.3105626069009304, "rewards/accuracy_reward": 0.12053571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303805589676, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 1333.1563110351562, "epoch": 0.6395340153834665, "grad_norm": 4.325551509857178, "kl": 1.78125, "learning_rate": 1.2344343853048893e-07, "loss": 0.1987, "reward": 0.6964285969734192, "reward_std": 0.2925548367202282, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 1390.7388916015625, "epoch": 0.639832723470988, "grad_norm": 6.199978828430176, "kl": 1.58984375, "learning_rate": 1.2330949002293293e-07, "loss": 0.1643, "reward": 0.7656250447034836, "reward_std": 0.2898242250084877, "rewards/accuracy_reward": 0.18303572665899992, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 1439.5335388183594, "epoch": 0.6401314315585095, "grad_norm": 5.976885795593262, "kl": 2.328125, "learning_rate": 1.2317558687056566e-07, "loss": 0.2008, "reward": 0.5814732313156128, "reward_std": 0.24973329529166222, "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5569196790456772, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 1265.8348999023438, "epoch": 0.6404301396460309, "grad_norm": 4.494076728820801, "kl": 1.548828125, "learning_rate": 1.2304172921906063e-07, "loss": 0.2182, "reward": 0.6601562947034836, "reward_std": 0.24035287648439407, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5664062798023224, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 1503.8080749511719, "epoch": 0.6407288477335524, "grad_norm": 5.811822891235352, "kl": 2.23046875, "learning_rate": 1.2290791721404189e-07, "loss": 0.2198, "reward": 0.5881696492433548, "reward_std": 0.27221618220210075, "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5591518133878708, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 1399.1317749023438, "epoch": 0.6410275558210738, "grad_norm": 6.952997207641602, "kl": 1.3486328125, "learning_rate": 1.2277415100108373e-07, "loss": 0.227, "reward": 0.6300223469734192, "reward_std": 0.27737608924508095, "rewards/accuracy_reward": 0.08035714877769351, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5496651977300644, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 1365.9375915527344, "epoch": 0.6413262639085954, "grad_norm": 7.489591598510742, "kl": 1.544921875, "learning_rate": 1.2264043072571074e-07, "loss": 0.1784, "reward": 0.6657366454601288, "reward_std": 0.25264665111899376, "rewards/accuracy_reward": 0.09821428940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223618745804, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 1431.9777526855469, "epoch": 0.6416249719961168, "grad_norm": 10.032479286193848, "kl": 1.947265625, "learning_rate": 1.2250675653339736e-07, "loss": 0.1739, "reward": 0.5982143133878708, "reward_std": 0.24617671966552734, "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714328289032, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 1406.8795471191406, "epoch": 0.6419236800836383, "grad_norm": 9.71904182434082, "kl": 1.615234375, "learning_rate": 1.2237312856956816e-07, "loss": 0.2146, "reward": 0.6629464626312256, "reward_std": 0.3386014997959137, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5379464626312256, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 1323.7723999023438, "epoch": 0.6422223881711597, "grad_norm": 12.888879776000977, "kl": 1.369140625, "learning_rate": 1.2223954697959716e-07, "loss": 0.1824, "reward": 0.683035746216774, "reward_std": 0.27888573333621025, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.571428582072258, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 1385.6719665527344, "epoch": 0.6425210962586813, "grad_norm": 16.899938583374023, "kl": 1.546875, "learning_rate": 1.2210601190880807e-07, "loss": 0.1541, "reward": 0.5714286118745804, "reward_std": 0.23563386872410774, "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5558035969734192, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 1360.5692749023438, "epoch": 0.6428198043462027, "grad_norm": 14.38919734954834, "kl": 1.0302734375, "learning_rate": 1.21972523502474e-07, "loss": 0.1793, "reward": 0.6930803954601288, "reward_std": 0.3022030368447304, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 1399.85498046875, "epoch": 0.6431185124337241, "grad_norm": 19.154022216796875, "kl": 1.10546875, "learning_rate": 1.2183908190581718e-07, "loss": 0.1851, "reward": 0.6495536118745804, "reward_std": 0.2808775566518307, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678656578064, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 1397.0647888183594, "epoch": 0.6434172205212456, "grad_norm": 20.123092651367188, "kl": 1.587890625, "learning_rate": 1.2170568726400902e-07, "loss": 0.1322, "reward": 0.659598246216774, "reward_std": 0.28198201954364777, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5725446790456772, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 1409.1719360351562, "epoch": 0.643715928608767, "grad_norm": 27.475080490112305, "kl": 1.0400390625, "learning_rate": 1.215723397221698e-07, "loss": 0.1673, "reward": 0.6992187798023224, "reward_std": 0.36600881814956665, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 1333.9018249511719, "epoch": 0.6440146366962886, "grad_norm": 28.332015991210938, "kl": 1.193359375, "learning_rate": 1.2143903942536863e-07, "loss": 0.1648, "reward": 0.761160746216774, "reward_std": 0.3346707969903946, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 1388.4465026855469, "epoch": 0.64431334478381, "grad_norm": 29.5283203125, "kl": 1.1982421875, "learning_rate": 1.213057865186231e-07, "loss": 0.1763, "reward": 0.6127232313156128, "reward_std": 0.2654416300356388, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.559151828289032, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 1359.8884582519531, "epoch": 0.6446120528713315, "grad_norm": 40.5582275390625, "kl": 1.353515625, "learning_rate": 1.211725811468994e-07, "loss": 0.2386, "reward": 0.6004464626312256, "reward_std": 0.24707583710551262, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.549107164144516, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 1261.8638916015625, "epoch": 0.6449107609588529, "grad_norm": 41.07429504394531, "kl": 1.2138671875, "learning_rate": 1.210394234551118e-07, "loss": 0.1828, "reward": 0.803013414144516, "reward_std": 0.3148578405380249, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5998884290456772, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 1365.0022888183594, "epoch": 0.6452094690463744, "grad_norm": 31.883703231811523, "kl": 2.1796875, "learning_rate": 1.2090631358812294e-07, "loss": 0.2547, "reward": 0.6679687798023224, "reward_std": 0.3408259227871895, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5719866454601288, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 1347.404052734375, "epoch": 0.6455081771338959, "grad_norm": 27.301483154296875, "kl": 2.5859375, "learning_rate": 1.2077325169074322e-07, "loss": 0.252, "reward": 0.7282366305589676, "reward_std": 0.30438920855522156, "rewards/accuracy_reward": 0.14062501024454832, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116305589676, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 1389.1138916015625, "epoch": 0.6458068852214174, "grad_norm": 29.976776123046875, "kl": 3.5078125, "learning_rate": 1.2064023790773094e-07, "loss": 0.3309, "reward": 0.6774553805589676, "reward_std": 0.2650519460439682, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792410969734192, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 1411.4420166015625, "epoch": 0.6461055933089388, "grad_norm": 15.261807441711426, "kl": 2.984375, "learning_rate": 1.2050727238379215e-07, "loss": 0.2303, "reward": 0.6054687798023224, "reward_std": 0.24818962067365646, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5518973618745804, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 1296.2522888183594, "epoch": 0.6464043013964603, "grad_norm": 66.42962646484375, "kl": 6.140625, "learning_rate": 1.2037435526358025e-07, "loss": 0.5483, "reward": 0.6529017984867096, "reward_std": 0.2707866206765175, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.588169664144516, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 1375.2322082519531, "epoch": 0.6467030094839817, "grad_norm": 66.26404571533203, "kl": 7.2578125, "learning_rate": 1.2024148669169599e-07, "loss": 0.4956, "reward": 0.6462053954601288, "reward_std": 0.2729775495827198, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 1348.0223693847656, "epoch": 0.6470017175715033, "grad_norm": 54.00050735473633, "kl": 6.16796875, "learning_rate": 1.2010866681268742e-07, "loss": 0.4333, "reward": 0.6802455484867096, "reward_std": 0.29837119951844215, "rewards/accuracy_reward": 0.10267857764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 1357.466552734375, "epoch": 0.6473004256590247, "grad_norm": 74.06742095947266, "kl": 6.28515625, "learning_rate": 1.1997589577104957e-07, "loss": 0.4874, "reward": 0.7539062947034836, "reward_std": 0.25274548679590225, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741454601288, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 1269.0179443359375, "epoch": 0.6475991337465462, "grad_norm": 25.177818298339844, "kl": 4.43359375, "learning_rate": 1.1984317371122428e-07, "loss": 0.4251, "reward": 0.6981027126312256, "reward_std": 0.29326799511909485, "rewards/accuracy_reward": 0.12946429383009672, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 1208.1161499023438, "epoch": 0.6478978418340676, "grad_norm": 31.788057327270508, "kl": 4.35546875, "learning_rate": 1.1971050077760009e-07, "loss": 0.3806, "reward": 0.6618303954601288, "reward_std": 0.28962986171245575, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 1430.5670166015625, "epoch": 0.6481965499215891, "grad_norm": 23.701160430908203, "kl": 5.3203125, "learning_rate": 1.1957787711451225e-07, "loss": 0.3019, "reward": 0.646763414144516, "reward_std": 0.27272987365722656, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348469734192, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 1368.3750610351562, "epoch": 0.6484952580091106, "grad_norm": 16.18306541442871, "kl": 3.166015625, "learning_rate": 1.1944530286624226e-07, "loss": 0.2753, "reward": 0.6562500298023224, "reward_std": 0.26049675419926643, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5580357313156128, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 1315.2723693847656, "epoch": 0.6487939660966321, "grad_norm": 32.106117248535156, "kl": 2.546875, "learning_rate": 1.193127781770179e-07, "loss": 0.2557, "reward": 0.6635044813156128, "reward_std": 0.30005498975515366, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187947034836, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 1383.3415832519531, "epoch": 0.6490926741841535, "grad_norm": 29.474843978881836, "kl": 2.26953125, "learning_rate": 1.1918030319101305e-07, "loss": 0.2511, "reward": 0.6199777126312256, "reward_std": 0.32744527608156204, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5507812947034836, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 1349.3348693847656, "epoch": 0.649391382271675, "grad_norm": 21.166479110717773, "kl": 2.537109375, "learning_rate": 1.1904787805234751e-07, "loss": 0.2942, "reward": 0.6367187649011612, "reward_std": 0.2631147652864456, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116156578064, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 1341.154052734375, "epoch": 0.6496900903591964, "grad_norm": 41.76475524902344, "kl": 1.83984375, "learning_rate": 1.1891550290508689e-07, "loss": 0.1969, "reward": 0.7315848469734192, "reward_std": 0.33886146545410156, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241156578064, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 1341.8795166015625, "epoch": 0.649988798446718, "grad_norm": 36.626319885253906, "kl": 2.11328125, "learning_rate": 1.1878317789324229e-07, "loss": 0.1808, "reward": 0.623325914144516, "reward_std": 0.2850510850548744, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5853794813156128, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 1336.5157165527344, "epoch": 0.6502875065342394, "grad_norm": 27.408517837524414, "kl": 2.7421875, "learning_rate": 1.1865090316077049e-07, "loss": 0.251, "reward": 0.6227678656578064, "reward_std": 0.26120518520474434, "rewards/accuracy_reward": 0.04687500325962901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5758928805589676, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 1458.7098693847656, "epoch": 0.6505862146217609, "grad_norm": 7.765546798706055, "kl": 3.5625, "learning_rate": 1.1851867885157331e-07, "loss": 0.2813, "reward": 0.6093750149011612, "reward_std": 0.23203716427087784, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321790456772, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 1319.2098693847656, "epoch": 0.6508849227092823, "grad_norm": 10.848382949829102, "kl": 4.01171875, "learning_rate": 1.1838650510949785e-07, "loss": 0.3229, "reward": 0.655691996216774, "reward_std": 0.3008229210972786, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741305589676, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 1288.3259582519531, "epoch": 0.6511836307968039, "grad_norm": 17.436628341674805, "kl": 3.59765625, "learning_rate": 1.1825438207833626e-07, "loss": 0.3654, "reward": 0.6768973469734192, "reward_std": 0.29073626548051834, "rewards/accuracy_reward": 0.09375000698491931, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5831473618745804, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 1506.4063415527344, "epoch": 0.6514823388843253, "grad_norm": 11.136756896972656, "kl": 3.298828125, "learning_rate": 1.1812230990182539e-07, "loss": 0.2292, "reward": 0.6199777126312256, "reward_std": 0.25546471402049065, "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5574776977300644, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 1338.2701416015625, "epoch": 0.6517810469718468, "grad_norm": 9.202800750732422, "kl": 3.4140625, "learning_rate": 1.1799028872364685e-07, "loss": 0.3184, "reward": 0.7098214626312256, "reward_std": 0.30808036774396896, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 1333.4420166015625, "epoch": 0.6520797550593682, "grad_norm": 23.63656997680664, "kl": 4.74609375, "learning_rate": 1.1785831868742667e-07, "loss": 0.4089, "reward": 0.6395089477300644, "reward_std": 0.2767876833677292, "rewards/accuracy_reward": 0.08035714528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5591517984867096, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 1348.4598693847656, "epoch": 0.6523784631468897, "grad_norm": 16.53373146057129, "kl": 2.658203125, "learning_rate": 1.1772639993673545e-07, "loss": 0.2379, "reward": 0.655691996216774, "reward_std": 0.27690887451171875, "rewards/accuracy_reward": 0.07366071850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312947034836, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 1299.5045471191406, "epoch": 0.6526771712344112, "grad_norm": 12.397130966186523, "kl": 2.80078125, "learning_rate": 1.1759453261508776e-07, "loss": 0.2837, "reward": 0.665178582072258, "reward_std": 0.287909809499979, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 1302.5379943847656, "epoch": 0.6529758793219327, "grad_norm": 20.79892349243164, "kl": 3.33984375, "learning_rate": 1.1746271686594234e-07, "loss": 0.2804, "reward": 0.6824777126312256, "reward_std": 0.29105184227228165, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5775669813156128, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 1317.7098999023438, "epoch": 0.6532745874094541, "grad_norm": 7.796280384063721, "kl": 2.875, "learning_rate": 1.1733095283270183e-07, "loss": 0.2898, "reward": 0.6623884290456772, "reward_std": 0.2581850402057171, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5664062798023224, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 1304.6898193359375, "epoch": 0.6535732954969756, "grad_norm": 9.8681001663208, "kl": 2.8828125, "learning_rate": 1.1719924065871259e-07, "loss": 0.3116, "reward": 0.695870578289032, "reward_std": 0.30393117666244507, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562798023224, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 1274.7768249511719, "epoch": 0.653872003584497, "grad_norm": 10.431411743164062, "kl": 3.27734375, "learning_rate": 1.1706758048726453e-07, "loss": 0.288, "reward": 0.6523437649011612, "reward_std": 0.27167749032378197, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809152126312256, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 1254.2500610351562, "epoch": 0.6541707116720186, "grad_norm": 7.445821762084961, "kl": 2.921875, "learning_rate": 1.1693597246159096e-07, "loss": 0.3112, "reward": 0.610491082072258, "reward_std": 0.26748326048254967, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089477300644, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 1276.8125610351562, "epoch": 0.65446941975954, "grad_norm": 12.464532852172852, "kl": 3.72265625, "learning_rate": 1.1680441672486862e-07, "loss": 0.2917, "reward": 0.6411830633878708, "reward_std": 0.26477304473519325, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.607700914144516, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 1373.6027526855469, "epoch": 0.6547681278470615, "grad_norm": 6.319514751434326, "kl": 3.62890625, "learning_rate": 1.1667291342021722e-07, "loss": 0.2239, "reward": 0.6523437798023224, "reward_std": 0.3034873381257057, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937798023224, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 1290.2210388183594, "epoch": 0.6550668359345829, "grad_norm": 6.608279705047607, "kl": 3.5703125, "learning_rate": 1.165414626906994e-07, "loss": 0.2981, "reward": 0.6406250298023224, "reward_std": 0.30158745497465134, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5558036118745804, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 1344.9911193847656, "epoch": 0.6553655440221045, "grad_norm": 9.843609809875488, "kl": 3.60546875, "learning_rate": 1.164100646793208e-07, "loss": 0.2634, "reward": 0.7555803954601288, "reward_std": 0.26637889817357063, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 1283.7366638183594, "epoch": 0.6556642521096259, "grad_norm": 22.484560012817383, "kl": 2.015625, "learning_rate": 1.1627871952902945e-07, "loss": 0.2652, "reward": 0.6534598618745804, "reward_std": 0.23672886192798615, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 1354.91748046875, "epoch": 0.6559629601971473, "grad_norm": 8.092353820800781, "kl": 2.5234375, "learning_rate": 1.1614742738271609e-07, "loss": 0.1351, "reward": 0.6400669813156128, "reward_std": 0.23884201422333717, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991454601288, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 1398.90185546875, "epoch": 0.6562616682846688, "grad_norm": 14.627788543701172, "kl": 3.09375, "learning_rate": 1.1601618838321365e-07, "loss": 0.2101, "reward": 0.6640625298023224, "reward_std": 0.2857498489320278, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 1217.5759582519531, "epoch": 0.6565603763721902, "grad_norm": 8.244027137756348, "kl": 2.7890625, "learning_rate": 1.1588500267329731e-07, "loss": 0.2143, "reward": 0.6529018133878708, "reward_std": 0.27442512661218643, "rewards/accuracy_reward": 0.07142857601866126, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581473246216774, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 1276.7589721679688, "epoch": 0.6568590844597118, "grad_norm": 8.03844928741455, "kl": 3.21875, "learning_rate": 1.1575387039568428e-07, "loss": 0.3328, "reward": 0.646763414144516, "reward_std": 0.3244488872587681, "rewards/accuracy_reward": 0.07812500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568638414144516, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 1292.2567443847656, "epoch": 0.6571577925472332, "grad_norm": 10.688858985900879, "kl": 3.53125, "learning_rate": 1.156227916930336e-07, "loss": 0.326, "reward": 0.6227678954601288, "reward_std": 0.26804909482598305, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714285969734192, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 1320.9955749511719, "epoch": 0.6574565006347547, "grad_norm": 6.687772750854492, "kl": 3.1875, "learning_rate": 1.154917667079461e-07, "loss": 0.2014, "reward": 0.6981026977300644, "reward_std": 0.29023513942956924, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 1222.18310546875, "epoch": 0.6577552087222761, "grad_norm": 18.32094383239746, "kl": 3.20703125, "learning_rate": 1.153607955829641e-07, "loss": 0.3196, "reward": 0.7014509290456772, "reward_std": 0.3288364000618458, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5786830633878708, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 1296.3594360351562, "epoch": 0.6580539168097976, "grad_norm": 9.670331001281738, "kl": 3.56640625, "learning_rate": 1.1522987846057133e-07, "loss": 0.2652, "reward": 0.6032366454601288, "reward_std": 0.26804960519075394, "rewards/accuracy_reward": 0.03794642956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652902126312256, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 1139.122802734375, "epoch": 0.658352624897319, "grad_norm": 6.5104451179504395, "kl": 2.6875, "learning_rate": 1.1509901548319277e-07, "loss": 0.2699, "reward": 0.7020089626312256, "reward_std": 0.28657471016049385, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747768133878708, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 1232.4777221679688, "epoch": 0.6586513329848406, "grad_norm": 17.159061431884766, "kl": 2.361328125, "learning_rate": 1.1496820679319457e-07, "loss": 0.2069, "reward": 0.6383928954601288, "reward_std": 0.2809177190065384, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 1236.0982666015625, "epoch": 0.658950041072362, "grad_norm": 9.559858322143555, "kl": 2.56640625, "learning_rate": 1.1483745253288379e-07, "loss": 0.2097, "reward": 0.6395089626312256, "reward_std": 0.26000865548849106, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5725446492433548, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 1354.8259582519531, "epoch": 0.6592487491598835, "grad_norm": 7.111260890960693, "kl": 3.78125, "learning_rate": 1.147067528445081e-07, "loss": 0.2911, "reward": 0.616629496216774, "reward_std": 0.2938385307788849, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5697544813156128, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 1252.1607666015625, "epoch": 0.6595474572474049, "grad_norm": 10.59782886505127, "kl": 3.37890625, "learning_rate": 1.145761078702561e-07, "loss": 0.3104, "reward": 0.6138393133878708, "reward_std": 0.30381081253290176, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678954601288, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 1248.5848999023438, "epoch": 0.6598461653349265, "grad_norm": 10.714112281799316, "kl": 2.59375, "learning_rate": 1.1444551775225668e-07, "loss": 0.2449, "reward": 0.6964285969734192, "reward_std": 0.29549482464790344, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.571428582072258, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 1302.40185546875, "epoch": 0.6601448734224479, "grad_norm": 9.697484016418457, "kl": 3.16015625, "learning_rate": 1.1431498263257906e-07, "loss": 0.1989, "reward": 0.694754496216774, "reward_std": 0.25823649764060974, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5920759290456772, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 1196.1540832519531, "epoch": 0.6604435815099694, "grad_norm": 7.210424900054932, "kl": 2.76953125, "learning_rate": 1.1418450265323261e-07, "loss": 0.2179, "reward": 0.6618303805589676, "reward_std": 0.267881840467453, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 1287.4465026855469, "epoch": 0.6607422895974908, "grad_norm": 10.1727933883667, "kl": 4.74609375, "learning_rate": 1.1405407795616687e-07, "loss": 0.1943, "reward": 0.7265625447034836, "reward_std": 0.2778467833995819, "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.579241082072258, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 1270.3996276855469, "epoch": 0.6610409976850123, "grad_norm": 10.106118202209473, "kl": 3.0703125, "learning_rate": 1.1392370868327109e-07, "loss": 0.2762, "reward": 0.741629496216774, "reward_std": 0.24678070098161697, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5608259290456772, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 1277.1228332519531, "epoch": 0.6613397057725338, "grad_norm": 8.331401824951172, "kl": 3.65625, "learning_rate": 1.1379339497637421e-07, "loss": 0.2272, "reward": 0.638950914144516, "reward_std": 0.2505715414881706, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187798023224, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 1371.6473999023438, "epoch": 0.6616384138600553, "grad_norm": 6.566041469573975, "kl": 3.16015625, "learning_rate": 1.1366313697724483e-07, "loss": 0.2349, "reward": 0.791294664144516, "reward_std": 0.29596511274576187, "rewards/accuracy_reward": 0.19419644260779023, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.597098246216774, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 1287.4107666015625, "epoch": 0.6619371219475767, "grad_norm": 8.430551528930664, "kl": 3.57421875, "learning_rate": 1.135329348275909e-07, "loss": 0.2714, "reward": 0.7410714477300644, "reward_std": 0.2751925215125084, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 1284.1004943847656, "epoch": 0.6622358300350982, "grad_norm": 5.523737907409668, "kl": 3.171875, "learning_rate": 1.1340278866905953e-07, "loss": 0.2672, "reward": 0.6417411118745804, "reward_std": 0.29891035705804825, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 1265.7656555175781, "epoch": 0.6625345381226196, "grad_norm": 3.5422070026397705, "kl": 3.17578125, "learning_rate": 1.1327269864323698e-07, "loss": 0.2328, "reward": 0.6880580633878708, "reward_std": 0.2744850628077984, "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437947034836, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 1202.3348388671875, "epoch": 0.6628332462101412, "grad_norm": 9.798402786254883, "kl": 3.734375, "learning_rate": 1.1314266489164854e-07, "loss": 0.3089, "reward": 0.6328125447034836, "reward_std": 0.27701159566640854, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839626312256, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 1210.450942993164, "epoch": 0.6631319542976626, "grad_norm": 6.21781587600708, "kl": 3.38671875, "learning_rate": 1.1301268755575811e-07, "loss": 0.2721, "reward": 0.6629464477300644, "reward_std": 0.2657296694815159, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571790456772, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 1300.40185546875, "epoch": 0.6634306623851841, "grad_norm": 4.835981369018555, "kl": 2.76953125, "learning_rate": 1.1288276677696836e-07, "loss": 0.2082, "reward": 0.7829241454601288, "reward_std": 0.3256767615675926, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562798023224, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 1349.0915832519531, "epoch": 0.6637293704727055, "grad_norm": 5.313284397125244, "kl": 3.359375, "learning_rate": 1.1275290269662023e-07, "loss": 0.2543, "reward": 0.6428571790456772, "reward_std": 0.261856023222208, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571790456772, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 1365.63623046875, "epoch": 0.664028078560227, "grad_norm": 4.757678508758545, "kl": 2.98046875, "learning_rate": 1.1262309545599328e-07, "loss": 0.2244, "reward": 0.6992187798023224, "reward_std": 0.24115609377622604, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 1246.5871276855469, "epoch": 0.6643267866477485, "grad_norm": 5.314688205718994, "kl": 2.984375, "learning_rate": 1.1249334519630496e-07, "loss": 0.1837, "reward": 0.7020089775323868, "reward_std": 0.2473197914659977, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5613839477300644, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 1318.9955749511719, "epoch": 0.66462549473527, "grad_norm": 4.28885555267334, "kl": 2.9296875, "learning_rate": 1.1236365205871083e-07, "loss": 0.2225, "reward": 0.710379496216774, "reward_std": 0.2871346101164818, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 1264.0736999511719, "epoch": 0.6649242028227914, "grad_norm": 14.489293098449707, "kl": 3.2890625, "learning_rate": 1.1223401618430438e-07, "loss": 0.192, "reward": 0.6869419813156128, "reward_std": 0.3203050419688225, "rewards/accuracy_reward": 0.09598214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 1279.4531860351562, "epoch": 0.6652229109103129, "grad_norm": 20.28910255432129, "kl": 2.373046875, "learning_rate": 1.1210443771411669e-07, "loss": 0.2824, "reward": 0.6975446790456772, "reward_std": 0.29667896777391434, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572544664144516, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 1225.8616333007812, "epoch": 0.6655216189978344, "grad_norm": 10.445996284484863, "kl": 3.40625, "learning_rate": 1.1197491678911644e-07, "loss": 0.1994, "reward": 0.750558078289032, "reward_std": 0.33675388991832733, "rewards/accuracy_reward": 0.14732143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366305589676, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 1289.5670166015625, "epoch": 0.6658203270853559, "grad_norm": 8.442909240722656, "kl": 3.28515625, "learning_rate": 1.1184545355020973e-07, "loss": 0.1529, "reward": 0.7087053805589676, "reward_std": 0.26820190623402596, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.588169664144516, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 1310.7924499511719, "epoch": 0.6661190351728773, "grad_norm": 6.642758846282959, "kl": 2.92578125, "learning_rate": 1.1171604813823982e-07, "loss": 0.2557, "reward": 0.6266741454601288, "reward_std": 0.30624349415302277, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991305589676, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 1230.4107666015625, "epoch": 0.6664177432603988, "grad_norm": 9.357885360717773, "kl": 2.84375, "learning_rate": 1.1158670069398717e-07, "loss": 0.2314, "reward": 0.7092634290456772, "reward_std": 0.31391704082489014, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 1193.8035888671875, "epoch": 0.6667164513479202, "grad_norm": 15.366133689880371, "kl": 2.82421875, "learning_rate": 1.1145741135816905e-07, "loss": 0.2416, "reward": 0.7790178954601288, "reward_std": 0.30235378444194794, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 1247.2902221679688, "epoch": 0.6670151594354418, "grad_norm": 10.00097370147705, "kl": 3.13671875, "learning_rate": 1.1132818027143972e-07, "loss": 0.234, "reward": 0.6395089477300644, "reward_std": 0.25660669803619385, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6082589477300644, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 1407.2098693847656, "epoch": 0.6673138675229632, "grad_norm": 17.041473388671875, "kl": 3.88671875, "learning_rate": 1.1119900757438978e-07, "loss": 0.2133, "reward": 0.5904018133878708, "reward_std": 0.26011642068624496, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5591517984867096, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 1173.5469360351562, "epoch": 0.6676125756104847, "grad_norm": 23.50022315979004, "kl": 3.81640625, "learning_rate": 1.1106989340754655e-07, "loss": 0.2614, "reward": 0.7170759290456772, "reward_std": 0.26537613943219185, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687649011612, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 1293.3928833007812, "epoch": 0.6679112836980061, "grad_norm": 16.36182403564453, "kl": 3.7265625, "learning_rate": 1.1094083791137359e-07, "loss": 0.2254, "reward": 0.6289062947034836, "reward_std": 0.2263307049870491, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.577566996216774, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 1219.7857666015625, "epoch": 0.6682099917855276, "grad_norm": 4.597226619720459, "kl": 3.68359375, "learning_rate": 1.1081184122627063e-07, "loss": 0.2352, "reward": 0.6595982611179352, "reward_std": 0.2843211852014065, "rewards/accuracy_reward": 0.06473214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594866082072258, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 1264.1763916015625, "epoch": 0.6685086998730491, "grad_norm": 6.883023262023926, "kl": 3.14453125, "learning_rate": 1.1068290349257337e-07, "loss": 0.1966, "reward": 0.7087053805589676, "reward_std": 0.20685461908578873, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339477300644, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 1291.71435546875, "epoch": 0.6688074079605706, "grad_norm": 6.359742164611816, "kl": 3.6640625, "learning_rate": 1.1055402485055345e-07, "loss": 0.2961, "reward": 0.7120535969734192, "reward_std": 0.32696324586868286, "rewards/accuracy_reward": 0.13839286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 1385.6697082519531, "epoch": 0.669106116048092, "grad_norm": 6.44364595413208, "kl": 3.1796875, "learning_rate": 1.1042520544041826e-07, "loss": 0.247, "reward": 0.6757812649011612, "reward_std": 0.26268157735466957, "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 1268.7254943847656, "epoch": 0.6694048241356134, "grad_norm": 4.46699333190918, "kl": 3.029296875, "learning_rate": 1.1029644540231064e-07, "loss": 0.2006, "reward": 0.666294664144516, "reward_std": 0.31936271488666534, "rewards/accuracy_reward": 0.09151786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5747767984867096, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 1241.6585388183594, "epoch": 0.669703532223135, "grad_norm": 17.365930557250977, "kl": 2.59765625, "learning_rate": 1.101677448763089e-07, "loss": 0.1901, "reward": 0.7087053954601288, "reward_std": 0.2950812950730324, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792410969734192, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 1250.2813110351562, "epoch": 0.6700022403106564, "grad_norm": 14.754571914672852, "kl": 2.732421875, "learning_rate": 1.1003910400242653e-07, "loss": 0.2821, "reward": 0.8130580633878708, "reward_std": 0.26795394718647003, "rewards/accuracy_reward": 0.22321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 1421.0335388183594, "epoch": 0.6703009483981779, "grad_norm": 3.6707687377929688, "kl": 3.60546875, "learning_rate": 1.0991052292061233e-07, "loss": 0.2638, "reward": 0.6891741454601288, "reward_std": 0.2825320065021515, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5998884290456772, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 1247.4509582519531, "epoch": 0.6705996564856993, "grad_norm": 6.324822902679443, "kl": 3.08984375, "learning_rate": 1.0978200177074982e-07, "loss": 0.1878, "reward": 0.788504496216774, "reward_std": 0.29396024718880653, "rewards/accuracy_reward": 0.18750000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.601004496216774, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 1285.0647888183594, "epoch": 0.6708983645732208, "grad_norm": 9.000921249389648, "kl": 3.7265625, "learning_rate": 1.0965354069265744e-07, "loss": 0.255, "reward": 0.6902901977300644, "reward_std": 0.22906672954559326, "rewards/accuracy_reward": 0.12276786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 1220.1629943847656, "epoch": 0.6711970726607422, "grad_norm": 5.317891597747803, "kl": 3.29296875, "learning_rate": 1.0952513982608827e-07, "loss": 0.2351, "reward": 0.7539062798023224, "reward_std": 0.2865912392735481, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241156578064, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 1292.6094665527344, "epoch": 0.6714957807482638, "grad_norm": 5.697662353515625, "kl": 3.8671875, "learning_rate": 1.0939679931072981e-07, "loss": 0.2351, "reward": 0.6116071790456772, "reward_std": 0.23898279666900635, "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825892984867096, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 1170.4397735595703, "epoch": 0.6717944888357852, "grad_norm": 6.297085285186768, "kl": 2.9296875, "learning_rate": 1.0926851928620404e-07, "loss": 0.2606, "reward": 0.6556919813156128, "reward_std": 0.2947057671844959, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 1260.65185546875, "epoch": 0.6720931969233067, "grad_norm": 6.021477699279785, "kl": 3.84765625, "learning_rate": 1.0914029989206697e-07, "loss": 0.2746, "reward": 0.7622767984867096, "reward_std": 0.3203646391630173, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.612723246216774, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 1336.2902526855469, "epoch": 0.6723919050108281, "grad_norm": 7.365692138671875, "kl": 3.875, "learning_rate": 1.0901214126780881e-07, "loss": 0.2652, "reward": 0.7137277126312256, "reward_std": 0.26505226269364357, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 1273.5781860351562, "epoch": 0.6726906130983497, "grad_norm": 12.409294128417969, "kl": 2.65625, "learning_rate": 1.0888404355285354e-07, "loss": 0.2091, "reward": 0.7148437947034836, "reward_std": 0.3085460886359215, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6277901977300644, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 1354.9308471679688, "epoch": 0.6729893211858711, "grad_norm": 21.035236358642578, "kl": 4.05078125, "learning_rate": 1.0875600688655897e-07, "loss": 0.2274, "reward": 0.7081473618745804, "reward_std": 0.27097371593117714, "rewards/accuracy_reward": 0.11383929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594308078289032, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 1357.075927734375, "epoch": 0.6732880292733926, "grad_norm": 14.206377029418945, "kl": 4.1015625, "learning_rate": 1.0862803140821634e-07, "loss": 0.2948, "reward": 0.6250000298023224, "reward_std": 0.2945426292717457, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5647321790456772, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 1320.9687805175781, "epoch": 0.673586737360914, "grad_norm": 5.682405948638916, "kl": 3.42578125, "learning_rate": 1.085001172570505e-07, "loss": 0.225, "reward": 0.642857164144516, "reward_std": 0.26980045065283775, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 1360.6719055175781, "epoch": 0.6738854454484355, "grad_norm": 7.813249588012695, "kl": 3.66015625, "learning_rate": 1.0837226457221953e-07, "loss": 0.2135, "reward": 0.676339328289032, "reward_std": 0.27038780972361565, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 1409.72998046875, "epoch": 0.674184153535957, "grad_norm": 10.07277774810791, "kl": 4.484375, "learning_rate": 1.0824447349281458e-07, "loss": 0.2033, "reward": 0.6880580484867096, "reward_std": 0.27119705080986023, "rewards/accuracy_reward": 0.10044643236324191, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116454601288, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 1261.857177734375, "epoch": 0.6744828616234785, "grad_norm": 7.5177764892578125, "kl": 3.2421875, "learning_rate": 1.0811674415785982e-07, "loss": 0.2366, "reward": 0.680245578289032, "reward_std": 0.2389700710773468, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 1258.4598999023438, "epoch": 0.6747815697109999, "grad_norm": 4.343350887298584, "kl": 3.953125, "learning_rate": 1.0798907670631229e-07, "loss": 0.2488, "reward": 0.7388393133878708, "reward_std": 0.2784036882221699, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 1232.6942749023438, "epoch": 0.6750802777985214, "grad_norm": 8.371895790100098, "kl": 2.609375, "learning_rate": 1.078614712770616e-07, "loss": 0.1757, "reward": 0.7193080633878708, "reward_std": 0.29037467390298843, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687798023224, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 1321.66748046875, "epoch": 0.6753789858860428, "grad_norm": 6.787572860717773, "kl": 3.4921875, "learning_rate": 1.0773392800893001e-07, "loss": 0.3125, "reward": 0.647879496216774, "reward_std": 0.2730447091162205, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.560825914144516, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 1386.6205749511719, "epoch": 0.6756776939735644, "grad_norm": 6.332686901092529, "kl": 3.5234375, "learning_rate": 1.0760644704067202e-07, "loss": 0.2075, "reward": 0.6015625298023224, "reward_std": 0.2580542452633381, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 1281.0826416015625, "epoch": 0.6759764020610858, "grad_norm": 13.663507461547852, "kl": 2.41015625, "learning_rate": 1.0747902851097452e-07, "loss": 0.2096, "reward": 0.7550223618745804, "reward_std": 0.3000285364687443, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.623325914144516, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 1365.7991943359375, "epoch": 0.6762751101486073, "grad_norm": 14.53727912902832, "kl": 2.78515625, "learning_rate": 1.0735167255845633e-07, "loss": 0.189, "reward": 0.7527901977300644, "reward_std": 0.3545078821480274, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6210937798023224, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 1278.6138916015625, "epoch": 0.6765738182361287, "grad_norm": 11.065526962280273, "kl": 3.44921875, "learning_rate": 1.0722437932166825e-07, "loss": 0.2331, "reward": 0.7243303805589676, "reward_std": 0.33591238409280777, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5993303805589676, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 1230.9888916015625, "epoch": 0.6768725263236502, "grad_norm": 6.716328144073486, "kl": 3.5859375, "learning_rate": 1.0709714893909283e-07, "loss": 0.1905, "reward": 0.697544664144516, "reward_std": 0.26240358129143715, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6082589477300644, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 1351.8906555175781, "epoch": 0.6771712344111717, "grad_norm": 20.283918380737305, "kl": 4.6640625, "learning_rate": 1.0696998154914433e-07, "loss": 0.2824, "reward": 0.7226562798023224, "reward_std": 0.24892936274409294, "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598618745804, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 1339.4040832519531, "epoch": 0.6774699424986932, "grad_norm": 27.165618896484375, "kl": 5.6796875, "learning_rate": 1.0684287729016828e-07, "loss": 0.3692, "reward": 0.7315848618745804, "reward_std": 0.32014407962560654, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6110491305589676, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 1410.5781860351562, "epoch": 0.6777686505862146, "grad_norm": 28.509201049804688, "kl": 5.4921875, "learning_rate": 1.0671583630044177e-07, "loss": 0.3148, "reward": 0.7254464477300644, "reward_std": 0.27926065400242805, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982142984867096, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 1325.7076721191406, "epoch": 0.6780673586737361, "grad_norm": 16.10182762145996, "kl": 3.58203125, "learning_rate": 1.065888587181729e-07, "loss": 0.1964, "reward": 0.6813616454601288, "reward_std": 0.25983719900250435, "rewards/accuracy_reward": 0.08035714877769351, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6010044813156128, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 1373.0491333007812, "epoch": 0.6783660667612575, "grad_norm": 12.181142807006836, "kl": 4.86328125, "learning_rate": 1.0646194468150083e-07, "loss": 0.3817, "reward": 0.6489955633878708, "reward_std": 0.24713659659028053, "rewards/accuracy_reward": 0.06919643306173384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5797991305589676, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 1370.93310546875, "epoch": 0.6786647748487791, "grad_norm": 13.284525871276855, "kl": 4.28515625, "learning_rate": 1.0633509432849562e-07, "loss": 0.3018, "reward": 0.6796875298023224, "reward_std": 0.33276815712451935, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5792411118745804, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 1344.9152221679688, "epoch": 0.6789634829363005, "grad_norm": 12.134236335754395, "kl": 4.66015625, "learning_rate": 1.0620830779715798e-07, "loss": 0.2122, "reward": 0.7059152126312256, "reward_std": 0.2530835457146168, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 1312.2143249511719, "epoch": 0.679262191023822, "grad_norm": 10.417510032653809, "kl": 3.982421875, "learning_rate": 1.0608158522541924e-07, "loss": 0.2318, "reward": 0.6958705633878708, "reward_std": 0.2567104622721672, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 1253.96435546875, "epoch": 0.6795608991113434, "grad_norm": 13.804922103881836, "kl": 2.71875, "learning_rate": 1.059549267511412e-07, "loss": 0.1528, "reward": 0.671316996216774, "reward_std": 0.27210231497883797, "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6177455633878708, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 1274.8438110351562, "epoch": 0.679859607198865, "grad_norm": 17.061227798461914, "kl": 3.3125, "learning_rate": 1.0582833251211576e-07, "loss": 0.1806, "reward": 0.6668526977300644, "reward_std": 0.29580795019865036, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5842634290456772, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 1305.1340026855469, "epoch": 0.6801583152863864, "grad_norm": 14.93547248840332, "kl": 2.75, "learning_rate": 1.057018026460651e-07, "loss": 0.177, "reward": 0.787388414144516, "reward_std": 0.3120637349784374, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 1347.8504943847656, "epoch": 0.6804570233739079, "grad_norm": 11.26961898803711, "kl": 2.91015625, "learning_rate": 1.0557533729064138e-07, "loss": 0.1936, "reward": 0.6718750298023224, "reward_std": 0.2814745269715786, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 1203.7701416015625, "epoch": 0.6807557314614293, "grad_norm": 19.066024780273438, "kl": 2.67578125, "learning_rate": 1.0544893658342637e-07, "loss": 0.2265, "reward": 0.6473214775323868, "reward_std": 0.2774766683578491, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026786118745804, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 1344.0067749023438, "epoch": 0.6810544395489508, "grad_norm": 8.024489402770996, "kl": 4.0234375, "learning_rate": 1.0532260066193173e-07, "loss": 0.251, "reward": 0.7399553954601288, "reward_std": 0.27210915461182594, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5948660969734192, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 1335.1072387695312, "epoch": 0.6813531476364723, "grad_norm": 5.953866481781006, "kl": 3.71484375, "learning_rate": 1.0519632966359863e-07, "loss": 0.2529, "reward": 0.7109375298023224, "reward_std": 0.2612548917531967, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5970982313156128, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 1225.4063110351562, "epoch": 0.6816518557239938, "grad_norm": 6.262638568878174, "kl": 3.5625, "learning_rate": 1.0507012372579738e-07, "loss": 0.2364, "reward": 0.6333705633878708, "reward_std": 0.28285492211580276, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.590959832072258, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 1272.1875610351562, "epoch": 0.6819505638115152, "grad_norm": 16.660594940185547, "kl": 4.1640625, "learning_rate": 1.0494398298582778e-07, "loss": 0.264, "reward": 0.7472098618745804, "reward_std": 0.2548659220337868, "rewards/accuracy_reward": 0.14732143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.599888414144516, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 1234.0603637695312, "epoch": 0.6822492718990366, "grad_norm": 17.326435089111328, "kl": 3.88671875, "learning_rate": 1.0481790758091862e-07, "loss": 0.2052, "reward": 0.6813616305589676, "reward_std": 0.28053176030516624, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5943080484867096, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 1379.9621276855469, "epoch": 0.6825479799865581, "grad_norm": 16.289024353027344, "kl": 4.29296875, "learning_rate": 1.0469189764822749e-07, "loss": 0.1648, "reward": 0.6997768133878708, "reward_std": 0.2740178219974041, "rewards/accuracy_reward": 0.09598215157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6037946790456772, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 1347.7679138183594, "epoch": 0.6828466880740796, "grad_norm": 9.668549537658691, "kl": 4.2265625, "learning_rate": 1.0456595332484093e-07, "loss": 0.2898, "reward": 0.6741071790456772, "reward_std": 0.29271210730075836, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.573660746216774, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 1312.1228332519531, "epoch": 0.6831453961616011, "grad_norm": 10.529151916503906, "kl": 3.44921875, "learning_rate": 1.0444007474777393e-07, "loss": 0.2172, "reward": 0.641183078289032, "reward_std": 0.2787024527788162, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.592075914144516, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 1201.2031860351562, "epoch": 0.6834441042491225, "grad_norm": 6.904878616333008, "kl": 3.578125, "learning_rate": 1.043142620539701e-07, "loss": 0.2936, "reward": 0.7500000447034836, "reward_std": 0.28998078778386116, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 1305.482177734375, "epoch": 0.683742812336644, "grad_norm": 6.334644794464111, "kl": 2.806640625, "learning_rate": 1.0418851538030135e-07, "loss": 0.1332, "reward": 0.7131696790456772, "reward_std": 0.2615365609526634, "rewards/accuracy_reward": 0.12723214854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375298023224, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 1227.1763916015625, "epoch": 0.6840415204241654, "grad_norm": 16.890336990356445, "kl": 2.5625, "learning_rate": 1.0406283486356766e-07, "loss": 0.2312, "reward": 0.7952009290456772, "reward_std": 0.24089835584163666, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.625558078289032, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 1238.4844055175781, "epoch": 0.684340228511687, "grad_norm": 7.136548042297363, "kl": 3.140625, "learning_rate": 1.0393722064049714e-07, "loss": 0.188, "reward": 0.708147332072258, "reward_std": 0.3037822097539902, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 1374.8527526855469, "epoch": 0.6846389365992084, "grad_norm": 9.089056968688965, "kl": 4.859375, "learning_rate": 1.0381167284774582e-07, "loss": 0.2363, "reward": 0.650669664144516, "reward_std": 0.2959219366312027, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5770089477300644, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 1302.5670166015625, "epoch": 0.6849376446867299, "grad_norm": 13.70199203491211, "kl": 3.703125, "learning_rate": 1.0368619162189732e-07, "loss": 0.2481, "reward": 0.7421875298023224, "reward_std": 0.3043416142463684, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339626312256, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 1235.7522888183594, "epoch": 0.6852363527742513, "grad_norm": 9.203460693359375, "kl": 3.515625, "learning_rate": 1.0356077709946289e-07, "loss": 0.2196, "reward": 0.6434151977300644, "reward_std": 0.26158539950847626, "rewards/accuracy_reward": 0.04464285960420966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5987723469734192, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 1225.3840026855469, "epoch": 0.6855350608617728, "grad_norm": 11.907964706420898, "kl": 3.39453125, "learning_rate": 1.0343542941688138e-07, "loss": 0.2822, "reward": 0.7170759290456772, "reward_std": 0.28034820035099983, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6054687798023224, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 1252.3772888183594, "epoch": 0.6858337689492943, "grad_norm": 10.350884437561035, "kl": 3.66796875, "learning_rate": 1.0331014871051863e-07, "loss": 0.3216, "reward": 0.7477678954601288, "reward_std": 0.2866504415869713, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964626312256, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 1249.2188110351562, "epoch": 0.6861324770368158, "grad_norm": 14.122386932373047, "kl": 3.3984375, "learning_rate": 1.031849351166678e-07, "loss": 0.276, "reward": 0.7818080633878708, "reward_std": 0.27217550575733185, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5853794813156128, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 1322.5045166015625, "epoch": 0.6864311851243372, "grad_norm": 6.600432872772217, "kl": 3.44140625, "learning_rate": 1.030597887715491e-07, "loss": 0.2528, "reward": 0.6685268133878708, "reward_std": 0.25527651607990265, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5926339477300644, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 1405.9465026855469, "epoch": 0.6867298932118587, "grad_norm": 12.145788192749023, "kl": 4.31640625, "learning_rate": 1.0293470981130938e-07, "loss": 0.1995, "reward": 0.6640625298023224, "reward_std": 0.24141032248735428, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125149011612, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 1263.0134582519531, "epoch": 0.6870286012993801, "grad_norm": 8.587471008300781, "kl": 3.50390625, "learning_rate": 1.0280969837202238e-07, "loss": 0.249, "reward": 0.6690848618745804, "reward_std": 0.2907792739570141, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043526977300644, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 1326.7701721191406, "epoch": 0.6873273093869017, "grad_norm": 5.3577961921691895, "kl": 3.51171875, "learning_rate": 1.0268475458968817e-07, "loss": 0.2322, "reward": 0.6473214626312256, "reward_std": 0.3002990186214447, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 1324.2701110839844, "epoch": 0.6876260174744231, "grad_norm": 9.560115814208984, "kl": 3.34765625, "learning_rate": 1.0255987860023345e-07, "loss": 0.1809, "reward": 0.7075893133878708, "reward_std": 0.2587887719273567, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.604910746216774, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 1377.0291137695312, "epoch": 0.6879247255619446, "grad_norm": 4.710623741149902, "kl": 3.3359375, "learning_rate": 1.0243507053951106e-07, "loss": 0.2226, "reward": 0.6835937649011612, "reward_std": 0.2997298091650009, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5675223469734192, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 1293.07373046875, "epoch": 0.688223433649466, "grad_norm": 6.982100486755371, "kl": 2.646484375, "learning_rate": 1.0231033054329986e-07, "loss": 0.1761, "reward": 0.6919643133878708, "reward_std": 0.2740386947989464, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 1304.43310546875, "epoch": 0.6885221417369876, "grad_norm": 8.509564399719238, "kl": 2.9765625, "learning_rate": 1.0218565874730479e-07, "loss": 0.1923, "reward": 0.761160746216774, "reward_std": 0.3250751718878746, "rewards/accuracy_reward": 0.18750001396983862, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607313156128, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 1271.2478332519531, "epoch": 0.688820849824509, "grad_norm": 8.550511360168457, "kl": 3.09765625, "learning_rate": 1.0206105528715657e-07, "loss": 0.2613, "reward": 0.7583705633878708, "reward_std": 0.28846072405576706, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 1286.0513916015625, "epoch": 0.6891195579120305, "grad_norm": 11.335152626037598, "kl": 3.052734375, "learning_rate": 1.0193652029841147e-07, "loss": 0.2208, "reward": 0.6690848618745804, "reward_std": 0.3247236981987953, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.593191996216774, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 1253.7634582519531, "epoch": 0.6894182659995519, "grad_norm": 5.213254928588867, "kl": 3.5, "learning_rate": 1.0181205391655141e-07, "loss": 0.1845, "reward": 0.6484375149011612, "reward_std": 0.2883983924984932, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 1255.9888916015625, "epoch": 0.6897169740870734, "grad_norm": 5.008433818817139, "kl": 2.69140625, "learning_rate": 1.0168765627698363e-07, "loss": 0.1497, "reward": 0.7142857611179352, "reward_std": 0.3065972812473774, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 1282.919677734375, "epoch": 0.6900156821745949, "grad_norm": 10.218542098999023, "kl": 2.98046875, "learning_rate": 1.015633275150405e-07, "loss": 0.2169, "reward": 0.7081473469734192, "reward_std": 0.2698041684925556, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965401977300644, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 1272.5201110839844, "epoch": 0.6903143902621164, "grad_norm": 30.616607666015625, "kl": 4.625, "learning_rate": 1.0143906776597959e-07, "loss": 0.263, "reward": 0.6540178805589676, "reward_std": 0.26825369894504547, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5959821790456772, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 1328.0179138183594, "epoch": 0.6906130983496378, "grad_norm": 8.677101135253906, "kl": 3.76171875, "learning_rate": 1.0131487716498335e-07, "loss": 0.2546, "reward": 0.7047991454601288, "reward_std": 0.2815021425485611, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 1310.1786193847656, "epoch": 0.6909118064371593, "grad_norm": 8.113252639770508, "kl": 3.78125, "learning_rate": 1.011907558471589e-07, "loss": 0.2783, "reward": 0.7299107313156128, "reward_std": 0.2740260027348995, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750149011612, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 1274.1451416015625, "epoch": 0.6912105145246807, "grad_norm": 35.1121826171875, "kl": 5.6171875, "learning_rate": 1.0106670394753814e-07, "loss": 0.3902, "reward": 0.616629496216774, "reward_std": 0.28494788706302643, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576450914144516, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 1339.5067443847656, "epoch": 0.6915092226122023, "grad_norm": 22.978851318359375, "kl": 4.3671875, "learning_rate": 1.009427216010773e-07, "loss": 0.2544, "reward": 0.7371652126312256, "reward_std": 0.2991964928805828, "rewards/accuracy_reward": 0.17187501303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5652902126312256, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 1244.7299499511719, "epoch": 0.6918079306997237, "grad_norm": 7.3316650390625, "kl": 3.40234375, "learning_rate": 1.008188089426571e-07, "loss": 0.1974, "reward": 0.6930803954601288, "reward_std": 0.2736350931227207, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303954601288, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 1308.5982666015625, "epoch": 0.6921066387872452, "grad_norm": 4.695497035980225, "kl": 3.037109375, "learning_rate": 1.0069496610708235e-07, "loss": 0.2101, "reward": 0.667410746216774, "reward_std": 0.2995019927620888, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500298023224, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 1327.6429138183594, "epoch": 0.6924053468747666, "grad_norm": 7.834883213043213, "kl": 3.78515625, "learning_rate": 1.0057119322908188e-07, "loss": 0.2321, "reward": 0.6796875298023224, "reward_std": 0.2772112190723419, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5904018133878708, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 1415.5245971679688, "epoch": 0.6927040549622882, "grad_norm": 8.200823783874512, "kl": 4.23046875, "learning_rate": 1.0044749044330847e-07, "loss": 0.1236, "reward": 0.6579241305589676, "reward_std": 0.29142703860998154, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.575334832072258, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 1316.6250610351562, "epoch": 0.6930027630498096, "grad_norm": 9.017902374267578, "kl": 3.078125, "learning_rate": 1.0032385788433869e-07, "loss": 0.16, "reward": 0.7137277275323868, "reward_std": 0.3106463924050331, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 1315.9911499023438, "epoch": 0.6933014711373311, "grad_norm": 8.776732444763184, "kl": 10.53515625, "learning_rate": 1.0020029568667252e-07, "loss": 0.2497, "reward": 0.6065848469734192, "reward_std": 0.25708920881152153, "rewards/accuracy_reward": 0.02901785890571773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.577566996216774, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 1181.3861999511719, "epoch": 0.6936001792248525, "grad_norm": 12.33814811706543, "kl": 3.30078125, "learning_rate": 1.0007680398473359e-07, "loss": 0.1711, "reward": 0.7036830633878708, "reward_std": 0.28370194882154465, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6099330484867096, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 1199.3750457763672, "epoch": 0.693898887312374, "grad_norm": 10.47700309753418, "kl": 4.34375, "learning_rate": 9.995338291286883e-08, "loss": 0.2323, "reward": 0.7472098618745804, "reward_std": 0.3090968281030655, "rewards/accuracy_reward": 0.1562500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5909598469734192, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 1335.8058471679688, "epoch": 0.6941975953998955, "grad_norm": 8.364538192749023, "kl": 2.98828125, "learning_rate": 9.983003260534815e-08, "loss": 0.1669, "reward": 0.6791294813156128, "reward_std": 0.2504252716898918, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.607700914144516, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 1290.0246276855469, "epoch": 0.694496303487417, "grad_norm": 11.89277458190918, "kl": 4.0, "learning_rate": 9.970675319636466e-08, "loss": 0.2377, "reward": 0.637276828289032, "reward_std": 0.2589140757918358, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581473246216774, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 1260.8125610351562, "epoch": 0.6947950115749384, "grad_norm": 14.621297836303711, "kl": 5.8828125, "learning_rate": 9.958354482003432e-08, "loss": 0.3675, "reward": 0.6941964626312256, "reward_std": 0.3379990719258785, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214626312256, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 1252.1473693847656, "epoch": 0.6950937196624598, "grad_norm": 6.911675930023193, "kl": 3.84765625, "learning_rate": 9.946040761039563e-08, "loss": 0.1722, "reward": 0.7198660969734192, "reward_std": 0.30083436891436577, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.597098246216774, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 1346.3907165527344, "epoch": 0.6953924277499813, "grad_norm": 8.59952449798584, "kl": 3.8359375, "learning_rate": 9.933734170140996e-08, "loss": 0.2577, "reward": 0.6902902126312256, "reward_std": 0.26429567113518715, "rewards/accuracy_reward": 0.10491072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 1242.7813110351562, "epoch": 0.6956911358375027, "grad_norm": 8.230659484863281, "kl": 4.0859375, "learning_rate": 9.921434722696086e-08, "loss": 0.3143, "reward": 0.726004496216774, "reward_std": 0.2933182567358017, "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.594308078289032, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 1184.8594055175781, "epoch": 0.6959898439250243, "grad_norm": 6.3023200035095215, "kl": 3.33203125, "learning_rate": 9.909142432085427e-08, "loss": 0.2127, "reward": 0.7260044813156128, "reward_std": 0.30354177951812744, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5831473469734192, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 1162.4732666015625, "epoch": 0.6962885520125457, "grad_norm": 6.266427993774414, "kl": 3.59375, "learning_rate": 9.896857311681834e-08, "loss": 0.2782, "reward": 0.6869419813156128, "reward_std": 0.3207213804125786, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5842634290456772, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 1304.4063110351562, "epoch": 0.6965872601000672, "grad_norm": 6.980648040771484, "kl": 3.71875, "learning_rate": 9.884579374850303e-08, "loss": 0.2491, "reward": 0.7427455633878708, "reward_std": 0.2867315299808979, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5976562798023224, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 1214.8415832519531, "epoch": 0.6968859681875886, "grad_norm": 16.840261459350586, "kl": 2.681640625, "learning_rate": 9.87230863494803e-08, "loss": 0.1909, "reward": 0.6718750298023224, "reward_std": 0.28727586939930916, "rewards/accuracy_reward": 0.06026786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611607164144516, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 1361.9978332519531, "epoch": 0.6971846762751102, "grad_norm": 17.215986251831055, "kl": 3.2890625, "learning_rate": 9.860045105324385e-08, "loss": 0.3289, "reward": 0.613839328289032, "reward_std": 0.3041745088994503, "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5602678805589676, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 1321.1183471679688, "epoch": 0.6974833843626316, "grad_norm": 13.37051773071289, "kl": 2.662109375, "learning_rate": 9.847788799320874e-08, "loss": 0.222, "reward": 0.7137277126312256, "reward_std": 0.2636103890836239, "rewards/accuracy_reward": 0.08928571967408061, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6244419813156128, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 1337.83935546875, "epoch": 0.6977820924501531, "grad_norm": 10.646956443786621, "kl": 3.765625, "learning_rate": 9.835539730271165e-08, "loss": 0.2483, "reward": 0.711495578289032, "reward_std": 0.2802666947245598, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 1309.8214721679688, "epoch": 0.6980808005376745, "grad_norm": 9.508569717407227, "kl": 3.29296875, "learning_rate": 9.823297911501044e-08, "loss": 0.1891, "reward": 0.669084832072258, "reward_std": 0.24774615839123726, "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6110491305589676, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 1269.1205749511719, "epoch": 0.698379508625196, "grad_norm": 8.95870304107666, "kl": 3.52734375, "learning_rate": 9.811063356328405e-08, "loss": 0.1975, "reward": 0.6768973469734192, "reward_std": 0.2726754695177078, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5898437798023224, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 1283.41748046875, "epoch": 0.6986782167127175, "grad_norm": 12.749387741088867, "kl": 3.30859375, "learning_rate": 9.798836078063249e-08, "loss": 0.1596, "reward": 0.686941996216774, "reward_std": 0.25389809533953667, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 1229.6518249511719, "epoch": 0.698976924800239, "grad_norm": 9.466233253479004, "kl": 3.63671875, "learning_rate": 9.786616090007655e-08, "loss": 0.3202, "reward": 0.6964285969734192, "reward_std": 0.2986431419849396, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6004464626312256, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 1184.1317443847656, "epoch": 0.6992756328877604, "grad_norm": 9.157320022583008, "kl": 2.923828125, "learning_rate": 9.774403405455775e-08, "loss": 0.1876, "reward": 0.7187500298023224, "reward_std": 0.2520625777542591, "rewards/accuracy_reward": 0.12053571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5982143133878708, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 1157.6183624267578, "epoch": 0.6995743409752819, "grad_norm": 14.85285758972168, "kl": 3.375, "learning_rate": 9.762198037693811e-08, "loss": 0.2411, "reward": 0.6534598469734192, "reward_std": 0.2525767460465431, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6132812798023224, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 1254.7187805175781, "epoch": 0.6998730490628033, "grad_norm": 5.239717483520508, "kl": 3.7421875, "learning_rate": 9.750000000000003e-08, "loss": 0.1852, "reward": 0.7460937798023224, "reward_std": 0.32147248834371567, "rewards/accuracy_reward": 0.1651785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809151977300644, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 1262.8973693847656, "epoch": 0.7001717571503249, "grad_norm": 5.808890342712402, "kl": 3.859375, "learning_rate": 9.737809305644623e-08, "loss": 0.285, "reward": 0.7098214477300644, "reward_std": 0.2799481153488159, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178954601288, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 1325.5111999511719, "epoch": 0.7004704652378463, "grad_norm": 4.100822448730469, "kl": 2.90625, "learning_rate": 9.725625967889956e-08, "loss": 0.1315, "reward": 0.6757812798023224, "reward_std": 0.26172278821468353, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5708705633878708, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 1326.9665832519531, "epoch": 0.7007691733253678, "grad_norm": 4.460084438323975, "kl": 3.30859375, "learning_rate": 9.713449999990277e-08, "loss": 0.2133, "reward": 0.6277901977300644, "reward_std": 0.22736192122101784, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187798023224, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 1363.0692443847656, "epoch": 0.7010678814128892, "grad_norm": 5.208163261413574, "kl": 3.296875, "learning_rate": 9.70128141519184e-08, "loss": 0.2151, "reward": 0.6556919813156128, "reward_std": 0.2862604595720768, "rewards/accuracy_reward": 0.05133928661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043527126312256, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 1356.60498046875, "epoch": 0.7013665895004108, "grad_norm": 4.828615665435791, "kl": 3.55078125, "learning_rate": 9.689120226732881e-08, "loss": 0.1762, "reward": 0.671316996216774, "reward_std": 0.2544281780719757, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 1169.79248046875, "epoch": 0.7016652975879322, "grad_norm": 16.84052085876465, "kl": 3.734375, "learning_rate": 9.676966447843576e-08, "loss": 0.1744, "reward": 0.683035746216774, "reward_std": 0.2943399176001549, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 1244.0558776855469, "epoch": 0.7019640056754537, "grad_norm": 6.768239974975586, "kl": 3.15625, "learning_rate": 9.664820091746043e-08, "loss": 0.2149, "reward": 0.7159598469734192, "reward_std": 0.25970980897545815, "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919813156128, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 1271.5804138183594, "epoch": 0.7022627137629751, "grad_norm": 6.253688335418701, "kl": 3.1328125, "learning_rate": 9.652681171654338e-08, "loss": 0.2204, "reward": 0.5982143133878708, "reward_std": 0.2681058496236801, "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669642984867096, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 1121.2723693847656, "epoch": 0.7025614218504966, "grad_norm": 8.073145866394043, "kl": 4.38671875, "learning_rate": 9.640549700774402e-08, "loss": 0.2647, "reward": 0.8091518133878708, "reward_std": 0.3530017212033272, "rewards/accuracy_reward": 0.2388392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125149011612, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 1238.24560546875, "epoch": 0.702860129938018, "grad_norm": 9.713029861450195, "kl": 2.8515625, "learning_rate": 9.628425692304102e-08, "loss": 0.2516, "reward": 0.6919643133878708, "reward_std": 0.3011253774166107, "rewards/accuracy_reward": 0.10714286332949996, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 1420.5045166015625, "epoch": 0.7031588380255396, "grad_norm": 8.465681076049805, "kl": 2.986328125, "learning_rate": 9.616309159433153e-08, "loss": 0.2373, "reward": 0.666294664144516, "reward_std": 0.30399011075496674, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.579241082072258, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 1223.7924499511719, "epoch": 0.703457546113061, "grad_norm": 3.346055507659912, "kl": 3.171875, "learning_rate": 9.604200115343168e-08, "loss": 0.1897, "reward": 0.6422991454601288, "reward_std": 0.27297185361385345, "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5641741156578064, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 1312.2500610351562, "epoch": 0.7037562542005825, "grad_norm": 4.060539245605469, "kl": 3.26171875, "learning_rate": 9.592098573207597e-08, "loss": 0.2405, "reward": 0.6450893133878708, "reward_std": 0.26909032464027405, "rewards/accuracy_reward": 0.051339287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500298023224, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 1209.9754638671875, "epoch": 0.7040549622881039, "grad_norm": 13.4741849899292, "kl": 3.98828125, "learning_rate": 9.58000454619173e-08, "loss": 0.253, "reward": 0.7187500298023224, "reward_std": 0.28337881714105606, "rewards/accuracy_reward": 0.13616072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 1191.1562805175781, "epoch": 0.7043536703756255, "grad_norm": 13.953530311584473, "kl": 3.078125, "learning_rate": 9.567918047452682e-08, "loss": 0.1967, "reward": 0.748325914144516, "reward_std": 0.2795608937740326, "rewards/accuracy_reward": 0.16294643888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585379496216774, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 1306.5514221191406, "epoch": 0.7046523784631469, "grad_norm": 4.803835391998291, "kl": 2.607421875, "learning_rate": 9.555839090139387e-08, "loss": 0.1185, "reward": 0.6802455633878708, "reward_std": 0.286084558814764, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.606584832072258, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 1320.3839721679688, "epoch": 0.7049510865506684, "grad_norm": 9.575790405273438, "kl": 2.96484375, "learning_rate": 9.543767687392557e-08, "loss": 0.2659, "reward": 0.7148437649011612, "reward_std": 0.31097670644521713, "rewards/accuracy_reward": 0.11383929080329835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.601004496216774, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 1296.41748046875, "epoch": 0.7052497946381898, "grad_norm": 4.206470489501953, "kl": 3.05859375, "learning_rate": 9.531703852344697e-08, "loss": 0.2245, "reward": 0.6149553954601288, "reward_std": 0.24276749789714813, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5881696790456772, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 1310.7991638183594, "epoch": 0.7055485027257113, "grad_norm": 6.636565685272217, "kl": 2.81640625, "learning_rate": 9.519647598120087e-08, "loss": 0.1951, "reward": 0.6986607313156128, "reward_std": 0.2702910602092743, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.602678582072258, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 1292.1965026855469, "epoch": 0.7058472108132328, "grad_norm": 6.0662360191345215, "kl": 3.17578125, "learning_rate": 9.507598937834737e-08, "loss": 0.2446, "reward": 0.6824777126312256, "reward_std": 0.3114074766635895, "rewards/accuracy_reward": 0.09375000093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 1317.935302734375, "epoch": 0.7061459189007543, "grad_norm": 6.625920295715332, "kl": 3.171875, "learning_rate": 9.495557884596411e-08, "loss": 0.2082, "reward": 0.679129496216774, "reward_std": 0.2755851410329342, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5965402126312256, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 1434.8639526367188, "epoch": 0.7064446269882757, "grad_norm": 20.878808975219727, "kl": 4.5390625, "learning_rate": 9.483524451504605e-08, "loss": 0.3328, "reward": 0.6383928954601288, "reward_std": 0.25362880900502205, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625000298023224, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 1221.5692443847656, "epoch": 0.7067433350757972, "grad_norm": 11.851842880249023, "kl": 8.8203125, "learning_rate": 9.471498651650501e-08, "loss": 0.1819, "reward": 0.8225446790456772, "reward_std": 0.2683912068605423, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6037946790456772, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 1395.2835083007812, "epoch": 0.7070420431633186, "grad_norm": 10.966005325317383, "kl": 3.89453125, "learning_rate": 9.459480498117e-08, "loss": 0.2425, "reward": 0.733816996216774, "reward_std": 0.26096317544579506, "rewards/accuracy_reward": 0.14508929522708058, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 1247.810302734375, "epoch": 0.7073407512508402, "grad_norm": 11.211149215698242, "kl": 2.734375, "learning_rate": 9.447470003978669e-08, "loss": 0.1815, "reward": 0.7321428954601288, "reward_std": 0.2665700539946556, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 1252.6473999023438, "epoch": 0.7076394593383616, "grad_norm": 8.666232109069824, "kl": 3.15234375, "learning_rate": 9.435467182301751e-08, "loss": 0.3462, "reward": 0.6975446790456772, "reward_std": 0.2756659463047981, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375149011612, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 1361.2098693847656, "epoch": 0.707938167425883, "grad_norm": 9.12386417388916, "kl": 2.640625, "learning_rate": 9.423472046144147e-08, "loss": 0.1978, "reward": 0.6774553805589676, "reward_std": 0.2626424469053745, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6060268133878708, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 1233.0938110351562, "epoch": 0.7082368755134045, "grad_norm": 10.492502212524414, "kl": 3.1015625, "learning_rate": 9.411484608555373e-08, "loss": 0.2707, "reward": 0.707589328289032, "reward_std": 0.32741425186395645, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 1190.4844055175781, "epoch": 0.7085355836009259, "grad_norm": 12.646745681762695, "kl": 3.7421875, "learning_rate": 9.399504882576599e-08, "loss": 0.1624, "reward": 0.6986607611179352, "reward_std": 0.26093753427267075, "rewards/accuracy_reward": 0.10937500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 1240.7455749511719, "epoch": 0.7088342916884475, "grad_norm": 9.225347518920898, "kl": 2.62109375, "learning_rate": 9.387532881240595e-08, "loss": 0.0501, "reward": 0.668526828289032, "reward_std": 0.24313373491168022, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.612723246216774, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 1326.5804138183594, "epoch": 0.7091329997759689, "grad_norm": 12.524370193481445, "kl": 3.375, "learning_rate": 9.375568617571714e-08, "loss": 0.2581, "reward": 0.6629464626312256, "reward_std": 0.2972703129053116, "rewards/accuracy_reward": 0.0915178635623306, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5714286118745804, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 1253.88623046875, "epoch": 0.7094317078634904, "grad_norm": 56.7125358581543, "kl": 4.7421875, "learning_rate": 9.363612104585907e-08, "loss": 0.3362, "reward": 0.660714328289032, "reward_std": 0.2788502499461174, "rewards/accuracy_reward": 0.10267857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5580357313156128, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 1191.3728637695312, "epoch": 0.7097304159510118, "grad_norm": 15.170226097106934, "kl": 11.15234375, "learning_rate": 9.351663355290693e-08, "loss": 0.3034, "reward": 0.7148437798023224, "reward_std": 0.30759797990322113, "rewards/accuracy_reward": 0.11383929383009672, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.601004496216774, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 1265.4866638183594, "epoch": 0.7100291240385334, "grad_norm": 9.430963516235352, "kl": 3.73046875, "learning_rate": 9.339722382685133e-08, "loss": 0.3154, "reward": 0.693638414144516, "reward_std": 0.2630933113396168, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312649011612, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 1286.2500610351562, "epoch": 0.7103278321260548, "grad_norm": 14.363691329956055, "kl": 3.60546875, "learning_rate": 9.327789199759839e-08, "loss": 0.2378, "reward": 0.6595982313156128, "reward_std": 0.30017556995153427, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125149011612, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 1349.6652221679688, "epoch": 0.7106265402135763, "grad_norm": 16.111955642700195, "kl": 3.51953125, "learning_rate": 9.315863819496947e-08, "loss": 0.1581, "reward": 0.7137276977300644, "reward_std": 0.23903813585639, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887276977300644, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 1218.2009582519531, "epoch": 0.7109252483010977, "grad_norm": 6.859330177307129, "kl": 2.8671875, "learning_rate": 9.303946254870093e-08, "loss": 0.2707, "reward": 0.6763393133878708, "reward_std": 0.2975311428308487, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580357164144516, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 1280.9933471679688, "epoch": 0.7112239563886192, "grad_norm": 5.526957035064697, "kl": 4.1640625, "learning_rate": 9.29203651884443e-08, "loss": 0.1753, "reward": 0.6294643133878708, "reward_std": 0.2936662584543228, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5915178805589676, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 1349.3192443847656, "epoch": 0.7115226644761407, "grad_norm": 7.174475193023682, "kl": 3.1640625, "learning_rate": 9.280134624376573e-08, "loss": 0.2322, "reward": 0.6780134290456772, "reward_std": 0.24268411099910736, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312798023224, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 1300.575927734375, "epoch": 0.7118213725636622, "grad_norm": 8.321044921875, "kl": 3.59765625, "learning_rate": 9.268240584414622e-08, "loss": 0.2307, "reward": 0.619419664144516, "reward_std": 0.29051343351602554, "rewards/accuracy_reward": 0.04687500139698386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5725446790456772, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 1332.9241638183594, "epoch": 0.7121200806511836, "grad_norm": 7.940063953399658, "kl": 3.46484375, "learning_rate": 9.256354411898132e-08, "loss": 0.225, "reward": 0.6177455633878708, "reward_std": 0.2652920186519623, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5820312798023224, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 1356.8103332519531, "epoch": 0.7124187887387051, "grad_norm": 3.73805570602417, "kl": 3.1875, "learning_rate": 9.244476119758082e-08, "loss": 0.2396, "reward": 0.6735491305589676, "reward_std": 0.27680082991719246, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5842634290456772, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 1328.6964721679688, "epoch": 0.7127174968262265, "grad_norm": 15.513975143432617, "kl": 3.16015625, "learning_rate": 9.232605720916896e-08, "loss": 0.1883, "reward": 0.683035746216774, "reward_std": 0.26868053153157234, "rewards/accuracy_reward": 0.08705357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595982164144516, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 1286.6920166015625, "epoch": 0.7130162049137481, "grad_norm": 11.142290115356445, "kl": 2.890625, "learning_rate": 9.22074322828841e-08, "loss": 0.2286, "reward": 0.6662946790456772, "reward_std": 0.2699391283094883, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6127232313156128, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 1172.1741638183594, "epoch": 0.7133149130012695, "grad_norm": 9.126872062683105, "kl": 2.51953125, "learning_rate": 9.208888654777844e-08, "loss": 0.1697, "reward": 0.7773437798023224, "reward_std": 0.2531043328344822, "rewards/accuracy_reward": 0.17857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5987723469734192, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 1160.6919860839844, "epoch": 0.713613621088791, "grad_norm": 7.7735276222229, "kl": 2.6953125, "learning_rate": 9.197042013281816e-08, "loss": 0.2435, "reward": 0.729910746216774, "reward_std": 0.25304046273231506, "rewards/accuracy_reward": 0.1428571459837258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5870535969734192, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 1278.638427734375, "epoch": 0.7139123291763124, "grad_norm": 9.93947982788086, "kl": 3.8515625, "learning_rate": 9.185203316688313e-08, "loss": 0.2895, "reward": 0.6222098469734192, "reward_std": 0.28531399741768837, "rewards/accuracy_reward": 0.03794643119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.584263414144516, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 1288.9353332519531, "epoch": 0.714211037263834, "grad_norm": 7.667771339416504, "kl": 3.29296875, "learning_rate": 9.173372577876671e-08, "loss": 0.2749, "reward": 0.7187500298023224, "reward_std": 0.2849833592772484, "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5825893133878708, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 1215.1875457763672, "epoch": 0.7145097453513554, "grad_norm": 6.463720321655273, "kl": 3.19921875, "learning_rate": 9.161549809717577e-08, "loss": 0.3185, "reward": 0.6897321790456772, "reward_std": 0.24646060541272163, "rewards/accuracy_reward": 0.11160715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250149011612, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 1239.8683471679688, "epoch": 0.7148084534388769, "grad_norm": 7.574667453765869, "kl": 3.4765625, "learning_rate": 9.149735025073051e-08, "loss": 0.1896, "reward": 0.6763393133878708, "reward_std": 0.2982737794518471, "rewards/accuracy_reward": 0.09598214854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571492433548, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 1319.9777221679688, "epoch": 0.7151071615263983, "grad_norm": 4.391722202301025, "kl": 3.08203125, "learning_rate": 9.137928236796407e-08, "loss": 0.2236, "reward": 0.627232164144516, "reward_std": 0.27201249822974205, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5803571492433548, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 1248.6808471679688, "epoch": 0.7154058696139198, "grad_norm": 12.88924789428711, "kl": 2.96484375, "learning_rate": 9.126129457732288e-08, "loss": 0.2491, "reward": 0.662388414144516, "reward_std": 0.2688029371201992, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5864955633878708, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 1286.6808776855469, "epoch": 0.7157045777014412, "grad_norm": 4.2848615646362305, "kl": 2.8671875, "learning_rate": 9.114338700716595e-08, "loss": 0.2993, "reward": 0.723214328289032, "reward_std": 0.35130756348371506, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580357164144516, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 1119.2254943847656, "epoch": 0.7160032857889628, "grad_norm": 5.271495819091797, "kl": 2.8046875, "learning_rate": 9.102555978576525e-08, "loss": 0.2443, "reward": 0.702566996216774, "reward_std": 0.2867296189069748, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 1276.71435546875, "epoch": 0.7163019938764842, "grad_norm": 8.559344291687012, "kl": 2.69140625, "learning_rate": 9.090781304130529e-08, "loss": 0.2247, "reward": 0.7293527126312256, "reward_std": 0.3090045303106308, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.593191996216774, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 1186.8795166015625, "epoch": 0.7166007019640057, "grad_norm": 12.730549812316895, "kl": 2.826171875, "learning_rate": 9.079014690188285e-08, "loss": 0.2394, "reward": 0.6612723469734192, "reward_std": 0.2588639557361603, "rewards/accuracy_reward": 0.07366071548312902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5876116305589676, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 1270.7924499511719, "epoch": 0.7168994100515271, "grad_norm": 5.14673376083374, "kl": 3.029296875, "learning_rate": 9.067256149550724e-08, "loss": 0.1609, "reward": 0.767857164144516, "reward_std": 0.2847466431558132, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580357164144516, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 1488.524658203125, "epoch": 0.7171981181390487, "grad_norm": 2.9017698764801025, "kl": 0.07476806640625, "learning_rate": 9.055505695009991e-08, "loss": 0.1349, "reward": 0.8643973469734192, "reward_std": 0.24767646566033363, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6880580633878708, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 1318.8103332519531, "epoch": 0.7174968262265701, "grad_norm": 1.7965271472930908, "kl": 0.0814208984375, "learning_rate": 9.043763339349416e-08, "loss": 0.1551, "reward": 0.7516741454601288, "reward_std": 0.26677660271525383, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6579241305589676, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 1431.6719360351562, "epoch": 0.7177955343140916, "grad_norm": 2.190194606781006, "kl": 0.0699462890625, "learning_rate": 9.03202909534354e-08, "loss": 0.134, "reward": 0.8956473618745804, "reward_std": 0.2803487181663513, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7081473469734192, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 1445.1317749023438, "epoch": 0.718094242401613, "grad_norm": 1.4968972206115723, "kl": 0.094970703125, "learning_rate": 9.020302975758073e-08, "loss": 0.1274, "reward": 0.832589328289032, "reward_std": 0.2561938129365444, "rewards/accuracy_reward": 0.1517857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6808035969734192, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 1398.4375610351562, "epoch": 0.7183929504891345, "grad_norm": 1.7282212972640991, "kl": 0.0638427734375, "learning_rate": 9.008584993349872e-08, "loss": 0.1527, "reward": 0.8766741454601288, "reward_std": 0.28567566722631454, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6981027126312256, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 1429.4844665527344, "epoch": 0.718691658576656, "grad_norm": 2.4578797817230225, "kl": 0.1451416015625, "learning_rate": 8.996875160866963e-08, "loss": 0.0697, "reward": 0.8828125298023224, "reward_std": 0.22262226417660713, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6930803954601288, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 1394.6161499023438, "epoch": 0.7189903666641775, "grad_norm": 2.7526206970214844, "kl": 0.08447265625, "learning_rate": 8.985173491048493e-08, "loss": 0.1336, "reward": 0.785714328289032, "reward_std": 0.21866211667656898, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098214626312256, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 1301.5223693847656, "epoch": 0.7192890747516989, "grad_norm": 1.859082579612732, "kl": 0.08221435546875, "learning_rate": 8.973479996624734e-08, "loss": 0.1156, "reward": 0.852120578289032, "reward_std": 0.29504360258579254, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7025670111179352, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 1456.4531860351562, "epoch": 0.7195877828392204, "grad_norm": 1.3194388151168823, "kl": 0.04156494140625, "learning_rate": 8.961794690317064e-08, "loss": 0.0953, "reward": 0.82979916036129, "reward_std": 0.2458791770040989, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7248884290456772, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 1367.08935546875, "epoch": 0.7198864909267418, "grad_norm": 1.2813589572906494, "kl": 0.06378173828125, "learning_rate": 8.950117584837943e-08, "loss": 0.1006, "reward": 0.8683036118745804, "reward_std": 0.23173322156071663, "rewards/accuracy_reward": 0.15625000977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7120536118745804, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 1363.6786193847656, "epoch": 0.7201851990142634, "grad_norm": 1.3596330881118774, "kl": 0.04541015625, "learning_rate": 8.938448692890921e-08, "loss": 0.1046, "reward": 0.8376116454601288, "reward_std": 0.2525482140481472, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7237723618745804, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 1465.1741485595703, "epoch": 0.7204839071017848, "grad_norm": 1.4262899160385132, "kl": 0.06005859375, "learning_rate": 8.926788027170614e-08, "loss": 0.0913, "reward": 0.8515625596046448, "reward_std": 0.17930520325899124, "rewards/accuracy_reward": 0.1428571513388306, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7087053954601288, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 1357.1741638183594, "epoch": 0.7207826151893062, "grad_norm": 1.9193506240844727, "kl": 0.0594482421875, "learning_rate": 8.915135600362675e-08, "loss": 0.1371, "reward": 0.7594866305589676, "reward_std": 0.2259424887597561, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7036830633878708, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 1163.0246276855469, "epoch": 0.7210813232768277, "grad_norm": 2.263153076171875, "kl": 0.06591796875, "learning_rate": 8.90349142514381e-08, "loss": 0.1294, "reward": 0.9268973618745804, "reward_std": 0.27190297469496727, "rewards/accuracy_reward": 0.18526786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.741629496216774, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 1377.83935546875, "epoch": 0.7213800313643491, "grad_norm": 11.419490814208984, "kl": 0.421630859375, "learning_rate": 8.891855514181733e-08, "loss": 0.1368, "reward": 0.9246652126312256, "reward_std": 0.25088971108198166, "rewards/accuracy_reward": 0.20535715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7193080484867096, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 1393.7902221679688, "epoch": 0.7216787394518707, "grad_norm": 1.3076660633087158, "kl": 0.032073974609375, "learning_rate": 8.880227880135179e-08, "loss": 0.1016, "reward": 0.9324777275323868, "reward_std": 0.16018594056367874, "rewards/accuracy_reward": 0.17857143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7539062947034836, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 1404.6183471679688, "epoch": 0.7219774475393921, "grad_norm": 2.2656195163726807, "kl": 0.0838623046875, "learning_rate": 8.868608535653877e-08, "loss": 0.1329, "reward": 0.8610491454601288, "reward_std": 0.23709703981876373, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7204241454601288, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 1385.4107666015625, "epoch": 0.7222761556269136, "grad_norm": 3.0917396545410156, "kl": 0.07135009765625, "learning_rate": 8.856997493378526e-08, "loss": 0.1296, "reward": 0.8208705633878708, "reward_std": 0.18417184427380562, "rewards/accuracy_reward": 0.10267857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.718191996216774, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 1360.7255249023438, "epoch": 0.722574863714435, "grad_norm": 1.8201297521591187, "kl": 0.06097412109375, "learning_rate": 8.84539476594081e-08, "loss": 0.0994, "reward": 0.9257812947034836, "reward_std": 0.190464923158288, "rewards/accuracy_reward": 0.19419643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7315848469734192, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 1299.8170166015625, "epoch": 0.7228735718019565, "grad_norm": 1.845396876335144, "kl": 0.0484619140625, "learning_rate": 8.83380036596336e-08, "loss": 0.1248, "reward": 0.9352679252624512, "reward_std": 0.22369522973895073, "rewards/accuracy_reward": 0.19419643748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7410714626312256, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 1275.5871276855469, "epoch": 0.723172279889478, "grad_norm": 2.5576090812683105, "kl": 0.03985595703125, "learning_rate": 8.822214306059737e-08, "loss": 0.089, "reward": 0.9748884290456772, "reward_std": 0.2692871317267418, "rewards/accuracy_reward": 0.2433035895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7315848618745804, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 1442.2076721191406, "epoch": 0.7234709879769995, "grad_norm": 2.0948798656463623, "kl": 0.0379638671875, "learning_rate": 8.810636598834447e-08, "loss": 0.0785, "reward": 0.8069196790456772, "reward_std": 0.19584274291992188, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7488839477300644, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 1335.8861999511719, "epoch": 0.7237696960645209, "grad_norm": 1.759556770324707, "kl": 0.03668212890625, "learning_rate": 8.799067256882902e-08, "loss": 0.0588, "reward": 0.8688616454601288, "reward_std": 0.17387894541025162, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.750558078289032, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 1428.9620971679688, "epoch": 0.7240684041520424, "grad_norm": 4.1666083335876465, "kl": 0.043731689453125, "learning_rate": 8.787506292791406e-08, "loss": 0.0977, "reward": 0.8822545111179352, "reward_std": 0.20769862830638885, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7349330633878708, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 1336.62060546875, "epoch": 0.7243671122395638, "grad_norm": 3.340583324432373, "kl": 0.04547119140625, "learning_rate": 8.775953719137157e-08, "loss": 0.0524, "reward": 0.8816964626312256, "reward_std": 0.2083745263516903, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7366071790456772, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 1343.2545471191406, "epoch": 0.7246658203270854, "grad_norm": 3.623704671859741, "kl": 0.036865234375, "learning_rate": 8.764409548488229e-08, "loss": 0.0717, "reward": 0.9481027126312256, "reward_std": 0.22321400046348572, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7382812947034836, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 1300.3839721679688, "epoch": 0.7249645284146068, "grad_norm": 5.438773155212402, "kl": 0.0443115234375, "learning_rate": 8.752873793403538e-08, "loss": 0.1117, "reward": 0.8543526977300644, "reward_std": 0.21565252169966698, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7338169813156128, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 1311.90185546875, "epoch": 0.7252632365021283, "grad_norm": 7.234127998352051, "kl": 0.06097412109375, "learning_rate": 8.741346466432866e-08, "loss": 0.1343, "reward": 0.9034598618745804, "reward_std": 0.2432812936604023, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7271205633878708, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 1421.7745971679688, "epoch": 0.7255619445896497, "grad_norm": 6.939084053039551, "kl": 0.041473388671875, "learning_rate": 8.729827580116803e-08, "loss": 0.0972, "reward": 0.835379496216774, "reward_std": 0.2007909044623375, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7193080633878708, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 1395.7813415527344, "epoch": 0.7258606526771713, "grad_norm": 3.006706953048706, "kl": 0.032440185546875, "learning_rate": 8.718317146986771e-08, "loss": 0.0473, "reward": 0.9804687798023224, "reward_std": 0.21100644022226334, "rewards/accuracy_reward": 0.2187500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7617187947034836, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 1297.7009582519531, "epoch": 0.7261593607646927, "grad_norm": 9.193634986877441, "kl": 0.06884765625, "learning_rate": 8.706815179565004e-08, "loss": 0.1163, "reward": 0.8214286118745804, "reward_std": 0.24260573461651802, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7321428954601288, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 1316.8728332519531, "epoch": 0.7264580688522142, "grad_norm": 9.236931800842285, "kl": 0.054473876953125, "learning_rate": 8.695321690364503e-08, "loss": 0.1107, "reward": 0.9475446790456772, "reward_std": 0.19664354622364044, "rewards/accuracy_reward": 0.21875001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7287946790456772, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 1366.9464721679688, "epoch": 0.7267567769397356, "grad_norm": 8.916204452514648, "kl": 0.052947998046875, "learning_rate": 8.683836691889062e-08, "loss": 0.1087, "reward": 0.9157366454601288, "reward_std": 0.23673748597502708, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7393973618745804, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 1454.9509582519531, "epoch": 0.7270554850272571, "grad_norm": 11.013885498046875, "kl": 0.06988525390625, "learning_rate": 8.672360196633237e-08, "loss": 0.1227, "reward": 0.8253348469734192, "reward_std": 0.23696309700608253, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7248884290456772, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 1251.154052734375, "epoch": 0.7273541931147786, "grad_norm": 12.794118881225586, "kl": 0.07452392578125, "learning_rate": 8.660892217082324e-08, "loss": 0.1283, "reward": 0.8761161118745804, "reward_std": 0.22811128199100494, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7131696790456772, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 1421.3036193847656, "epoch": 0.7276529012023001, "grad_norm": 4.873563766479492, "kl": 0.04254150390625, "learning_rate": 8.649432765712362e-08, "loss": 0.0726, "reward": 0.9040178954601288, "reward_std": 0.25663920119404793, "rewards/accuracy_reward": 0.15848215413279831, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7455357611179352, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 1514.6407165527344, "epoch": 0.7279516092898215, "grad_norm": 10.19567584991455, "kl": 0.0533447265625, "learning_rate": 8.637981854990117e-08, "loss": 0.0773, "reward": 0.8470982611179352, "reward_std": 0.17787379957735538, "rewards/accuracy_reward": 0.10937500419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.737723246216774, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 1365.0960388183594, "epoch": 0.728250317377343, "grad_norm": 4.511603355407715, "kl": 0.043121337890625, "learning_rate": 8.62653949737305e-08, "loss": 0.0519, "reward": 0.9229911416769028, "reward_std": 0.2257305532693863, "rewards/accuracy_reward": 0.18526786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.737723246216774, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 1348.18310546875, "epoch": 0.7285490254648644, "grad_norm": 7.289315700531006, "kl": 0.072021484375, "learning_rate": 8.615105705309332e-08, "loss": 0.0904, "reward": 0.893973246216774, "reward_std": 0.2414497435092926, "rewards/accuracy_reward": 0.16964286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7243303954601288, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 1372.4219360351562, "epoch": 0.728847733552386, "grad_norm": 7.924818515777588, "kl": 0.053863525390625, "learning_rate": 8.6036804912378e-08, "loss": 0.0944, "reward": 0.9715402275323868, "reward_std": 0.2839400991797447, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7483259290456772, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 1373.2947082519531, "epoch": 0.7291464416399074, "grad_norm": 7.3409013748168945, "kl": 0.08746337890625, "learning_rate": 8.59226386758797e-08, "loss": 0.0376, "reward": 0.9012277126312256, "reward_std": 0.16089221090078354, "rewards/accuracy_reward": 0.16741071990691125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.733816996216774, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 1395.93310546875, "epoch": 0.7294451497274289, "grad_norm": 6.586035251617432, "kl": 0.06494140625, "learning_rate": 8.580855846780016e-08, "loss": 0.0519, "reward": 0.9101562947034836, "reward_std": 0.200714323669672, "rewards/accuracy_reward": 0.15848214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7516741454601288, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 1364.5670166015625, "epoch": 0.7297438578149503, "grad_norm": 5.596032619476318, "kl": 0.075439453125, "learning_rate": 8.569456441224737e-08, "loss": 0.0259, "reward": 0.9441964775323868, "reward_std": 0.2456359788775444, "rewards/accuracy_reward": 0.2098214440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750447034836, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 1317.2545166015625, "epoch": 0.7300425659024719, "grad_norm": 8.565279006958008, "kl": 0.08203125, "learning_rate": 8.558065663323572e-08, "loss": 0.121, "reward": 0.8537946790456772, "reward_std": 0.2028096690773964, "rewards/accuracy_reward": 0.10937500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7444196790456772, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 1409.2232666015625, "epoch": 0.7303412739899933, "grad_norm": 10.123638153076172, "kl": 0.1026611328125, "learning_rate": 8.546683525468576e-08, "loss": 0.0963, "reward": 0.8046875298023224, "reward_std": 0.19906986877322197, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7399553954601288, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 1300.4911193847656, "epoch": 0.7306399820775148, "grad_norm": 13.724224090576172, "kl": 0.11474609375, "learning_rate": 8.535310040042394e-08, "loss": 0.0743, "reward": 0.823660746216774, "reward_std": 0.16226254031062126, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7366071790456772, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 1435.8683776855469, "epoch": 0.7309386901650362, "grad_norm": 9.227072715759277, "kl": 0.085418701171875, "learning_rate": 8.523945219418263e-08, "loss": 0.0635, "reward": 0.8872768431901932, "reward_std": 0.17947234585881233, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7488839626312256, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 1162.4107666015625, "epoch": 0.7312373982525577, "grad_norm": 13.055530548095703, "kl": 0.11578369140625, "learning_rate": 8.51258907596e-08, "loss": 0.1182, "reward": 0.8655134290456772, "reward_std": 0.2405831553041935, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7449777126312256, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 1280.685302734375, "epoch": 0.7315361063400792, "grad_norm": 8.472529411315918, "kl": 0.1055908203125, "learning_rate": 8.501241622021967e-08, "loss": 0.079, "reward": 0.9341518431901932, "reward_std": 0.1946385484188795, "rewards/accuracy_reward": 0.18526786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7488839626312256, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 1279.8883972167969, "epoch": 0.7318348144276007, "grad_norm": 15.680326461791992, "kl": 0.150390625, "learning_rate": 8.489902869949087e-08, "loss": 0.1123, "reward": 0.8387276977300644, "reward_std": 0.21425873413681984, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7248884290456772, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 1335.2188110351562, "epoch": 0.7321335225151221, "grad_norm": 10.421350479125977, "kl": 0.2332763671875, "learning_rate": 8.478572832076817e-08, "loss": 0.0919, "reward": 0.881138414144516, "reward_std": 0.18434598855674267, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7315848618745804, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 1330.2165832519531, "epoch": 0.7324322306026436, "grad_norm": 11.091117858886719, "kl": 0.1455078125, "learning_rate": 8.467251520731114e-08, "loss": 0.1023, "reward": 0.8772321939468384, "reward_std": 0.18608069606125355, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7321428954601288, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 1351.9062805175781, "epoch": 0.732730938690165, "grad_norm": 6.459686756134033, "kl": 0.28399658203125, "learning_rate": 8.45593894822847e-08, "loss": 0.0737, "reward": 0.831473246216774, "reward_std": 0.17948808334767818, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7310268133878708, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 1384.5223999023438, "epoch": 0.7330296467776866, "grad_norm": 11.399949073791504, "kl": 0.205078125, "learning_rate": 8.444635126875841e-08, "loss": 0.1077, "reward": 0.918526828289032, "reward_std": 0.2769586518406868, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.715401828289032, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 1453.2500915527344, "epoch": 0.733328354865208, "grad_norm": 15.567118644714355, "kl": 0.31298828125, "learning_rate": 8.433340068970683e-08, "loss": 0.1025, "reward": 0.8152901977300644, "reward_std": 0.26883457973599434, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7237723618745804, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 1324.2255249023438, "epoch": 0.7336270629527294, "grad_norm": 18.372377395629883, "kl": 0.235107421875, "learning_rate": 8.422053786800917e-08, "loss": 0.1495, "reward": 0.90792416036129, "reward_std": 0.23796940967440605, "rewards/accuracy_reward": 0.18080357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.727120578289032, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 1233.3482666015625, "epoch": 0.7339257710402509, "grad_norm": 20.28638458251953, "kl": 0.388427734375, "learning_rate": 8.410776292644903e-08, "loss": 0.2039, "reward": 0.9441964626312256, "reward_std": 0.2361326515674591, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7321428954601288, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 1235.3728332519531, "epoch": 0.7342244791277723, "grad_norm": 16.95560646057129, "kl": 0.232666015625, "learning_rate": 8.399507598771454e-08, "loss": 0.0928, "reward": 0.9458705931901932, "reward_std": 0.21660345047712326, "rewards/accuracy_reward": 0.20982144214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7360491305589676, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 1423.0022583007812, "epoch": 0.7345231872152939, "grad_norm": 10.582751274108887, "kl": 0.2470703125, "learning_rate": 8.388247717439808e-08, "loss": 0.1275, "reward": 0.8777902275323868, "reward_std": 0.23787943832576275, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7349330633878708, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 1297.9219665527344, "epoch": 0.7348218953028153, "grad_norm": 14.038068771362305, "kl": 0.470947265625, "learning_rate": 8.376996660899605e-08, "loss": 0.1298, "reward": 0.905691996216774, "reward_std": 0.22592200711369514, "rewards/accuracy_reward": 0.18080358440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7248884290456772, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 1384.227783203125, "epoch": 0.7351206033903368, "grad_norm": 8.440620422363281, "kl": 0.35009765625, "learning_rate": 8.365754441390893e-08, "loss": 0.1084, "reward": 1.02511166036129, "reward_std": 0.21199503540992737, "rewards/accuracy_reward": 0.294642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7304687947034836, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 1401.0357666015625, "epoch": 0.7354193114778582, "grad_norm": 12.484953880310059, "kl": 0.9765625, "learning_rate": 8.354521071144114e-08, "loss": 0.1395, "reward": 0.8214285969734192, "reward_std": 0.21820151060819626, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7165178805589676, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 1283.013427734375, "epoch": 0.7357180195653797, "grad_norm": 2.2765276432037354, "kl": 0.208740234375, "learning_rate": 8.34329656238006e-08, "loss": 0.032, "reward": 1.0412946939468384, "reward_std": 0.19382445886731148, "rewards/accuracy_reward": 0.3102678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7310268133878708, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 1413.3906860351562, "epoch": 0.7360167276529012, "grad_norm": 10.847990036010742, "kl": 1.056640625, "learning_rate": 8.332080927309903e-08, "loss": 0.179, "reward": 0.8158482611179352, "reward_std": 0.20573430880904198, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7131696790456772, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 1419.2389221191406, "epoch": 0.7363154357404227, "grad_norm": 8.657658576965332, "kl": 1.302734375, "learning_rate": 8.320874178135162e-08, "loss": 0.1664, "reward": 0.8231027126312256, "reward_std": 0.24647342413663864, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7204241454601288, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 1457.3482971191406, "epoch": 0.7366141438279441, "grad_norm": 10.064896583557129, "kl": 1.8134765625, "learning_rate": 8.30967632704767e-08, "loss": 0.1932, "reward": 0.827566996216774, "reward_std": 0.2508687637746334, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6981027126312256, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 1551.3036193847656, "epoch": 0.7369128519154656, "grad_norm": 8.559224128723145, "kl": 1.6650390625, "learning_rate": 8.298487386229597e-08, "loss": 0.1474, "reward": 0.9190848767757416, "reward_std": 0.1734718456864357, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.727120578289032, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 1390.5335693359375, "epoch": 0.737211560002987, "grad_norm": 8.781705856323242, "kl": 2.013671875, "learning_rate": 8.287307367853412e-08, "loss": 0.2349, "reward": 0.8186384290456772, "reward_std": 0.23062734305858612, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7159598618745804, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 1375.8929138183594, "epoch": 0.7375102680905086, "grad_norm": 7.941634178161621, "kl": 2.330078125, "learning_rate": 8.27613628408188e-08, "loss": 0.2647, "reward": 0.8671875447034836, "reward_std": 0.28685034811496735, "rewards/accuracy_reward": 0.17410715529695153, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6930803954601288, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 1369.5022888183594, "epoch": 0.73780897617803, "grad_norm": 6.170969486236572, "kl": 3.478515625, "learning_rate": 8.264974147068049e-08, "loss": 0.3272, "reward": 0.8655134439468384, "reward_std": 0.21860246732831, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6891741305589676, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 1332.0736999511719, "epoch": 0.7381076842655515, "grad_norm": 4.162290096282959, "kl": 2.625, "learning_rate": 8.253820968955224e-08, "loss": 0.2555, "reward": 0.7868303954601288, "reward_std": 0.2295728698372841, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6975446790456772, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 1365.1697082519531, "epoch": 0.7384063923530729, "grad_norm": 7.398095607757568, "kl": 3.169921875, "learning_rate": 8.24267676187697e-08, "loss": 0.3341, "reward": 0.9843750298023224, "reward_std": 0.2455889880657196, "rewards/accuracy_reward": 0.2790178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7053571790456772, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 1405.2746276855469, "epoch": 0.7387051004405945, "grad_norm": 7.97052001953125, "kl": 3.5390625, "learning_rate": 8.231541537957102e-08, "loss": 0.2793, "reward": 0.9341518133878708, "reward_std": 0.22154590860009193, "rewards/accuracy_reward": 0.23883929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125298023224, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 1406.2545471191406, "epoch": 0.7390038085281159, "grad_norm": 4.687631607055664, "kl": 2.7880859375, "learning_rate": 8.220415309309636e-08, "loss": 0.2605, "reward": 0.881138414144516, "reward_std": 0.20670834928750992, "rewards/accuracy_reward": 0.17410715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7070312798023224, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 1446.1362609863281, "epoch": 0.7393025166156374, "grad_norm": 10.906356811523438, "kl": 4.75390625, "learning_rate": 8.209298088038831e-08, "loss": 0.3822, "reward": 0.7829241454601288, "reward_std": 0.2610509768128395, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6847098618745804, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 1579.8884582519531, "epoch": 0.7396012247031588, "grad_norm": 8.325968742370605, "kl": 5.5078125, "learning_rate": 8.198189886239132e-08, "loss": 0.3748, "reward": 0.690848246216774, "reward_std": 0.22907022386789322, "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6707589477300644, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 1348.4799499511719, "epoch": 0.7398999327906803, "grad_norm": 11.955333709716797, "kl": 4.2109375, "learning_rate": 8.187090715995168e-08, "loss": 0.3614, "reward": 0.789620578289032, "reward_std": 0.2715499736368656, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.686941996216774, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 1471.5291137695312, "epoch": 0.7401986408782018, "grad_norm": 11.825909614562988, "kl": 5.09375, "learning_rate": 8.176000589381752e-08, "loss": 0.3871, "reward": 0.754464328289032, "reward_std": 0.2581000216305256, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6607143133878708, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 1500.950927734375, "epoch": 0.7404973489657233, "grad_norm": 10.648296356201172, "kl": 4.87890625, "learning_rate": 8.164919518463861e-08, "loss": 0.3665, "reward": 0.8309152126312256, "reward_std": 0.2874912843108177, "rewards/accuracy_reward": 0.16071428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6702009290456772, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 1517.1295166015625, "epoch": 0.7407960570532447, "grad_norm": 11.287418365478516, "kl": 5.6796875, "learning_rate": 8.153847515296604e-08, "loss": 0.4145, "reward": 0.7578125298023224, "reward_std": 0.2619761452078819, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375298023224, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 1467.8304138183594, "epoch": 0.7410947651407662, "grad_norm": 5.106165409088135, "kl": 4.17578125, "learning_rate": 8.142784591925242e-08, "loss": 0.419, "reward": 0.930245578289032, "reward_std": 0.2810891829431057, "rewards/accuracy_reward": 0.2589285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6713169813156128, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 1430.2098999023438, "epoch": 0.7413934732282876, "grad_norm": 11.196635246276855, "kl": 5.828125, "learning_rate": 8.131730760385147e-08, "loss": 0.4751, "reward": 0.8627232611179352, "reward_std": 0.32340461760759354, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6439732313156128, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 1457.9197387695312, "epoch": 0.7416921813158092, "grad_norm": 18.913850784301758, "kl": 5.7578125, "learning_rate": 8.120686032701805e-08, "loss": 0.4089, "reward": 0.7460937798023224, "reward_std": 0.2449783869087696, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.638950914144516, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 1363.5089721679688, "epoch": 0.7419908894033306, "grad_norm": 4.757077217102051, "kl": 4.3125, "learning_rate": 8.109650420890801e-08, "loss": 0.3735, "reward": 0.7589285969734192, "reward_std": 0.29346825927495956, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250298023224, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 1648.9063110351562, "epoch": 0.7422895974908521, "grad_norm": 9.732283592224121, "kl": 4.09765625, "learning_rate": 8.098623936957793e-08, "loss": 0.2476, "reward": 0.7399553954601288, "reward_std": 0.25482188537716866, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6506696790456772, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 1417.5045166015625, "epoch": 0.7425883055783735, "grad_norm": 7.326340675354004, "kl": 3.146484375, "learning_rate": 8.08760659289852e-08, "loss": 0.2726, "reward": 0.79073666036129, "reward_std": 0.2609051279723644, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6657366305589676, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 1507.41748046875, "epoch": 0.742887013665895, "grad_norm": 3.8793134689331055, "kl": 4.01953125, "learning_rate": 8.076598400698772e-08, "loss": 0.2726, "reward": 0.7148437649011612, "reward_std": 0.2206031009554863, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6501116305589676, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 1560.3906860351562, "epoch": 0.7431857217534165, "grad_norm": 4.520893096923828, "kl": 4.171875, "learning_rate": 8.065599372334379e-08, "loss": 0.2881, "reward": 0.6958705633878708, "reward_std": 0.2293752133846283, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6400669813156128, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 1492.6161499023438, "epoch": 0.743484429840938, "grad_norm": 5.5767292976379395, "kl": 3.759765625, "learning_rate": 8.05460951977121e-08, "loss": 0.3066, "reward": 0.8007812649011612, "reward_std": 0.2784340903162956, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6445312798023224, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 1533.6563415527344, "epoch": 0.7437831379284594, "grad_norm": 4.962903022766113, "kl": 3.37109375, "learning_rate": 8.04362885496515e-08, "loss": 0.2244, "reward": 0.7232142984867096, "reward_std": 0.20915500074625015, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6473214477300644, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 1498.825927734375, "epoch": 0.7440818460159809, "grad_norm": 4.104568004608154, "kl": 3.31640625, "learning_rate": 8.032657389862078e-08, "loss": 0.2456, "reward": 0.7840402126312256, "reward_std": 0.2556596212089062, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6478795111179352, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 1582.96435546875, "epoch": 0.7443805541035023, "grad_norm": 8.535561561584473, "kl": 5.296875, "learning_rate": 8.021695136397876e-08, "loss": 0.4134, "reward": 0.7399553805589676, "reward_std": 0.292412206530571, "rewards/accuracy_reward": 0.12946428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.610491082072258, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 1439.029052734375, "epoch": 0.7446792621910239, "grad_norm": 5.330016136169434, "kl": 3.87890625, "learning_rate": 8.01074210649841e-08, "loss": 0.3623, "reward": 0.7561384290456772, "reward_std": 0.23902245610952377, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6244419664144516, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 1376.9219360351562, "epoch": 0.7449779702785453, "grad_norm": 3.8980307579040527, "kl": 4.18359375, "learning_rate": 7.999798312079487e-08, "loss": 0.3646, "reward": 0.792410746216774, "reward_std": 0.3084932714700699, "rewards/accuracy_reward": 0.17410714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6183035969734192, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 1493.7366943359375, "epoch": 0.7452766783660668, "grad_norm": 3.805356740951538, "kl": 3.80078125, "learning_rate": 7.988863765046897e-08, "loss": 0.3007, "reward": 0.7656250298023224, "reward_std": 0.27441396564245224, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6294643133878708, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 1417.6295166015625, "epoch": 0.7455753864535882, "grad_norm": 8.007915496826172, "kl": 3.587890625, "learning_rate": 7.977938477296346e-08, "loss": 0.2925, "reward": 0.8214286118745804, "reward_std": 0.28871244192123413, "rewards/accuracy_reward": 0.1964285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000298023224, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 1490.5090026855469, "epoch": 0.7458740945411098, "grad_norm": 9.198023796081543, "kl": 3.044921875, "learning_rate": 7.967022460713477e-08, "loss": 0.3205, "reward": 0.7985491454601288, "reward_std": 0.28835824877023697, "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6690848469734192, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 1501.2455749511719, "epoch": 0.7461728026286312, "grad_norm": 5.906368732452393, "kl": 3.609375, "learning_rate": 7.956115727173858e-08, "loss": 0.3234, "reward": 0.722098246216774, "reward_std": 0.26928331330418587, "rewards/accuracy_reward": 0.08035714575089514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6417411118745804, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 1336.4241638183594, "epoch": 0.7464715107161526, "grad_norm": 8.559525489807129, "kl": 3.328125, "learning_rate": 7.945218288542931e-08, "loss": 0.3018, "reward": 0.8476562947034836, "reward_std": 0.2700815387070179, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6489955633878708, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 1436.716552734375, "epoch": 0.7467702188036741, "grad_norm": 6.4193010330200195, "kl": 3.609375, "learning_rate": 7.934330156676046e-08, "loss": 0.3196, "reward": 0.772879496216774, "reward_std": 0.3097839131951332, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6367187947034836, "step": 2500 }, { "clip_ratio": 0.0, "completion_length": 1460.8013916015625, "epoch": 0.7470689268911955, "grad_norm": 9.145577430725098, "kl": 3.646484375, "learning_rate": 7.923451343418429e-08, "loss": 0.2817, "reward": 0.7304687798023224, "reward_std": 0.2731305845081806, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.654575914144516, "step": 2501 }, { "clip_ratio": 0.0, "completion_length": 1485.6897888183594, "epoch": 0.747367634978717, "grad_norm": 3.25166916847229, "kl": 4.45703125, "learning_rate": 7.912581860605155e-08, "loss": 0.4065, "reward": 0.7304687798023224, "reward_std": 0.2740616276860237, "rewards/accuracy_reward": 0.10714286146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.623325914144516, "step": 2502 }, { "clip_ratio": 0.0, "completion_length": 1467.4130249023438, "epoch": 0.7476663430662385, "grad_norm": 3.7756783962249756, "kl": 5.0390625, "learning_rate": 7.901721720061159e-08, "loss": 0.3823, "reward": 0.7834821939468384, "reward_std": 0.30708567053079605, "rewards/accuracy_reward": 0.19419643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.589285746216774, "step": 2503 }, { "clip_ratio": 0.0, "completion_length": 1512.2388916015625, "epoch": 0.74796505115376, "grad_norm": 9.19835090637207, "kl": 5.078125, "learning_rate": 7.890870933601215e-08, "loss": 0.3615, "reward": 0.6718750298023224, "reward_std": 0.29263587296009064, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026785969734192, "step": 2504 }, { "clip_ratio": 0.0, "completion_length": 1436.2746276855469, "epoch": 0.7482637592412814, "grad_norm": 3.6667330265045166, "kl": 3.79296875, "learning_rate": 7.880029513029904e-08, "loss": 0.3051, "reward": 0.7801339626312256, "reward_std": 0.3121573105454445, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6462053954601288, "step": 2505 }, { "clip_ratio": 0.0, "completion_length": 1411.3415832519531, "epoch": 0.7485624673288029, "grad_norm": 4.460873126983643, "kl": 4.21484375, "learning_rate": 7.869197470141636e-08, "loss": 0.3309, "reward": 0.7265625298023224, "reward_std": 0.29983094707131386, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.641741082072258, "step": 2506 }, { "clip_ratio": 0.0, "completion_length": 1466.0670166015625, "epoch": 0.7488611754163244, "grad_norm": 4.134634017944336, "kl": 4.0546875, "learning_rate": 7.858374816720614e-08, "loss": 0.334, "reward": 0.6953125447034836, "reward_std": 0.2387200891971588, "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.659598246216774, "step": 2507 }, { "clip_ratio": 0.0, "completion_length": 1320.6473999023438, "epoch": 0.7491598835038459, "grad_norm": 6.670360088348389, "kl": 4.81640625, "learning_rate": 7.84756156454082e-08, "loss": 0.4206, "reward": 0.7505580633878708, "reward_std": 0.29847588762640953, "rewards/accuracy_reward": 0.14062500675208867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6099330633878708, "step": 2508 }, { "clip_ratio": 0.0, "completion_length": 1349.4085388183594, "epoch": 0.7494585915913673, "grad_norm": 6.039951801300049, "kl": 4.49609375, "learning_rate": 7.836757725366014e-08, "loss": 0.3939, "reward": 0.6819196790456772, "reward_std": 0.2859914302825928, "rewards/accuracy_reward": 0.06696428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6149553805589676, "step": 2509 }, { "clip_ratio": 0.0, "completion_length": 1538.8750610351562, "epoch": 0.7497572996788888, "grad_norm": 9.423096656799316, "kl": 5.1015625, "learning_rate": 7.825963310949712e-08, "loss": 0.3023, "reward": 0.6411830633878708, "reward_std": 0.21119241416454315, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5987723469734192, "step": 2510 }, { "clip_ratio": 0.0, "completion_length": 1373.02685546875, "epoch": 0.7500560077664102, "grad_norm": 10.391846656799316, "kl": 4.9921875, "learning_rate": 7.81517833303518e-08, "loss": 0.4072, "reward": 0.7399553954601288, "reward_std": 0.2489834986627102, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.619419664144516, "step": 2511 }, { "clip_ratio": 0.0, "completion_length": 1328.0781860351562, "epoch": 0.7503547158539318, "grad_norm": 3.9064736366271973, "kl": 4.75390625, "learning_rate": 7.804402803355423e-08, "loss": 0.4378, "reward": 0.775669664144516, "reward_std": 0.28409355506300926, "rewards/accuracy_reward": 0.17633929592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303805589676, "step": 2512 }, { "clip_ratio": 0.0, "completion_length": 1357.2679443359375, "epoch": 0.7506534239414532, "grad_norm": 9.797927856445312, "kl": 5.5859375, "learning_rate": 7.793636733633148e-08, "loss": 0.4072, "reward": 0.7388392984867096, "reward_std": 0.3099265471100807, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580357164144516, "step": 2513 }, { "clip_ratio": 0.0, "completion_length": 1501.4241638183594, "epoch": 0.7509521320289747, "grad_norm": 3.472304105758667, "kl": 4.4609375, "learning_rate": 7.782880135580795e-08, "loss": 0.3714, "reward": 0.6930803805589676, "reward_std": 0.28199928253889084, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6037946790456772, "step": 2514 }, { "clip_ratio": 0.0, "completion_length": 1464.9397888183594, "epoch": 0.7512508401164961, "grad_norm": 4.613271236419678, "kl": 4.17578125, "learning_rate": 7.772133020900488e-08, "loss": 0.3507, "reward": 0.6953125298023224, "reward_std": 0.28416552394628525, "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875298023224, "step": 2515 }, { "clip_ratio": 0.0, "completion_length": 1359.4531860351562, "epoch": 0.7515495482040176, "grad_norm": 7.5032148361206055, "kl": 3.33984375, "learning_rate": 7.761395401284027e-08, "loss": 0.334, "reward": 0.7304687798023224, "reward_std": 0.3217102587223053, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6367187798023224, "step": 2516 }, { "clip_ratio": 0.0, "completion_length": 1404.3817749023438, "epoch": 0.7518482562915391, "grad_norm": 4.698227405548096, "kl": 4.03515625, "learning_rate": 7.750667288412895e-08, "loss": 0.3777, "reward": 0.742745578289032, "reward_std": 0.2903097942471504, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 2517 }, { "clip_ratio": 0.0, "completion_length": 1453.1361999511719, "epoch": 0.7521469643790606, "grad_norm": 4.690629959106445, "kl": 4.0, "learning_rate": 7.73994869395823e-08, "loss": 0.3139, "reward": 0.7672991305589676, "reward_std": 0.27981405705213547, "rewards/accuracy_reward": 0.15848215483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.608816996216774, "step": 2518 }, { "clip_ratio": 0.0, "completion_length": 1549.7991638183594, "epoch": 0.752445672466582, "grad_norm": 3.6644039154052734, "kl": 3.6796875, "learning_rate": 7.729239629580803e-08, "loss": 0.268, "reward": 0.6941964626312256, "reward_std": 0.2512863539159298, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6205357313156128, "step": 2519 }, { "clip_ratio": 0.0, "completion_length": 1497.1697082519531, "epoch": 0.7527443805541035, "grad_norm": 6.193227767944336, "kl": 3.35546875, "learning_rate": 7.718540106931039e-08, "loss": 0.2496, "reward": 0.7064732313156128, "reward_std": 0.22189078480005264, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.612723246216774, "step": 2520 }, { "clip_ratio": 0.0, "completion_length": 1469.33935546875, "epoch": 0.753043088641625, "grad_norm": 4.169887542724609, "kl": 4.84765625, "learning_rate": 7.707850137648958e-08, "loss": 0.3853, "reward": 0.6501116305589676, "reward_std": 0.26109007373452187, "rewards/accuracy_reward": 0.0691964307334274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5809152126312256, "step": 2521 }, { "clip_ratio": 0.0, "completion_length": 1531.24560546875, "epoch": 0.7533417967291465, "grad_norm": 6.556000709533691, "kl": 4.79296875, "learning_rate": 7.697169733364201e-08, "loss": 0.2957, "reward": 0.662388414144516, "reward_std": 0.28486568853259087, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5753348618745804, "step": 2522 }, { "clip_ratio": 0.0, "completion_length": 1296.0781555175781, "epoch": 0.7536405048166679, "grad_norm": 4.61934232711792, "kl": 3.640625, "learning_rate": 7.686498905696011e-08, "loss": 0.3262, "reward": 0.754464328289032, "reward_std": 0.3101292811334133, "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750149011612, "step": 2523 }, { "clip_ratio": 0.0, "completion_length": 1513.0692443847656, "epoch": 0.7539392129041894, "grad_norm": 3.643409252166748, "kl": 4.48828125, "learning_rate": 7.675837666253192e-08, "loss": 0.306, "reward": 0.6629464477300644, "reward_std": 0.2551358602941036, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 2524 }, { "clip_ratio": 0.0, "completion_length": 1420.3281860351562, "epoch": 0.7542379209917108, "grad_norm": 3.827061891555786, "kl": 4.48828125, "learning_rate": 7.665186026634132e-08, "loss": 0.3314, "reward": 0.6981026977300644, "reward_std": 0.30361639708280563, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5731026977300644, "step": 2525 }, { "clip_ratio": 0.0, "completion_length": 1554.6116638183594, "epoch": 0.7545366290792324, "grad_norm": 7.117210388183594, "kl": 5.0078125, "learning_rate": 7.654543998426771e-08, "loss": 0.3104, "reward": 0.640066996216774, "reward_std": 0.28806914016604424, "rewards/accuracy_reward": 0.0781250053551048, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5619419813156128, "step": 2526 }, { "clip_ratio": 0.0, "completion_length": 1309.0402221679688, "epoch": 0.7548353371667538, "grad_norm": 5.428173542022705, "kl": 4.59375, "learning_rate": 7.64391159320859e-08, "loss": 0.4196, "reward": 0.8526786118745804, "reward_std": 0.3171989582479, "rewards/accuracy_reward": 0.2991071604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714402794838, "step": 2527 }, { "clip_ratio": 0.0, "completion_length": 1351.8839721679688, "epoch": 0.7551340452542753, "grad_norm": 7.831776142120361, "kl": 4.99609375, "learning_rate": 7.633288822546604e-08, "loss": 0.3779, "reward": 0.6858259290456772, "reward_std": 0.2529207915067673, "rewards/accuracy_reward": 0.12500001047737896, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5608259066939354, "step": 2528 }, { "clip_ratio": 0.0, "completion_length": 1487.1340026855469, "epoch": 0.7554327533417967, "grad_norm": 3.507140874862671, "kl": 4.609375, "learning_rate": 7.622675697997354e-08, "loss": 0.3322, "reward": 0.6668527275323868, "reward_std": 0.27649252489209175, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.570870578289032, "step": 2529 }, { "clip_ratio": 0.0, "completion_length": 1456.6607666015625, "epoch": 0.7557314614293182, "grad_norm": 9.491260528564453, "kl": 5.08203125, "learning_rate": 7.61207223110687e-08, "loss": 0.3235, "reward": 0.6512277126312256, "reward_std": 0.28526054322719574, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5686384290456772, "step": 2530 }, { "clip_ratio": 0.0, "completion_length": 1436.47998046875, "epoch": 0.7560301695168397, "grad_norm": 6.571896553039551, "kl": 5.0078125, "learning_rate": 7.601478433410686e-08, "loss": 0.3667, "reward": 0.6545759290456772, "reward_std": 0.27678558230400085, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5474330633878708, "step": 2531 }, { "clip_ratio": 0.0, "completion_length": 1400.3572082519531, "epoch": 0.7563288776043612, "grad_norm": 3.287790536880493, "kl": 4.28125, "learning_rate": 7.590894316433821e-08, "loss": 0.3661, "reward": 0.6406250298023224, "reward_std": 0.2779371738433838, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964626312256, "step": 2532 }, { "clip_ratio": 0.0, "completion_length": 1460.69873046875, "epoch": 0.7566275856918826, "grad_norm": 3.6731910705566406, "kl": 4.0625, "learning_rate": 7.580319891690751e-08, "loss": 0.2947, "reward": 0.7527902126312256, "reward_std": 0.30551546812057495, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576450914144516, "step": 2533 }, { "clip_ratio": 0.0, "completion_length": 1440.1697082519531, "epoch": 0.7569262937794041, "grad_norm": 3.592280149459839, "kl": 4.21484375, "learning_rate": 7.56975517068542e-08, "loss": 0.3021, "reward": 0.6835937649011612, "reward_std": 0.3514469861984253, "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5697544813156128, "step": 2534 }, { "clip_ratio": 0.0, "completion_length": 1419.9241638183594, "epoch": 0.7572250018669255, "grad_norm": 3.0929412841796875, "kl": 4.02734375, "learning_rate": 7.559200164911201e-08, "loss": 0.342, "reward": 0.6568080633878708, "reward_std": 0.30903464555740356, "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5585937649011612, "step": 2535 }, { "clip_ratio": 0.0, "completion_length": 1383.9397888183594, "epoch": 0.7575237099544471, "grad_norm": 3.4560768604278564, "kl": 4.36328125, "learning_rate": 7.548654885850911e-08, "loss": 0.2973, "reward": 0.6785714626312256, "reward_std": 0.3042497932910919, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 2536 }, { "clip_ratio": 0.0, "completion_length": 1402.6406860351562, "epoch": 0.7578224180419685, "grad_norm": 3.9043233394622803, "kl": 4.03515625, "learning_rate": 7.538119344976782e-08, "loss": 0.3184, "reward": 0.5535714626312256, "reward_std": 0.24339472502470016, "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5446428805589676, "step": 2537 }, { "clip_ratio": 0.0, "completion_length": 1466.8259582519531, "epoch": 0.75812112612949, "grad_norm": 8.508223533630371, "kl": 5.328125, "learning_rate": 7.527593553750443e-08, "loss": 0.3844, "reward": 0.6562500447034836, "reward_std": 0.29162415862083435, "rewards/accuracy_reward": 0.12946429383009672, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.526785746216774, "step": 2538 }, { "clip_ratio": 0.0, "completion_length": 1492.9085388183594, "epoch": 0.7584198342170114, "grad_norm": 3.4418418407440186, "kl": 4.30859375, "learning_rate": 7.517077523622927e-08, "loss": 0.2954, "reward": 0.6138393133878708, "reward_std": 0.29314372688531876, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5401785969734192, "step": 2539 }, { "clip_ratio": 0.0, "completion_length": 1484.1451721191406, "epoch": 0.758718542304533, "grad_norm": 3.9418768882751465, "kl": 4.12109375, "learning_rate": 7.50657126603465e-08, "loss": 0.3018, "reward": 0.6501116454601288, "reward_std": 0.2596344016492367, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5496651977300644, "step": 2540 }, { "clip_ratio": 0.0, "completion_length": 1490.6005249023438, "epoch": 0.7590172503920544, "grad_norm": 3.2615020275115967, "kl": 4.43359375, "learning_rate": 7.496074792415379e-08, "loss": 0.2974, "reward": 0.5965401977300644, "reward_std": 0.26321347430348396, "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5385044813156128, "step": 2541 }, { "clip_ratio": 0.0, "completion_length": 1353.7031860351562, "epoch": 0.7593159584795758, "grad_norm": 3.572887659072876, "kl": 4.18359375, "learning_rate": 7.485588114184253e-08, "loss": 0.3361, "reward": 0.6863839477300644, "reward_std": 0.2836240828037262, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625223517418, "step": 2542 }, { "clip_ratio": 0.0, "completion_length": 1448.2478332519531, "epoch": 0.7596146665670973, "grad_norm": 2.67768931388855, "kl": 4.38671875, "learning_rate": 7.47511124274976e-08, "loss": 0.3049, "reward": 0.5948660969734192, "reward_std": 0.27599427103996277, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5256696715950966, "step": 2543 }, { "clip_ratio": 0.0, "completion_length": 1352.0715026855469, "epoch": 0.7599133746546187, "grad_norm": 3.649730920791626, "kl": 4.46875, "learning_rate": 7.464644189509691e-08, "loss": 0.2941, "reward": 0.5535714477300644, "reward_std": 0.2870084084570408, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5200893133878708, "step": 2544 }, { "clip_ratio": 0.0, "completion_length": 1520.9197082519531, "epoch": 0.7602120827421402, "grad_norm": 5.573822975158691, "kl": 4.9375, "learning_rate": 7.454186965851189e-08, "loss": 0.3817, "reward": 0.5742187798023224, "reward_std": 0.2868487574160099, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937798023224, "step": 2545 }, { "clip_ratio": 0.0, "completion_length": 1347.9888916015625, "epoch": 0.7605107908296617, "grad_norm": 3.911674737930298, "kl": 3.484375, "learning_rate": 7.443739583150684e-08, "loss": 0.2173, "reward": 0.6422991454601288, "reward_std": 0.29883669316768646, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5485491305589676, "step": 2546 }, { "clip_ratio": 0.0, "completion_length": 1463.4777526855469, "epoch": 0.7608094989171832, "grad_norm": 4.083090305328369, "kl": 4.04296875, "learning_rate": 7.4333020527739e-08, "loss": 0.2665, "reward": 0.6925223469734192, "reward_std": 0.27821653708815575, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5452009290456772, "step": 2547 }, { "clip_ratio": 0.0, "completion_length": 1440.2411499023438, "epoch": 0.7611082070047046, "grad_norm": 6.037135601043701, "kl": 4.27734375, "learning_rate": 7.422874386075855e-08, "loss": 0.2688, "reward": 0.6222098469734192, "reward_std": 0.22934091091156006, "rewards/accuracy_reward": 0.12053572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741380095482, "step": 2548 }, { "clip_ratio": 0.0, "completion_length": 1418.8170471191406, "epoch": 0.7614069150922261, "grad_norm": 4.231720924377441, "kl": 4.12109375, "learning_rate": 7.412456594400821e-08, "loss": 0.3465, "reward": 0.6406250447034836, "reward_std": 0.2924224063754082, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.511160746216774, "step": 2549 }, { "clip_ratio": 0.0, "completion_length": 1430.5022888183594, "epoch": 0.7617056231797475, "grad_norm": 4.260475158691406, "kl": 3.87890625, "learning_rate": 7.402048689082336e-08, "loss": 0.2653, "reward": 0.5117187798023224, "reward_std": 0.25425297766923904, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330633878708, "step": 2550 }, { "clip_ratio": 0.0, "completion_length": 1402.4777526855469, "epoch": 0.7620043312672691, "grad_norm": 3.903911590576172, "kl": 3.66796875, "learning_rate": 7.391650681443187e-08, "loss": 0.2355, "reward": 0.6116071790456772, "reward_std": 0.2922630086541176, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5223214626312256, "step": 2551 }, { "clip_ratio": 0.0, "completion_length": 1476.8861999511719, "epoch": 0.7623030393547905, "grad_norm": 6.77730655670166, "kl": 4.08984375, "learning_rate": 7.381262582795377e-08, "loss": 0.2923, "reward": 0.5357142984867096, "reward_std": 0.2679766118526459, "rewards/accuracy_reward": 0.042410716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933036044239998, "step": 2552 }, { "clip_ratio": 0.0, "completion_length": 1528.982177734375, "epoch": 0.762601747442312, "grad_norm": 5.388480186462402, "kl": 3.97265625, "learning_rate": 7.370884404440147e-08, "loss": 0.2893, "reward": 0.5407366305589676, "reward_std": 0.25286004692316055, "rewards/accuracy_reward": 0.058035715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4827009066939354, "step": 2553 }, { "clip_ratio": 0.0, "completion_length": 1505.60498046875, "epoch": 0.7629004555298334, "grad_norm": 3.839338541030884, "kl": 3.890625, "learning_rate": 7.360516157667938e-08, "loss": 0.3152, "reward": 0.5167411118745804, "reward_std": 0.2664858475327492, "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196715950966, "step": 2554 }, { "clip_ratio": 0.0, "completion_length": 1478.9754943847656, "epoch": 0.763199163617355, "grad_norm": 4.479970455169678, "kl": 3.9375, "learning_rate": 7.35015785375838e-08, "loss": 0.2817, "reward": 0.6422991305589676, "reward_std": 0.2930327355861664, "rewards/accuracy_reward": 0.15625000651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491380095482, "step": 2555 }, { "clip_ratio": 0.0, "completion_length": 1410.0938110351562, "epoch": 0.7634978717048764, "grad_norm": 2.597588300704956, "kl": 3.4140625, "learning_rate": 7.339809503980295e-08, "loss": 0.2896, "reward": 0.6065848469734192, "reward_std": 0.3026038929820061, "rewards/accuracy_reward": 0.11160714970901608, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 2556 }, { "clip_ratio": 0.0, "completion_length": 1392.1652526855469, "epoch": 0.7637965797923979, "grad_norm": 6.921197414398193, "kl": 3.96875, "learning_rate": 7.329471119591681e-08, "loss": 0.2677, "reward": 0.561941996216774, "reward_std": 0.25799792632460594, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419813156128, "step": 2557 }, { "clip_ratio": 0.0, "completion_length": 1439.7790832519531, "epoch": 0.7640952878799193, "grad_norm": 2.9062278270721436, "kl": 4.3984375, "learning_rate": 7.319142711839674e-08, "loss": 0.3676, "reward": 0.568638414144516, "reward_std": 0.27320733293890953, "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562723517418, "step": 2558 }, { "clip_ratio": 0.0, "completion_length": 1488.21435546875, "epoch": 0.7643939959674408, "grad_norm": 3.168872356414795, "kl": 4.171875, "learning_rate": 7.308824291960578e-08, "loss": 0.3502, "reward": 0.5128348469734192, "reward_std": 0.29363976418972015, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 2559 }, { "clip_ratio": 0.0, "completion_length": 1434.8951721191406, "epoch": 0.7646927040549623, "grad_norm": 2.8386731147766113, "kl": 3.45703125, "learning_rate": 7.298515871179823e-08, "loss": 0.2298, "reward": 0.6250000149011612, "reward_std": 0.27283021807670593, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750149011612, "step": 2560 }, { "clip_ratio": 0.0, "completion_length": 1460.1495971679688, "epoch": 0.7649914121424838, "grad_norm": 3.150726556777954, "kl": 3.40625, "learning_rate": 7.288217460711953e-08, "loss": 0.2406, "reward": 0.6054687798023224, "reward_std": 0.2731866426765919, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937723517418, "step": 2561 }, { "clip_ratio": 0.0, "completion_length": 1492.4107971191406, "epoch": 0.7652901202300052, "grad_norm": 2.542271137237549, "kl": 3.27734375, "learning_rate": 7.27792907176064e-08, "loss": 0.2597, "reward": 0.595982164144516, "reward_std": 0.30839690938591957, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5089285969734192, "step": 2562 }, { "clip_ratio": 0.0, "completion_length": 1467.9152526855469, "epoch": 0.7655888283175267, "grad_norm": 5.855329513549805, "kl": 3.7265625, "learning_rate": 7.267650715518632e-08, "loss": 0.2623, "reward": 0.5139509066939354, "reward_std": 0.24579787626862526, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 2563 }, { "clip_ratio": 0.0, "completion_length": 1470.8103332519531, "epoch": 0.7658875364050481, "grad_norm": 3.3398022651672363, "kl": 3.76171875, "learning_rate": 7.257382403167778e-08, "loss": 0.2854, "reward": 0.490513414144516, "reward_std": 0.2760273292660713, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026902794838, "step": 2564 }, { "clip_ratio": 0.0, "completion_length": 1392.384033203125, "epoch": 0.7661862444925697, "grad_norm": 5.672695159912109, "kl": 3.39453125, "learning_rate": 7.247124145878998e-08, "loss": 0.29, "reward": 0.5055803880095482, "reward_std": 0.28113606572151184, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018133878708, "step": 2565 }, { "clip_ratio": 0.0, "completion_length": 1407.8661193847656, "epoch": 0.7664849525800911, "grad_norm": 3.1168434619903564, "kl": 3.05078125, "learning_rate": 7.236875954812266e-08, "loss": 0.2025, "reward": 0.5786830633878708, "reward_std": 0.22938113659620285, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580559372902, "step": 2566 }, { "clip_ratio": 0.0, "completion_length": 1472.5625915527344, "epoch": 0.7667836606676126, "grad_norm": 3.5228750705718994, "kl": 3.22265625, "learning_rate": 7.226637841116611e-08, "loss": 0.2579, "reward": 0.5429687723517418, "reward_std": 0.2576995827257633, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044887661934, "step": 2567 }, { "clip_ratio": 0.0, "completion_length": 1465.3995971679688, "epoch": 0.767082368755134, "grad_norm": 3.4257750511169434, "kl": 3.37109375, "learning_rate": 7.216409815930102e-08, "loss": 0.2509, "reward": 0.5719866156578064, "reward_std": 0.26150618493556976, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4827009066939354, "step": 2568 }, { "clip_ratio": 0.0, "completion_length": 1345.4978332519531, "epoch": 0.7673810768426556, "grad_norm": 6.839754581451416, "kl": 3.16796875, "learning_rate": 7.206191890379819e-08, "loss": 0.2763, "reward": 0.6367187798023224, "reward_std": 0.300087284296751, "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937649011612, "step": 2569 }, { "clip_ratio": 0.0, "completion_length": 1433.1585388183594, "epoch": 0.767679784930177, "grad_norm": 3.2995197772979736, "kl": 3.76171875, "learning_rate": 7.195984075581869e-08, "loss": 0.2877, "reward": 0.5424107387661934, "reward_std": 0.245046004652977, "rewards/accuracy_reward": 0.08258928707800806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4598214477300644, "step": 2570 }, { "clip_ratio": 0.0, "completion_length": 1453.7344055175781, "epoch": 0.7679784930176985, "grad_norm": 2.56752872467041, "kl": 3.40234375, "learning_rate": 7.185786382641356e-08, "loss": 0.2823, "reward": 0.565848246216774, "reward_std": 0.2698989547789097, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4966518133878708, "step": 2571 }, { "clip_ratio": 0.0, "completion_length": 1475.1942443847656, "epoch": 0.7682772011052199, "grad_norm": 3.169257164001465, "kl": 3.73046875, "learning_rate": 7.175598822652364e-08, "loss": 0.2668, "reward": 0.5318080559372902, "reward_std": 0.2502553127706051, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687798023224, "step": 2572 }, { "clip_ratio": 0.0, "completion_length": 1416.1853332519531, "epoch": 0.7685759091927414, "grad_norm": 3.2522671222686768, "kl": 3.30078125, "learning_rate": 7.165421406697964e-08, "loss": 0.2189, "reward": 0.5133928805589676, "reward_std": 0.2771376632153988, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071566939354, "step": 2573 }, { "clip_ratio": 0.0, "completion_length": 1410.435302734375, "epoch": 0.7688746172802629, "grad_norm": 4.868887424468994, "kl": 3.95703125, "learning_rate": 7.15525414585019e-08, "loss": 0.3087, "reward": 0.5234375149011612, "reward_std": 0.2595262937247753, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 2574 }, { "clip_ratio": 0.0, "completion_length": 1423.0023193359375, "epoch": 0.7691733253677844, "grad_norm": 4.693687915802002, "kl": 3.5, "learning_rate": 7.145097051170017e-08, "loss": 0.2471, "reward": 0.5970982313156128, "reward_std": 0.24030239135026932, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 2575 }, { "clip_ratio": 0.0, "completion_length": 1440.5469360351562, "epoch": 0.7694720334553058, "grad_norm": 3.793468713760376, "kl": 3.60546875, "learning_rate": 7.134950133707379e-08, "loss": 0.2595, "reward": 0.5212053880095482, "reward_std": 0.28437764570116997, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4832589477300644, "step": 2576 }, { "clip_ratio": 0.0, "completion_length": 1392.1474304199219, "epoch": 0.7697707415428273, "grad_norm": 4.591440677642822, "kl": 3.6796875, "learning_rate": 7.124813404501117e-08, "loss": 0.3229, "reward": 0.5351562798023224, "reward_std": 0.29570965096354485, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 2577 }, { "clip_ratio": 0.0, "completion_length": 1433.3460388183594, "epoch": 0.7700694496303487, "grad_norm": 4.880648136138916, "kl": 3.8671875, "learning_rate": 7.114686874579004e-08, "loss": 0.3058, "reward": 0.585379496216774, "reward_std": 0.271225206553936, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4893973469734192, "step": 2578 }, { "clip_ratio": 0.0, "completion_length": 1430.966552734375, "epoch": 0.7703681577178703, "grad_norm": 4.3041887283325195, "kl": 3.86328125, "learning_rate": 7.104570554957716e-08, "loss": 0.2853, "reward": 0.5228794887661934, "reward_std": 0.235908854752779, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080633878708, "step": 2579 }, { "clip_ratio": 0.0, "completion_length": 1338.0357666015625, "epoch": 0.7706668658053917, "grad_norm": 4.358524322509766, "kl": 3.24609375, "learning_rate": 7.094464456642808e-08, "loss": 0.2431, "reward": 0.5753348469734192, "reward_std": 0.28300196304917336, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062649011612, "step": 2580 }, { "clip_ratio": 0.0, "completion_length": 1511.7791137695312, "epoch": 0.7709655738929132, "grad_norm": 4.889552593231201, "kl": 3.6171875, "learning_rate": 7.084368590628731e-08, "loss": 0.2627, "reward": 0.5915178805589676, "reward_std": 0.24714617058634758, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 2581 }, { "clip_ratio": 0.0, "completion_length": 1430.4755249023438, "epoch": 0.7712642819804346, "grad_norm": 4.500431537628174, "kl": 3.8125, "learning_rate": 7.0742829678988e-08, "loss": 0.2714, "reward": 0.5959821715950966, "reward_std": 0.2821343094110489, "rewards/accuracy_reward": 0.10491071711294353, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4910714700818062, "step": 2582 }, { "clip_ratio": 0.0, "completion_length": 1400.9263916015625, "epoch": 0.7715629900679561, "grad_norm": 3.8489644527435303, "kl": 3.6171875, "learning_rate": 7.064207599425176e-08, "loss": 0.2801, "reward": 0.5128348395228386, "reward_std": 0.28298480808734894, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919813156128, "step": 2583 }, { "clip_ratio": 0.0, "completion_length": 1382.6116638183594, "epoch": 0.7718616981554776, "grad_norm": 3.3260433673858643, "kl": 3.546875, "learning_rate": 7.054142496168878e-08, "loss": 0.2609, "reward": 0.536830373108387, "reward_std": 0.26655254885554314, "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303880095482, "step": 2584 }, { "clip_ratio": 0.0, "completion_length": 1445.5045471191406, "epoch": 0.772160406242999, "grad_norm": 4.552841663360596, "kl": 3.6484375, "learning_rate": 7.044087669079754e-08, "loss": 0.2578, "reward": 0.5675223469734192, "reward_std": 0.274973351508379, "rewards/accuracy_reward": 0.10937500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473395228386, "step": 2585 }, { "clip_ratio": 0.0, "completion_length": 1398.8281860351562, "epoch": 0.7724591143305205, "grad_norm": 3.004720687866211, "kl": 3.375, "learning_rate": 7.034043129096464e-08, "loss": 0.2472, "reward": 0.5446428805589676, "reward_std": 0.2682071104645729, "rewards/accuracy_reward": 0.06919643119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4754464477300644, "step": 2586 }, { "clip_ratio": 0.0, "completion_length": 1345.8750610351562, "epoch": 0.7727578224180419, "grad_norm": 4.053915500640869, "kl": 2.60546875, "learning_rate": 7.024008887146482e-08, "loss": 0.1857, "reward": 0.584263414144516, "reward_std": 0.27247968688607216, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.521763414144516, "step": 2587 }, { "clip_ratio": 0.0, "completion_length": 1375.1920166015625, "epoch": 0.7730565305055634, "grad_norm": 9.405174255371094, "kl": 2.84765625, "learning_rate": 7.013984954146093e-08, "loss": 0.2818, "reward": 0.5535714477300644, "reward_std": 0.2576105631887913, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035969734192, "step": 2588 }, { "clip_ratio": 0.0, "completion_length": 1339.2991638183594, "epoch": 0.7733552385930849, "grad_norm": 3.6177995204925537, "kl": 2.8828125, "learning_rate": 7.003971341000337e-08, "loss": 0.2452, "reward": 0.5546875149011612, "reward_std": 0.2657958194613457, "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 2589 }, { "clip_ratio": 0.0, "completion_length": 1399.3437805175781, "epoch": 0.7736539466806064, "grad_norm": 2.5798943042755127, "kl": 3.11328125, "learning_rate": 6.993968058603056e-08, "loss": 0.223, "reward": 0.518415205180645, "reward_std": 0.2871100455522537, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482700914144516, "step": 2590 }, { "clip_ratio": 0.0, "completion_length": 1378.7388916015625, "epoch": 0.7739526547681278, "grad_norm": 5.662426471710205, "kl": 3.125, "learning_rate": 6.983975117836834e-08, "loss": 0.1686, "reward": 0.5697544887661934, "reward_std": 0.2586437910795212, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4559151977300644, "step": 2591 }, { "clip_ratio": 0.0, "completion_length": 1317.8973999023438, "epoch": 0.7742513628556493, "grad_norm": 5.691442489624023, "kl": 2.83203125, "learning_rate": 6.973992529573011e-08, "loss": 0.2352, "reward": 0.5842634290456772, "reward_std": 0.3063507303595543, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062798023224, "step": 2592 }, { "clip_ratio": 0.0, "completion_length": 1407.8929138183594, "epoch": 0.7745500709431707, "grad_norm": 3.545459270477295, "kl": 3.0234375, "learning_rate": 6.964020304671671e-08, "loss": 0.2444, "reward": 0.6015625298023224, "reward_std": 0.2687632516026497, "rewards/accuracy_reward": 0.14062500349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 2593 }, { "clip_ratio": 0.0, "completion_length": 1399.0759582519531, "epoch": 0.7748487790306923, "grad_norm": 2.2142581939697266, "kl": 2.9609375, "learning_rate": 6.95405845398161e-08, "loss": 0.2039, "reward": 0.5362723469734192, "reward_std": 0.3115056976675987, "rewards/accuracy_reward": 0.05803571781143546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 2594 }, { "clip_ratio": 0.0, "completion_length": 1462.6585388183594, "epoch": 0.7751474871182137, "grad_norm": 3.9100492000579834, "kl": 2.935546875, "learning_rate": 6.944106988340346e-08, "loss": 0.2322, "reward": 0.5089286044239998, "reward_std": 0.27555834874510765, "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4709821566939354, "step": 2595 }, { "clip_ratio": 0.0, "completion_length": 1420.3884582519531, "epoch": 0.7754461952057352, "grad_norm": 2.5835907459259033, "kl": 3.23046875, "learning_rate": 6.934165918574106e-08, "loss": 0.2746, "reward": 0.5362723469734192, "reward_std": 0.29172713309526443, "rewards/accuracy_reward": 0.07366071850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 2596 }, { "clip_ratio": 0.0, "completion_length": 1431.1563415527344, "epoch": 0.7757449032932566, "grad_norm": 2.683986186981201, "kl": 3.0703125, "learning_rate": 6.92423525549779e-08, "loss": 0.2568, "reward": 0.5412946715950966, "reward_std": 0.26296210289001465, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467633955180645, "step": 2597 }, { "clip_ratio": 0.0, "completion_length": 1521.57373046875, "epoch": 0.7760436113807782, "grad_norm": 5.00552225112915, "kl": 3.48046875, "learning_rate": 6.91431500991499e-08, "loss": 0.2282, "reward": 0.5703125298023224, "reward_std": 0.24414315819740295, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4363839477300644, "step": 2598 }, { "clip_ratio": 0.0, "completion_length": 1457.4263916015625, "epoch": 0.7763423194682996, "grad_norm": 2.3749020099639893, "kl": 3.19921875, "learning_rate": 6.904405192617967e-08, "loss": 0.2509, "reward": 0.5061384215950966, "reward_std": 0.28572429716587067, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598469734192, "step": 2599 }, { "clip_ratio": 0.0, "completion_length": 1445.65185546875, "epoch": 0.7766410275558211, "grad_norm": 4.443459987640381, "kl": 3.26953125, "learning_rate": 6.89450581438762e-08, "loss": 0.2655, "reward": 0.528459832072258, "reward_std": 0.2723132483661175, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 2600 }, { "clip_ratio": 0.0, "completion_length": 1309.7567443847656, "epoch": 0.7769397356433425, "grad_norm": 4.507551670074463, "kl": 3.375, "learning_rate": 6.88461688599351e-08, "loss": 0.2551, "reward": 0.6171875298023224, "reward_std": 0.2813008911907673, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660895228386, "step": 2601 }, { "clip_ratio": 0.0, "completion_length": 1418.1072082519531, "epoch": 0.777238443730864, "grad_norm": 3.1587259769439697, "kl": 3.3515625, "learning_rate": 6.874738418193817e-08, "loss": 0.2797, "reward": 0.6199776977300644, "reward_std": 0.29073670879006386, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062649011612, "step": 2602 }, { "clip_ratio": 0.0, "completion_length": 1409.7590026855469, "epoch": 0.7775371518183855, "grad_norm": 2.4071545600891113, "kl": 3.1171875, "learning_rate": 6.864870421735345e-08, "loss": 0.2432, "reward": 0.4972098469734192, "reward_std": 0.25812819600105286, "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 2603 }, { "clip_ratio": 0.0, "completion_length": 1434.5558776855469, "epoch": 0.777835859905907, "grad_norm": 7.160818099975586, "kl": 3.4375, "learning_rate": 6.855012907353517e-08, "loss": 0.2221, "reward": 0.5468750074505806, "reward_std": 0.24009573832154274, "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459821455180645, "step": 2604 }, { "clip_ratio": 0.0, "completion_length": 1408.0737609863281, "epoch": 0.7781345679934284, "grad_norm": 3.109304189682007, "kl": 3.06640625, "learning_rate": 6.845165885772331e-08, "loss": 0.2097, "reward": 0.6356027126312256, "reward_std": 0.258532777428627, "rewards/accuracy_reward": 0.1674107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919887661934, "step": 2605 }, { "clip_ratio": 0.0, "completion_length": 1419.9509582519531, "epoch": 0.7784332760809499, "grad_norm": 2.4595818519592285, "kl": 3.078125, "learning_rate": 6.835329367704384e-08, "loss": 0.2227, "reward": 0.5792410895228386, "reward_std": 0.2629173696041107, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 2606 }, { "clip_ratio": 0.0, "completion_length": 1360.1853332519531, "epoch": 0.7787319841684713, "grad_norm": 3.071436882019043, "kl": 2.9453125, "learning_rate": 6.825503363850851e-08, "loss": 0.2068, "reward": 0.563616082072258, "reward_std": 0.25773436576128006, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 2607 }, { "clip_ratio": 0.0, "completion_length": 1392.5380249023438, "epoch": 0.7790306922559929, "grad_norm": 3.5730066299438477, "kl": 3.01953125, "learning_rate": 6.815687884901451e-08, "loss": 0.2318, "reward": 0.5703125298023224, "reward_std": 0.26709799095988274, "rewards/accuracy_reward": 0.10044643189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660895228386, "step": 2608 }, { "clip_ratio": 0.0, "completion_length": 1428.0111999511719, "epoch": 0.7793294003435143, "grad_norm": 3.4510879516601562, "kl": 3.17578125, "learning_rate": 6.805882941534469e-08, "loss": 0.2413, "reward": 0.5552455484867096, "reward_std": 0.23780958727002144, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 2609 }, { "clip_ratio": 0.0, "completion_length": 1410.1005249023438, "epoch": 0.7796281084310358, "grad_norm": 2.621751070022583, "kl": 3.25, "learning_rate": 6.796088544416724e-08, "loss": 0.2848, "reward": 0.5189732387661934, "reward_std": 0.26005808264017105, "rewards/accuracy_reward": 0.06026785844005644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458705373108387, "step": 2610 }, { "clip_ratio": 0.0, "completion_length": 1413.2433776855469, "epoch": 0.7799268165185572, "grad_norm": 6.331118106842041, "kl": 2.8671875, "learning_rate": 6.786304704203553e-08, "loss": 0.2444, "reward": 0.5217634215950966, "reward_std": 0.24976066499948502, "rewards/accuracy_reward": 0.022321428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4994419813156128, "step": 2611 }, { "clip_ratio": 0.0, "completion_length": 1444.6943054199219, "epoch": 0.7802255246060787, "grad_norm": 6.15047025680542, "kl": 3.21484375, "learning_rate": 6.776531431538819e-08, "loss": 0.1817, "reward": 0.4921875223517418, "reward_std": 0.2569071687757969, "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 2612 }, { "clip_ratio": 0.0, "completion_length": 1443.4219665527344, "epoch": 0.7805242326936002, "grad_norm": 2.6462936401367188, "kl": 3.203125, "learning_rate": 6.766768737054889e-08, "loss": 0.212, "reward": 0.4921875298023224, "reward_std": 0.2545490562915802, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474330373108387, "step": 2613 }, { "clip_ratio": 0.0, "completion_length": 1370.0804138183594, "epoch": 0.7808229407811217, "grad_norm": 2.2137129306793213, "kl": 2.984375, "learning_rate": 6.757016631372611e-08, "loss": 0.1943, "reward": 0.5418527126312256, "reward_std": 0.25590069964528084, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 2614 }, { "clip_ratio": 0.0, "completion_length": 1433.9129943847656, "epoch": 0.7811216488686431, "grad_norm": 2.7323966026306152, "kl": 3.00390625, "learning_rate": 6.747275125101327e-08, "loss": 0.1803, "reward": 0.5937500298023224, "reward_std": 0.3021204471588135, "rewards/accuracy_reward": 0.11383928707800806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107387661934, "step": 2615 }, { "clip_ratio": 0.0, "completion_length": 1478.7121276855469, "epoch": 0.7814203569561646, "grad_norm": 2.1519880294799805, "kl": 2.9296875, "learning_rate": 6.737544228838832e-08, "loss": 0.2062, "reward": 0.5100446492433548, "reward_std": 0.2439529225230217, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4497767984867096, "step": 2616 }, { "clip_ratio": 0.0, "completion_length": 1456.3304443359375, "epoch": 0.781719065043686, "grad_norm": 4.964725971221924, "kl": 3.3046875, "learning_rate": 6.727823953171393e-08, "loss": 0.2406, "reward": 0.4771205633878708, "reward_std": 0.2504374720156193, "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312723517418, "step": 2617 }, { "clip_ratio": 0.0, "completion_length": 1428.8661499023438, "epoch": 0.7820177731312076, "grad_norm": 2.3935763835906982, "kl": 3.23046875, "learning_rate": 6.718114308673722e-08, "loss": 0.2271, "reward": 0.5535714477300644, "reward_std": 0.2940879724919796, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419643059372902, "step": 2618 }, { "clip_ratio": 0.0, "completion_length": 1394.7991943359375, "epoch": 0.782316481218729, "grad_norm": 6.298231601715088, "kl": 2.39453125, "learning_rate": 6.70841530590895e-08, "loss": 0.2093, "reward": 0.5362723469734192, "reward_std": 0.30938760936260223, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871652126312256, "step": 2619 }, { "clip_ratio": 0.0, "completion_length": 1369.5960693359375, "epoch": 0.7826151893062505, "grad_norm": 6.032042980194092, "kl": 2.5546875, "learning_rate": 6.698726955428648e-08, "loss": 0.2274, "reward": 0.4960937723517418, "reward_std": 0.27856264263391495, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080559372902, "step": 2620 }, { "clip_ratio": 0.0, "completion_length": 1435.8348693847656, "epoch": 0.7829138973937719, "grad_norm": 2.7751517295837402, "kl": 2.77734375, "learning_rate": 6.689049267772792e-08, "loss": 0.2013, "reward": 0.5753348544239998, "reward_std": 0.28582100570201874, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598395228386, "step": 2621 }, { "clip_ratio": 0.0, "completion_length": 1321.732177734375, "epoch": 0.7832126054812935, "grad_norm": 5.151723861694336, "kl": 2.64453125, "learning_rate": 6.679382253469754e-08, "loss": 0.2456, "reward": 0.620535746216774, "reward_std": 0.3042909801006317, "rewards/accuracy_reward": 0.14732143329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143133878708, "step": 2622 }, { "clip_ratio": 0.0, "completion_length": 1449.1741943359375, "epoch": 0.7835113135688149, "grad_norm": 3.922912120819092, "kl": 2.70703125, "learning_rate": 6.6697259230363e-08, "loss": 0.2047, "reward": 0.471540205180645, "reward_std": 0.2231445200741291, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437723517418, "step": 2623 }, { "clip_ratio": 0.0, "completion_length": 1321.4286193847656, "epoch": 0.7838100216563364, "grad_norm": 3.578671455383301, "kl": 2.69140625, "learning_rate": 6.660080286977575e-08, "loss": 0.2354, "reward": 0.5094866454601288, "reward_std": 0.26905300840735435, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366380095482, "step": 2624 }, { "clip_ratio": 0.0, "completion_length": 1321.5826110839844, "epoch": 0.7841087297438578, "grad_norm": 3.0284647941589355, "kl": 2.79296875, "learning_rate": 6.650445355787079e-08, "loss": 0.2541, "reward": 0.5831473395228386, "reward_std": 0.25988756120204926, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723395228386, "step": 2625 }, { "clip_ratio": 0.0, "completion_length": 1369.1562805175781, "epoch": 0.7844074378313793, "grad_norm": 3.1526870727539062, "kl": 2.4921875, "learning_rate": 6.640821139946674e-08, "loss": 0.1692, "reward": 0.5954241454601288, "reward_std": 0.2718263231217861, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949777126312256, "step": 2626 }, { "clip_ratio": 0.0, "completion_length": 1386.7389221191406, "epoch": 0.7847061459189008, "grad_norm": 2.6513702869415283, "kl": 3.05078125, "learning_rate": 6.63120764992657e-08, "loss": 0.2021, "reward": 0.4994419887661934, "reward_std": 0.2807919383049011, "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276902794838, "step": 2627 }, { "clip_ratio": 0.0, "completion_length": 1384.7411804199219, "epoch": 0.7850048540064222, "grad_norm": 3.3756155967712402, "kl": 2.8828125, "learning_rate": 6.621604896185294e-08, "loss": 0.1501, "reward": 0.5424107387661934, "reward_std": 0.26499244198203087, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143133878708, "step": 2628 }, { "clip_ratio": 0.0, "completion_length": 1539.35498046875, "epoch": 0.7853035620939437, "grad_norm": 2.4418108463287354, "kl": 2.72265625, "learning_rate": 6.612012889169709e-08, "loss": 0.1856, "reward": 0.4760044813156128, "reward_std": 0.2502598762512207, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437723517418, "step": 2629 }, { "clip_ratio": 0.0, "completion_length": 1409.1763916015625, "epoch": 0.7856022701814651, "grad_norm": 3.031278610229492, "kl": 2.9375, "learning_rate": 6.602431639314967e-08, "loss": 0.2439, "reward": 0.5396205559372902, "reward_std": 0.2826448790729046, "rewards/accuracy_reward": 0.07812500395812094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 2630 }, { "clip_ratio": 0.0, "completion_length": 1361.0089721679688, "epoch": 0.7859009782689866, "grad_norm": 2.1842167377471924, "kl": 2.78125, "learning_rate": 6.592861157044538e-08, "loss": 0.2176, "reward": 0.6300223469734192, "reward_std": 0.3119395077228546, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616380095482, "step": 2631 }, { "clip_ratio": 0.0, "completion_length": 1429.4732971191406, "epoch": 0.786199686356508, "grad_norm": 2.8986029624938965, "kl": 3.09375, "learning_rate": 6.583301452770169e-08, "loss": 0.2365, "reward": 0.5279018133878708, "reward_std": 0.27129340171813965, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654017984867096, "step": 2632 }, { "clip_ratio": 0.0, "completion_length": 1292.8683471679688, "epoch": 0.7864983944440296, "grad_norm": 3.007565975189209, "kl": 2.64453125, "learning_rate": 6.573752536891876e-08, "loss": 0.1937, "reward": 0.5424107313156128, "reward_std": 0.32185664772987366, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 2633 }, { "clip_ratio": 0.0, "completion_length": 1372.6317749023438, "epoch": 0.786797102531551, "grad_norm": 2.6694607734680176, "kl": 3.19140625, "learning_rate": 6.564214419797941e-08, "loss": 0.2609, "reward": 0.5848214626312256, "reward_std": 0.2951810508966446, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 2634 }, { "clip_ratio": 0.0, "completion_length": 1400.7567443847656, "epoch": 0.7870958106190725, "grad_norm": 5.145925998687744, "kl": 3.05078125, "learning_rate": 6.554687111864914e-08, "loss": 0.2096, "reward": 0.502790205180645, "reward_std": 0.25045982003211975, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.476004496216774, "step": 2635 }, { "clip_ratio": 0.0, "completion_length": 1351.8728332519531, "epoch": 0.7873945187065939, "grad_norm": 2.8748557567596436, "kl": 2.828125, "learning_rate": 6.54517062345756e-08, "loss": 0.1923, "reward": 0.5429687798023224, "reward_std": 0.32116470858454704, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4893973469734192, "step": 2636 }, { "clip_ratio": 0.0, "completion_length": 1440.5424499511719, "epoch": 0.7876932267941155, "grad_norm": 6.647035121917725, "kl": 3.32421875, "learning_rate": 6.535664964928888e-08, "loss": 0.2291, "reward": 0.5066964477300644, "reward_std": 0.2518190033733845, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4508928805589676, "step": 2637 }, { "clip_ratio": 0.0, "completion_length": 1404.7589721679688, "epoch": 0.7879919348816369, "grad_norm": 2.495553731918335, "kl": 3.04296875, "learning_rate": 6.52617014662013e-08, "loss": 0.2222, "reward": 0.525669664144516, "reward_std": 0.2601354829967022, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375298023224, "step": 2638 }, { "clip_ratio": 0.0, "completion_length": 1339.1139221191406, "epoch": 0.7882906429691584, "grad_norm": 2.5032143592834473, "kl": 2.681640625, "learning_rate": 6.516686178860707e-08, "loss": 0.2172, "reward": 0.6071428880095482, "reward_std": 0.2700594738125801, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143133878708, "step": 2639 }, { "clip_ratio": 0.0, "completion_length": 1329.2991333007812, "epoch": 0.7885893510566798, "grad_norm": 3.879722833633423, "kl": 2.796875, "learning_rate": 6.507213071968252e-08, "loss": 0.1569, "reward": 0.5558035969734192, "reward_std": 0.2702362835407257, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 2640 }, { "clip_ratio": 0.0, "completion_length": 1409.9509582519531, "epoch": 0.7888880591442013, "grad_norm": 2.388223171234131, "kl": 3.328125, "learning_rate": 6.497750836248581e-08, "loss": 0.2805, "reward": 0.4737723469734192, "reward_std": 0.2580346316099167, "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830559372902, "step": 2641 }, { "clip_ratio": 0.0, "completion_length": 1368.4844055175781, "epoch": 0.7891867672317228, "grad_norm": 5.486515045166016, "kl": 2.71484375, "learning_rate": 6.488299481995668e-08, "loss": 0.2276, "reward": 0.5234375223517418, "reward_std": 0.2615923769772053, "rewards/accuracy_reward": 0.060267861699685454, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463169664144516, "step": 2642 }, { "clip_ratio": 0.0, "completion_length": 1403.49560546875, "epoch": 0.7894854753192443, "grad_norm": 2.397101879119873, "kl": 2.96484375, "learning_rate": 6.478859019491671e-08, "loss": 0.231, "reward": 0.542410746216774, "reward_std": 0.27155381068587303, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 2643 }, { "clip_ratio": 0.0, "completion_length": 1386.3995971679688, "epoch": 0.7897841834067657, "grad_norm": 2.737846612930298, "kl": 2.7734375, "learning_rate": 6.46942945900688e-08, "loss": 0.2269, "reward": 0.486049123108387, "reward_std": 0.2514694482088089, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 2644 }, { "clip_ratio": 0.0, "completion_length": 1308.7232971191406, "epoch": 0.7900828914942872, "grad_norm": 3.103006601333618, "kl": 2.541015625, "learning_rate": 6.460010810799737e-08, "loss": 0.2189, "reward": 0.5323660969734192, "reward_std": 0.26151422783732414, "rewards/accuracy_reward": 0.07366071850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053880095482, "step": 2645 }, { "clip_ratio": 0.0, "completion_length": 1324.7277526855469, "epoch": 0.7903815995818086, "grad_norm": 4.572160720825195, "kl": 2.42578125, "learning_rate": 6.45060308511681e-08, "loss": 0.176, "reward": 0.604910746216774, "reward_std": 0.23893817886710167, "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 2646 }, { "clip_ratio": 0.0, "completion_length": 1415.3103332519531, "epoch": 0.7906803076693302, "grad_norm": 4.707891464233398, "kl": 2.390625, "learning_rate": 6.441206292192777e-08, "loss": 0.1181, "reward": 0.5937500223517418, "reward_std": 0.2424975112080574, "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4553571715950966, "step": 2647 }, { "clip_ratio": 0.0, "completion_length": 1354.8281860351562, "epoch": 0.7909790157568516, "grad_norm": 2.5538408756256104, "kl": 2.6015625, "learning_rate": 6.431820442250429e-08, "loss": 0.2257, "reward": 0.5239955633878708, "reward_std": 0.26624443382024765, "rewards/accuracy_reward": 0.06026785844005644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 2648 }, { "clip_ratio": 0.0, "completion_length": 1273.0313110351562, "epoch": 0.7912777238443731, "grad_norm": 7.908884525299072, "kl": 2.4765625, "learning_rate": 6.422445545500655e-08, "loss": 0.2912, "reward": 0.6026785969734192, "reward_std": 0.2811516411602497, "rewards/accuracy_reward": 0.14508929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 2649 }, { "clip_ratio": 0.0, "completion_length": 1429.6005249023438, "epoch": 0.7915764319318945, "grad_norm": 2.493206739425659, "kl": 2.515625, "learning_rate": 6.41308161214242e-08, "loss": 0.158, "reward": 0.537388414144516, "reward_std": 0.29494592547416687, "rewards/accuracy_reward": 0.0736607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463727705180645, "step": 2650 }, { "clip_ratio": 0.0, "completion_length": 1376.3705749511719, "epoch": 0.7918751400194161, "grad_norm": 3.2798027992248535, "kl": 2.984375, "learning_rate": 6.403728652362765e-08, "loss": 0.2145, "reward": 0.5524553805589676, "reward_std": 0.2574452795088291, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4520089477300644, "step": 2651 }, { "clip_ratio": 0.0, "completion_length": 1341.65185546875, "epoch": 0.7921738481069375, "grad_norm": 3.025753974914551, "kl": 2.734375, "learning_rate": 6.394386676336799e-08, "loss": 0.262, "reward": 0.5122768208384514, "reward_std": 0.24586911126971245, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053805589676, "step": 2652 }, { "clip_ratio": 0.0, "completion_length": 1415.321533203125, "epoch": 0.792472556194459, "grad_norm": 3.2698161602020264, "kl": 2.34765625, "learning_rate": 6.38505569422767e-08, "loss": 0.1633, "reward": 0.498325914144516, "reward_std": 0.25613511726260185, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044813156128, "step": 2653 }, { "clip_ratio": 0.0, "completion_length": 1402.8148193359375, "epoch": 0.7927712642819804, "grad_norm": 2.8306376934051514, "kl": 2.67578125, "learning_rate": 6.375735716186574e-08, "loss": 0.1833, "reward": 0.5033482238650322, "reward_std": 0.24754974991083145, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410895228386, "step": 2654 }, { "clip_ratio": 0.0, "completion_length": 1435.0000610351562, "epoch": 0.7930699723695019, "grad_norm": 4.0068535804748535, "kl": 3.07421875, "learning_rate": 6.366426752352737e-08, "loss": 0.2236, "reward": 0.4598214477300644, "reward_std": 0.26698973029851913, "rewards/accuracy_reward": 0.02678571525029838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357387661934, "step": 2655 }, { "clip_ratio": 0.0, "completion_length": 1398.7656860351562, "epoch": 0.7933686804570234, "grad_norm": 3.145747423171997, "kl": 2.46875, "learning_rate": 6.357128812853393e-08, "loss": 0.2011, "reward": 0.5279017984867096, "reward_std": 0.29374533891677856, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196715950966, "step": 2656 }, { "clip_ratio": 0.0, "completion_length": 1344.4308776855469, "epoch": 0.7936673885445449, "grad_norm": 2.278310775756836, "kl": 2.6484375, "learning_rate": 6.3478419078038e-08, "loss": 0.2299, "reward": 0.5323660895228386, "reward_std": 0.26032571122050285, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 2657 }, { "clip_ratio": 0.0, "completion_length": 1388.7188110351562, "epoch": 0.7939660966320663, "grad_norm": 4.731141567230225, "kl": 2.78515625, "learning_rate": 6.338566047307184e-08, "loss": 0.197, "reward": 0.523437537252903, "reward_std": 0.24872685223817825, "rewards/accuracy_reward": 0.05580357299186289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 2658 }, { "clip_ratio": 0.0, "completion_length": 1365.1116638183594, "epoch": 0.7942648047195878, "grad_norm": 6.463346481323242, "kl": 2.296875, "learning_rate": 6.329301241454782e-08, "loss": 0.2041, "reward": 0.5708705484867096, "reward_std": 0.30290500447154045, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4905134215950966, "step": 2659 }, { "clip_ratio": 0.0, "completion_length": 1372.8750610351562, "epoch": 0.7945635128071092, "grad_norm": 2.4067606925964355, "kl": 2.66796875, "learning_rate": 6.3200475003258e-08, "loss": 0.2288, "reward": 0.5083705559372902, "reward_std": 0.2981945499777794, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 2660 }, { "clip_ratio": 0.0, "completion_length": 1290.3281860351562, "epoch": 0.7948622208946308, "grad_norm": 6.018568515777588, "kl": 2.322265625, "learning_rate": 6.31080483398739e-08, "loss": 0.2534, "reward": 0.6054687649011612, "reward_std": 0.2960629537701607, "rewards/accuracy_reward": 0.10714286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498325914144516, "step": 2661 }, { "clip_ratio": 0.0, "completion_length": 1415.2634582519531, "epoch": 0.7951609289821522, "grad_norm": 5.5035247802734375, "kl": 3.09375, "learning_rate": 6.30157325249467e-08, "loss": 0.2027, "reward": 0.5279018133878708, "reward_std": 0.27667446434497833, "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410895228386, "step": 2662 }, { "clip_ratio": 0.0, "completion_length": 1253.5156555175781, "epoch": 0.7954596370696737, "grad_norm": 3.6793174743652344, "kl": 2.4296875, "learning_rate": 6.292352765890706e-08, "loss": 0.249, "reward": 0.5574776977300644, "reward_std": 0.32121453434228897, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741380095482, "step": 2663 }, { "clip_ratio": 0.0, "completion_length": 1352.3929138183594, "epoch": 0.7957583451571951, "grad_norm": 4.712541103363037, "kl": 2.375, "learning_rate": 6.283143384206467e-08, "loss": 0.2032, "reward": 0.6395089477300644, "reward_std": 0.25568678230047226, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5167411044239998, "step": 2664 }, { "clip_ratio": 0.0, "completion_length": 1384.63623046875, "epoch": 0.7960570532447167, "grad_norm": 2.528169870376587, "kl": 2.7734375, "learning_rate": 6.27394511746087e-08, "loss": 0.1881, "reward": 0.6104911044239998, "reward_std": 0.2982405461370945, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 2665 }, { "clip_ratio": 0.0, "completion_length": 1377.8973693847656, "epoch": 0.7963557613322381, "grad_norm": 3.059217929840088, "kl": 2.87109375, "learning_rate": 6.264757975660727e-08, "loss": 0.2126, "reward": 0.487165205180645, "reward_std": 0.25661297515034676, "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723544239998, "step": 2666 }, { "clip_ratio": 0.0, "completion_length": 1277.1518249511719, "epoch": 0.7966544694197596, "grad_norm": 5.036462783813477, "kl": 2.4609375, "learning_rate": 6.255581968800741e-08, "loss": 0.2109, "reward": 0.581473246216774, "reward_std": 0.31972554326057434, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232387661934, "step": 2667 }, { "clip_ratio": 0.0, "completion_length": 1301.8370971679688, "epoch": 0.796953177507281, "grad_norm": 2.814972162246704, "kl": 2.4609375, "learning_rate": 6.246417106863513e-08, "loss": 0.1294, "reward": 0.5496651977300644, "reward_std": 0.24778460711240768, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027902126312256, "step": 2668 }, { "clip_ratio": 0.0, "completion_length": 1425.8326721191406, "epoch": 0.7972518855948025, "grad_norm": 4.780095100402832, "kl": 3.2890625, "learning_rate": 6.237263399819515e-08, "loss": 0.2356, "reward": 0.4933035969734192, "reward_std": 0.28055183216929436, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250223517418, "step": 2669 }, { "clip_ratio": 0.0, "completion_length": 1310.9420166015625, "epoch": 0.797550593682324, "grad_norm": 2.5169894695281982, "kl": 2.66796875, "learning_rate": 6.228120857627079e-08, "loss": 0.192, "reward": 0.5407366305589676, "reward_std": 0.29513730481266975, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 2670 }, { "clip_ratio": 0.0, "completion_length": 1308.8125610351562, "epoch": 0.7978493017698454, "grad_norm": 4.844010353088379, "kl": 2.72265625, "learning_rate": 6.218989490232403e-08, "loss": 0.2097, "reward": 0.6138393133878708, "reward_std": 0.26760081946849823, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393133878708, "step": 2671 }, { "clip_ratio": 0.0, "completion_length": 1352.1563415527344, "epoch": 0.7981480098573669, "grad_norm": 2.4367833137512207, "kl": 2.63671875, "learning_rate": 6.209869307569508e-08, "loss": 0.2134, "reward": 0.5680803954601288, "reward_std": 0.28309860453009605, "rewards/accuracy_reward": 0.07589286169968545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875149011612, "step": 2672 }, { "clip_ratio": 0.0, "completion_length": 1359.8750305175781, "epoch": 0.7984467179448883, "grad_norm": 3.498563289642334, "kl": 2.69921875, "learning_rate": 6.200760319560267e-08, "loss": 0.2328, "reward": 0.6171875149011612, "reward_std": 0.26495246961712837, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875074505806, "step": 2673 }, { "clip_ratio": 0.0, "completion_length": 1317.8348693847656, "epoch": 0.7987454260324098, "grad_norm": 3.0384790897369385, "kl": 2.494140625, "learning_rate": 6.191662536114368e-08, "loss": 0.2002, "reward": 0.542410746216774, "reward_std": 0.2742885798215866, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 2674 }, { "clip_ratio": 0.0, "completion_length": 1372.32373046875, "epoch": 0.7990441341199312, "grad_norm": 6.008838176727295, "kl": 3.0703125, "learning_rate": 6.1825759671293e-08, "loss": 0.1841, "reward": 0.5502232387661934, "reward_std": 0.24622982367873192, "rewards/accuracy_reward": 0.09151785913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053880095482, "step": 2675 }, { "clip_ratio": 0.0, "completion_length": 1369.6094360351562, "epoch": 0.7993428422074528, "grad_norm": 5.154383659362793, "kl": 2.83203125, "learning_rate": 6.173500622490363e-08, "loss": 0.1649, "reward": 0.487165205180645, "reward_std": 0.24001963809132576, "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.471540205180645, "step": 2676 }, { "clip_ratio": 0.0, "completion_length": 1338.5558776855469, "epoch": 0.7996415502949742, "grad_norm": 9.004364967346191, "kl": 3.14453125, "learning_rate": 6.164436512070648e-08, "loss": 0.2274, "reward": 0.5279018133878708, "reward_std": 0.2518659755587578, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053805589676, "step": 2677 }, { "clip_ratio": 0.0, "completion_length": 1389.6741638183594, "epoch": 0.7999402583824957, "grad_norm": 4.328060150146484, "kl": 2.90234375, "learning_rate": 6.15538364573101e-08, "loss": 0.207, "reward": 0.5474330633878708, "reward_std": 0.26973677054047585, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4425223395228386, "step": 2678 }, { "clip_ratio": 0.0, "completion_length": 1384.2590026855469, "epoch": 0.8002389664700171, "grad_norm": 2.413113832473755, "kl": 2.77734375, "learning_rate": 6.146342033320082e-08, "loss": 0.1835, "reward": 0.5401785895228386, "reward_std": 0.2939421087503433, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143133878708, "step": 2679 }, { "clip_ratio": 0.0, "completion_length": 1394.60498046875, "epoch": 0.8005376745575387, "grad_norm": 5.127553939819336, "kl": 2.72265625, "learning_rate": 6.137311684674259e-08, "loss": 0.2369, "reward": 0.5016741305589676, "reward_std": 0.2535094544291496, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705559372902, "step": 2680 }, { "clip_ratio": 0.0, "completion_length": 1300.8370971679688, "epoch": 0.8008363826450601, "grad_norm": 6.193166732788086, "kl": 2.484375, "learning_rate": 6.128292609617666e-08, "loss": 0.1938, "reward": 0.5228794887661934, "reward_std": 0.2675994411110878, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437798023224, "step": 2681 }, { "clip_ratio": 0.0, "completion_length": 1206.2790832519531, "epoch": 0.8011350907325816, "grad_norm": 6.803974151611328, "kl": 2.2265625, "learning_rate": 6.119284817962182e-08, "loss": 0.1849, "reward": 0.6060268133878708, "reward_std": 0.3334376849234104, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 2682 }, { "clip_ratio": 0.0, "completion_length": 1328.7344055175781, "epoch": 0.801433798820103, "grad_norm": 3.377375602722168, "kl": 2.66015625, "learning_rate": 6.110288319507394e-08, "loss": 0.2056, "reward": 0.5390625149011612, "reward_std": 0.2780890204012394, "rewards/accuracy_reward": 0.06696428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.472098246216774, "step": 2683 }, { "clip_ratio": 0.0, "completion_length": 1320.1920166015625, "epoch": 0.8017325069076245, "grad_norm": 3.905672550201416, "kl": 2.4765625, "learning_rate": 6.101303124040612e-08, "loss": 0.1755, "reward": 0.5066964477300644, "reward_std": 0.2641409933567047, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500223517418, "step": 2684 }, { "clip_ratio": 0.0, "completion_length": 1279.6763916015625, "epoch": 0.802031214995146, "grad_norm": 4.252340793609619, "kl": 2.3828125, "learning_rate": 6.092329241336853e-08, "loss": 0.2297, "reward": 0.679129496216774, "reward_std": 0.3319476321339607, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794887661934, "step": 2685 }, { "clip_ratio": 0.0, "completion_length": 1429.0715026855469, "epoch": 0.8023299230826675, "grad_norm": 2.4402036666870117, "kl": 2.78515625, "learning_rate": 6.083366681158812e-08, "loss": 0.1562, "reward": 0.4994419887661934, "reward_std": 0.2849939092993736, "rewards/accuracy_reward": 0.03571428754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 2686 }, { "clip_ratio": 0.0, "completion_length": 1324.6562805175781, "epoch": 0.8026286311701889, "grad_norm": 2.311089515686035, "kl": 2.80078125, "learning_rate": 6.074415453256885e-08, "loss": 0.2665, "reward": 0.5055803805589676, "reward_std": 0.2670268677175045, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4408482313156128, "step": 2687 }, { "clip_ratio": 0.0, "completion_length": 1309.5670166015625, "epoch": 0.8029273392577104, "grad_norm": 3.026024580001831, "kl": 2.38671875, "learning_rate": 6.065475567369131e-08, "loss": 0.1956, "reward": 0.6194196790456772, "reward_std": 0.31992819160223007, "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4787946715950966, "step": 2688 }, { "clip_ratio": 0.0, "completion_length": 1368.7745971679688, "epoch": 0.8032260473452318, "grad_norm": 2.6669251918792725, "kl": 2.66015625, "learning_rate": 6.056547033221262e-08, "loss": 0.1935, "reward": 0.5390625223517418, "reward_std": 0.26451193168759346, "rewards/accuracy_reward": 0.06473214458674192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 2689 }, { "clip_ratio": 0.0, "completion_length": 1342.9085388183594, "epoch": 0.8035247554327534, "grad_norm": 2.232454299926758, "kl": 2.84375, "learning_rate": 6.047629860526653e-08, "loss": 0.1938, "reward": 0.5429687723517418, "reward_std": 0.31912822276353836, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478236623108387, "step": 2690 }, { "clip_ratio": 0.0, "completion_length": 1418.4219360351562, "epoch": 0.8038234635202748, "grad_norm": 3.7043495178222656, "kl": 2.859375, "learning_rate": 6.038724058986318e-08, "loss": 0.1596, "reward": 0.5312500149011612, "reward_std": 0.28454039990901947, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 2691 }, { "clip_ratio": 0.0, "completion_length": 1308.2701416015625, "epoch": 0.8041221716077963, "grad_norm": 2.7096123695373535, "kl": 2.73828125, "learning_rate": 6.029829638288887e-08, "loss": 0.2285, "reward": 0.5747768133878708, "reward_std": 0.24027696251869202, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053805589676, "step": 2692 }, { "clip_ratio": 0.0, "completion_length": 1309.6763916015625, "epoch": 0.8044208796953177, "grad_norm": 4.306579113006592, "kl": 2.64453125, "learning_rate": 6.020946608110623e-08, "loss": 0.2267, "reward": 0.5139508992433548, "reward_std": 0.28548676148056984, "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687798023224, "step": 2693 }, { "clip_ratio": 0.0, "completion_length": 1404.2031860351562, "epoch": 0.8047195877828393, "grad_norm": 5.140834331512451, "kl": 3.1640625, "learning_rate": 6.012074978115393e-08, "loss": 0.2166, "reward": 0.550223246216774, "reward_std": 0.2747742347419262, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.452008955180645, "step": 2694 }, { "clip_ratio": 0.0, "completion_length": 1351.3995971679688, "epoch": 0.8050182958703607, "grad_norm": 2.664311647415161, "kl": 2.62109375, "learning_rate": 6.00321475795466e-08, "loss": 0.1866, "reward": 0.5970982313156128, "reward_std": 0.30565501004457474, "rewards/accuracy_reward": 0.11383929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4832589477300644, "step": 2695 }, { "clip_ratio": 0.0, "completion_length": 1392.5938110351562, "epoch": 0.8053170039578822, "grad_norm": 4.102873802185059, "kl": 3.11328125, "learning_rate": 5.994365957267474e-08, "loss": 0.2163, "reward": 0.4375000149011612, "reward_std": 0.2279616855084896, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357313156128, "step": 2696 }, { "clip_ratio": 0.0, "completion_length": 1369.3638916015625, "epoch": 0.8056157120454036, "grad_norm": 3.5367801189422607, "kl": 2.7578125, "learning_rate": 5.985528585680463e-08, "loss": 0.246, "reward": 0.5703125298023224, "reward_std": 0.27271652966737747, "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 2697 }, { "clip_ratio": 0.0, "completion_length": 1378.77685546875, "epoch": 0.8059144201329251, "grad_norm": 2.703206777572632, "kl": 2.796875, "learning_rate": 5.976702652807821e-08, "loss": 0.1738, "reward": 0.513950914144516, "reward_std": 0.24666185304522514, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116156578064, "step": 2698 }, { "clip_ratio": 0.0, "completion_length": 1311.2790832519531, "epoch": 0.8062131282204466, "grad_norm": 2.754666328430176, "kl": 2.59765625, "learning_rate": 5.967888168251303e-08, "loss": 0.2327, "reward": 0.5228794813156128, "reward_std": 0.28600436449050903, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830633878708, "step": 2699 }, { "clip_ratio": 0.0, "completion_length": 1311.7255249023438, "epoch": 0.8065118363079681, "grad_norm": 2.592423915863037, "kl": 2.73046875, "learning_rate": 5.9590851416001964e-08, "loss": 0.2344, "reward": 0.5647321566939354, "reward_std": 0.30572114884853363, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4553571715950966, "step": 2700 }, { "clip_ratio": 0.0, "completion_length": 1349.9420471191406, "epoch": 0.8068105443954895, "grad_norm": 2.5474624633789062, "kl": 2.6953125, "learning_rate": 5.950293582431334e-08, "loss": 0.2054, "reward": 0.581473246216774, "reward_std": 0.2691146284341812, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483258955180645, "step": 2701 }, { "clip_ratio": 0.0, "completion_length": 1257.1317138671875, "epoch": 0.807109252483011, "grad_norm": 2.943457841873169, "kl": 2.390625, "learning_rate": 5.941513500309076e-08, "loss": 0.2083, "reward": 0.5970982387661934, "reward_std": 0.25067298486828804, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625298023224, "step": 2702 }, { "clip_ratio": 0.0, "completion_length": 1353.7456359863281, "epoch": 0.8074079605705324, "grad_norm": 3.377930164337158, "kl": 2.43359375, "learning_rate": 5.932744904785282e-08, "loss": 0.1497, "reward": 0.5251116305589676, "reward_std": 0.2594091594219208, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.471540205180645, "step": 2703 }, { "clip_ratio": 0.0, "completion_length": 1401.1965026855469, "epoch": 0.807706668658054, "grad_norm": 2.4785704612731934, "kl": 2.4609375, "learning_rate": 5.923987805399331e-08, "loss": 0.1526, "reward": 0.528459832072258, "reward_std": 0.2918044924736023, "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4592634215950966, "step": 2704 }, { "clip_ratio": 0.0, "completion_length": 1418.8527526855469, "epoch": 0.8080053767455754, "grad_norm": 3.6194510459899902, "kl": 2.453125, "learning_rate": 5.9152422116780904e-08, "loss": 0.2267, "reward": 0.5033482313156128, "reward_std": 0.2752573564648628, "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 2705 }, { "clip_ratio": 0.0, "completion_length": 1237.6518249511719, "epoch": 0.8083040848330969, "grad_norm": 4.195130348205566, "kl": 2.271484375, "learning_rate": 5.906508133135901e-08, "loss": 0.2227, "reward": 0.5245535895228386, "reward_std": 0.29464997351169586, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.479910746216774, "step": 2706 }, { "clip_ratio": 0.0, "completion_length": 1441.4599304199219, "epoch": 0.8086027929206183, "grad_norm": 2.859577178955078, "kl": 2.66015625, "learning_rate": 5.8977855792745915e-08, "loss": 0.1725, "reward": 0.5418527126312256, "reward_std": 0.306526854634285, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 2707 }, { "clip_ratio": 0.0, "completion_length": 1284.6853332519531, "epoch": 0.8089015010081398, "grad_norm": 2.673067569732666, "kl": 2.8203125, "learning_rate": 5.889074559583444e-08, "loss": 0.2941, "reward": 0.5613839626312256, "reward_std": 0.3061329312622547, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698661044239998, "step": 2708 }, { "clip_ratio": 0.0, "completion_length": 1322.1473693847656, "epoch": 0.8092002090956613, "grad_norm": 2.743711233139038, "kl": 2.69921875, "learning_rate": 5.88037508353919e-08, "loss": 0.1828, "reward": 0.5396205559372902, "reward_std": 0.3114079050719738, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 2709 }, { "clip_ratio": 0.0, "completion_length": 1493.6027526855469, "epoch": 0.8094989171831828, "grad_norm": 5.491214752197266, "kl": 2.78125, "learning_rate": 5.871687160606015e-08, "loss": 0.1946, "reward": 0.4810267984867096, "reward_std": 0.2632999159395695, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431919664144516, "step": 2710 }, { "clip_ratio": 0.0, "completion_length": 1428.6964721679688, "epoch": 0.8097976252707042, "grad_norm": 3.7100868225097656, "kl": 2.69140625, "learning_rate": 5.863010800235518e-08, "loss": 0.1555, "reward": 0.5083705708384514, "reward_std": 0.2938474342226982, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241305589676, "step": 2711 }, { "clip_ratio": 0.0, "completion_length": 1294.4107360839844, "epoch": 0.8100963333582257, "grad_norm": 4.268005847930908, "kl": 2.3671875, "learning_rate": 5.854346011866733e-08, "loss": 0.1308, "reward": 0.4938616305589676, "reward_std": 0.27560579031705856, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 2712 }, { "clip_ratio": 0.0, "completion_length": 1346.2143249511719, "epoch": 0.8103950414457471, "grad_norm": 3.684821605682373, "kl": 2.546875, "learning_rate": 5.845692804926105e-08, "loss": 0.1395, "reward": 0.5011160895228386, "reward_std": 0.2664199732244015, "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268133878708, "step": 2713 }, { "clip_ratio": 0.0, "completion_length": 1386.2098693847656, "epoch": 0.8106937495332686, "grad_norm": 8.558638572692871, "kl": 2.5625, "learning_rate": 5.837051188827467e-08, "loss": 0.1953, "reward": 0.5401786118745804, "reward_std": 0.2900131456553936, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785969734192, "step": 2714 }, { "clip_ratio": 0.0, "completion_length": 1364.5111999511719, "epoch": 0.8109924576207901, "grad_norm": 3.828298568725586, "kl": 2.61328125, "learning_rate": 5.828421172972054e-08, "loss": 0.2432, "reward": 0.5887276977300644, "reward_std": 0.3184030130505562, "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598395228386, "step": 2715 }, { "clip_ratio": 0.0, "completion_length": 1337.8795471191406, "epoch": 0.8112911657083115, "grad_norm": 6.978218078613281, "kl": 2.44140625, "learning_rate": 5.8198027667484834e-08, "loss": 0.2229, "reward": 0.6383928954601288, "reward_std": 0.2877889648079872, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357313156128, "step": 2716 }, { "clip_ratio": 0.0, "completion_length": 1411.9978332519531, "epoch": 0.811589873795833, "grad_norm": 2.662527084350586, "kl": 2.38671875, "learning_rate": 5.8111959795327245e-08, "loss": 0.1509, "reward": 0.4815848395228386, "reward_std": 0.25374581292271614, "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919813156128, "step": 2717 }, { "clip_ratio": 0.0, "completion_length": 1390.3661193847656, "epoch": 0.8118885818833544, "grad_norm": 2.2279045581817627, "kl": 2.609375, "learning_rate": 5.802600820688125e-08, "loss": 0.1957, "reward": 0.5128348469734192, "reward_std": 0.2865200936794281, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4525669887661934, "step": 2718 }, { "clip_ratio": 0.0, "completion_length": 1341.2857666015625, "epoch": 0.812187289970876, "grad_norm": 3.73152756690979, "kl": 2.1953125, "learning_rate": 5.7940172995653795e-08, "loss": 0.1483, "reward": 0.5446428805589676, "reward_std": 0.27670155465602875, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 2719 }, { "clip_ratio": 0.0, "completion_length": 1363.1652526855469, "epoch": 0.8124859980583974, "grad_norm": 4.034478187561035, "kl": 2.8203125, "learning_rate": 5.7854454255025086e-08, "loss": 0.2286, "reward": 0.5597098469734192, "reward_std": 0.31760183349251747, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.448102705180645, "step": 2720 }, { "clip_ratio": 0.0, "completion_length": 1412.1942749023438, "epoch": 0.8127847061459189, "grad_norm": 2.5207152366638184, "kl": 2.62109375, "learning_rate": 5.7768852078248757e-08, "loss": 0.2271, "reward": 0.5150669738650322, "reward_std": 0.28755562007427216, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4503348395228386, "step": 2721 }, { "clip_ratio": 0.0, "completion_length": 1377.6183471679688, "epoch": 0.8130834142334403, "grad_norm": 2.312723398208618, "kl": 2.734375, "learning_rate": 5.768336655845163e-08, "loss": 0.1827, "reward": 0.5518973618745804, "reward_std": 0.27638472244143486, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4469866380095482, "step": 2722 }, { "clip_ratio": 0.0, "completion_length": 1352.1897888183594, "epoch": 0.8133821223209619, "grad_norm": 3.918238401412964, "kl": 2.671875, "learning_rate": 5.759799778863348e-08, "loss": 0.2123, "reward": 0.5842634290456772, "reward_std": 0.28274378925561905, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470424123108387, "step": 2723 }, { "clip_ratio": 0.0, "completion_length": 1396.6764221191406, "epoch": 0.8136808304084833, "grad_norm": 5.160913467407227, "kl": 2.80859375, "learning_rate": 5.751274586166725e-08, "loss": 0.2124, "reward": 0.5284598395228386, "reward_std": 0.29199130088090897, "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312723517418, "step": 2724 }, { "clip_ratio": 0.0, "completion_length": 1356.35498046875, "epoch": 0.8139795384960048, "grad_norm": 3.6226799488067627, "kl": 2.60546875, "learning_rate": 5.742761087029859e-08, "loss": 0.1785, "reward": 0.488839328289032, "reward_std": 0.2719229571521282, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535895228386, "step": 2725 }, { "clip_ratio": 0.0, "completion_length": 1405.8527526855469, "epoch": 0.8142782465835262, "grad_norm": 2.6296393871307373, "kl": 2.7109375, "learning_rate": 5.734259290714609e-08, "loss": 0.1577, "reward": 0.541294664144516, "reward_std": 0.26199913397431374, "rewards/accuracy_reward": 0.09375000325962901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4475446566939354, "step": 2726 }, { "clip_ratio": 0.0, "completion_length": 1354.9688110351562, "epoch": 0.8145769546710477, "grad_norm": 3.806881904602051, "kl": 2.49609375, "learning_rate": 5.725769206470098e-08, "loss": 0.1748, "reward": 0.592075914144516, "reward_std": 0.2758549153804779, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473395228386, "step": 2727 }, { "clip_ratio": 0.0, "completion_length": 1350.1183776855469, "epoch": 0.8148756627585692, "grad_norm": 3.8596932888031006, "kl": 2.59765625, "learning_rate": 5.717290843532699e-08, "loss": 0.1567, "reward": 0.5178571790456772, "reward_std": 0.27464301511645317, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4441964477300644, "step": 2728 }, { "clip_ratio": 0.0, "completion_length": 1400.9107971191406, "epoch": 0.8151743708460907, "grad_norm": 2.3869435787200928, "kl": 2.43359375, "learning_rate": 5.708824211126043e-08, "loss": 0.2027, "reward": 0.4771205484867096, "reward_std": 0.2596498802304268, "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312723517418, "step": 2729 }, { "clip_ratio": 0.0, "completion_length": 1365.9554443359375, "epoch": 0.8154730789336121, "grad_norm": 4.742448329925537, "kl": 2.71484375, "learning_rate": 5.7003693184609994e-08, "loss": 0.1739, "reward": 0.5239955633878708, "reward_std": 0.2579520232975483, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4324776977300644, "step": 2730 }, { "clip_ratio": 0.0, "completion_length": 1373.0134582519531, "epoch": 0.8157717870211336, "grad_norm": 3.5375308990478516, "kl": 2.3828125, "learning_rate": 5.6919261747356554e-08, "loss": 0.1733, "reward": 0.489397332072258, "reward_std": 0.2584659978747368, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116380095482, "step": 2731 }, { "clip_ratio": 0.0, "completion_length": 1441.5469055175781, "epoch": 0.816070495108655, "grad_norm": 3.4222195148468018, "kl": 2.4609375, "learning_rate": 5.6834947891353296e-08, "loss": 0.1829, "reward": 0.4776785895228386, "reward_std": 0.28365007042884827, "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.444196455180645, "step": 2732 }, { "clip_ratio": 0.0, "completion_length": 1406.4554138183594, "epoch": 0.8163692031961766, "grad_norm": 3.6270945072174072, "kl": 2.5, "learning_rate": 5.6750751708325445e-08, "loss": 0.1488, "reward": 0.493861623108387, "reward_std": 0.2583364024758339, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4335937723517418, "step": 2733 }, { "clip_ratio": 0.0, "completion_length": 1250.591552734375, "epoch": 0.816667911283698, "grad_norm": 7.0690717697143555, "kl": 2.267578125, "learning_rate": 5.666667328987013e-08, "loss": 0.2475, "reward": 0.565848246216774, "reward_std": 0.3072020635008812, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4787946566939354, "step": 2734 }, { "clip_ratio": 0.0, "completion_length": 1396.4620971679688, "epoch": 0.8169666193712195, "grad_norm": 2.997589349746704, "kl": 2.453125, "learning_rate": 5.658271272745648e-08, "loss": 0.166, "reward": 0.498883955180645, "reward_std": 0.27057982608675957, "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4363839477300644, "step": 2735 }, { "clip_ratio": 0.0, "completion_length": 1281.40185546875, "epoch": 0.8172653274587409, "grad_norm": 10.544676780700684, "kl": 2.126953125, "learning_rate": 5.6498870112425386e-08, "loss": 0.164, "reward": 0.556919664144516, "reward_std": 0.318227618932724, "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018059372902, "step": 2736 }, { "clip_ratio": 0.0, "completion_length": 1378.85498046875, "epoch": 0.8175640355462624, "grad_norm": 5.855522155761719, "kl": 2.107421875, "learning_rate": 5.641514553598932e-08, "loss": 0.1665, "reward": 0.4877232387661934, "reward_std": 0.2576640695333481, "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018133878708, "step": 2737 }, { "clip_ratio": 0.0, "completion_length": 1422.1875610351562, "epoch": 0.8178627436337839, "grad_norm": 4.889698505401611, "kl": 2.515625, "learning_rate": 5.6331539089232466e-08, "loss": 0.2, "reward": 0.4726562798023224, "reward_std": 0.2709751985967159, "rewards/accuracy_reward": 0.029017859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443638414144516, "step": 2738 }, { "clip_ratio": 0.0, "completion_length": 1404.2679443359375, "epoch": 0.8181614517213054, "grad_norm": 2.811293125152588, "kl": 2.103515625, "learning_rate": 5.6248050863110405e-08, "loss": 0.1395, "reward": 0.5597098469734192, "reward_std": 0.2689204588532448, "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919887661934, "step": 2739 }, { "clip_ratio": 0.0, "completion_length": 1478.3036193847656, "epoch": 0.8184601598088268, "grad_norm": 3.637096643447876, "kl": 2.33984375, "learning_rate": 5.6164680948450185e-08, "loss": 0.1668, "reward": 0.521205373108387, "reward_std": 0.2637876868247986, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4520089402794838, "step": 2740 }, { "clip_ratio": 0.0, "completion_length": 1381.2701416015625, "epoch": 0.8187588678963483, "grad_norm": 2.299168348312378, "kl": 2.42578125, "learning_rate": 5.6081429435950096e-08, "loss": 0.203, "reward": 0.5189732313156128, "reward_std": 0.2683997377753258, "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4408482313156128, "step": 2741 }, { "clip_ratio": 0.0, "completion_length": 1385.9620971679688, "epoch": 0.8190575759838697, "grad_norm": 3.0321803092956543, "kl": 2.4140625, "learning_rate": 5.59982964161796e-08, "loss": 0.1875, "reward": 0.506138414144516, "reward_std": 0.29881956055760384, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 2742 }, { "clip_ratio": 0.0, "completion_length": 1401.0469360351562, "epoch": 0.8193562840713913, "grad_norm": 2.638434410095215, "kl": 2.3984375, "learning_rate": 5.591528197957927e-08, "loss": 0.1769, "reward": 0.5039062723517418, "reward_std": 0.25507163628935814, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4503348469734192, "step": 2743 }, { "clip_ratio": 0.0, "completion_length": 1331.2054138183594, "epoch": 0.8196549921589127, "grad_norm": 4.2876763343811035, "kl": 2.236328125, "learning_rate": 5.5832386216460736e-08, "loss": 0.148, "reward": 0.5379464402794838, "reward_std": 0.28747303783893585, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4754464477300644, "step": 2744 }, { "clip_ratio": 0.0, "completion_length": 1393.4375915527344, "epoch": 0.8199537002464342, "grad_norm": 2.7342209815979004, "kl": 2.572265625, "learning_rate": 5.574960921700639e-08, "loss": 0.1973, "reward": 0.5524553880095482, "reward_std": 0.30932923778891563, "rewards/accuracy_reward": 0.09151786239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375149011612, "step": 2745 }, { "clip_ratio": 0.0, "completion_length": 1367.8817443847656, "epoch": 0.8202524083339556, "grad_norm": 4.9842424392700195, "kl": 2.26171875, "learning_rate": 5.566695107126952e-08, "loss": 0.1821, "reward": 0.5915178805589676, "reward_std": 0.28297385573387146, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.493303582072258, "step": 2746 }, { "clip_ratio": 0.0, "completion_length": 1422.7254943847656, "epoch": 0.8205511164214772, "grad_norm": 6.0929741859436035, "kl": 2.83984375, "learning_rate": 5.558441186917413e-08, "loss": 0.2246, "reward": 0.4743303805589676, "reward_std": 0.2652265578508377, "rewards/accuracy_reward": 0.04910714388824999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4252232387661934, "step": 2747 }, { "clip_ratio": 0.0, "completion_length": 1370.7478332519531, "epoch": 0.8208498245089986, "grad_norm": 4.071722030639648, "kl": 2.353515625, "learning_rate": 5.550199170051472e-08, "loss": 0.1977, "reward": 0.5150669813156128, "reward_std": 0.3028615266084671, "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4815848469734192, "step": 2748 }, { "clip_ratio": 0.0, "completion_length": 1398.3169860839844, "epoch": 0.8211485325965201, "grad_norm": 2.590019702911377, "kl": 2.2890625, "learning_rate": 5.541969065495638e-08, "loss": 0.1554, "reward": 0.4910714626312256, "reward_std": 0.27137700468301773, "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535895228386, "step": 2749 }, { "clip_ratio": 0.0, "completion_length": 1261.8973693847656, "epoch": 0.8214472406840415, "grad_norm": 2.815826892852783, "kl": 2.515625, "learning_rate": 5.533750882203462e-08, "loss": 0.1987, "reward": 0.5284598469734192, "reward_std": 0.28247538954019547, "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 2750 }, { "clip_ratio": 0.0, "completion_length": 1338.2545166015625, "epoch": 0.821745948771563, "grad_norm": 4.100876808166504, "kl": 2.59375, "learning_rate": 5.525544629115514e-08, "loss": 0.1749, "reward": 0.5161830484867096, "reward_std": 0.27134324982762337, "rewards/accuracy_reward": 0.06250000395812094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830559372902, "step": 2751 }, { "clip_ratio": 0.0, "completion_length": 1422.2388916015625, "epoch": 0.8220446568590845, "grad_norm": 9.336560249328613, "kl": 2.9375, "learning_rate": 5.517350315159401e-08, "loss": 0.2103, "reward": 0.482700914144516, "reward_std": 0.2858666032552719, "rewards/accuracy_reward": 0.03348214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187723517418, "step": 2752 }, { "clip_ratio": 0.0, "completion_length": 1362.1853637695312, "epoch": 0.822343364946606, "grad_norm": 3.880415201187134, "kl": 2.48046875, "learning_rate": 5.509167949249727e-08, "loss": 0.2233, "reward": 0.505022332072258, "reward_std": 0.24920187518000603, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462611623108387, "step": 2753 }, { "clip_ratio": 0.0, "completion_length": 1340.6563110351562, "epoch": 0.8226420730341274, "grad_norm": 3.540314197540283, "kl": 2.49609375, "learning_rate": 5.500997540288106e-08, "loss": 0.2114, "reward": 0.5145089477300644, "reward_std": 0.31031583994627, "rewards/accuracy_reward": 0.042410717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 2754 }, { "clip_ratio": 0.0, "completion_length": 1334.16748046875, "epoch": 0.8229407811216489, "grad_norm": 7.6485419273376465, "kl": 2.37109375, "learning_rate": 5.4928390971631466e-08, "loss": 0.2191, "reward": 0.588169664144516, "reward_std": 0.2910461239516735, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 2755 }, { "clip_ratio": 0.0, "completion_length": 1419.49560546875, "epoch": 0.8232394892091703, "grad_norm": 3.286504030227661, "kl": 2.859375, "learning_rate": 5.4846926287504275e-08, "loss": 0.2231, "reward": 0.5424107313156128, "reward_std": 0.36325570940971375, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4754464626312256, "step": 2756 }, { "clip_ratio": 0.0, "completion_length": 1354.6562805175781, "epoch": 0.8235381972966918, "grad_norm": 6.844689846038818, "kl": 2.875, "learning_rate": 5.4765581439125125e-08, "loss": 0.2097, "reward": 0.5106027126312256, "reward_std": 0.283752653747797, "rewards/accuracy_reward": 0.0669642889406532, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443638414144516, "step": 2757 }, { "clip_ratio": 0.0, "completion_length": 1351.99560546875, "epoch": 0.8238369053842133, "grad_norm": 2.799654960632324, "kl": 2.37109375, "learning_rate": 5.468435651498928e-08, "loss": 0.1814, "reward": 0.5145089477300644, "reward_std": 0.28400325402617455, "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.485491082072258, "step": 2758 }, { "clip_ratio": 0.0, "completion_length": 1372.5759887695312, "epoch": 0.8241356134717347, "grad_norm": 4.450092792510986, "kl": 2.4296875, "learning_rate": 5.460325160346142e-08, "loss": 0.1356, "reward": 0.545200914144516, "reward_std": 0.2943231984972954, "rewards/accuracy_reward": 0.08258929057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 2759 }, { "clip_ratio": 0.0, "completion_length": 1414.9888610839844, "epoch": 0.8244343215592562, "grad_norm": 4.169230937957764, "kl": 2.75, "learning_rate": 5.452226679277576e-08, "loss": 0.1844, "reward": 0.6099330708384514, "reward_std": 0.29833365604281425, "rewards/accuracy_reward": 0.1495535767171532, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794887661934, "step": 2760 }, { "clip_ratio": 0.0, "completion_length": 1431.94873046875, "epoch": 0.8247330296467776, "grad_norm": 3.103463649749756, "kl": 2.408203125, "learning_rate": 5.4441402171035884e-08, "loss": 0.1547, "reward": 0.5351562574505806, "reward_std": 0.30173874646425247, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 2761 }, { "clip_ratio": 0.0, "completion_length": 1392.8147583007812, "epoch": 0.8250317377342992, "grad_norm": 5.35961389541626, "kl": 2.42578125, "learning_rate": 5.4360657826214505e-08, "loss": 0.2005, "reward": 0.5195312649011612, "reward_std": 0.28935767710208893, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443638414144516, "step": 2762 }, { "clip_ratio": 0.0, "completion_length": 1399.4576721191406, "epoch": 0.8253304458218206, "grad_norm": 6.131446838378906, "kl": 2.1015625, "learning_rate": 5.428003384615362e-08, "loss": 0.1451, "reward": 0.5407366305589676, "reward_std": 0.31973376125097275, "rewards/accuracy_reward": 0.040178572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580633878708, "step": 2763 }, { "clip_ratio": 0.0, "completion_length": 1414.2277526855469, "epoch": 0.8256291539093421, "grad_norm": 2.8199949264526367, "kl": 2.70703125, "learning_rate": 5.4199530318564166e-08, "loss": 0.1923, "reward": 0.5535714626312256, "reward_std": 0.29176216572523117, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419642984867096, "step": 2764 }, { "clip_ratio": 0.0, "completion_length": 1322.0781860351562, "epoch": 0.8259278619968635, "grad_norm": 2.8062357902526855, "kl": 2.91796875, "learning_rate": 5.411914733102611e-08, "loss": 0.2245, "reward": 0.518973246216774, "reward_std": 0.2592063434422016, "rewards/accuracy_reward": 0.06473214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410895228386, "step": 2765 }, { "clip_ratio": 0.0, "completion_length": 1287.3393249511719, "epoch": 0.826226570084385, "grad_norm": 3.903737783432007, "kl": 2.16796875, "learning_rate": 5.4038884970988286e-08, "loss": 0.2184, "reward": 0.5440848618745804, "reward_std": 0.30539288371801376, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598469734192, "step": 2766 }, { "clip_ratio": 0.0, "completion_length": 1423.7188415527344, "epoch": 0.8265252781719065, "grad_norm": 4.51344108581543, "kl": 2.62109375, "learning_rate": 5.395874332576823e-08, "loss": 0.1243, "reward": 0.4681919887661934, "reward_std": 0.2899039499461651, "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4391741305589676, "step": 2767 }, { "clip_ratio": 0.0, "completion_length": 1455.6764221191406, "epoch": 0.826823986259428, "grad_norm": 3.4935250282287598, "kl": 2.625, "learning_rate": 5.387872248255223e-08, "loss": 0.1231, "reward": 0.4732143059372902, "reward_std": 0.26298218965530396, "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250223517418, "step": 2768 }, { "clip_ratio": 0.0, "completion_length": 1349.3259582519531, "epoch": 0.8271226943469494, "grad_norm": 3.111039638519287, "kl": 2.37109375, "learning_rate": 5.379882252839514e-08, "loss": 0.1992, "reward": 0.6434152126312256, "reward_std": 0.2800692394375801, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 2769 }, { "clip_ratio": 0.0, "completion_length": 1315.5625610351562, "epoch": 0.8274214024344709, "grad_norm": 5.995444297790527, "kl": 2.67578125, "learning_rate": 5.371904355022022e-08, "loss": 0.1747, "reward": 0.5552455633878708, "reward_std": 0.25505512952804565, "rewards/accuracy_reward": 0.08928571990691125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598469734192, "step": 2770 }, { "clip_ratio": 0.0, "completion_length": 1390.2098693847656, "epoch": 0.8277201105219923, "grad_norm": 5.303922653198242, "kl": 2.28515625, "learning_rate": 5.363938563481921e-08, "loss": 0.1532, "reward": 0.5708705633878708, "reward_std": 0.2939527817070484, "rewards/accuracy_reward": 0.08258929057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.488281287252903, "step": 2771 }, { "clip_ratio": 0.0, "completion_length": 1368.2188110351562, "epoch": 0.8280188186095139, "grad_norm": 6.614811897277832, "kl": 2.4140625, "learning_rate": 5.355984886885215e-08, "loss": 0.1954, "reward": 0.5641741305589676, "reward_std": 0.3089471384882927, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098395228386, "step": 2772 }, { "clip_ratio": 0.0, "completion_length": 1411.3460388183594, "epoch": 0.8283175266970353, "grad_norm": 6.557124614715576, "kl": 2.009765625, "learning_rate": 5.3480433338847175e-08, "loss": 0.1526, "reward": 0.651785746216774, "reward_std": 0.338114395737648, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607313156128, "step": 2773 }, { "clip_ratio": 0.0, "completion_length": 1423.1853332519531, "epoch": 0.8286162347845568, "grad_norm": 2.157545566558838, "kl": 2.7421875, "learning_rate": 5.3401139131200636e-08, "loss": 0.1889, "reward": 0.5329241380095482, "reward_std": 0.28696248680353165, "rewards/accuracy_reward": 0.0736607180442661, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4592634066939354, "step": 2774 }, { "clip_ratio": 0.0, "completion_length": 1395.3103332519531, "epoch": 0.8289149428720782, "grad_norm": 4.592647552490234, "kl": 2.66015625, "learning_rate": 5.33219663321769e-08, "loss": 0.1687, "reward": 0.530691996216774, "reward_std": 0.3284444622695446, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705484867096, "step": 2775 }, { "clip_ratio": 0.0, "completion_length": 1315.8014221191406, "epoch": 0.8292136509595998, "grad_norm": 3.645077705383301, "kl": 2.380859375, "learning_rate": 5.3242915027908126e-08, "loss": 0.1751, "reward": 0.6434152126312256, "reward_std": 0.2822473347187042, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294813156128, "step": 2776 }, { "clip_ratio": 0.0, "completion_length": 1418.7232971191406, "epoch": 0.8295123590471212, "grad_norm": 3.593034505844116, "kl": 2.67578125, "learning_rate": 5.316398530439448e-08, "loss": 0.1967, "reward": 0.4748884215950966, "reward_std": 0.2694871909916401, "rewards/accuracy_reward": 0.04464285867288709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4302455559372902, "step": 2777 }, { "clip_ratio": 0.0, "completion_length": 1337.0245971679688, "epoch": 0.8298110671346427, "grad_norm": 4.494168281555176, "kl": 2.380859375, "learning_rate": 5.308517724750367e-08, "loss": 0.2008, "reward": 0.608816996216774, "reward_std": 0.30558140203356743, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026902794838, "step": 2778 }, { "clip_ratio": 0.0, "completion_length": 1435.0781860351562, "epoch": 0.8301097752221641, "grad_norm": 3.2158350944519043, "kl": 2.453125, "learning_rate": 5.300649094297118e-08, "loss": 0.1806, "reward": 0.494419664144516, "reward_std": 0.30528538301587105, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803805589676, "step": 2779 }, { "clip_ratio": 0.0, "completion_length": 1398.0826721191406, "epoch": 0.8304084833096856, "grad_norm": 5.923099040985107, "kl": 2.68359375, "learning_rate": 5.292792647640004e-08, "loss": 0.1802, "reward": 0.4743303880095482, "reward_std": 0.2728661969304085, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463169664144516, "step": 2780 }, { "clip_ratio": 0.0, "completion_length": 1378.6451416015625, "epoch": 0.8307071913972071, "grad_norm": 4.794301986694336, "kl": 2.2109375, "learning_rate": 5.284948393326062e-08, "loss": 0.1685, "reward": 0.5569196715950966, "reward_std": 0.30198775976896286, "rewards/accuracy_reward": 0.08258929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 2781 }, { "clip_ratio": 0.0, "completion_length": 1436.6250305175781, "epoch": 0.8310058994847286, "grad_norm": 6.441556453704834, "kl": 2.40625, "learning_rate": 5.2771163398890737e-08, "loss": 0.1722, "reward": 0.5496652126312256, "reward_std": 0.2855289541184902, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044813156128, "step": 2782 }, { "clip_ratio": 0.0, "completion_length": 1384.65185546875, "epoch": 0.83130460757225, "grad_norm": 2.3932745456695557, "kl": 2.6953125, "learning_rate": 5.269296495849548e-08, "loss": 0.1848, "reward": 0.5446428805589676, "reward_std": 0.27013618499040604, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419643059372902, "step": 2783 }, { "clip_ratio": 0.0, "completion_length": 1380.1183776855469, "epoch": 0.8316033156597715, "grad_norm": 2.4570815563201904, "kl": 2.796875, "learning_rate": 5.261488869714706e-08, "loss": 0.2294, "reward": 0.4464285895228386, "reward_std": 0.2403767928481102, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000223517418, "step": 2784 }, { "clip_ratio": 0.0, "completion_length": 1337.7790832519531, "epoch": 0.8319020237472929, "grad_norm": 2.506418466567993, "kl": 2.5234375, "learning_rate": 5.253693469978487e-08, "loss": 0.1963, "reward": 0.5998884290456772, "reward_std": 0.3127107433974743, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483816996216774, "step": 2785 }, { "clip_ratio": 0.0, "completion_length": 1371.763427734375, "epoch": 0.8322007318348145, "grad_norm": 3.042508602142334, "kl": 2.375, "learning_rate": 5.245910305121512e-08, "loss": 0.1832, "reward": 0.5747768133878708, "reward_std": 0.296519473195076, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854911044239998, "step": 2786 }, { "clip_ratio": 0.0, "completion_length": 1462.44873046875, "epoch": 0.8324994399223359, "grad_norm": 3.8713390827178955, "kl": 2.50390625, "learning_rate": 5.238139383611104e-08, "loss": 0.1381, "reward": 0.5267857313156128, "reward_std": 0.29304686933755875, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 2787 }, { "clip_ratio": 0.0, "completion_length": 1379.6585693359375, "epoch": 0.8327981480098574, "grad_norm": 9.736207008361816, "kl": 2.54296875, "learning_rate": 5.2303807139012696e-08, "loss": 0.202, "reward": 0.5295759364962578, "reward_std": 0.27497106045484543, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.464843787252903, "step": 2788 }, { "clip_ratio": 0.0, "completion_length": 1438.5625610351562, "epoch": 0.8330968560973788, "grad_norm": 3.8638875484466553, "kl": 2.8671875, "learning_rate": 5.2226343044326744e-08, "loss": 0.1788, "reward": 0.502232164144516, "reward_std": 0.3152356594800949, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4508928805589676, "step": 2789 }, { "clip_ratio": 0.0, "completion_length": 1350.3125610351562, "epoch": 0.8333955641849004, "grad_norm": 2.396867036819458, "kl": 2.37109375, "learning_rate": 5.214900163632657e-08, "loss": 0.1873, "reward": 0.6010044887661934, "reward_std": 0.31079768389463425, "rewards/accuracy_reward": 0.12053572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 2790 }, { "clip_ratio": 0.0, "completion_length": 1411.2835693359375, "epoch": 0.8336942722724218, "grad_norm": 4.708254814147949, "kl": 2.58203125, "learning_rate": 5.207178299915203e-08, "loss": 0.1937, "reward": 0.5602678805589676, "reward_std": 0.2595757953822613, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857387661934, "step": 2791 }, { "clip_ratio": 0.0, "completion_length": 1496.4554443359375, "epoch": 0.8339929803599433, "grad_norm": 5.148940563201904, "kl": 2.5390625, "learning_rate": 5.199468721680942e-08, "loss": 0.1543, "reward": 0.4966517984867096, "reward_std": 0.2639166824519634, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4363839477300644, "step": 2792 }, { "clip_ratio": 0.0, "completion_length": 1413.3973999023438, "epoch": 0.8342916884474647, "grad_norm": 2.6693954467773438, "kl": 2.48046875, "learning_rate": 5.1917714373171406e-08, "loss": 0.1238, "reward": 0.517857164144516, "reward_std": 0.2699213922023773, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4553571715950966, "step": 2793 }, { "clip_ratio": 0.0, "completion_length": 1407.5045471191406, "epoch": 0.8345903965349862, "grad_norm": 4.84650993347168, "kl": 2.94921875, "learning_rate": 5.184086455197692e-08, "loss": 0.2213, "reward": 0.5541294738650322, "reward_std": 0.28608500212430954, "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4559151977300644, "step": 2794 }, { "clip_ratio": 0.0, "completion_length": 1401.5581359863281, "epoch": 0.8348891046225076, "grad_norm": 5.148187637329102, "kl": 2.40625, "learning_rate": 5.176413783683099e-08, "loss": 0.1537, "reward": 0.5306919887661934, "reward_std": 0.2805827036499977, "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276902794838, "step": 2795 }, { "clip_ratio": 0.0, "completion_length": 1372.7500915527344, "epoch": 0.8351878127100292, "grad_norm": 4.057096004486084, "kl": 2.34765625, "learning_rate": 5.1687534311204825e-08, "loss": 0.1287, "reward": 0.5078125223517418, "reward_std": 0.27174222469329834, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4408482313156128, "step": 2796 }, { "clip_ratio": 0.0, "completion_length": 1350.2857666015625, "epoch": 0.8354865207975506, "grad_norm": 2.704801321029663, "kl": 2.068359375, "learning_rate": 5.161105405843547e-08, "loss": 0.157, "reward": 0.5083705633878708, "reward_std": 0.31423135846853256, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312723517418, "step": 2797 }, { "clip_ratio": 0.0, "completion_length": 1330.263427734375, "epoch": 0.8357852288850721, "grad_norm": 8.09300422668457, "kl": 1.861328125, "learning_rate": 5.153469716172599e-08, "loss": 0.145, "reward": 0.5491071715950966, "reward_std": 0.29655369743704796, "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 2798 }, { "clip_ratio": 0.0, "completion_length": 1396.8348388671875, "epoch": 0.8360839369725935, "grad_norm": 10.998167037963867, "kl": 2.015625, "learning_rate": 5.1458463704145234e-08, "loss": 0.1842, "reward": 0.6579241454601288, "reward_std": 0.3020564317703247, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598544239998, "step": 2799 }, { "clip_ratio": 0.0, "completion_length": 1428.384033203125, "epoch": 0.836382645060115, "grad_norm": 6.622757434844971, "kl": 2.017578125, "learning_rate": 5.138235376862768e-08, "loss": 0.1821, "reward": 0.502790205180645, "reward_std": 0.303842268884182, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187649011612, "step": 2800 }, { "clip_ratio": 0.0, "completion_length": 1404.8549499511719, "epoch": 0.8366813531476365, "grad_norm": 2.693490743637085, "kl": 2.37109375, "learning_rate": 5.130636743797349e-08, "loss": 0.1847, "reward": 0.6964286118745804, "reward_std": 0.28897156566381454, "rewards/accuracy_reward": 0.247767873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607313156128, "step": 2801 }, { "clip_ratio": 0.0, "completion_length": 1380.1384582519531, "epoch": 0.8369800612351579, "grad_norm": 6.8874993324279785, "kl": 2.41015625, "learning_rate": 5.123050479484839e-08, "loss": 0.1608, "reward": 0.589285746216774, "reward_std": 0.27994633466005325, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.450892873108387, "step": 2802 }, { "clip_ratio": 0.0, "completion_length": 1362.4353332519531, "epoch": 0.8372787693226794, "grad_norm": 11.19797134399414, "kl": 2.017578125, "learning_rate": 5.115476592178343e-08, "loss": 0.1512, "reward": 0.5937500223517418, "reward_std": 0.2986426427960396, "rewards/accuracy_reward": 0.10044642956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933036044239998, "step": 2803 }, { "clip_ratio": 0.0, "completion_length": 1422.66748046875, "epoch": 0.8375774774102008, "grad_norm": 2.8936092853546143, "kl": 2.2734375, "learning_rate": 5.107915090117512e-08, "loss": 0.1493, "reward": 0.490513414144516, "reward_std": 0.28174422308802605, "rewards/accuracy_reward": 0.04464285750873387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705559372902, "step": 2804 }, { "clip_ratio": 0.0, "completion_length": 1425.6965026855469, "epoch": 0.8378761854977224, "grad_norm": 4.602198600769043, "kl": 2.08984375, "learning_rate": 5.1003659815285205e-08, "loss": 0.1339, "reward": 0.5881696715950966, "reward_std": 0.2985876686871052, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 2805 }, { "clip_ratio": 0.0, "completion_length": 1455.3572082519531, "epoch": 0.8381748935852438, "grad_norm": 2.4890096187591553, "kl": 2.59765625, "learning_rate": 5.092829274624055e-08, "loss": 0.1881, "reward": 0.565290205180645, "reward_std": 0.3328619450330734, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4559151977300644, "step": 2806 }, { "clip_ratio": 0.0, "completion_length": 1359.5045166015625, "epoch": 0.8384736016727653, "grad_norm": 2.0408947467803955, "kl": 2.65234375, "learning_rate": 5.0853049776033155e-08, "loss": 0.2282, "reward": 0.553571455180645, "reward_std": 0.2735126316547394, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 2807 }, { "clip_ratio": 0.0, "completion_length": 1277.8415832519531, "epoch": 0.8387723097602867, "grad_norm": 3.3162906169891357, "kl": 2.166015625, "learning_rate": 5.077793098652002e-08, "loss": 0.147, "reward": 0.5329241305589676, "reward_std": 0.2865087389945984, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 2808 }, { "clip_ratio": 0.0, "completion_length": 1350.9888916015625, "epoch": 0.8390710178478082, "grad_norm": 5.663393974304199, "kl": 2.375, "learning_rate": 5.0702936459422995e-08, "loss": 0.1886, "reward": 0.566964328289032, "reward_std": 0.2900156192481518, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785969734192, "step": 2809 }, { "clip_ratio": 0.0, "completion_length": 1386.2410888671875, "epoch": 0.8393697259353297, "grad_norm": 3.7770283222198486, "kl": 2.328125, "learning_rate": 5.062806627632881e-08, "loss": 0.1819, "reward": 0.6132812798023224, "reward_std": 0.3049631491303444, "rewards/accuracy_reward": 0.15625000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312649011612, "step": 2810 }, { "clip_ratio": 0.0, "completion_length": 1348.6920166015625, "epoch": 0.8396684340228512, "grad_norm": 3.617988348007202, "kl": 2.248046875, "learning_rate": 5.0553320518688817e-08, "loss": 0.1591, "reward": 0.5479910969734192, "reward_std": 0.33868086338043213, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232387661934, "step": 2811 }, { "clip_ratio": 0.0, "completion_length": 1313.6272888183594, "epoch": 0.8399671421103726, "grad_norm": 3.2508668899536133, "kl": 2.37890625, "learning_rate": 5.047869926781914e-08, "loss": 0.1687, "reward": 0.5809152126312256, "reward_std": 0.2510317675769329, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 2812 }, { "clip_ratio": 0.0, "completion_length": 1417.1875915527344, "epoch": 0.8402658501978941, "grad_norm": 5.983640193939209, "kl": 2.38671875, "learning_rate": 5.040420260490039e-08, "loss": 0.1738, "reward": 0.502790205180645, "reward_std": 0.25041284784674644, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451450914144516, "step": 2813 }, { "clip_ratio": 0.0, "completion_length": 1371.9263610839844, "epoch": 0.8405645582854155, "grad_norm": 6.401571750640869, "kl": 2.765625, "learning_rate": 5.032983061097758e-08, "loss": 0.1659, "reward": 0.553013414144516, "reward_std": 0.28092309460043907, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884215950966, "step": 2814 }, { "clip_ratio": 0.0, "completion_length": 1443.1094360351562, "epoch": 0.8408632663729371, "grad_norm": 9.357488632202148, "kl": 2.984375, "learning_rate": 5.025558336696018e-08, "loss": 0.2125, "reward": 0.4631696566939354, "reward_std": 0.24247664585709572, "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125149011612, "step": 2815 }, { "clip_ratio": 0.0, "completion_length": 1374.950927734375, "epoch": 0.8411619744604585, "grad_norm": 5.009710311889648, "kl": 2.8828125, "learning_rate": 5.018146095362194e-08, "loss": 0.2031, "reward": 0.620535746216774, "reward_std": 0.29668501019477844, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500223517418, "step": 2816 }, { "clip_ratio": 0.0, "completion_length": 1433.9486999511719, "epoch": 0.84146068254798, "grad_norm": 4.314777851104736, "kl": 2.689453125, "learning_rate": 5.0107463451600717e-08, "loss": 0.2167, "reward": 0.595982164144516, "reward_std": 0.2876514531672001, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4598214477300644, "step": 2817 }, { "clip_ratio": 0.0, "completion_length": 1317.4420166015625, "epoch": 0.8417593906355014, "grad_norm": 3.487900495529175, "kl": 2.560546875, "learning_rate": 5.00335909413986e-08, "loss": 0.1882, "reward": 0.4832589402794838, "reward_std": 0.25559449940919876, "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654017984867096, "step": 2818 }, { "clip_ratio": 0.0, "completion_length": 1389.5781860351562, "epoch": 0.842058098723023, "grad_norm": 4.5836968421936035, "kl": 2.33203125, "learning_rate": 4.995984350338164e-08, "loss": 0.1767, "reward": 0.5485491380095482, "reward_std": 0.28340672329068184, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4815848395228386, "step": 2819 }, { "clip_ratio": 0.0, "completion_length": 1375.6786193847656, "epoch": 0.8423568068105444, "grad_norm": 4.673241138458252, "kl": 2.43359375, "learning_rate": 4.988622121777978e-08, "loss": 0.1905, "reward": 0.5239955484867096, "reward_std": 0.28886622190475464, "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4905134215950966, "step": 2820 }, { "clip_ratio": 0.0, "completion_length": 1452.4353332519531, "epoch": 0.8426555148980659, "grad_norm": 5.749076843261719, "kl": 2.60546875, "learning_rate": 4.9812724164686864e-08, "loss": 0.1423, "reward": 0.5636161118745804, "reward_std": 0.27256588265299797, "rewards/accuracy_reward": 0.09598214528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 2821 }, { "clip_ratio": 0.0, "completion_length": 1360.9263916015625, "epoch": 0.8429542229855873, "grad_norm": 5.485799312591553, "kl": 2.6015625, "learning_rate": 4.9739352424060526e-08, "loss": 0.1836, "reward": 0.548549123108387, "reward_std": 0.3236684910953045, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4547991305589676, "step": 2822 }, { "clip_ratio": 0.0, "completion_length": 1329.4152221679688, "epoch": 0.8432529310731088, "grad_norm": 2.326915979385376, "kl": 2.62890625, "learning_rate": 4.966610607572195e-08, "loss": 0.2244, "reward": 0.6060268133878708, "reward_std": 0.29894138872623444, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875223517418, "step": 2823 }, { "clip_ratio": 0.0, "completion_length": 1371.8929443359375, "epoch": 0.8435516391606303, "grad_norm": 5.336257457733154, "kl": 2.859375, "learning_rate": 4.959298519935608e-08, "loss": 0.2614, "reward": 0.5424107313156128, "reward_std": 0.2723047398030758, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607313156128, "step": 2824 }, { "clip_ratio": 0.0, "completion_length": 1356.5179138183594, "epoch": 0.8438503472481518, "grad_norm": 4.2870330810546875, "kl": 2.5234375, "learning_rate": 4.951998987451116e-08, "loss": 0.186, "reward": 0.581473246216774, "reward_std": 0.2708768658339977, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803805589676, "step": 2825 }, { "clip_ratio": 0.0, "completion_length": 1399.7545166015625, "epoch": 0.8441490553356732, "grad_norm": 4.060235023498535, "kl": 2.58203125, "learning_rate": 4.9447120180599e-08, "loss": 0.2095, "reward": 0.5379464477300644, "reward_std": 0.27569518238306046, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857387661934, "step": 2826 }, { "clip_ratio": 0.0, "completion_length": 1389.4129943847656, "epoch": 0.8444477634231947, "grad_norm": 2.042632818222046, "kl": 2.451171875, "learning_rate": 4.9374376196894724e-08, "loss": 0.1768, "reward": 0.5864955633878708, "reward_std": 0.38354455679655075, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312798023224, "step": 2827 }, { "clip_ratio": 0.0, "completion_length": 1342.7589721679688, "epoch": 0.8447464715107161, "grad_norm": 2.8938286304473877, "kl": 2.341796875, "learning_rate": 4.930175800253658e-08, "loss": 0.185, "reward": 0.5256696715950966, "reward_std": 0.265509519726038, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053805589676, "step": 2828 }, { "clip_ratio": 0.0, "completion_length": 1417.43310546875, "epoch": 0.8450451795982377, "grad_norm": 12.652461051940918, "kl": 2.33203125, "learning_rate": 4.9229265676526094e-08, "loss": 0.1612, "reward": 0.590959832072258, "reward_std": 0.2649711109697819, "rewards/accuracy_reward": 0.09375000349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4972098395228386, "step": 2829 }, { "clip_ratio": 0.0, "completion_length": 1400.3236999511719, "epoch": 0.8453438876857591, "grad_norm": 3.270181179046631, "kl": 2.41796875, "learning_rate": 4.9156899297727805e-08, "loss": 0.1879, "reward": 0.5747768133878708, "reward_std": 0.2983627915382385, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 2830 }, { "clip_ratio": 0.0, "completion_length": 1426.8482666015625, "epoch": 0.8456425957732806, "grad_norm": 4.62266731262207, "kl": 2.17578125, "learning_rate": 4.908465894486923e-08, "loss": 0.0934, "reward": 0.5189732313156128, "reward_std": 0.24897146970033646, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 2831 }, { "clip_ratio": 0.0, "completion_length": 1468.82373046875, "epoch": 0.845941303860802, "grad_norm": 2.9305553436279297, "kl": 2.462890625, "learning_rate": 4.90125446965408e-08, "loss": 0.1908, "reward": 0.5708705633878708, "reward_std": 0.2903853505849838, "rewards/accuracy_reward": 0.12053571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4503348395228386, "step": 2832 }, { "clip_ratio": 0.0, "completion_length": 1407.1138610839844, "epoch": 0.8462400119483235, "grad_norm": 3.6265411376953125, "kl": 2.53515625, "learning_rate": 4.894055663119576e-08, "loss": 0.1688, "reward": 0.5926339626312256, "reward_std": 0.31638549268245697, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732387661934, "step": 2833 }, { "clip_ratio": 0.0, "completion_length": 1398.83935546875, "epoch": 0.846538720035845, "grad_norm": 4.549892902374268, "kl": 2.328125, "learning_rate": 4.886869482715005e-08, "loss": 0.1629, "reward": 0.5061384290456772, "reward_std": 0.2775503471493721, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241305589676, "step": 2834 }, { "clip_ratio": 0.0, "completion_length": 1492.6496276855469, "epoch": 0.8468374281233665, "grad_norm": 7.062963962554932, "kl": 2.37109375, "learning_rate": 4.8796959362582254e-08, "loss": 0.1554, "reward": 0.5770089626312256, "reward_std": 0.2716165818274021, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.476562537252903, "step": 2835 }, { "clip_ratio": 0.0, "completion_length": 1356.0402526855469, "epoch": 0.8471361362108879, "grad_norm": 6.037487030029297, "kl": 2.37890625, "learning_rate": 4.872535031553359e-08, "loss": 0.1916, "reward": 0.4698660969734192, "reward_std": 0.26499462127685547, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.452008955180645, "step": 2836 }, { "clip_ratio": 0.0, "completion_length": 1366.9040832519531, "epoch": 0.8474348442984094, "grad_norm": 3.1819097995758057, "kl": 2.51953125, "learning_rate": 4.865386776390761e-08, "loss": 0.174, "reward": 0.5546875223517418, "reward_std": 0.29122116044163704, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4787946715950966, "step": 2837 }, { "clip_ratio": 0.0, "completion_length": 1434.1295471191406, "epoch": 0.8477335523859308, "grad_norm": 3.874659299850464, "kl": 2.59375, "learning_rate": 4.858251178547039e-08, "loss": 0.1806, "reward": 0.612165205180645, "reward_std": 0.3185655400156975, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 2838 }, { "clip_ratio": 0.0, "completion_length": 1377.8505554199219, "epoch": 0.8480322604734524, "grad_norm": 5.599626541137695, "kl": 2.3203125, "learning_rate": 4.851128245785017e-08, "loss": 0.2019, "reward": 0.5792411118745804, "reward_std": 0.3226524218916893, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625149011612, "step": 2839 }, { "clip_ratio": 0.0, "completion_length": 1413.1205749511719, "epoch": 0.8483309685609738, "grad_norm": 3.8569858074188232, "kl": 2.548828125, "learning_rate": 4.844017985853751e-08, "loss": 0.1922, "reward": 0.5161830633878708, "reward_std": 0.25410476699471474, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794887661934, "step": 2840 }, { "clip_ratio": 0.0, "completion_length": 1392.7054443359375, "epoch": 0.8486296766484953, "grad_norm": 5.358004093170166, "kl": 2.484375, "learning_rate": 4.8369204064885126e-08, "loss": 0.177, "reward": 0.6216518133878708, "reward_std": 0.2724447250366211, "rewards/accuracy_reward": 0.14062500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810267984867096, "step": 2841 }, { "clip_ratio": 0.0, "completion_length": 1431.8304138183594, "epoch": 0.8489283847360167, "grad_norm": 4.785943984985352, "kl": 2.55078125, "learning_rate": 4.8298355154107623e-08, "loss": 0.193, "reward": 0.6065848618745804, "reward_std": 0.28570692241191864, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4905134066939354, "step": 2842 }, { "clip_ratio": 0.0, "completion_length": 1436.8192749023438, "epoch": 0.8492270928235381, "grad_norm": 3.5231950283050537, "kl": 2.859375, "learning_rate": 4.822763320328174e-08, "loss": 0.1877, "reward": 0.5859375298023224, "reward_std": 0.31566282361745834, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375149011612, "step": 2843 }, { "clip_ratio": 0.0, "completion_length": 1365.4063110351562, "epoch": 0.8495258009110597, "grad_norm": 3.30615234375, "kl": 2.5625, "learning_rate": 4.815703828934602e-08, "loss": 0.226, "reward": 0.5803571790456772, "reward_std": 0.31315377354621887, "rewards/accuracy_reward": 0.10937500582076609, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4709821566939354, "step": 2844 }, { "clip_ratio": 0.0, "completion_length": 1387.3527221679688, "epoch": 0.8498245089985811, "grad_norm": 2.8413755893707275, "kl": 2.44140625, "learning_rate": 4.808657048910077e-08, "loss": 0.1629, "reward": 0.4933035969734192, "reward_std": 0.28593025356531143, "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535969734192, "step": 2845 }, { "clip_ratio": 0.0, "completion_length": 1469.4598999023438, "epoch": 0.8501232170861026, "grad_norm": 3.19149112701416, "kl": 2.6875, "learning_rate": 4.8016229879208104e-08, "loss": 0.1882, "reward": 0.494977705180645, "reward_std": 0.2971639446914196, "rewards/accuracy_reward": 0.024553572991862893, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470424123108387, "step": 2846 }, { "clip_ratio": 0.0, "completion_length": 1378.0848999023438, "epoch": 0.850421925173624, "grad_norm": 7.353912353515625, "kl": 2.5859375, "learning_rate": 4.7946016536191704e-08, "loss": 0.2029, "reward": 0.5206473469734192, "reward_std": 0.32165826857089996, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044813156128, "step": 2847 }, { "clip_ratio": 0.0, "completion_length": 1404.0380249023438, "epoch": 0.8507206332611456, "grad_norm": 7.891163349151611, "kl": 2.21875, "learning_rate": 4.7875930536436754e-08, "loss": 0.1678, "reward": 0.6183035969734192, "reward_std": 0.29450076818466187, "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486607164144516, "step": 2848 }, { "clip_ratio": 0.0, "completion_length": 1364.372802734375, "epoch": 0.851019341348667, "grad_norm": 4.3154191970825195, "kl": 2.3046875, "learning_rate": 4.780597195619002e-08, "loss": 0.1968, "reward": 0.5602678880095482, "reward_std": 0.2951088696718216, "rewards/accuracy_reward": 0.10044643119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459821455180645, "step": 2849 }, { "clip_ratio": 0.0, "completion_length": 1389.9040832519531, "epoch": 0.8513180494361885, "grad_norm": 5.685265064239502, "kl": 2.72265625, "learning_rate": 4.773614087155952e-08, "loss": 0.1835, "reward": 0.581473246216774, "reward_std": 0.3294804319739342, "rewards/accuracy_reward": 0.13392857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447544664144516, "step": 2850 }, { "clip_ratio": 0.0, "completion_length": 1408.47998046875, "epoch": 0.8516167575237099, "grad_norm": 16.37853240966797, "kl": 3.203125, "learning_rate": 4.7666437358514664e-08, "loss": 0.2477, "reward": 0.623883955180645, "reward_std": 0.2716706655919552, "rewards/accuracy_reward": 0.1808035832364112, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803805589676, "step": 2851 }, { "clip_ratio": 0.0, "completion_length": 1383.3013916015625, "epoch": 0.8519154656112314, "grad_norm": 3.150066614151001, "kl": 2.8125, "learning_rate": 4.759686149288605e-08, "loss": 0.2432, "reward": 0.5150669887661934, "reward_std": 0.2871357500553131, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443638414144516, "step": 2852 }, { "clip_ratio": 0.0, "completion_length": 1404.5826416015625, "epoch": 0.8522141736987529, "grad_norm": 2.581782579421997, "kl": 2.53125, "learning_rate": 4.7527413350365374e-08, "loss": 0.1729, "reward": 0.5691964477300644, "reward_std": 0.2668979801237583, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 2853 }, { "clip_ratio": 0.0, "completion_length": 1302.8929138183594, "epoch": 0.8525128817862744, "grad_norm": 3.217109203338623, "kl": 2.78125, "learning_rate": 4.745809300650542e-08, "loss": 0.2116, "reward": 0.5189732387661934, "reward_std": 0.32915472984313965, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 2854 }, { "clip_ratio": 0.0, "completion_length": 1441.5469360351562, "epoch": 0.8528115898737958, "grad_norm": 3.770112991333008, "kl": 2.765625, "learning_rate": 4.7388900536719955e-08, "loss": 0.192, "reward": 0.5329241305589676, "reward_std": 0.2925625443458557, "rewards/accuracy_reward": 0.06473214458674192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919813156128, "step": 2855 }, { "clip_ratio": 0.0, "completion_length": 1396.5090026855469, "epoch": 0.8531102979613173, "grad_norm": 4.329217433929443, "kl": 2.640625, "learning_rate": 4.7319836016283544e-08, "loss": 0.1862, "reward": 0.580357164144516, "reward_std": 0.277300700545311, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785895228386, "step": 2856 }, { "clip_ratio": 0.0, "completion_length": 1535.9755249023438, "epoch": 0.8534090060488387, "grad_norm": 9.202827453613281, "kl": 2.9609375, "learning_rate": 4.725089952033166e-08, "loss": 0.2138, "reward": 0.5044643059372902, "reward_std": 0.29236097633838654, "rewards/accuracy_reward": 0.042410717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462053582072258, "step": 2857 }, { "clip_ratio": 0.0, "completion_length": 1516.2545471191406, "epoch": 0.8537077141363603, "grad_norm": 6.010167598724365, "kl": 2.94921875, "learning_rate": 4.718209112386049e-08, "loss": 0.2039, "reward": 0.505580373108387, "reward_std": 0.25150177627801895, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732387661934, "step": 2858 }, { "clip_ratio": 0.0, "completion_length": 1379.33935546875, "epoch": 0.8540064222238817, "grad_norm": 3.23736310005188, "kl": 2.921875, "learning_rate": 4.7113410901726716e-08, "loss": 0.2376, "reward": 0.5446428805589676, "reward_std": 0.2696176506578922, "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419642984867096, "step": 2859 }, { "clip_ratio": 0.0, "completion_length": 1382.5826416015625, "epoch": 0.8543051303114032, "grad_norm": 4.317378997802734, "kl": 2.73828125, "learning_rate": 4.704485892864778e-08, "loss": 0.1981, "reward": 0.5502232313156128, "reward_std": 0.24608387798070908, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660895228386, "step": 2860 }, { "clip_ratio": 0.0, "completion_length": 1326.4888916015625, "epoch": 0.8546038383989246, "grad_norm": 4.789716720581055, "kl": 2.703125, "learning_rate": 4.69764352792015e-08, "loss": 0.1766, "reward": 0.5195312649011612, "reward_std": 0.266147892922163, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312798023224, "step": 2861 }, { "clip_ratio": 0.0, "completion_length": 1317.1250610351562, "epoch": 0.8549025464864461, "grad_norm": 11.078301429748535, "kl": 2.22265625, "learning_rate": 4.6908140027826055e-08, "loss": 0.1939, "reward": 0.5546875149011612, "reward_std": 0.2454010508954525, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625223517418, "step": 2862 }, { "clip_ratio": 0.0, "completion_length": 1409.0692443847656, "epoch": 0.8552012545739676, "grad_norm": 5.244504928588867, "kl": 2.18359375, "learning_rate": 4.6839973248820053e-08, "loss": 0.1698, "reward": 0.5334821715950966, "reward_std": 0.2737981453537941, "rewards/accuracy_reward": 0.05580357299186289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785895228386, "step": 2863 }, { "clip_ratio": 0.0, "completion_length": 1380.3527221679688, "epoch": 0.8554999626614891, "grad_norm": 7.636904239654541, "kl": 2.41796875, "learning_rate": 4.677193501634219e-08, "loss": 0.2013, "reward": 0.5279018133878708, "reward_std": 0.2586685083806515, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 2864 }, { "clip_ratio": 0.0, "completion_length": 1388.8192443847656, "epoch": 0.8557986707490105, "grad_norm": 4.885064601898193, "kl": 2.275390625, "learning_rate": 4.6704025404411444e-08, "loss": 0.1402, "reward": 0.5736607313156128, "reward_std": 0.2597111761569977, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107238650322, "step": 2865 }, { "clip_ratio": 0.0, "completion_length": 1424.7969360351562, "epoch": 0.856097378836532, "grad_norm": 4.2342963218688965, "kl": 2.55859375, "learning_rate": 4.663624448690683e-08, "loss": 0.1909, "reward": 0.4944196566939354, "reward_std": 0.24163566902279854, "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803805589676, "step": 2866 }, { "clip_ratio": 0.0, "completion_length": 1429.8906860351562, "epoch": 0.8563960869240534, "grad_norm": 2.9976601600646973, "kl": 2.26953125, "learning_rate": 4.65685923375673e-08, "loss": 0.1493, "reward": 0.5295759290456772, "reward_std": 0.2678900435566902, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.460379496216774, "step": 2867 }, { "clip_ratio": 0.0, "completion_length": 1438.4732971191406, "epoch": 0.856694795011575, "grad_norm": 2.8107693195343018, "kl": 2.26953125, "learning_rate": 4.650106902999177e-08, "loss": 0.135, "reward": 0.616629496216774, "reward_std": 0.2723708339035511, "rewards/accuracy_reward": 0.16741071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187798023224, "step": 2868 }, { "clip_ratio": 0.0, "completion_length": 1405.58935546875, "epoch": 0.8569935030990964, "grad_norm": 3.260572910308838, "kl": 2.48046875, "learning_rate": 4.643367463763902e-08, "loss": 0.2034, "reward": 0.4966518059372902, "reward_std": 0.255341786891222, "rewards/accuracy_reward": 0.051339287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125149011612, "step": 2869 }, { "clip_ratio": 0.0, "completion_length": 1414.2701416015625, "epoch": 0.8572922111866179, "grad_norm": 3.724201202392578, "kl": 2.1640625, "learning_rate": 4.636640923382748e-08, "loss": 0.1538, "reward": 0.4893973469734192, "reward_std": 0.29030996561050415, "rewards/accuracy_reward": 0.03571428754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830559372902, "step": 2870 }, { "clip_ratio": 0.0, "completion_length": 1312.8214721679688, "epoch": 0.8575909192741393, "grad_norm": 5.04609489440918, "kl": 2.15625, "learning_rate": 4.6299272891735366e-08, "loss": 0.1622, "reward": 0.5161830708384514, "reward_std": 0.27046553045511246, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4447544887661934, "step": 2871 }, { "clip_ratio": 0.0, "completion_length": 1375.4621276855469, "epoch": 0.8578896273616609, "grad_norm": 4.369451999664307, "kl": 2.03515625, "learning_rate": 4.6232265684400446e-08, "loss": 0.1148, "reward": 0.577566996216774, "reward_std": 0.2699868232011795, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 2872 }, { "clip_ratio": 0.0, "completion_length": 1459.2188415527344, "epoch": 0.8581883354491823, "grad_norm": 5.938235759735107, "kl": 2.31640625, "learning_rate": 4.6165387684719945e-08, "loss": 0.1523, "reward": 0.545758955180645, "reward_std": 0.2727008946239948, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732387661934, "step": 2873 }, { "clip_ratio": 0.0, "completion_length": 1472.8192443847656, "epoch": 0.8584870435367038, "grad_norm": 5.750783920288086, "kl": 2.7265625, "learning_rate": 4.6098638965450594e-08, "loss": 0.2246, "reward": 0.5625000298023224, "reward_std": 0.25955916196107864, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4352678805589676, "step": 2874 }, { "clip_ratio": 0.0, "completion_length": 1425.9085388183594, "epoch": 0.8587857516242252, "grad_norm": 2.565192937850952, "kl": 2.1796875, "learning_rate": 4.603201959920851e-08, "loss": 0.1667, "reward": 0.5111607313156128, "reward_std": 0.2502126134932041, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 2875 }, { "clip_ratio": 0.0, "completion_length": 1428.9978332519531, "epoch": 0.8590844597117467, "grad_norm": 5.75710916519165, "kl": 2.48046875, "learning_rate": 4.596552965846893e-08, "loss": 0.1778, "reward": 0.4988839626312256, "reward_std": 0.29154304414987564, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375298023224, "step": 2876 }, { "clip_ratio": 0.0, "completion_length": 1357.7009582519531, "epoch": 0.8593831677992682, "grad_norm": 5.156015872955322, "kl": 2.0234375, "learning_rate": 4.5899169215566494e-08, "loss": 0.1286, "reward": 0.5954241305589676, "reward_std": 0.2878236882388592, "rewards/accuracy_reward": 0.15625000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.439174123108387, "step": 2877 }, { "clip_ratio": 0.0, "completion_length": 1419.9621276855469, "epoch": 0.8596818758867897, "grad_norm": 2.0853195190429688, "kl": 2.34375, "learning_rate": 4.583293834269479e-08, "loss": 0.1484, "reward": 0.5926339626312256, "reward_std": 0.2722231522202492, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803805589676, "step": 2878 }, { "clip_ratio": 0.0, "completion_length": 1438.3259582519531, "epoch": 0.8599805839743111, "grad_norm": 4.590451717376709, "kl": 2.052734375, "learning_rate": 4.5766837111906545e-08, "loss": 0.1393, "reward": 0.5005580708384514, "reward_std": 0.2931620925664902, "rewards/accuracy_reward": 0.04241071594879031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473469734192, "step": 2879 }, { "clip_ratio": 0.0, "completion_length": 1355.450927734375, "epoch": 0.8602792920618326, "grad_norm": 4.078364849090576, "kl": 2.2421875, "learning_rate": 4.570086559511343e-08, "loss": 0.1608, "reward": 0.5167411044239998, "reward_std": 0.2626497894525528, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732313156128, "step": 2880 }, { "clip_ratio": 0.0, "completion_length": 1418.4465026855469, "epoch": 0.860578000149354, "grad_norm": 4.486968994140625, "kl": 2.59765625, "learning_rate": 4.563502386408595e-08, "loss": 0.2182, "reward": 0.491071455180645, "reward_std": 0.27111920714378357, "rewards/accuracy_reward": 0.03571428591385484, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 2881 }, { "clip_ratio": 0.0, "completion_length": 1444.7322082519531, "epoch": 0.8608767082368756, "grad_norm": 6.769111156463623, "kl": 2.671875, "learning_rate": 4.55693119904535e-08, "loss": 0.1616, "reward": 0.4765625149011612, "reward_std": 0.2687050849199295, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4497768059372902, "step": 2882 }, { "clip_ratio": 0.0, "completion_length": 1363.4933776855469, "epoch": 0.861175416324397, "grad_norm": 9.516194343566895, "kl": 2.35546875, "learning_rate": 4.550373004570414e-08, "loss": 0.2151, "reward": 0.5318080559372902, "reward_std": 0.3032205030322075, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187723517418, "step": 2883 }, { "clip_ratio": 0.0, "completion_length": 1329.21435546875, "epoch": 0.8614741244119185, "grad_norm": 3.327559471130371, "kl": 2.87109375, "learning_rate": 4.543827810118459e-08, "loss": 0.2215, "reward": 0.577566996216774, "reward_std": 0.30362867563962936, "rewards/accuracy_reward": 0.12276786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4547991305589676, "step": 2884 }, { "clip_ratio": 0.0, "completion_length": 1371.0536193847656, "epoch": 0.8617728324994399, "grad_norm": 3.9060451984405518, "kl": 2.62109375, "learning_rate": 4.537295622810015e-08, "loss": 0.1893, "reward": 0.566964328289032, "reward_std": 0.344548337161541, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 2885 }, { "clip_ratio": 0.0, "completion_length": 1356.9442443847656, "epoch": 0.8620715405869613, "grad_norm": 4.358765602111816, "kl": 2.046875, "learning_rate": 4.5307764497514664e-08, "loss": 0.1444, "reward": 0.5334821790456772, "reward_std": 0.2998164966702461, "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357238650322, "step": 2886 }, { "clip_ratio": 0.0, "completion_length": 1476.2121276855469, "epoch": 0.8623702486744829, "grad_norm": 2.7801144123077393, "kl": 2.55078125, "learning_rate": 4.5242702980350257e-08, "loss": 0.1969, "reward": 0.5580357313156128, "reward_std": 0.2770370952785015, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.450892873108387, "step": 2887 }, { "clip_ratio": 0.0, "completion_length": 1349.3594360351562, "epoch": 0.8626689567620043, "grad_norm": 2.839618444442749, "kl": 2.43359375, "learning_rate": 4.5177771747387534e-08, "loss": 0.2223, "reward": 0.5708705633878708, "reward_std": 0.2657773979008198, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4436384066939354, "step": 2888 }, { "clip_ratio": 0.0, "completion_length": 1373.2969360351562, "epoch": 0.8629676648495258, "grad_norm": 18.944284439086914, "kl": 2.083984375, "learning_rate": 4.511297086926534e-08, "loss": 0.1945, "reward": 0.5345982387661934, "reward_std": 0.2888212278485298, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483258955180645, "step": 2889 }, { "clip_ratio": 0.0, "completion_length": 1393.7723693847656, "epoch": 0.8632663729370472, "grad_norm": 3.1010167598724365, "kl": 2.23828125, "learning_rate": 4.504830041648062e-08, "loss": 0.1502, "reward": 0.5435267984867096, "reward_std": 0.2739120200276375, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 2890 }, { "clip_ratio": 0.0, "completion_length": 1456.52685546875, "epoch": 0.8635650810245687, "grad_norm": 2.4240949153900146, "kl": 2.5859375, "learning_rate": 4.4983760459388554e-08, "loss": 0.2133, "reward": 0.5385044887661934, "reward_std": 0.27257463335990906, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4335937723517418, "step": 2891 }, { "clip_ratio": 0.0, "completion_length": 1391.7120971679688, "epoch": 0.8638637891120902, "grad_norm": 4.119161605834961, "kl": 2.28125, "learning_rate": 4.491935106820221e-08, "loss": 0.1755, "reward": 0.4899553805589676, "reward_std": 0.2692538797855377, "rewards/accuracy_reward": 0.02678571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 2892 }, { "clip_ratio": 0.0, "completion_length": 1384.477783203125, "epoch": 0.8641624971996117, "grad_norm": 3.4694552421569824, "kl": 2.146484375, "learning_rate": 4.4855072312992746e-08, "loss": 0.1148, "reward": 0.5468750149011612, "reward_std": 0.31951281428337097, "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 2893 }, { "clip_ratio": 0.0, "completion_length": 1431.3148193359375, "epoch": 0.8644612052871331, "grad_norm": 2.231614351272583, "kl": 2.5078125, "learning_rate": 4.479092426368916e-08, "loss": 0.1982, "reward": 0.5139508992433548, "reward_std": 0.2871709018945694, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 2894 }, { "clip_ratio": 0.0, "completion_length": 1442.841552734375, "epoch": 0.8647599133746546, "grad_norm": 5.87947416305542, "kl": 2.5546875, "learning_rate": 4.472690699007818e-08, "loss": 0.1782, "reward": 0.5747767984867096, "reward_std": 0.28273075819015503, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982313156128, "step": 2895 }, { "clip_ratio": 0.0, "completion_length": 1381.62060546875, "epoch": 0.865058621462176, "grad_norm": 6.50895881652832, "kl": 2.443359375, "learning_rate": 4.466302056180439e-08, "loss": 0.1632, "reward": 0.526227705180645, "reward_std": 0.24609433859586716, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470424123108387, "step": 2896 }, { "clip_ratio": 0.0, "completion_length": 1448.7143859863281, "epoch": 0.8653573295496976, "grad_norm": 3.8824644088745117, "kl": 2.64453125, "learning_rate": 4.459926504836995e-08, "loss": 0.1599, "reward": 0.5452009215950966, "reward_std": 0.24349985644221306, "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794813156128, "step": 2897 }, { "clip_ratio": 0.0, "completion_length": 1403.9755249023438, "epoch": 0.865656037637219, "grad_norm": 4.401116847991943, "kl": 2.4375, "learning_rate": 4.453564051913458e-08, "loss": 0.1994, "reward": 0.4955357387661934, "reward_std": 0.2216249443590641, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750223517418, "step": 2898 }, { "clip_ratio": 0.0, "completion_length": 1459.2656555175781, "epoch": 0.8659547457247405, "grad_norm": 4.067502975463867, "kl": 2.89453125, "learning_rate": 4.447214704331553e-08, "loss": 0.2175, "reward": 0.5719866380095482, "reward_std": 0.2974594309926033, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451450914144516, "step": 2899 }, { "clip_ratio": 0.0, "completion_length": 1394.0960083007812, "epoch": 0.8662534538122619, "grad_norm": 8.130118370056152, "kl": 2.306640625, "learning_rate": 4.4408784689987535e-08, "loss": 0.1758, "reward": 0.5563616380095482, "reward_std": 0.30481675267219543, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366380095482, "step": 2900 }, { "clip_ratio": 0.0, "completion_length": 1408.6719360351562, "epoch": 0.8665521618997835, "grad_norm": 4.198693752288818, "kl": 2.4921875, "learning_rate": 4.4345553528082564e-08, "loss": 0.2091, "reward": 0.5440848544239998, "reward_std": 0.27389493957161903, "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4436384066939354, "step": 2901 }, { "clip_ratio": 0.0, "completion_length": 1385.1942749023438, "epoch": 0.8668508699873049, "grad_norm": 4.0893449783325195, "kl": 2.4375, "learning_rate": 4.428245362638996e-08, "loss": 0.175, "reward": 0.5496652126312256, "reward_std": 0.3073012977838516, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 2902 }, { "clip_ratio": 0.0, "completion_length": 1355.7880249023438, "epoch": 0.8671495780748264, "grad_norm": 3.1080949306488037, "kl": 1.966796875, "learning_rate": 4.421948505355622e-08, "loss": 0.1267, "reward": 0.530691996216774, "reward_std": 0.297591432929039, "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455708384514, "step": 2903 }, { "clip_ratio": 0.0, "completion_length": 1458.1786499023438, "epoch": 0.8674482861623478, "grad_norm": 3.431678533554077, "kl": 2.80078125, "learning_rate": 4.415664787808497e-08, "loss": 0.2069, "reward": 0.545200914144516, "reward_std": 0.2541237808763981, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830559372902, "step": 2904 }, { "clip_ratio": 0.0, "completion_length": 1411.9308471679688, "epoch": 0.8677469942498693, "grad_norm": 2.83799409866333, "kl": 2.244140625, "learning_rate": 4.409394216833693e-08, "loss": 0.136, "reward": 0.522321455180645, "reward_std": 0.25076668336987495, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 2905 }, { "clip_ratio": 0.0, "completion_length": 1390.9933776855469, "epoch": 0.8680457023373908, "grad_norm": 3.3913662433624268, "kl": 2.48828125, "learning_rate": 4.4031367992529716e-08, "loss": 0.1738, "reward": 0.6227678954601288, "reward_std": 0.29145878553390503, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000223517418, "step": 2906 }, { "clip_ratio": 0.0, "completion_length": 1478.0915832519531, "epoch": 0.8683444104249123, "grad_norm": 4.4875807762146, "kl": 2.73828125, "learning_rate": 4.396892541873792e-08, "loss": 0.1568, "reward": 0.4771205559372902, "reward_std": 0.25636229664087296, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4213169887661934, "step": 2907 }, { "clip_ratio": 0.0, "completion_length": 1462.3840026855469, "epoch": 0.8686431185124337, "grad_norm": 3.979013681411743, "kl": 2.453125, "learning_rate": 4.390661451489298e-08, "loss": 0.2126, "reward": 0.4693080484867096, "reward_std": 0.27515827119350433, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4447544887661934, "step": 2908 }, { "clip_ratio": 0.0, "completion_length": 1448.2545471191406, "epoch": 0.8689418265999552, "grad_norm": 2.076425313949585, "kl": 2.5, "learning_rate": 4.3844435348782997e-08, "loss": 0.1585, "reward": 0.5200893208384514, "reward_std": 0.26543042436242104, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 2909 }, { "clip_ratio": 0.0, "completion_length": 1484.4375610351562, "epoch": 0.8692405346874766, "grad_norm": 4.710153579711914, "kl": 2.283203125, "learning_rate": 4.378238798805283e-08, "loss": 0.1645, "reward": 0.538504496216774, "reward_std": 0.2614324502646923, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723544239998, "step": 2910 }, { "clip_ratio": 0.0, "completion_length": 1432.0492248535156, "epoch": 0.8695392427749982, "grad_norm": 3.1144001483917236, "kl": 2.50390625, "learning_rate": 4.3720472500203953e-08, "loss": 0.1833, "reward": 0.4799107387661934, "reward_std": 0.23767465725541115, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500149011612, "step": 2911 }, { "clip_ratio": 0.0, "completion_length": 1461.1139221191406, "epoch": 0.8698379508625196, "grad_norm": 4.1606831550598145, "kl": 2.181640625, "learning_rate": 4.36586889525943e-08, "loss": 0.1205, "reward": 0.553571455180645, "reward_std": 0.2694348655641079, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 2912 }, { "clip_ratio": 0.0, "completion_length": 1545.5245971679688, "epoch": 0.8701366589500411, "grad_norm": 4.299087047576904, "kl": 2.84765625, "learning_rate": 4.359703741243834e-08, "loss": 0.187, "reward": 0.556919664144516, "reward_std": 0.2562304958701134, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732313156128, "step": 2913 }, { "clip_ratio": 0.0, "completion_length": 1492.77685546875, "epoch": 0.8704353670375625, "grad_norm": 7.032388687133789, "kl": 2.3359375, "learning_rate": 4.3535517946806925e-08, "loss": 0.1891, "reward": 0.5357143133878708, "reward_std": 0.2558184452354908, "rewards/accuracy_reward": 0.08705357415601611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607313156128, "step": 2914 }, { "clip_ratio": 0.0, "completion_length": 1407.2232666015625, "epoch": 0.870734075125084, "grad_norm": 4.605353355407715, "kl": 2.47265625, "learning_rate": 4.347413062262718e-08, "loss": 0.1792, "reward": 0.5926339626312256, "reward_std": 0.2931572422385216, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018059372902, "step": 2915 }, { "clip_ratio": 0.0, "completion_length": 1292.96435546875, "epoch": 0.8710327832126055, "grad_norm": 16.99807357788086, "kl": 2.0625, "learning_rate": 4.3412875506682486e-08, "loss": 0.2031, "reward": 0.544084832072258, "reward_std": 0.25510599464178085, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491380095482, "step": 2916 }, { "clip_ratio": 0.0, "completion_length": 1411.2143249511719, "epoch": 0.871331491300127, "grad_norm": 2.4059104919433594, "kl": 2.48828125, "learning_rate": 4.3351752665612456e-08, "loss": 0.1562, "reward": 0.5781250298023224, "reward_std": 0.2909705713391304, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4709821566939354, "step": 2917 }, { "clip_ratio": 0.0, "completion_length": 1476.7522888183594, "epoch": 0.8716301993876484, "grad_norm": 2.665724277496338, "kl": 2.181640625, "learning_rate": 4.3290762165912675e-08, "loss": 0.1407, "reward": 0.4614955559372902, "reward_std": 0.25243620574474335, "rewards/accuracy_reward": 0.017857143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4436384066939354, "step": 2918 }, { "clip_ratio": 0.0, "completion_length": 1481.7188110351562, "epoch": 0.8719289074751699, "grad_norm": 7.4701924324035645, "kl": 2.48828125, "learning_rate": 4.322990407393491e-08, "loss": 0.2324, "reward": 0.4972098469734192, "reward_std": 0.2831525281071663, "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 2919 }, { "clip_ratio": 0.0, "completion_length": 1579.6898193359375, "epoch": 0.8722276155626913, "grad_norm": 2.2055776119232178, "kl": 2.5625, "learning_rate": 4.316917845588673e-08, "loss": 0.1168, "reward": 0.474888414144516, "reward_std": 0.2886270433664322, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.439174123108387, "step": 2920 }, { "clip_ratio": 0.0, "completion_length": 1320.1384582519531, "epoch": 0.8725263236502129, "grad_norm": 6.3050537109375, "kl": 1.98046875, "learning_rate": 4.310858537783168e-08, "loss": 0.1432, "reward": 0.6082589477300644, "reward_std": 0.3104020655155182, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122767984867096, "step": 2921 }, { "clip_ratio": 0.0, "completion_length": 1443.8907165527344, "epoch": 0.8728250317377343, "grad_norm": 3.4340455532073975, "kl": 2.5859375, "learning_rate": 4.3048124905689134e-08, "loss": 0.1628, "reward": 0.5357143208384514, "reward_std": 0.27140703424811363, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000223517418, "step": 2922 }, { "clip_ratio": 0.0, "completion_length": 1463.341552734375, "epoch": 0.8731237398252558, "grad_norm": 6.1412129402160645, "kl": 2.86328125, "learning_rate": 4.29877971052341e-08, "loss": 0.2213, "reward": 0.5234375223517418, "reward_std": 0.25454414263367653, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4341518059372902, "step": 2923 }, { "clip_ratio": 0.0, "completion_length": 1422.7233276367188, "epoch": 0.8734224479127772, "grad_norm": 2.3442580699920654, "kl": 2.56640625, "learning_rate": 4.292760204209731e-08, "loss": 0.1778, "reward": 0.503348246216774, "reward_std": 0.3016170933842659, "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463169664144516, "step": 2924 }, { "clip_ratio": 0.0, "completion_length": 1409.76123046875, "epoch": 0.8737211560002988, "grad_norm": 8.09238052368164, "kl": 2.40234375, "learning_rate": 4.286753978176517e-08, "loss": 0.1935, "reward": 0.5507812798023224, "reward_std": 0.2787320017814636, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 2925 }, { "clip_ratio": 0.0, "completion_length": 1337.4286193847656, "epoch": 0.8740198640878202, "grad_norm": 4.324495792388916, "kl": 2.44140625, "learning_rate": 4.280761038957944e-08, "loss": 0.1811, "reward": 0.5133928805589676, "reward_std": 0.2936323508620262, "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857313156128, "step": 2926 }, { "clip_ratio": 0.0, "completion_length": 1421.26123046875, "epoch": 0.8743185721753417, "grad_norm": 3.146738290786743, "kl": 2.6640625, "learning_rate": 4.274781393073747e-08, "loss": 0.1879, "reward": 0.5039062649011612, "reward_std": 0.26125428080558777, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705484867096, "step": 2927 }, { "clip_ratio": 0.0, "completion_length": 1420.904052734375, "epoch": 0.8746172802628631, "grad_norm": 4.011467456817627, "kl": 2.65625, "learning_rate": 4.2688150470291975e-08, "loss": 0.1952, "reward": 0.5463169813156128, "reward_std": 0.3086441904306412, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169813156128, "step": 2928 }, { "clip_ratio": 0.0, "completion_length": 1404.4420166015625, "epoch": 0.8749159883503845, "grad_norm": 4.470172882080078, "kl": 2.44921875, "learning_rate": 4.262862007315088e-08, "loss": 0.2276, "reward": 0.5318080633878708, "reward_std": 0.2801201827824116, "rewards/accuracy_reward": 0.07142857671715319, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794813156128, "step": 2929 }, { "clip_ratio": 0.0, "completion_length": 1476.5514221191406, "epoch": 0.8752146964379061, "grad_norm": 3.7263684272766113, "kl": 2.55078125, "learning_rate": 4.2569222804077485e-08, "loss": 0.2078, "reward": 0.577008955180645, "reward_std": 0.29595842957496643, "rewards/accuracy_reward": 0.11607143608853221, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 2930 }, { "clip_ratio": 0.0, "completion_length": 1447.8482666015625, "epoch": 0.8755134045254275, "grad_norm": 4.08577823638916, "kl": 2.77734375, "learning_rate": 4.250995872769016e-08, "loss": 0.198, "reward": 0.486049123108387, "reward_std": 0.264996737241745, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459263414144516, "step": 2931 }, { "clip_ratio": 0.0, "completion_length": 1357.2322082519531, "epoch": 0.875812112612949, "grad_norm": 4.519721984863281, "kl": 2.5234375, "learning_rate": 4.245082790846241e-08, "loss": 0.2016, "reward": 0.6032366454601288, "reward_std": 0.3070860952138901, "rewards/accuracy_reward": 0.13169643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715402126312256, "step": 2932 }, { "clip_ratio": 0.0, "completion_length": 1403.7121276855469, "epoch": 0.8761108207004704, "grad_norm": 3.4703238010406494, "kl": 2.515625, "learning_rate": 4.239183041072282e-08, "loss": 0.1809, "reward": 0.5323660969734192, "reward_std": 0.2985917180776596, "rewards/accuracy_reward": 0.04017857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875223517418, "step": 2933 }, { "clip_ratio": 0.0, "completion_length": 1412.4442138671875, "epoch": 0.8764095287879919, "grad_norm": 4.761364936828613, "kl": 2.73046875, "learning_rate": 4.233296629865482e-08, "loss": 0.1678, "reward": 0.493303582072258, "reward_std": 0.2853805534541607, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.450892873108387, "step": 2934 }, { "clip_ratio": 0.0, "completion_length": 1505.3237609863281, "epoch": 0.8767082368755134, "grad_norm": 5.03891658782959, "kl": 2.421875, "learning_rate": 4.227423563629683e-08, "loss": 0.1671, "reward": 0.4860491305589676, "reward_std": 0.2771409675478935, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4525669813156128, "step": 2935 }, { "clip_ratio": 0.0, "completion_length": 1487.3572387695312, "epoch": 0.8770069449630349, "grad_norm": 3.845522165298462, "kl": 2.49609375, "learning_rate": 4.221563848754208e-08, "loss": 0.1769, "reward": 0.6104910895228386, "reward_std": 0.221270103007555, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 2936 }, { "clip_ratio": 0.0, "completion_length": 1469.935302734375, "epoch": 0.8773056530505563, "grad_norm": 2.432983636856079, "kl": 2.494140625, "learning_rate": 4.215717491613846e-08, "loss": 0.1835, "reward": 0.5245535895228386, "reward_std": 0.2702382691204548, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.4665178656578064, "step": 2937 }, { "clip_ratio": 0.0, "completion_length": 1498.9754943847656, "epoch": 0.8776043611380778, "grad_norm": 3.0776660442352295, "kl": 2.69921875, "learning_rate": 4.209884498568865e-08, "loss": 0.196, "reward": 0.5150669813156128, "reward_std": 0.2635659947991371, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955484867096, "step": 2938 }, { "clip_ratio": 0.0, "completion_length": 1490.6138916015625, "epoch": 0.8779030692255992, "grad_norm": 14.019775390625, "kl": 2.3125, "learning_rate": 4.204064875964992e-08, "loss": 0.1991, "reward": 0.5251116454601288, "reward_std": 0.25747140869498253, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294887661934, "step": 2939 }, { "clip_ratio": 0.0, "completion_length": 1476.0759582519531, "epoch": 0.8782017773131208, "grad_norm": 5.375280857086182, "kl": 2.623046875, "learning_rate": 4.1982586301333976e-08, "loss": 0.1706, "reward": 0.5306920036673546, "reward_std": 0.2743063047528267, "rewards/accuracy_reward": 0.05580357299186289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884215950966, "step": 2940 }, { "clip_ratio": 0.0, "completion_length": 1426.7433776855469, "epoch": 0.8785004854006422, "grad_norm": 5.677239418029785, "kl": 2.3359375, "learning_rate": 4.1924657673907155e-08, "loss": 0.1891, "reward": 0.6707589626312256, "reward_std": 0.30025614798069, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 2941 }, { "clip_ratio": 0.0, "completion_length": 1454.0402526855469, "epoch": 0.8787991934881637, "grad_norm": 6.0459208488464355, "kl": 2.44921875, "learning_rate": 4.186686294039013e-08, "loss": 0.1936, "reward": 0.4793526977300644, "reward_std": 0.2846919037401676, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705559372902, "step": 2942 }, { "clip_ratio": 0.0, "completion_length": 1461.6473693847656, "epoch": 0.8790979015756851, "grad_norm": 6.779751300811768, "kl": 2.953125, "learning_rate": 4.180920216365785e-08, "loss": 0.1815, "reward": 0.4748884066939354, "reward_std": 0.27098462358117104, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443638414144516, "step": 2943 }, { "clip_ratio": 0.0, "completion_length": 1387.5134582519531, "epoch": 0.8793966096632067, "grad_norm": 2.737172842025757, "kl": 2.87890625, "learning_rate": 4.175167540643965e-08, "loss": 0.2239, "reward": 0.5479911044239998, "reward_std": 0.29376664012670517, "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732313156128, "step": 2944 }, { "clip_ratio": 0.0, "completion_length": 1396.2925109863281, "epoch": 0.8796953177507281, "grad_norm": 2.925930976867676, "kl": 2.6015625, "learning_rate": 4.1694282731318936e-08, "loss": 0.1456, "reward": 0.517857164144516, "reward_std": 0.2522348426282406, "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459821455180645, "step": 2945 }, { "clip_ratio": 0.0, "completion_length": 1484.3415832519531, "epoch": 0.8799940258382496, "grad_norm": 3.428406000137329, "kl": 2.76953125, "learning_rate": 4.163702420073335e-08, "loss": 0.1759, "reward": 0.5239955633878708, "reward_std": 0.2529898174107075, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.439174123108387, "step": 2946 }, { "clip_ratio": 0.0, "completion_length": 1382.24560546875, "epoch": 0.880292733925771, "grad_norm": 4.6280436515808105, "kl": 2.43359375, "learning_rate": 4.157989987697461e-08, "loss": 0.1819, "reward": 0.513392873108387, "reward_std": 0.2849043868482113, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857387661934, "step": 2947 }, { "clip_ratio": 0.0, "completion_length": 1479.3973693847656, "epoch": 0.8805914420132925, "grad_norm": 3.522235155105591, "kl": 2.5234375, "learning_rate": 4.152290982218829e-08, "loss": 0.1473, "reward": 0.5234375223517418, "reward_std": 0.28634610399603844, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 2948 }, { "clip_ratio": 0.0, "completion_length": 1400.97998046875, "epoch": 0.880890150100814, "grad_norm": 11.735478401184082, "kl": 2.0859375, "learning_rate": 4.146605409837403e-08, "loss": 0.1815, "reward": 0.5513393059372902, "reward_std": 0.3034211955964565, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 2949 }, { "clip_ratio": 0.0, "completion_length": 1473.0134582519531, "epoch": 0.8811888581883355, "grad_norm": 3.916386842727661, "kl": 2.474609375, "learning_rate": 4.1409332767385305e-08, "loss": 0.1722, "reward": 0.6116071864962578, "reward_std": 0.2814069241285324, "rewards/accuracy_reward": 0.13392857764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785895228386, "step": 2950 }, { "clip_ratio": 0.0, "completion_length": 1436.1116943359375, "epoch": 0.8814875662758569, "grad_norm": 3.411412477493286, "kl": 2.57421875, "learning_rate": 4.135274589092934e-08, "loss": 0.2067, "reward": 0.5625000298023224, "reward_std": 0.29672956094145775, "rewards/accuracy_reward": 0.07812500209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750223517418, "step": 2951 }, { "clip_ratio": 0.0, "completion_length": 1463.6741943359375, "epoch": 0.8817862743633784, "grad_norm": 8.278216361999512, "kl": 2.158203125, "learning_rate": 4.129629353056709e-08, "loss": 0.1583, "reward": 0.4838169887661934, "reward_std": 0.2592548839747906, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919813156128, "step": 2952 }, { "clip_ratio": 0.0, "completion_length": 1434.44873046875, "epoch": 0.8820849824508998, "grad_norm": 3.9260380268096924, "kl": 2.53125, "learning_rate": 4.123997574771324e-08, "loss": 0.1956, "reward": 0.617187537252903, "reward_std": 0.3236777186393738, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625298023224, "step": 2953 }, { "clip_ratio": 0.0, "completion_length": 1444.2924499511719, "epoch": 0.8823836905384214, "grad_norm": 2.164625883102417, "kl": 2.75390625, "learning_rate": 4.118379260363597e-08, "loss": 0.2081, "reward": 0.4815848469734192, "reward_std": 0.26658329367637634, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312649011612, "step": 2954 }, { "clip_ratio": 0.0, "completion_length": 1442.4509887695312, "epoch": 0.8826823986259428, "grad_norm": 6.748205661773682, "kl": 2.5078125, "learning_rate": 4.112774415945705e-08, "loss": 0.2026, "reward": 0.5742187798023224, "reward_std": 0.27163639664649963, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.471540205180645, "step": 2955 }, { "clip_ratio": 0.0, "completion_length": 1410.2388916015625, "epoch": 0.8829811067134643, "grad_norm": 10.157845497131348, "kl": 2.216796875, "learning_rate": 4.1071830476151734e-08, "loss": 0.1663, "reward": 0.545200914144516, "reward_std": 0.2795798182487488, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489397332072258, "step": 2956 }, { "clip_ratio": 0.0, "completion_length": 1423.9063110351562, "epoch": 0.8832798148009857, "grad_norm": 6.070518970489502, "kl": 2.83984375, "learning_rate": 4.101605161454856e-08, "loss": 0.2126, "reward": 0.5273437798023224, "reward_std": 0.2749021574854851, "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473469734192, "step": 2957 }, { "clip_ratio": 0.0, "completion_length": 1424.6674499511719, "epoch": 0.8835785228885072, "grad_norm": 5.1196417808532715, "kl": 2.302734375, "learning_rate": 4.096040763532953e-08, "loss": 0.1374, "reward": 0.524553582072258, "reward_std": 0.2565108649432659, "rewards/accuracy_reward": 0.040178572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750149011612, "step": 2958 }, { "clip_ratio": 0.0, "completion_length": 1379.0469360351562, "epoch": 0.8838772309760287, "grad_norm": 2.4004950523376465, "kl": 2.251953125, "learning_rate": 4.090489859902976e-08, "loss": 0.162, "reward": 0.6138393133878708, "reward_std": 0.29943014681339264, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.497767873108387, "step": 2959 }, { "clip_ratio": 0.0, "completion_length": 1500.4107971191406, "epoch": 0.8841759390635502, "grad_norm": 8.261034965515137, "kl": 2.6484375, "learning_rate": 4.0849524566037714e-08, "loss": 0.1785, "reward": 0.4893973469734192, "reward_std": 0.26156584918498993, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4380580559372902, "step": 2960 }, { "clip_ratio": 0.0, "completion_length": 1454.4063415527344, "epoch": 0.8844746471510716, "grad_norm": 3.348639488220215, "kl": 2.45703125, "learning_rate": 4.07942855965949e-08, "loss": 0.1245, "reward": 0.5424107387661934, "reward_std": 0.33301711454987526, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107387661934, "step": 2961 }, { "clip_ratio": 0.0, "completion_length": 1381.4509582519531, "epoch": 0.8847733552385931, "grad_norm": 5.460599899291992, "kl": 2.6953125, "learning_rate": 4.0739181750795856e-08, "loss": 0.216, "reward": 0.6171875149011612, "reward_std": 0.2697641924023628, "rewards/accuracy_reward": 0.13392857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4832589477300644, "step": 2962 }, { "clip_ratio": 0.0, "completion_length": 1449.1808776855469, "epoch": 0.8850720633261145, "grad_norm": 8.849095344543457, "kl": 2.9453125, "learning_rate": 4.0684213088588205e-08, "loss": 0.2188, "reward": 0.4637276977300644, "reward_std": 0.2572999447584152, "rewards/accuracy_reward": 0.015625000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026977300644, "step": 2963 }, { "clip_ratio": 0.0, "completion_length": 1345.3661499023438, "epoch": 0.8853707714136361, "grad_norm": 4.845189094543457, "kl": 2.46484375, "learning_rate": 4.062937966977249e-08, "loss": 0.2236, "reward": 0.651785746216774, "reward_std": 0.32399916648864746, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466517873108387, "step": 2964 }, { "clip_ratio": 0.0, "completion_length": 1524.40185546875, "epoch": 0.8856694795011575, "grad_norm": 9.83930778503418, "kl": 2.2109375, "learning_rate": 4.057468155400203e-08, "loss": 0.1452, "reward": 0.5011160969734192, "reward_std": 0.2875378876924515, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 2965 }, { "clip_ratio": 0.0, "completion_length": 1397.8795471191406, "epoch": 0.885968187588679, "grad_norm": 3.4356203079223633, "kl": 2.38671875, "learning_rate": 4.0520118800783035e-08, "loss": 0.1829, "reward": 0.522879496216774, "reward_std": 0.28105374425649643, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4469866305589676, "step": 2966 }, { "clip_ratio": 0.0, "completion_length": 1472.3772888183594, "epoch": 0.8862668956762004, "grad_norm": 4.594440460205078, "kl": 2.89453125, "learning_rate": 4.046569146947448e-08, "loss": 0.225, "reward": 0.560825914144516, "reward_std": 0.28197256475687027, "rewards/accuracy_reward": 0.09375000582076609, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 2967 }, { "clip_ratio": 0.0, "completion_length": 1325.1920471191406, "epoch": 0.886565603763722, "grad_norm": 7.545547008514404, "kl": 2.330078125, "learning_rate": 4.041139961928791e-08, "loss": 0.1987, "reward": 0.5524553805589676, "reward_std": 0.2923591062426567, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5033482238650322, "step": 2968 }, { "clip_ratio": 0.0, "completion_length": 1426.1473693847656, "epoch": 0.8868643118512434, "grad_norm": 10.082645416259766, "kl": 2.3125, "learning_rate": 4.0357243309287566e-08, "loss": 0.1669, "reward": 0.5485491380095482, "reward_std": 0.24143293872475624, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919813156128, "step": 2969 }, { "clip_ratio": 0.0, "completion_length": 1412.62060546875, "epoch": 0.8871630199387649, "grad_norm": 2.893876314163208, "kl": 2.796875, "learning_rate": 4.0303222598390216e-08, "loss": 0.2023, "reward": 0.4916294887661934, "reward_std": 0.2968399301171303, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473395228386, "step": 2970 }, { "clip_ratio": 0.0, "completion_length": 1415.6295166015625, "epoch": 0.8874617280262863, "grad_norm": 3.2781596183776855, "kl": 2.6875, "learning_rate": 4.0249337545365045e-08, "loss": 0.2158, "reward": 0.5189732387661934, "reward_std": 0.2681536339223385, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732313156128, "step": 2971 }, { "clip_ratio": 0.0, "completion_length": 1527.0357971191406, "epoch": 0.8877604361138077, "grad_norm": 5.790205478668213, "kl": 3.01171875, "learning_rate": 4.019558820883379e-08, "loss": 0.1825, "reward": 0.5139508992433548, "reward_std": 0.2724435366690159, "rewards/accuracy_reward": 0.051339287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462611623108387, "step": 2972 }, { "clip_ratio": 0.0, "completion_length": 1380.232177734375, "epoch": 0.8880591442013293, "grad_norm": 3.0067152976989746, "kl": 2.7578125, "learning_rate": 4.0141974647270386e-08, "loss": 0.2046, "reward": 0.6049107238650322, "reward_std": 0.2572841979563236, "rewards/accuracy_reward": 0.14732143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575892984867096, "step": 2973 }, { "clip_ratio": 0.0, "completion_length": 1470.8215026855469, "epoch": 0.8883578522888507, "grad_norm": 9.318809509277344, "kl": 2.4921875, "learning_rate": 4.008849691900116e-08, "loss": 0.2013, "reward": 0.6450893133878708, "reward_std": 0.34925247728824615, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535895228386, "step": 2974 }, { "clip_ratio": 0.0, "completion_length": 1365.7746276855469, "epoch": 0.8886565603763722, "grad_norm": 18.051515579223633, "kl": 2.3359375, "learning_rate": 4.003515508220466e-08, "loss": 0.2241, "reward": 0.5279017984867096, "reward_std": 0.3178066536784172, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 2975 }, { "clip_ratio": 0.0, "completion_length": 1421.7679138183594, "epoch": 0.8889552684638936, "grad_norm": 4.020099639892578, "kl": 2.46875, "learning_rate": 3.998194919491155e-08, "loss": 0.1613, "reward": 0.5993303880095482, "reward_std": 0.3078905940055847, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018059372902, "step": 2976 }, { "clip_ratio": 0.0, "completion_length": 1375.0804138183594, "epoch": 0.8892539765514151, "grad_norm": 2.9575307369232178, "kl": 2.453125, "learning_rate": 3.992887931500463e-08, "loss": 0.1812, "reward": 0.6400669813156128, "reward_std": 0.2957182079553604, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949776977300644, "step": 2977 }, { "clip_ratio": 0.0, "completion_length": 1336.4576416015625, "epoch": 0.8895526846389366, "grad_norm": 3.598834991455078, "kl": 2.734375, "learning_rate": 3.9875945500218746e-08, "loss": 0.2081, "reward": 0.5937500298023224, "reward_std": 0.3183174431324005, "rewards/accuracy_reward": 0.13839286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 2978 }, { "clip_ratio": 0.0, "completion_length": 1482.1697082519531, "epoch": 0.8898513927264581, "grad_norm": 6.121829509735107, "kl": 2.53125, "learning_rate": 3.982314780814066e-08, "loss": 0.2057, "reward": 0.5412946790456772, "reward_std": 0.2895388677716255, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018059372902, "step": 2979 }, { "clip_ratio": 0.0, "completion_length": 1492.4353332519531, "epoch": 0.8901501008139795, "grad_norm": 4.14646577835083, "kl": 2.45703125, "learning_rate": 3.9770486296209117e-08, "loss": 0.1888, "reward": 0.4949776977300644, "reward_std": 0.24550705775618553, "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884215950966, "step": 2980 }, { "clip_ratio": 0.0, "completion_length": 1501.5402526855469, "epoch": 0.890448808901501, "grad_norm": 9.350455284118652, "kl": 2.90625, "learning_rate": 3.9717961021714685e-08, "loss": 0.1927, "reward": 0.479910746216774, "reward_std": 0.2298441007733345, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000223517418, "step": 2981 }, { "clip_ratio": 0.0, "completion_length": 1528.0536499023438, "epoch": 0.8907475169890224, "grad_norm": 6.776817321777344, "kl": 3.1328125, "learning_rate": 3.966557204179968e-08, "loss": 0.2288, "reward": 0.555245578289032, "reward_std": 0.3366238996386528, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4525669887661934, "step": 2982 }, { "clip_ratio": 0.0, "completion_length": 1414.2388916015625, "epoch": 0.891046225076544, "grad_norm": 5.916048526763916, "kl": 2.33984375, "learning_rate": 3.961331941345819e-08, "loss": 0.11, "reward": 0.5066964477300644, "reward_std": 0.2582663707435131, "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750149011612, "step": 2983 }, { "clip_ratio": 0.0, "completion_length": 1350.5491638183594, "epoch": 0.8913449331640654, "grad_norm": 6.399316310882568, "kl": 2.216796875, "learning_rate": 3.9561203193536e-08, "loss": 0.138, "reward": 0.5664062798023224, "reward_std": 0.3211803659796715, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.499441996216774, "step": 2984 }, { "clip_ratio": 0.0, "completion_length": 1411.7991943359375, "epoch": 0.8916436412515869, "grad_norm": 5.4094390869140625, "kl": 2.515625, "learning_rate": 3.950922343873038e-08, "loss": 0.1575, "reward": 0.5206473544239998, "reward_std": 0.2863358333706856, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4827009066939354, "step": 2985 }, { "clip_ratio": 0.0, "completion_length": 1460.91748046875, "epoch": 0.8919423493391083, "grad_norm": 4.964884281158447, "kl": 2.8046875, "learning_rate": 3.9457380205590265e-08, "loss": 0.1819, "reward": 0.5033482238650322, "reward_std": 0.2970062643289566, "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732387661934, "step": 2986 }, { "clip_ratio": 0.0, "completion_length": 1419.9353332519531, "epoch": 0.8922410574266298, "grad_norm": 5.7102370262146, "kl": 2.640625, "learning_rate": 3.940567355051597e-08, "loss": 0.1713, "reward": 0.5072544887661934, "reward_std": 0.302678644657135, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473395228386, "step": 2987 }, { "clip_ratio": 0.0, "completion_length": 1408.0023193359375, "epoch": 0.8925397655141513, "grad_norm": 3.9952118396759033, "kl": 2.41796875, "learning_rate": 3.935410352975928e-08, "loss": 0.1477, "reward": 0.6300223618745804, "reward_std": 0.2956625297665596, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 2988 }, { "clip_ratio": 0.0, "completion_length": 1422.2121276855469, "epoch": 0.8928384736016728, "grad_norm": 16.150318145751953, "kl": 2.3671875, "learning_rate": 3.9302670199423364e-08, "loss": 0.2082, "reward": 0.5390625223517418, "reward_std": 0.2556300722062588, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 2989 }, { "clip_ratio": 0.0, "completion_length": 1501.3884582519531, "epoch": 0.8931371816891942, "grad_norm": 3.9687094688415527, "kl": 2.38671875, "learning_rate": 3.9251373615462565e-08, "loss": 0.1511, "reward": 0.5474330484867096, "reward_std": 0.23758703097701073, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437649011612, "step": 2990 }, { "clip_ratio": 0.0, "completion_length": 1401.7857971191406, "epoch": 0.8934358897767157, "grad_norm": 3.3562419414520264, "kl": 2.4765625, "learning_rate": 3.9200213833682595e-08, "loss": 0.1498, "reward": 0.564174123108387, "reward_std": 0.2531716898083687, "rewards/accuracy_reward": 0.08482143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4793527126312256, "step": 2991 }, { "clip_ratio": 0.0, "completion_length": 1363.0513610839844, "epoch": 0.8937345978642371, "grad_norm": 4.636507034301758, "kl": 2.423828125, "learning_rate": 3.914919090974028e-08, "loss": 0.191, "reward": 0.5145089626312256, "reward_std": 0.28996872901916504, "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 2992 }, { "clip_ratio": 0.0, "completion_length": 1459.6138916015625, "epoch": 0.8940333059517587, "grad_norm": 6.102733135223389, "kl": 2.61328125, "learning_rate": 3.909830489914355e-08, "loss": 0.2053, "reward": 0.474330373108387, "reward_std": 0.275189645588398, "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125223517418, "step": 2993 }, { "clip_ratio": 0.0, "completion_length": 1548.9978332519531, "epoch": 0.8943320140392801, "grad_norm": 7.700137615203857, "kl": 2.6953125, "learning_rate": 3.9047555857251374e-08, "loss": 0.1681, "reward": 0.4838169887661934, "reward_std": 0.25143786892294884, "rewards/accuracy_reward": 0.05580357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.428013414144516, "step": 2994 }, { "clip_ratio": 0.0, "completion_length": 1394.9241943359375, "epoch": 0.8946307221268016, "grad_norm": 3.218276023864746, "kl": 2.1875, "learning_rate": 3.8996943839273795e-08, "loss": 0.1685, "reward": 0.510044664144516, "reward_std": 0.2819991558790207, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 2995 }, { "clip_ratio": 0.0, "completion_length": 1438.3750610351562, "epoch": 0.894929430214323, "grad_norm": 7.122566223144531, "kl": 1.99609375, "learning_rate": 3.8946468900271675e-08, "loss": 0.1166, "reward": 0.6093750298023224, "reward_std": 0.2655246928334236, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.497767873108387, "step": 2996 }, { "clip_ratio": 0.0, "completion_length": 1408.5759582519531, "epoch": 0.8952281383018446, "grad_norm": 4.544902324676514, "kl": 2.15234375, "learning_rate": 3.8896131095156805e-08, "loss": 0.1307, "reward": 0.5128348469734192, "reward_std": 0.2782754749059677, "rewards/accuracy_reward": 0.03125000046566129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4815848469734192, "step": 2997 }, { "clip_ratio": 0.0, "completion_length": 1507.6763916015625, "epoch": 0.895526846389366, "grad_norm": 5.437558650970459, "kl": 2.259765625, "learning_rate": 3.884593047869183e-08, "loss": 0.1318, "reward": 0.517857164144516, "reward_std": 0.2799318805336952, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776786044239998, "step": 2998 }, { "clip_ratio": 0.0, "completion_length": 1378.9308471679688, "epoch": 0.8958255544768875, "grad_norm": 2.924499750137329, "kl": 2.2890625, "learning_rate": 3.879586710549003e-08, "loss": 0.1706, "reward": 0.5039062723517418, "reward_std": 0.28927209973335266, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241305589676, "step": 2999 }, { "clip_ratio": 0.0, "completion_length": 1425.0781555175781, "epoch": 0.8961242625644089, "grad_norm": 2.2597920894622803, "kl": 2.65625, "learning_rate": 3.874594103001551e-08, "loss": 0.1911, "reward": 0.5797991454601288, "reward_std": 0.28990424051880836, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4592634215950966, "step": 3000 }, { "clip_ratio": 0.0, "completion_length": 1476.5134582519531, "epoch": 0.8964229706519304, "grad_norm": 7.877499103546143, "kl": 2.44140625, "learning_rate": 3.8696152306582914e-08, "loss": 0.1555, "reward": 0.5993303805589676, "reward_std": 0.3050009682774544, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 3001 }, { "clip_ratio": 0.0, "completion_length": 1463.10498046875, "epoch": 0.8967216787394519, "grad_norm": 4.5420918464660645, "kl": 2.60546875, "learning_rate": 3.864650098935749e-08, "loss": 0.1778, "reward": 0.5117187798023224, "reward_std": 0.2664567716419697, "rewards/accuracy_reward": 0.06473214458674192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4469866305589676, "step": 3002 }, { "clip_ratio": 0.0, "completion_length": 1495.3147888183594, "epoch": 0.8970203868269734, "grad_norm": 4.769266128540039, "kl": 2.4765625, "learning_rate": 3.859698713235503e-08, "loss": 0.2056, "reward": 0.5301339477300644, "reward_std": 0.32163604348897934, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 3003 }, { "clip_ratio": 0.0, "completion_length": 1436.22998046875, "epoch": 0.8973190949144948, "grad_norm": 1.9136807918548584, "kl": 2.3046875, "learning_rate": 3.854761078944174e-08, "loss": 0.1318, "reward": 0.5212053880095482, "reward_std": 0.2727942280471325, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463169664144516, "step": 3004 }, { "clip_ratio": 0.0, "completion_length": 1315.4888916015625, "epoch": 0.8976178030020163, "grad_norm": 3.5505032539367676, "kl": 2.453125, "learning_rate": 3.849837201433424e-08, "loss": 0.1568, "reward": 0.4849330633878708, "reward_std": 0.23488854989409447, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723544239998, "step": 3005 }, { "clip_ratio": 0.0, "completion_length": 1468.9531860351562, "epoch": 0.8979165110895377, "grad_norm": 10.464487075805664, "kl": 2.328125, "learning_rate": 3.844927086059951e-08, "loss": 0.2243, "reward": 0.5736607387661934, "reward_std": 0.3165236860513687, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143133878708, "step": 3006 }, { "clip_ratio": 0.0, "completion_length": 1465.1116638183594, "epoch": 0.8982152191770593, "grad_norm": 4.895223617553711, "kl": 2.640625, "learning_rate": 3.8400307381654764e-08, "loss": 0.2009, "reward": 0.4955357387661934, "reward_std": 0.2519381679594517, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4441964477300644, "step": 3007 }, { "clip_ratio": 0.0, "completion_length": 1459.7924499511719, "epoch": 0.8985139272645807, "grad_norm": 2.8797006607055664, "kl": 2.4765625, "learning_rate": 3.835148163076748e-08, "loss": 0.1461, "reward": 0.5831473469734192, "reward_std": 0.26785196363925934, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 3008 }, { "clip_ratio": 0.0, "completion_length": 1387.4531555175781, "epoch": 0.8988126353521022, "grad_norm": 2.3916192054748535, "kl": 2.6484375, "learning_rate": 3.830279366105531e-08, "loss": 0.197, "reward": 0.5267857387661934, "reward_std": 0.26723217591643333, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535969734192, "step": 3009 }, { "clip_ratio": 0.0, "completion_length": 1529.0982971191406, "epoch": 0.8991113434396236, "grad_norm": 2.860006093978882, "kl": 2.626953125, "learning_rate": 3.825424352548596e-08, "loss": 0.1999, "reward": 0.5100446715950966, "reward_std": 0.30229806154966354, "rewards/accuracy_reward": 0.049107145983725786, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 3010 }, { "clip_ratio": 0.0, "completion_length": 1441.5670471191406, "epoch": 0.8994100515271451, "grad_norm": 3.701406240463257, "kl": 2.75390625, "learning_rate": 3.8205831276877224e-08, "loss": 0.2316, "reward": 0.555803582072258, "reward_std": 0.2607399895787239, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250223517418, "step": 3011 }, { "clip_ratio": 0.0, "completion_length": 1457.05810546875, "epoch": 0.8997087596146666, "grad_norm": 3.8428313732147217, "kl": 2.54296875, "learning_rate": 3.8157556967896926e-08, "loss": 0.1886, "reward": 0.6077009290456772, "reward_std": 0.2873695343732834, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223395228386, "step": 3012 }, { "clip_ratio": 0.0, "completion_length": 1415.0179138183594, "epoch": 0.9000074677021881, "grad_norm": 4.203301906585693, "kl": 2.5546875, "learning_rate": 3.810942065106273e-08, "loss": 0.188, "reward": 0.6367187723517418, "reward_std": 0.29364581778645515, "rewards/accuracy_reward": 0.18303572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830633878708, "step": 3013 }, { "clip_ratio": 0.0, "completion_length": 1393.1786499023438, "epoch": 0.9003061757897095, "grad_norm": 6.615682601928711, "kl": 2.3671875, "learning_rate": 3.806142237874228e-08, "loss": 0.1966, "reward": 0.525669664144516, "reward_std": 0.28188861906528473, "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494419664144516, "step": 3014 }, { "clip_ratio": 0.0, "completion_length": 1405.841552734375, "epoch": 0.9006048838772309, "grad_norm": 8.898879051208496, "kl": 2.8515625, "learning_rate": 3.8013562203152936e-08, "loss": 0.1739, "reward": 0.5167411044239998, "reward_std": 0.24736790359020233, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 3015 }, { "clip_ratio": 0.0, "completion_length": 1467.1630249023438, "epoch": 0.9009035919647524, "grad_norm": 5.325218677520752, "kl": 2.80078125, "learning_rate": 3.7965840176361914e-08, "loss": 0.2028, "reward": 0.5647321715950966, "reward_std": 0.28090863674879074, "rewards/accuracy_reward": 0.10714285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893133878708, "step": 3016 }, { "clip_ratio": 0.0, "completion_length": 1484.3192749023438, "epoch": 0.9012023000522739, "grad_norm": 3.6003901958465576, "kl": 2.708984375, "learning_rate": 3.7918256350286106e-08, "loss": 0.1837, "reward": 0.558035746216774, "reward_std": 0.2730547711253166, "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 3017 }, { "clip_ratio": 0.0, "completion_length": 1423.97998046875, "epoch": 0.9015010081397954, "grad_norm": 17.62794303894043, "kl": 2.14453125, "learning_rate": 3.787081077669201e-08, "loss": 0.1903, "reward": 0.6171875447034836, "reward_std": 0.3018953166902065, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 3018 }, { "clip_ratio": 0.0, "completion_length": 1365.6027526855469, "epoch": 0.9017997162273168, "grad_norm": 12.378169059753418, "kl": 2.3046875, "learning_rate": 3.7823503507195804e-08, "loss": 0.1836, "reward": 0.5318080708384514, "reward_std": 0.2553177960216999, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 3019 }, { "clip_ratio": 0.0, "completion_length": 1392.6541137695312, "epoch": 0.9020984243148383, "grad_norm": 2.8717737197875977, "kl": 2.38671875, "learning_rate": 3.777633459326316e-08, "loss": 0.1632, "reward": 0.683035746216774, "reward_std": 0.3095628246665001, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250149011612, "step": 3020 }, { "clip_ratio": 0.0, "completion_length": 1494.3192443847656, "epoch": 0.9023971324023597, "grad_norm": 7.668806552886963, "kl": 2.31640625, "learning_rate": 3.7729304086209204e-08, "loss": 0.1529, "reward": 0.5602678954601288, "reward_std": 0.28093162178993225, "rewards/accuracy_reward": 0.09821428940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535895228386, "step": 3021 }, { "clip_ratio": 0.0, "completion_length": 1469.65185546875, "epoch": 0.9026958404898813, "grad_norm": 4.99358606338501, "kl": 2.984375, "learning_rate": 3.7682412037198536e-08, "loss": 0.2119, "reward": 0.5172991380095482, "reward_std": 0.25615132227540016, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 3022 }, { "clip_ratio": 0.0, "completion_length": 1376.7746276855469, "epoch": 0.9029945485774027, "grad_norm": 6.790584087371826, "kl": 2.73828125, "learning_rate": 3.7635658497245124e-08, "loss": 0.189, "reward": 0.5837053805589676, "reward_std": 0.27687161043286324, "rewards/accuracy_reward": 0.10937500651925802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 3023 }, { "clip_ratio": 0.0, "completion_length": 1418.1406555175781, "epoch": 0.9032932566649242, "grad_norm": 8.051871299743652, "kl": 2.49609375, "learning_rate": 3.758904351721219e-08, "loss": 0.1934, "reward": 0.5150669738650322, "reward_std": 0.2781341187655926, "rewards/accuracy_reward": 0.04687500325962901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919887661934, "step": 3024 }, { "clip_ratio": 0.0, "completion_length": 1412.7277526855469, "epoch": 0.9035919647524456, "grad_norm": 2.680983781814575, "kl": 2.3671875, "learning_rate": 3.754256714781233e-08, "loss": 0.1734, "reward": 0.5797991380095482, "reward_std": 0.3107425905764103, "rewards/accuracy_reward": 0.12053571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4592634066939354, "step": 3025 }, { "clip_ratio": 0.0, "completion_length": 1490.6005249023438, "epoch": 0.9038906728399672, "grad_norm": 9.460180282592773, "kl": 3.03125, "learning_rate": 3.749622943960721e-08, "loss": 0.218, "reward": 0.4949776977300644, "reward_std": 0.26097121834754944, "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 3026 }, { "clip_ratio": 0.0, "completion_length": 1443.7098999023438, "epoch": 0.9041893809274886, "grad_norm": 6.955436706542969, "kl": 2.5, "learning_rate": 3.745003044300774e-08, "loss": 0.2128, "reward": 0.5234375298023224, "reward_std": 0.28592555969953537, "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875223517418, "step": 3027 }, { "clip_ratio": 0.0, "completion_length": 1413.6763916015625, "epoch": 0.9044880890150101, "grad_norm": 8.356184005737305, "kl": 2.2890625, "learning_rate": 3.740397020827395e-08, "loss": 0.1632, "reward": 0.5669643133878708, "reward_std": 0.2705033868551254, "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 3028 }, { "clip_ratio": 0.0, "completion_length": 1445.6496276855469, "epoch": 0.9047867971025315, "grad_norm": 10.375825881958008, "kl": 3.046875, "learning_rate": 3.735804878551479e-08, "loss": 0.1979, "reward": 0.498883955180645, "reward_std": 0.2803350165486336, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4520089477300644, "step": 3029 }, { "clip_ratio": 0.0, "completion_length": 1440.5960083007812, "epoch": 0.905085505190053, "grad_norm": 5.484809875488281, "kl": 2.734375, "learning_rate": 3.7312266224688276e-08, "loss": 0.2204, "reward": 0.541294664144516, "reward_std": 0.2612360380589962, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 3030 }, { "clip_ratio": 0.0, "completion_length": 1390.6183776855469, "epoch": 0.9053842132775745, "grad_norm": 7.492177486419678, "kl": 2.427734375, "learning_rate": 3.726662257560141e-08, "loss": 0.2277, "reward": 0.6222098469734192, "reward_std": 0.27185046672821045, "rewards/accuracy_reward": 0.14955357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562798023224, "step": 3031 }, { "clip_ratio": 0.0, "completion_length": 1421.5179138183594, "epoch": 0.905682921365096, "grad_norm": 5.801004886627197, "kl": 2.55078125, "learning_rate": 3.722111788790994e-08, "loss": 0.2151, "reward": 0.4921875298023224, "reward_std": 0.2772926054894924, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458705373108387, "step": 3032 }, { "clip_ratio": 0.0, "completion_length": 1424.2523193359375, "epoch": 0.9059816294526174, "grad_norm": 3.7513411045074463, "kl": 2.53515625, "learning_rate": 3.717575221111852e-08, "loss": 0.1724, "reward": 0.4960937649011612, "reward_std": 0.27293746173381805, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4670759066939354, "step": 3033 }, { "clip_ratio": 0.0, "completion_length": 1537.1607971191406, "epoch": 0.9062803375401389, "grad_norm": 6.100399494171143, "kl": 2.359375, "learning_rate": 3.713052559458061e-08, "loss": 0.1715, "reward": 0.5223214477300644, "reward_std": 0.28921379894018173, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732142984867096, "step": 3034 }, { "clip_ratio": 0.0, "completion_length": 1428.6786499023438, "epoch": 0.9065790456276603, "grad_norm": 4.7932515144348145, "kl": 2.7421875, "learning_rate": 3.7085438087498263e-08, "loss": 0.2408, "reward": 0.573660746216774, "reward_std": 0.25320233777165413, "rewards/accuracy_reward": 0.08705357438884676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071566939354, "step": 3035 }, { "clip_ratio": 0.0, "completion_length": 1461.5134582519531, "epoch": 0.9068777537151819, "grad_norm": 7.162165641784668, "kl": 2.70703125, "learning_rate": 3.7040489738922314e-08, "loss": 0.1653, "reward": 0.5334821566939354, "reward_std": 0.2650851234793663, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357313156128, "step": 3036 }, { "clip_ratio": 0.0, "completion_length": 1414.8014221191406, "epoch": 0.9071764618027033, "grad_norm": 7.262499809265137, "kl": 2.62109375, "learning_rate": 3.699568059775215e-08, "loss": 0.1517, "reward": 0.5513393208384514, "reward_std": 0.2962154448032379, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535895228386, "step": 3037 }, { "clip_ratio": 0.0, "completion_length": 1425.65185546875, "epoch": 0.9074751698902248, "grad_norm": 18.07046890258789, "kl": 3.171875, "learning_rate": 3.695101071273572e-08, "loss": 0.1743, "reward": 0.6272321790456772, "reward_std": 0.26750317960977554, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4910714477300644, "step": 3038 }, { "clip_ratio": 0.0, "completion_length": 1447.857177734375, "epoch": 0.9077738779777462, "grad_norm": 3.441465139389038, "kl": 2.314453125, "learning_rate": 3.69064801324695e-08, "loss": 0.1557, "reward": 0.5351562649011612, "reward_std": 0.2234749048948288, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 3039 }, { "clip_ratio": 0.0, "completion_length": 1414.4844665527344, "epoch": 0.9080725860652678, "grad_norm": 3.888309955596924, "kl": 2.64453125, "learning_rate": 3.6862088905398366e-08, "loss": 0.1805, "reward": 0.5463169887661934, "reward_std": 0.2908196672797203, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919813156128, "step": 3040 }, { "clip_ratio": 0.0, "completion_length": 1365.4241333007812, "epoch": 0.9083712941527892, "grad_norm": 7.118712425231934, "kl": 2.84375, "learning_rate": 3.681783707981564e-08, "loss": 0.2039, "reward": 0.5167410895228386, "reward_std": 0.24751991033554077, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410895228386, "step": 3041 }, { "clip_ratio": 0.0, "completion_length": 1394.3639221191406, "epoch": 0.9086700022403107, "grad_norm": 3.9439713954925537, "kl": 2.41796875, "learning_rate": 3.677372470386298e-08, "loss": 0.1778, "reward": 0.6406250149011612, "reward_std": 0.35209525376558304, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486607164144516, "step": 3042 }, { "clip_ratio": 0.0, "completion_length": 1443.2433776855469, "epoch": 0.9089687103278321, "grad_norm": 9.849898338317871, "kl": 2.279296875, "learning_rate": 3.67297518255303e-08, "loss": 0.1838, "reward": 0.5993303880095482, "reward_std": 0.28296447917819023, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487723246216774, "step": 3043 }, { "clip_ratio": 0.0, "completion_length": 1394.7389221191406, "epoch": 0.9092674184153536, "grad_norm": 7.097021102905273, "kl": 2.5078125, "learning_rate": 3.668591849265581e-08, "loss": 0.2329, "reward": 0.524553582072258, "reward_std": 0.3097272776067257, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 3044 }, { "clip_ratio": 0.0, "completion_length": 1415.1540832519531, "epoch": 0.909566126502875, "grad_norm": 5.890965938568115, "kl": 2.494140625, "learning_rate": 3.664222475292587e-08, "loss": 0.2204, "reward": 0.5228794813156128, "reward_std": 0.2624056860804558, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080484867096, "step": 3045 }, { "clip_ratio": 0.0, "completion_length": 1473.5715026855469, "epoch": 0.9098648345903966, "grad_norm": 4.249211311340332, "kl": 2.41796875, "learning_rate": 3.6598670653874975e-08, "loss": 0.1685, "reward": 0.482142873108387, "reward_std": 0.23064041137695312, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.475446455180645, "step": 3046 }, { "clip_ratio": 0.0, "completion_length": 1545.6853332519531, "epoch": 0.910163542677918, "grad_norm": 3.5298750400543213, "kl": 3.01953125, "learning_rate": 3.655525624288573e-08, "loss": 0.1954, "reward": 0.4659598469734192, "reward_std": 0.24989687278866768, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026977300644, "step": 3047 }, { "clip_ratio": 0.0, "completion_length": 1309.5714721679688, "epoch": 0.9104622507654395, "grad_norm": 13.7044095993042, "kl": 2.119140625, "learning_rate": 3.651198156718875e-08, "loss": 0.1902, "reward": 0.6227678954601288, "reward_std": 0.32200518250465393, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393133878708, "step": 3048 }, { "clip_ratio": 0.0, "completion_length": 1501.3370971679688, "epoch": 0.9107609588529609, "grad_norm": 3.932826042175293, "kl": 2.8125, "learning_rate": 3.646884667386265e-08, "loss": 0.1505, "reward": 0.550781287252903, "reward_std": 0.24339864775538445, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459263414144516, "step": 3049 }, { "clip_ratio": 0.0, "completion_length": 1456.4397888183594, "epoch": 0.9110596669404825, "grad_norm": 4.980598449707031, "kl": 2.64453125, "learning_rate": 3.642585160983396e-08, "loss": 0.1777, "reward": 0.599888414144516, "reward_std": 0.2409662939608097, "rewards/accuracy_reward": 0.12723214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 3050 }, { "clip_ratio": 0.0, "completion_length": 1401.8013610839844, "epoch": 0.9113583750280039, "grad_norm": 6.4377760887146, "kl": 2.59375, "learning_rate": 3.638299642187712e-08, "loss": 0.2199, "reward": 0.526227705180645, "reward_std": 0.2613772414624691, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 3051 }, { "clip_ratio": 0.0, "completion_length": 1552.6295471191406, "epoch": 0.9116570831155254, "grad_norm": 3.160402774810791, "kl": 2.83984375, "learning_rate": 3.6340281156614314e-08, "loss": 0.2119, "reward": 0.4687500223517418, "reward_std": 0.2718455381691456, "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285895228386, "step": 3052 }, { "clip_ratio": 0.0, "completion_length": 1386.1027526855469, "epoch": 0.9119557912030468, "grad_norm": 3.559133768081665, "kl": 2.625, "learning_rate": 3.6297705860515634e-08, "loss": 0.1577, "reward": 0.6294643133878708, "reward_std": 0.2876538261771202, "rewards/accuracy_reward": 0.14508929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750223517418, "step": 3053 }, { "clip_ratio": 0.0, "completion_length": 1468.0380249023438, "epoch": 0.9122544992905683, "grad_norm": 2.704960346221924, "kl": 2.75390625, "learning_rate": 3.625527057989878e-08, "loss": 0.1661, "reward": 0.5133928954601288, "reward_std": 0.31043073162436485, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466517873108387, "step": 3054 }, { "clip_ratio": 0.0, "completion_length": 1510.8393249511719, "epoch": 0.9125532073780898, "grad_norm": 6.0154948234558105, "kl": 2.96484375, "learning_rate": 3.621297536092919e-08, "loss": 0.1883, "reward": 0.5982143133878708, "reward_std": 0.2666654735803604, "rewards/accuracy_reward": 0.13616072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535895228386, "step": 3055 }, { "clip_ratio": 0.0, "completion_length": 1437.5089721679688, "epoch": 0.9128519154656113, "grad_norm": 3.496431589126587, "kl": 2.9375, "learning_rate": 3.617082024961995e-08, "loss": 0.2681, "reward": 0.5513392984867096, "reward_std": 0.2689623385667801, "rewards/accuracy_reward": 0.09598214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 3056 }, { "clip_ratio": 0.0, "completion_length": 1385.82373046875, "epoch": 0.9131506235531327, "grad_norm": 4.991671562194824, "kl": 2.125, "learning_rate": 3.612880529183162e-08, "loss": 0.1409, "reward": 0.657366082072258, "reward_std": 0.28985925018787384, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839626312256, "step": 3057 }, { "clip_ratio": 0.0, "completion_length": 1411.8103332519531, "epoch": 0.9134493316406541, "grad_norm": 6.594745635986328, "kl": 2.1328125, "learning_rate": 3.6086930533272424e-08, "loss": 0.1546, "reward": 0.546316996216774, "reward_std": 0.25865986943244934, "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4882812723517418, "step": 3058 }, { "clip_ratio": 0.0, "completion_length": 1509.8973693847656, "epoch": 0.9137480397281756, "grad_norm": 2.933013677597046, "kl": 2.39453125, "learning_rate": 3.604519601949798e-08, "loss": 0.1674, "reward": 0.4938616305589676, "reward_std": 0.279225904494524, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116380095482, "step": 3059 }, { "clip_ratio": 0.0, "completion_length": 1403.5558471679688, "epoch": 0.9140467478156971, "grad_norm": 6.3589701652526855, "kl": 2.662109375, "learning_rate": 3.600360179591132e-08, "loss": 0.232, "reward": 0.5898437798023224, "reward_std": 0.294145368039608, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 3060 }, { "clip_ratio": 0.0, "completion_length": 1343.591552734375, "epoch": 0.9143454559032186, "grad_norm": 10.082201957702637, "kl": 2.60546875, "learning_rate": 3.59621479077629e-08, "loss": 0.2176, "reward": 0.552455373108387, "reward_std": 0.33382246643304825, "rewards/accuracy_reward": 0.07142857182770967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810267984867096, "step": 3061 }, { "clip_ratio": 0.0, "completion_length": 1470.5871276855469, "epoch": 0.91464416399074, "grad_norm": 3.5316221714019775, "kl": 2.814453125, "learning_rate": 3.592083440015049e-08, "loss": 0.2041, "reward": 0.498325914144516, "reward_std": 0.2533336617052555, "rewards/accuracy_reward": 0.017857143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687798023224, "step": 3062 }, { "clip_ratio": 0.0, "completion_length": 1444.2857666015625, "epoch": 0.9149428720782615, "grad_norm": 3.2309019565582275, "kl": 2.302734375, "learning_rate": 3.587966131801913e-08, "loss": 0.1463, "reward": 0.5245535969734192, "reward_std": 0.24016597494482994, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 3063 }, { "clip_ratio": 0.0, "completion_length": 1391.6361999511719, "epoch": 0.9152415801657829, "grad_norm": 7.560273170471191, "kl": 2.3671875, "learning_rate": 3.58386287061611e-08, "loss": 0.1948, "reward": 0.5379464477300644, "reward_std": 0.3004927486181259, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.491071455180645, "step": 3064 }, { "clip_ratio": 0.0, "completion_length": 1541.2255249023438, "epoch": 0.9155402882533045, "grad_norm": 2.782742500305176, "kl": 2.6015625, "learning_rate": 3.579773660921589e-08, "loss": 0.1579, "reward": 0.5597098469734192, "reward_std": 0.29639818519353867, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312649011612, "step": 3065 }, { "clip_ratio": 0.0, "completion_length": 1439.4375610351562, "epoch": 0.9158389963408259, "grad_norm": 5.611033916473389, "kl": 3.02734375, "learning_rate": 3.575698507167004e-08, "loss": 0.2197, "reward": 0.5329241380095482, "reward_std": 0.29161201417446136, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312649011612, "step": 3066 }, { "clip_ratio": 0.0, "completion_length": 1430.0201721191406, "epoch": 0.9161377044283474, "grad_norm": 11.061599731445312, "kl": 3.19921875, "learning_rate": 3.5716374137857266e-08, "loss": 0.2116, "reward": 0.595982164144516, "reward_std": 0.24780163913965225, "rewards/accuracy_reward": 0.1250000020954758, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 3067 }, { "clip_ratio": 0.0, "completion_length": 1430.1763916015625, "epoch": 0.9164364125158688, "grad_norm": 7.858548164367676, "kl": 2.703125, "learning_rate": 3.567590385195828e-08, "loss": 0.1914, "reward": 0.5212053880095482, "reward_std": 0.31684084236621857, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018059372902, "step": 3068 }, { "clip_ratio": 0.0, "completion_length": 1441.1362609863281, "epoch": 0.9167351206033904, "grad_norm": 4.666031360626221, "kl": 3.033203125, "learning_rate": 3.563557425800075e-08, "loss": 0.2245, "reward": 0.5256696715950966, "reward_std": 0.28587891533970833, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 3069 }, { "clip_ratio": 0.0, "completion_length": 1445.2098693847656, "epoch": 0.9170338286909118, "grad_norm": 15.729207992553711, "kl": 2.54296875, "learning_rate": 3.559538539985938e-08, "loss": 0.2053, "reward": 0.544084832072258, "reward_std": 0.28952478617429733, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.468191996216774, "step": 3070 }, { "clip_ratio": 0.0, "completion_length": 1514.12060546875, "epoch": 0.9173325367784333, "grad_norm": 3.7852623462677, "kl": 2.53125, "learning_rate": 3.555533732125567e-08, "loss": 0.1511, "reward": 0.5558036118745804, "reward_std": 0.31261296570301056, "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4709821715950966, "step": 3071 }, { "clip_ratio": 0.0, "completion_length": 1475.4263916015625, "epoch": 0.9176312448659547, "grad_norm": 10.118858337402344, "kl": 2.216796875, "learning_rate": 3.5515430065758015e-08, "loss": 0.1376, "reward": 0.4927455484867096, "reward_std": 0.25264014303684235, "rewards/accuracy_reward": 0.017857143422588706, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884215950966, "step": 3072 }, { "clip_ratio": 0.0, "completion_length": 1480.5447387695312, "epoch": 0.9179299529534762, "grad_norm": 5.294891834259033, "kl": 2.70703125, "learning_rate": 3.5475663676781596e-08, "loss": 0.1645, "reward": 0.5546875149011612, "reward_std": 0.24136748909950256, "rewards/accuracy_reward": 0.09375000582076609, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375149011612, "step": 3073 }, { "clip_ratio": 0.0, "completion_length": 1491.1027526855469, "epoch": 0.9182286610409977, "grad_norm": 3.849975824356079, "kl": 2.71875, "learning_rate": 3.5436038197588334e-08, "loss": 0.1765, "reward": 0.5652901977300644, "reward_std": 0.2858643904328346, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794813156128, "step": 3074 }, { "clip_ratio": 0.0, "completion_length": 1472.7879943847656, "epoch": 0.9185273691285192, "grad_norm": 6.154011249542236, "kl": 2.435546875, "learning_rate": 3.53965536712869e-08, "loss": 0.2031, "reward": 0.5351562798023224, "reward_std": 0.25866954401135445, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 3075 }, { "clip_ratio": 0.0, "completion_length": 1415.7031860351562, "epoch": 0.9188260772160406, "grad_norm": 4.690402030944824, "kl": 2.68359375, "learning_rate": 3.535721014083256e-08, "loss": 0.192, "reward": 0.5898437947034836, "reward_std": 0.26172588393092155, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455915205180645, "step": 3076 }, { "clip_ratio": 0.0, "completion_length": 1377.6451416015625, "epoch": 0.9191247853035621, "grad_norm": 2.4833602905273438, "kl": 2.55078125, "learning_rate": 3.531800764902723e-08, "loss": 0.2011, "reward": 0.5948660969734192, "reward_std": 0.3255440294742584, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839402794838, "step": 3077 }, { "clip_ratio": 0.0, "completion_length": 1472.0313110351562, "epoch": 0.9194234933910835, "grad_norm": 3.724416971206665, "kl": 2.70703125, "learning_rate": 3.527894623851937e-08, "loss": 0.2084, "reward": 0.5128348469734192, "reward_std": 0.28591011092066765, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4503348544239998, "step": 3078 }, { "clip_ratio": 0.0, "completion_length": 1418.8215026855469, "epoch": 0.9197222014786051, "grad_norm": 11.51362419128418, "kl": 2.259765625, "learning_rate": 3.5240025951803996e-08, "loss": 0.1948, "reward": 0.5474330633878708, "reward_std": 0.30576443672180176, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 3079 }, { "clip_ratio": 0.0, "completion_length": 1388.7813110351562, "epoch": 0.9200209095661265, "grad_norm": 13.342612266540527, "kl": 2.44921875, "learning_rate": 3.520124683122251e-08, "loss": 0.2344, "reward": 0.6300223544239998, "reward_std": 0.27361468970775604, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027901902794838, "step": 3080 }, { "clip_ratio": 0.0, "completion_length": 1566.6518859863281, "epoch": 0.920319617653648, "grad_norm": 4.225193023681641, "kl": 2.546875, "learning_rate": 3.5162608918962855e-08, "loss": 0.1844, "reward": 0.4620535969734192, "reward_std": 0.24180793017148972, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4508928805589676, "step": 3081 }, { "clip_ratio": 0.0, "completion_length": 1448.3058776855469, "epoch": 0.9206183257411694, "grad_norm": 6.087406635284424, "kl": 2.61328125, "learning_rate": 3.512411225705925e-08, "loss": 0.2016, "reward": 0.5502232313156128, "reward_std": 0.26750001311302185, "rewards/accuracy_reward": 0.06919643352739513, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268133878708, "step": 3082 }, { "clip_ratio": 0.0, "completion_length": 1423.8147888183594, "epoch": 0.920917033828691, "grad_norm": 4.91497278213501, "kl": 2.81640625, "learning_rate": 3.5085756887392296e-08, "loss": 0.179, "reward": 0.5301339477300644, "reward_std": 0.2541780285537243, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803805589676, "step": 3083 }, { "clip_ratio": 0.0, "completion_length": 1446.8214721679688, "epoch": 0.9212157419162124, "grad_norm": 5.041876316070557, "kl": 2.5234375, "learning_rate": 3.5047542851688906e-08, "loss": 0.1947, "reward": 0.5837053805589676, "reward_std": 0.28574465960264206, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660895228386, "step": 3084 }, { "clip_ratio": 0.0, "completion_length": 1433.4598999023438, "epoch": 0.9215144500037339, "grad_norm": 7.955806732177734, "kl": 2.5703125, "learning_rate": 3.500947019152217e-08, "loss": 0.228, "reward": 0.604910746216774, "reward_std": 0.29156796634197235, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 3085 }, { "clip_ratio": 0.0, "completion_length": 1367.4688110351562, "epoch": 0.9218131580912553, "grad_norm": 4.996347427368164, "kl": 2.58984375, "learning_rate": 3.49715389483114e-08, "loss": 0.2168, "reward": 0.5943080708384514, "reward_std": 0.2806132324039936, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4916294813156128, "step": 3086 }, { "clip_ratio": 0.0, "completion_length": 1488.0045471191406, "epoch": 0.9221118661787768, "grad_norm": 9.552310943603516, "kl": 2.8203125, "learning_rate": 3.4933749163322136e-08, "loss": 0.1989, "reward": 0.4709821715950966, "reward_std": 0.23941325023770332, "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285895228386, "step": 3087 }, { "clip_ratio": 0.0, "completion_length": 1447.9598693847656, "epoch": 0.9224105742662982, "grad_norm": 5.5039777755737305, "kl": 3.02734375, "learning_rate": 3.48961008776659e-08, "loss": 0.2155, "reward": 0.4866071715950966, "reward_std": 0.282045591622591, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419643133878708, "step": 3088 }, { "clip_ratio": 0.0, "completion_length": 1421.4598693847656, "epoch": 0.9227092823538198, "grad_norm": 6.401374340057373, "kl": 3.1328125, "learning_rate": 3.485859413230036e-08, "loss": 0.1965, "reward": 0.474888414144516, "reward_std": 0.25554919615387917, "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4525669813156128, "step": 3089 }, { "clip_ratio": 0.0, "completion_length": 1450.8281860351562, "epoch": 0.9230079904413412, "grad_norm": 17.416728973388672, "kl": 2.23828125, "learning_rate": 3.4821228968029186e-08, "loss": 0.1777, "reward": 0.5680803954601288, "reward_std": 0.27406686544418335, "rewards/accuracy_reward": 0.060267861699685454, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125298023224, "step": 3090 }, { "clip_ratio": 0.0, "completion_length": 1431.85498046875, "epoch": 0.9233066985288627, "grad_norm": 7.043900489807129, "kl": 2.703125, "learning_rate": 3.478400542550199e-08, "loss": 0.1607, "reward": 0.5435268133878708, "reward_std": 0.2812582030892372, "rewards/accuracy_reward": 0.09151786030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4520089477300644, "step": 3091 }, { "clip_ratio": 0.0, "completion_length": 1388.2701721191406, "epoch": 0.9236054066163841, "grad_norm": 3.3002560138702393, "kl": 2.63671875, "learning_rate": 3.4746923545214355e-08, "loss": 0.1908, "reward": 0.518973246216774, "reward_std": 0.29796336218714714, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474330373108387, "step": 3092 }, { "clip_ratio": 0.0, "completion_length": 1433.9732971191406, "epoch": 0.9239041147039057, "grad_norm": 3.446383476257324, "kl": 2.5625, "learning_rate": 3.470998336750773e-08, "loss": 0.1862, "reward": 0.5619419813156128, "reward_std": 0.28093619644641876, "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705559372902, "step": 3093 }, { "clip_ratio": 0.0, "completion_length": 1431.5246276855469, "epoch": 0.9242028227914271, "grad_norm": 10.381686210632324, "kl": 3.12109375, "learning_rate": 3.46731849325694e-08, "loss": 0.2529, "reward": 0.5401785895228386, "reward_std": 0.27788741514086723, "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 3094 }, { "clip_ratio": 0.0, "completion_length": 1381.76123046875, "epoch": 0.9245015308789486, "grad_norm": 4.284051418304443, "kl": 2.416015625, "learning_rate": 3.463652828043249e-08, "loss": 0.1952, "reward": 0.5251116305589676, "reward_std": 0.2167663685977459, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482700914144516, "step": 3095 }, { "clip_ratio": 0.0, "completion_length": 1482.0938110351562, "epoch": 0.92480023896647, "grad_norm": 10.857874870300293, "kl": 2.57421875, "learning_rate": 3.460001345097579e-08, "loss": 0.1579, "reward": 0.488281287252903, "reward_std": 0.25768495723605156, "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276977300644, "step": 3096 }, { "clip_ratio": 0.0, "completion_length": 1427.0558776855469, "epoch": 0.9250989470539915, "grad_norm": 4.284520626068115, "kl": 2.64453125, "learning_rate": 3.456364048392388e-08, "loss": 0.1822, "reward": 0.5435268208384514, "reward_std": 0.2556290999054909, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 3097 }, { "clip_ratio": 0.0, "completion_length": 1464.8371276855469, "epoch": 0.925397655141513, "grad_norm": 7.9286909103393555, "kl": 2.640625, "learning_rate": 3.4527409418847016e-08, "loss": 0.1622, "reward": 0.4508928880095482, "reward_std": 0.23093654960393906, "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000223517418, "step": 3098 }, { "clip_ratio": 0.0, "completion_length": 1426.0737609863281, "epoch": 0.9256963632290345, "grad_norm": 7.0578742027282715, "kl": 2.72265625, "learning_rate": 3.449132029516099e-08, "loss": 0.209, "reward": 0.6037946790456772, "reward_std": 0.26114626973867416, "rewards/accuracy_reward": 0.16517857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4386160895228386, "step": 3099 }, { "clip_ratio": 0.0, "completion_length": 1451.7076721191406, "epoch": 0.9259950713165559, "grad_norm": 5.354708194732666, "kl": 2.44921875, "learning_rate": 3.445537315212725e-08, "loss": 0.1638, "reward": 0.5943080633878708, "reward_std": 0.2468419335782528, "rewards/accuracy_reward": 0.16071429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4335937649011612, "step": 3100 }, { "clip_ratio": 0.0, "completion_length": 1448.5514221191406, "epoch": 0.9262937794040773, "grad_norm": 3.6806201934814453, "kl": 2.115234375, "learning_rate": 3.441956802885281e-08, "loss": 0.1319, "reward": 0.554129496216774, "reward_std": 0.28679290786385536, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4514509215950966, "step": 3101 }, { "clip_ratio": 0.0, "completion_length": 1464.37060546875, "epoch": 0.9265924874915988, "grad_norm": 4.048487186431885, "kl": 2.56640625, "learning_rate": 3.4383904964290055e-08, "loss": 0.1583, "reward": 0.5156250149011612, "reward_std": 0.25645051896572113, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.450892873108387, "step": 3102 }, { "clip_ratio": 0.0, "completion_length": 1561.5804443359375, "epoch": 0.9268911955791203, "grad_norm": 2.6570022106170654, "kl": 2.15234375, "learning_rate": 3.4348383997236966e-08, "loss": 0.1294, "reward": 0.4960937649011612, "reward_std": 0.2710064761340618, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794813156128, "step": 3103 }, { "clip_ratio": 0.0, "completion_length": 1462.6808776855469, "epoch": 0.9271899036666418, "grad_norm": 10.295455932617188, "kl": 2.19140625, "learning_rate": 3.4313005166336865e-08, "loss": 0.1503, "reward": 0.5954241305589676, "reward_std": 0.26084403693675995, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919887661934, "step": 3104 }, { "clip_ratio": 0.0, "completion_length": 1404.0290832519531, "epoch": 0.9274886117541632, "grad_norm": 10.016919136047363, "kl": 2.08984375, "learning_rate": 3.427776851007842e-08, "loss": 0.1656, "reward": 0.4994419813156128, "reward_std": 0.3127823695540428, "rewards/accuracy_reward": 0.04687500325962901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4525669887661934, "step": 3105 }, { "clip_ratio": 0.0, "completion_length": 1485.4286499023438, "epoch": 0.9277873198416847, "grad_norm": 6.744906902313232, "kl": 2.0, "learning_rate": 3.424267406679569e-08, "loss": 0.1706, "reward": 0.5864955633878708, "reward_std": 0.2955678179860115, "rewards/accuracy_reward": 0.11160714854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 3106 }, { "clip_ratio": 0.0, "completion_length": 1410.1273193359375, "epoch": 0.9280860279292061, "grad_norm": 5.601937770843506, "kl": 2.49609375, "learning_rate": 3.4207721874667984e-08, "loss": 0.1964, "reward": 0.5625000223517418, "reward_std": 0.2518373914062977, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4754464477300644, "step": 3107 }, { "clip_ratio": 0.0, "completion_length": 1441.6585388183594, "epoch": 0.9283847360167277, "grad_norm": 5.488917827606201, "kl": 2.14453125, "learning_rate": 3.417291197171984e-08, "loss": 0.1732, "reward": 0.5055803805589676, "reward_std": 0.2608025260269642, "rewards/accuracy_reward": 0.06250000395812094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443080373108387, "step": 3108 }, { "clip_ratio": 0.0, "completion_length": 1387.4509887695312, "epoch": 0.9286834441042491, "grad_norm": 5.3404541015625, "kl": 2.28125, "learning_rate": 3.413824439582106e-08, "loss": 0.1961, "reward": 0.5245535895228386, "reward_std": 0.2992234155535698, "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4419643133878708, "step": 3109 }, { "clip_ratio": 0.0, "completion_length": 1493.1205749511719, "epoch": 0.9289821521917706, "grad_norm": 5.552827835083008, "kl": 2.099609375, "learning_rate": 3.410371918468653e-08, "loss": 0.1008, "reward": 0.5753348618745804, "reward_std": 0.26824599504470825, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705633878708, "step": 3110 }, { "clip_ratio": 0.0, "completion_length": 1390.65185546875, "epoch": 0.929280860279292, "grad_norm": 5.506743907928467, "kl": 2.18359375, "learning_rate": 3.406933637587631e-08, "loss": 0.1965, "reward": 0.5407366380095482, "reward_std": 0.2593271993100643, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4670759066939354, "step": 3111 }, { "clip_ratio": 0.0, "completion_length": 1397.7723999023438, "epoch": 0.9295795683668135, "grad_norm": 4.154637336730957, "kl": 2.298828125, "learning_rate": 3.4035096006795544e-08, "loss": 0.1486, "reward": 0.631138414144516, "reward_std": 0.26001350954174995, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5039062723517418, "step": 3112 }, { "clip_ratio": 0.0, "completion_length": 1449.9107666015625, "epoch": 0.929878276454335, "grad_norm": 3.787842273712158, "kl": 2.60546875, "learning_rate": 3.400099811469437e-08, "loss": 0.1812, "reward": 0.6021205633878708, "reward_std": 0.28040437027812004, "rewards/accuracy_reward": 0.12723214831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 3113 }, { "clip_ratio": 0.0, "completion_length": 1433.8705749511719, "epoch": 0.9301769845418565, "grad_norm": 7.242470741271973, "kl": 2.451171875, "learning_rate": 3.396704273666797e-08, "loss": 0.1931, "reward": 0.5703125298023224, "reward_std": 0.31173665076494217, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4787946566939354, "step": 3114 }, { "clip_ratio": 0.0, "completion_length": 1438.6384887695312, "epoch": 0.9304756926293779, "grad_norm": 5.790591239929199, "kl": 2.392578125, "learning_rate": 3.3933229909656476e-08, "loss": 0.1672, "reward": 0.5591518133878708, "reward_std": 0.2751925513148308, "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196715950966, "step": 3115 }, { "clip_ratio": 0.0, "completion_length": 1429.716552734375, "epoch": 0.9307744007168994, "grad_norm": 15.408222198486328, "kl": 2.5078125, "learning_rate": 3.38995596704449e-08, "loss": 0.2487, "reward": 0.642857164144516, "reward_std": 0.29914747923612595, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4843750149011612, "step": 3116 }, { "clip_ratio": 0.0, "completion_length": 1407.9107666015625, "epoch": 0.9310731088044208, "grad_norm": 8.426671981811523, "kl": 2.65625, "learning_rate": 3.386603205566317e-08, "loss": 0.2343, "reward": 0.6037946790456772, "reward_std": 0.2803165353834629, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232387661934, "step": 3117 }, { "clip_ratio": 0.0, "completion_length": 1518.9531860351562, "epoch": 0.9313718168919424, "grad_norm": 12.34610652923584, "kl": 3.02734375, "learning_rate": 3.383264710178608e-08, "loss": 0.2087, "reward": 0.5039062723517418, "reward_std": 0.2598712518811226, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026977300644, "step": 3118 }, { "clip_ratio": 0.0, "completion_length": 1461.3371276855469, "epoch": 0.9316705249794638, "grad_norm": 3.1986546516418457, "kl": 2.85546875, "learning_rate": 3.3799404845133145e-08, "loss": 0.2153, "reward": 0.576450914144516, "reward_std": 0.31574732810258865, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4670759066939354, "step": 3119 }, { "clip_ratio": 0.0, "completion_length": 1511.5268249511719, "epoch": 0.9319692330669853, "grad_norm": 8.577537536621094, "kl": 3.09375, "learning_rate": 3.3766305321868716e-08, "loss": 0.2321, "reward": 0.5156250223517418, "reward_std": 0.2354440689086914, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357387661934, "step": 3120 }, { "clip_ratio": 0.0, "completion_length": 1485.3281860351562, "epoch": 0.9322679411545067, "grad_norm": 10.870013236999512, "kl": 2.6796875, "learning_rate": 3.3733348568001786e-08, "loss": 0.1577, "reward": 0.4737723395228386, "reward_std": 0.24637659639120102, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187649011612, "step": 3121 }, { "clip_ratio": 0.0, "completion_length": 1530.22998046875, "epoch": 0.9325666492420283, "grad_norm": 3.6059377193450928, "kl": 2.8203125, "learning_rate": 3.37005346193861e-08, "loss": 0.198, "reward": 0.5245535895228386, "reward_std": 0.2851436696946621, "rewards/accuracy_reward": 0.06919643329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 3122 }, { "clip_ratio": 0.0, "completion_length": 1510.962158203125, "epoch": 0.9328653573295497, "grad_norm": 4.464468002319336, "kl": 2.57421875, "learning_rate": 3.366786351172004e-08, "loss": 0.1969, "reward": 0.510044664144516, "reward_std": 0.25845275819301605, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732313156128, "step": 3123 }, { "clip_ratio": 0.0, "completion_length": 1396.65185546875, "epoch": 0.9331640654170712, "grad_norm": 13.349549293518066, "kl": 2.60546875, "learning_rate": 3.363533528054652e-08, "loss": 0.2221, "reward": 0.5747768059372902, "reward_std": 0.2865094915032387, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854911044239998, "step": 3124 }, { "clip_ratio": 0.0, "completion_length": 1439.0603332519531, "epoch": 0.9334627735045926, "grad_norm": 4.395313262939453, "kl": 2.74609375, "learning_rate": 3.360294996125311e-08, "loss": 0.1796, "reward": 0.601562537252903, "reward_std": 0.261079590767622, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375298023224, "step": 3125 }, { "clip_ratio": 0.0, "completion_length": 1484.3750610351562, "epoch": 0.9337614815921141, "grad_norm": 6.968785762786865, "kl": 2.76171875, "learning_rate": 3.357070758907185e-08, "loss": 0.1972, "reward": 0.5546875149011612, "reward_std": 0.26781709492206573, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410895228386, "step": 3126 }, { "clip_ratio": 0.0, "completion_length": 1512.0848999023438, "epoch": 0.9340601896796356, "grad_norm": 7.9962310791015625, "kl": 3.04296875, "learning_rate": 3.353860819907927e-08, "loss": 0.199, "reward": 0.464285746216774, "reward_std": 0.25553376972675323, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285969734192, "step": 3127 }, { "clip_ratio": 0.0, "completion_length": 1397.9197387695312, "epoch": 0.9343588977671571, "grad_norm": 8.04548168182373, "kl": 2.5078125, "learning_rate": 3.3506651826196375e-08, "loss": 0.1892, "reward": 0.6763392984867096, "reward_std": 0.2651873156428337, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857313156128, "step": 3128 }, { "clip_ratio": 0.0, "completion_length": 1443.0893859863281, "epoch": 0.9346576058546785, "grad_norm": 9.466154098510742, "kl": 3.04296875, "learning_rate": 3.3474838505188557e-08, "loss": 0.2208, "reward": 0.4916294813156128, "reward_std": 0.299260426312685, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4380580559372902, "step": 3129 }, { "clip_ratio": 0.0, "completion_length": 1441.8504943847656, "epoch": 0.9349563139422, "grad_norm": 6.74104118347168, "kl": 2.67578125, "learning_rate": 3.3443168270665584e-08, "loss": 0.1744, "reward": 0.5178571790456772, "reward_std": 0.25845517963171005, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 3130 }, { "clip_ratio": 0.0, "completion_length": 1484.8348693847656, "epoch": 0.9352550220297214, "grad_norm": 6.203705310821533, "kl": 2.81640625, "learning_rate": 3.341164115708159e-08, "loss": 0.2164, "reward": 0.463727705180645, "reward_std": 0.25566353648900986, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026977300644, "step": 3131 }, { "clip_ratio": 0.0, "completion_length": 1554.3170471191406, "epoch": 0.935553730117243, "grad_norm": 3.104274272918701, "kl": 2.576171875, "learning_rate": 3.338025719873497e-08, "loss": 0.1708, "reward": 0.4860491305589676, "reward_std": 0.24030332267284393, "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312649011612, "step": 3132 }, { "clip_ratio": 0.0, "completion_length": 1403.1942443847656, "epoch": 0.9358524382047644, "grad_norm": 4.086523532867432, "kl": 2.345703125, "learning_rate": 3.33490164297684e-08, "loss": 0.1537, "reward": 0.6523437798023224, "reward_std": 0.3196733370423317, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223395228386, "step": 3133 }, { "clip_ratio": 0.0, "completion_length": 1494.9822082519531, "epoch": 0.9361511462922859, "grad_norm": 7.0004119873046875, "kl": 3.03515625, "learning_rate": 3.331791888416877e-08, "loss": 0.2146, "reward": 0.5747767984867096, "reward_std": 0.26713868230581284, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268059372902, "step": 3134 }, { "clip_ratio": 0.0, "completion_length": 1419.5670471191406, "epoch": 0.9364498543798073, "grad_norm": 2.585314989089966, "kl": 2.78125, "learning_rate": 3.328696459576715e-08, "loss": 0.202, "reward": 0.5401785895228386, "reward_std": 0.25383206456899643, "rewards/accuracy_reward": 0.08928571967408061, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.450892873108387, "step": 3135 }, { "clip_ratio": 0.0, "completion_length": 1525.6719360351562, "epoch": 0.9367485624673288, "grad_norm": 7.9224419593811035, "kl": 2.58203125, "learning_rate": 3.325615359823879e-08, "loss": 0.142, "reward": 0.5474330559372902, "reward_std": 0.26003750786185265, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044887661934, "step": 3136 }, { "clip_ratio": 0.0, "completion_length": 1402.4866638183594, "epoch": 0.9370472705548503, "grad_norm": 3.854868173599243, "kl": 2.75, "learning_rate": 3.322548592510305e-08, "loss": 0.222, "reward": 0.6060268133878708, "reward_std": 0.2656874395906925, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 3137 }, { "clip_ratio": 0.0, "completion_length": 1541.2947387695312, "epoch": 0.9373459786423718, "grad_norm": 4.243083953857422, "kl": 2.57421875, "learning_rate": 3.31949616097233e-08, "loss": 0.1806, "reward": 0.5284598395228386, "reward_std": 0.26844679936766624, "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241305589676, "step": 3138 }, { "clip_ratio": 0.0, "completion_length": 1472.7969360351562, "epoch": 0.9376446867298932, "grad_norm": 8.333685874938965, "kl": 2.40234375, "learning_rate": 3.316458068530705e-08, "loss": 0.1888, "reward": 0.5518973395228386, "reward_std": 0.28683367371559143, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330633878708, "step": 3139 }, { "clip_ratio": 0.0, "completion_length": 1410.0425109863281, "epoch": 0.9379433948174147, "grad_norm": 3.9134480953216553, "kl": 2.484375, "learning_rate": 3.3134343184905775e-08, "loss": 0.1353, "reward": 0.5089285895228386, "reward_std": 0.25039346516132355, "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 3140 }, { "clip_ratio": 0.0, "completion_length": 1428.6183776855469, "epoch": 0.9382421029049361, "grad_norm": 5.631182670593262, "kl": 2.6328125, "learning_rate": 3.310424914141487e-08, "loss": 0.1838, "reward": 0.4916294813156128, "reward_std": 0.21982476115226746, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187723517418, "step": 3141 }, { "clip_ratio": 0.0, "completion_length": 1372.9107971191406, "epoch": 0.9385408109924577, "grad_norm": 2.7128474712371826, "kl": 2.14453125, "learning_rate": 3.3074298587573724e-08, "loss": 0.1355, "reward": 0.654575914144516, "reward_std": 0.26424986124038696, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044813156128, "step": 3142 }, { "clip_ratio": 0.0, "completion_length": 1514.1161193847656, "epoch": 0.9388395190799791, "grad_norm": 3.0983712673187256, "kl": 2.228515625, "learning_rate": 3.304449155596562e-08, "loss": 0.1322, "reward": 0.4860491305589676, "reward_std": 0.2652283310890198, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 3143 }, { "clip_ratio": 0.0, "completion_length": 1512.6250610351562, "epoch": 0.9391382271675005, "grad_norm": 4.180562973022461, "kl": 2.083984375, "learning_rate": 3.3014828079017665e-08, "loss": 0.161, "reward": 0.5982143133878708, "reward_std": 0.28361066430807114, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888392984867096, "step": 3144 }, { "clip_ratio": 0.0, "completion_length": 1498.7009582519531, "epoch": 0.939436935255022, "grad_norm": 2.9830291271209717, "kl": 2.48828125, "learning_rate": 3.298530818900081e-08, "loss": 0.1297, "reward": 0.4905134066939354, "reward_std": 0.26683924347162247, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.459263414144516, "step": 3145 }, { "clip_ratio": 0.0, "completion_length": 1309.2254943847656, "epoch": 0.9397356433425434, "grad_norm": 10.20754337310791, "kl": 2.05859375, "learning_rate": 3.295593191802984e-08, "loss": 0.2053, "reward": 0.6205357313156128, "reward_std": 0.2557844892144203, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 3146 }, { "clip_ratio": 0.0, "completion_length": 1561.2188110351562, "epoch": 0.940034351430065, "grad_norm": 3.986483335494995, "kl": 2.4296875, "learning_rate": 3.2926699298063225e-08, "loss": 0.1652, "reward": 0.5396205559372902, "reward_std": 0.2706926167011261, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4793526828289032, "step": 3147 }, { "clip_ratio": 0.0, "completion_length": 1351.5804138183594, "epoch": 0.9403330595175864, "grad_norm": 8.29495906829834, "kl": 2.181640625, "learning_rate": 3.2897610360903205e-08, "loss": 0.1947, "reward": 0.5429687723517418, "reward_std": 0.2678261548280716, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687798023224, "step": 3148 }, { "clip_ratio": 0.0, "completion_length": 1515.8460388183594, "epoch": 0.9406317676051079, "grad_norm": 2.368577480316162, "kl": 2.36328125, "learning_rate": 3.286866513819567e-08, "loss": 0.1692, "reward": 0.497767873108387, "reward_std": 0.27864545583724976, "rewards/accuracy_reward": 0.04464285867288709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250149011612, "step": 3149 }, { "clip_ratio": 0.0, "completion_length": 1430.8772888183594, "epoch": 0.9409304756926293, "grad_norm": 8.606961250305176, "kl": 2.3203125, "learning_rate": 3.283986366143021e-08, "loss": 0.2079, "reward": 0.5764509215950966, "reward_std": 0.3177759423851967, "rewards/accuracy_reward": 0.10267857764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4737723544239998, "step": 3150 }, { "clip_ratio": 0.0, "completion_length": 1478.5134887695312, "epoch": 0.9412291837801509, "grad_norm": 9.485764503479004, "kl": 2.0390625, "learning_rate": 3.281120596194001e-08, "loss": 0.1263, "reward": 0.5457589477300644, "reward_std": 0.29636046662926674, "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4966518059372902, "step": 3151 }, { "clip_ratio": 0.0, "completion_length": 1476.40185546875, "epoch": 0.9415278918676723, "grad_norm": 5.901967525482178, "kl": 2.17578125, "learning_rate": 3.2782692070901804e-08, "loss": 0.161, "reward": 0.6093750298023224, "reward_std": 0.26319262385368347, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 3152 }, { "clip_ratio": 0.0, "completion_length": 1507.6094360351562, "epoch": 0.9418265999551938, "grad_norm": 3.111905336380005, "kl": 2.71484375, "learning_rate": 3.275432201933596e-08, "loss": 0.2168, "reward": 0.478236623108387, "reward_std": 0.2819332741200924, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4425223395228386, "step": 3153 }, { "clip_ratio": 0.0, "completion_length": 1515.6138916015625, "epoch": 0.9421253080427152, "grad_norm": 4.146934986114502, "kl": 2.4296875, "learning_rate": 3.272609583810629e-08, "loss": 0.1881, "reward": 0.5262276977300644, "reward_std": 0.2618788927793503, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463727705180645, "step": 3154 }, { "clip_ratio": 0.0, "completion_length": 1469.08935546875, "epoch": 0.9424240161302367, "grad_norm": 7.254668712615967, "kl": 2.25390625, "learning_rate": 3.26980135579201e-08, "loss": 0.1273, "reward": 0.5764509215950966, "reward_std": 0.3067385032773018, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4983259215950966, "step": 3155 }, { "clip_ratio": 0.0, "completion_length": 1479.0692749023438, "epoch": 0.9427227242177582, "grad_norm": 4.539575099945068, "kl": 2.33984375, "learning_rate": 3.267007520932817e-08, "loss": 0.1356, "reward": 0.549107164144516, "reward_std": 0.29431138560175896, "rewards/accuracy_reward": 0.0669642889406532, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 3156 }, { "clip_ratio": 0.0, "completion_length": 1459.2969360351562, "epoch": 0.9430214323052797, "grad_norm": 3.3327910900115967, "kl": 2.474609375, "learning_rate": 3.26422808227247e-08, "loss": 0.1904, "reward": 0.5446428805589676, "reward_std": 0.26225726678967476, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 3157 }, { "clip_ratio": 0.0, "completion_length": 1382.4018859863281, "epoch": 0.9433201403928011, "grad_norm": 5.916254043579102, "kl": 2.287109375, "learning_rate": 3.261463042834722e-08, "loss": 0.1616, "reward": 0.5401786044239998, "reward_std": 0.2881895937025547, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 3158 }, { "clip_ratio": 0.0, "completion_length": 1475.8661193847656, "epoch": 0.9436188484803226, "grad_norm": 2.43259596824646, "kl": 2.515625, "learning_rate": 3.258712405627669e-08, "loss": 0.2011, "reward": 0.603794664144516, "reward_std": 0.2863945923745632, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 3159 }, { "clip_ratio": 0.0, "completion_length": 1439.5625915527344, "epoch": 0.943917556567844, "grad_norm": 2.2255470752716064, "kl": 2.57421875, "learning_rate": 3.255976173643733e-08, "loss": 0.2082, "reward": 0.588169664144516, "reward_std": 0.24067040532827377, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4720982387661934, "step": 3160 }, { "clip_ratio": 0.0, "completion_length": 1441.27685546875, "epoch": 0.9442162646553656, "grad_norm": 2.928894519805908, "kl": 2.57421875, "learning_rate": 3.253254349859666e-08, "loss": 0.1965, "reward": 0.5133928880095482, "reward_std": 0.28161338344216347, "rewards/accuracy_reward": 0.051339288940653205, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462053582072258, "step": 3161 }, { "clip_ratio": 0.0, "completion_length": 1452.3125305175781, "epoch": 0.944514972742887, "grad_norm": 4.073805332183838, "kl": 2.70703125, "learning_rate": 3.250546937236545e-08, "loss": 0.1649, "reward": 0.5234375298023224, "reward_std": 0.26402610540390015, "rewards/accuracy_reward": 0.06250000442378223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 3162 }, { "clip_ratio": 0.0, "completion_length": 1518.4085388183594, "epoch": 0.9448136808304085, "grad_norm": 4.189696311950684, "kl": 2.80859375, "learning_rate": 3.2478539387197724e-08, "loss": 0.1981, "reward": 0.5457589477300644, "reward_std": 0.2584156505763531, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 3163 }, { "clip_ratio": 0.0, "completion_length": 1424.8861999511719, "epoch": 0.9451123889179299, "grad_norm": 3.616835355758667, "kl": 3.0625, "learning_rate": 3.245175357239062e-08, "loss": 0.2502, "reward": 0.489955373108387, "reward_std": 0.2975064218044281, "rewards/accuracy_reward": 0.03571428800933063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410895228386, "step": 3164 }, { "clip_ratio": 0.0, "completion_length": 1456.415283203125, "epoch": 0.9454110970054515, "grad_norm": 6.82573127746582, "kl": 2.8046875, "learning_rate": 3.242511195708453e-08, "loss": 0.1991, "reward": 0.4938616305589676, "reward_std": 0.2414160594344139, "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080633878708, "step": 3165 }, { "clip_ratio": 0.0, "completion_length": 1443.5447387695312, "epoch": 0.9457098050929729, "grad_norm": 5.4159979820251465, "kl": 2.859375, "learning_rate": 3.2398614570262874e-08, "loss": 0.2019, "reward": 0.545758955180645, "reward_std": 0.2766606956720352, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4810268133878708, "step": 3166 }, { "clip_ratio": 0.0, "completion_length": 1529.1072082519531, "epoch": 0.9460085131804944, "grad_norm": 12.15856647491455, "kl": 2.80859375, "learning_rate": 3.237226144075225e-08, "loss": 0.1186, "reward": 0.4704241380095482, "reward_std": 0.24986978247761726, "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4458705559372902, "step": 3167 }, { "clip_ratio": 0.0, "completion_length": 1452.7723693847656, "epoch": 0.9463072212680158, "grad_norm": 8.24917221069336, "kl": 2.59375, "learning_rate": 3.234605259722225e-08, "loss": 0.198, "reward": 0.5239955633878708, "reward_std": 0.26753300055861473, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463727705180645, "step": 3168 }, { "clip_ratio": 0.0, "completion_length": 1426.2411193847656, "epoch": 0.9466059293555373, "grad_norm": 3.624840021133423, "kl": 2.68359375, "learning_rate": 3.231998806818554e-08, "loss": 0.2492, "reward": 0.584263414144516, "reward_std": 0.29937708377838135, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4637276902794838, "step": 3169 }, { "clip_ratio": 0.0, "completion_length": 1451.9375305175781, "epoch": 0.9469046374430587, "grad_norm": 3.4515652656555176, "kl": 2.55859375, "learning_rate": 3.229406788199779e-08, "loss": 0.217, "reward": 0.4732143059372902, "reward_std": 0.25276877358555794, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535969734192, "step": 3170 }, { "clip_ratio": 0.0, "completion_length": 1466.3460693359375, "epoch": 0.9472033455305803, "grad_norm": 7.428613185882568, "kl": 2.53515625, "learning_rate": 3.2268292066857596e-08, "loss": 0.1755, "reward": 0.538504496216774, "reward_std": 0.26885468140244484, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473469734192, "step": 3171 }, { "clip_ratio": 0.0, "completion_length": 1435.3236999511719, "epoch": 0.9475020536181017, "grad_norm": 4.539495944976807, "kl": 2.431640625, "learning_rate": 3.224266065080652e-08, "loss": 0.1803, "reward": 0.5697544887661934, "reward_std": 0.2972545027732849, "rewards/accuracy_reward": 0.11160714901052415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473395228386, "step": 3172 }, { "clip_ratio": 0.0, "completion_length": 1397.5380249023438, "epoch": 0.9478007617056232, "grad_norm": 4.6674370765686035, "kl": 2.169921875, "learning_rate": 3.221717366172904e-08, "loss": 0.1944, "reward": 0.602120578289032, "reward_std": 0.27318356186151505, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.499441996216774, "step": 3173 }, { "clip_ratio": 0.0, "completion_length": 1447.05810546875, "epoch": 0.9480994697931446, "grad_norm": 5.775225639343262, "kl": 2.4765625, "learning_rate": 3.21918311273525e-08, "loss": 0.1845, "reward": 0.5279018059372902, "reward_std": 0.2968464195728302, "rewards/accuracy_reward": 0.04464285750873387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4832589402794838, "step": 3174 }, { "clip_ratio": 0.0, "completion_length": 1428.2835388183594, "epoch": 0.9483981778806662, "grad_norm": 5.288299560546875, "kl": 2.546875, "learning_rate": 3.21666330752471e-08, "loss": 0.1855, "reward": 0.5675223544239998, "reward_std": 0.29337115958333015, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4425223469734192, "step": 3175 }, { "clip_ratio": 0.0, "completion_length": 1381.2746276855469, "epoch": 0.9486968859681876, "grad_norm": 4.37476110458374, "kl": 2.1328125, "learning_rate": 3.2141579532825856e-08, "loss": 0.1452, "reward": 0.5251116380095482, "reward_std": 0.3002312704920769, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 3176 }, { "clip_ratio": 0.0, "completion_length": 1452.74560546875, "epoch": 0.9489955940557091, "grad_norm": 3.0487236976623535, "kl": 2.48828125, "learning_rate": 3.211667052734454e-08, "loss": 0.1485, "reward": 0.5418526977300644, "reward_std": 0.3129049763083458, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4815848469734192, "step": 3177 }, { "clip_ratio": 0.0, "completion_length": 1430.7054443359375, "epoch": 0.9492943021432305, "grad_norm": 7.643002986907959, "kl": 2.064453125, "learning_rate": 3.2091906085901724e-08, "loss": 0.1656, "reward": 0.5424107387661934, "reward_std": 0.2747529372572899, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482142873108387, "step": 3178 }, { "clip_ratio": 0.0, "completion_length": 1425.0871276855469, "epoch": 0.949593010230752, "grad_norm": 5.605353832244873, "kl": 2.36328125, "learning_rate": 3.2067286235438694e-08, "loss": 0.184, "reward": 0.5379464477300644, "reward_std": 0.2559508942067623, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888393059372902, "step": 3179 }, { "clip_ratio": 0.0, "completion_length": 1433.3773193359375, "epoch": 0.9498917183182735, "grad_norm": 5.101319789886475, "kl": 2.796875, "learning_rate": 3.204281100273943e-08, "loss": 0.2233, "reward": 0.6032366305589676, "reward_std": 0.2986522242426872, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187649011612, "step": 3180 }, { "clip_ratio": 0.0, "completion_length": 1445.3214721679688, "epoch": 0.950190426405795, "grad_norm": 6.116573810577393, "kl": 2.48828125, "learning_rate": 3.201848041443061e-08, "loss": 0.1694, "reward": 0.5775669887661934, "reward_std": 0.2656748332083225, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562723517418, "step": 3181 }, { "clip_ratio": 0.0, "completion_length": 1478.10498046875, "epoch": 0.9504891344933164, "grad_norm": 3.5653934478759766, "kl": 2.2890625, "learning_rate": 3.199429449698148e-08, "loss": 0.2028, "reward": 0.5853794813156128, "reward_std": 0.24394144862890244, "rewards/accuracy_reward": 0.1250000058207661, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794887661934, "step": 3182 }, { "clip_ratio": 0.0, "completion_length": 1442.6897888183594, "epoch": 0.9507878425808379, "grad_norm": 4.702702045440674, "kl": 2.376953125, "learning_rate": 3.197025327670399e-08, "loss": 0.1502, "reward": 0.611607164144516, "reward_std": 0.27609871700406075, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428805589676, "step": 3183 }, { "clip_ratio": 0.0, "completion_length": 1376.4911193847656, "epoch": 0.9510865506683593, "grad_norm": 4.875103950500488, "kl": 2.3671875, "learning_rate": 3.1946356779752625e-08, "loss": 0.1918, "reward": 0.545200914144516, "reward_std": 0.30705200508236885, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080559372902, "step": 3184 }, { "clip_ratio": 0.0, "completion_length": 1494.8951721191406, "epoch": 0.9513852587558809, "grad_norm": 3.737011432647705, "kl": 2.63671875, "learning_rate": 3.1922605032124417e-08, "loss": 0.2001, "reward": 0.569754496216774, "reward_std": 0.30088406801223755, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482700914144516, "step": 3185 }, { "clip_ratio": 0.0, "completion_length": 1426.482177734375, "epoch": 0.9516839668434023, "grad_norm": 12.366004943847656, "kl": 2.40625, "learning_rate": 3.189899805965895e-08, "loss": 0.2062, "reward": 0.5094866380095482, "reward_std": 0.2799772247672081, "rewards/accuracy_reward": 0.040178572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080633878708, "step": 3186 }, { "clip_ratio": 0.0, "completion_length": 1428.0603332519531, "epoch": 0.9519826749309237, "grad_norm": 17.572662353515625, "kl": 2.298828125, "learning_rate": 3.187553588803831e-08, "loss": 0.2288, "reward": 0.5195312649011612, "reward_std": 0.2871435582637787, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.479352705180645, "step": 3187 }, { "clip_ratio": 0.0, "completion_length": 1431.9152526855469, "epoch": 0.9522813830184452, "grad_norm": 5.442008972167969, "kl": 2.5, "learning_rate": 3.185221854278701e-08, "loss": 0.1707, "reward": 0.5206473469734192, "reward_std": 0.29249856621026993, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 3188 }, { "clip_ratio": 0.0, "completion_length": 1443.9330749511719, "epoch": 0.9525800911059666, "grad_norm": 5.2465901374816895, "kl": 2.578125, "learning_rate": 3.182904604927204e-08, "loss": 0.1624, "reward": 0.5753348618745804, "reward_std": 0.267628725618124, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463727705180645, "step": 3189 }, { "clip_ratio": 0.0, "completion_length": 1403.3973693847656, "epoch": 0.9528787991934882, "grad_norm": 3.3850698471069336, "kl": 2.76171875, "learning_rate": 3.1806018432702823e-08, "loss": 0.208, "reward": 0.5669642984867096, "reward_std": 0.28101206570863724, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 3190 }, { "clip_ratio": 0.0, "completion_length": 1408.5425109863281, "epoch": 0.9531775072810096, "grad_norm": 7.077630996704102, "kl": 2.47265625, "learning_rate": 3.178313571813114e-08, "loss": 0.1857, "reward": 0.618861623108387, "reward_std": 0.27963635325431824, "rewards/accuracy_reward": 0.15178571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 3191 }, { "clip_ratio": 0.0, "completion_length": 1449.029052734375, "epoch": 0.9534762153685311, "grad_norm": 7.259594917297363, "kl": 2.8046875, "learning_rate": 3.176039793045114e-08, "loss": 0.203, "reward": 0.577566996216774, "reward_std": 0.2939932942390442, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026977300644, "step": 3192 }, { "clip_ratio": 0.0, "completion_length": 1493.08935546875, "epoch": 0.9537749234560525, "grad_norm": 3.3568778038024902, "kl": 2.52734375, "learning_rate": 3.1737805094399304e-08, "loss": 0.142, "reward": 0.6155134290456772, "reward_std": 0.2635037638247013, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483816996216774, "step": 3193 }, { "clip_ratio": 0.0, "completion_length": 1420.5937805175781, "epoch": 0.954073631543574, "grad_norm": 2.657125949859619, "kl": 2.33203125, "learning_rate": 3.1715357234554416e-08, "loss": 0.1633, "reward": 0.5619419813156128, "reward_std": 0.27326958253979683, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 3194 }, { "clip_ratio": 0.0, "completion_length": 1475.7433776855469, "epoch": 0.9543723396310955, "grad_norm": 4.071237564086914, "kl": 2.50390625, "learning_rate": 3.169305437533758e-08, "loss": 0.1993, "reward": 0.6104910969734192, "reward_std": 0.2890111021697521, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339402794838, "step": 3195 }, { "clip_ratio": 0.0, "completion_length": 1495.8147888183594, "epoch": 0.954671047718617, "grad_norm": 3.0704712867736816, "kl": 2.55859375, "learning_rate": 3.1670896541012086e-08, "loss": 0.158, "reward": 0.546875037252903, "reward_std": 0.27196021378040314, "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535969734192, "step": 3196 }, { "clip_ratio": 0.0, "completion_length": 1411.0692443847656, "epoch": 0.9549697558061384, "grad_norm": 4.304591178894043, "kl": 2.515625, "learning_rate": 3.164888375568349e-08, "loss": 0.1917, "reward": 0.6110491380095482, "reward_std": 0.2904520258307457, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4681919887661934, "step": 3197 }, { "clip_ratio": 0.0, "completion_length": 1462.6964721679688, "epoch": 0.9552684638936599, "grad_norm": 4.741172790527344, "kl": 2.66796875, "learning_rate": 3.162701604329958e-08, "loss": 0.179, "reward": 0.5563616305589676, "reward_std": 0.26653778925538063, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4559151977300644, "step": 3198 }, { "clip_ratio": 0.0, "completion_length": 1396.7053833007812, "epoch": 0.9555671719811814, "grad_norm": 12.283018112182617, "kl": 2.2109375, "learning_rate": 3.160529342765023e-08, "loss": 0.1788, "reward": 0.6478794813156128, "reward_std": 0.3225867822766304, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5027901977300644, "step": 3199 }, { "clip_ratio": 0.0, "completion_length": 1538.1965026855469, "epoch": 0.9558658800687029, "grad_norm": 3.9319264888763428, "kl": 2.81640625, "learning_rate": 3.158371593236755e-08, "loss": 0.1883, "reward": 0.529575914144516, "reward_std": 0.21373334527015686, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187798023224, "step": 3200 }, { "clip_ratio": 0.0, "completion_length": 1537.0982971191406, "epoch": 0.9561645881562243, "grad_norm": 5.358796119689941, "kl": 2.66796875, "learning_rate": 3.156228358092574e-08, "loss": 0.1381, "reward": 0.5200892984867096, "reward_std": 0.23073268681764603, "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4754464477300644, "step": 3201 }, { "clip_ratio": 0.0, "completion_length": 1445.7612609863281, "epoch": 0.9564632962437458, "grad_norm": 2.8505845069885254, "kl": 2.65625, "learning_rate": 3.1540996396641094e-08, "loss": 0.1768, "reward": 0.5368303805589676, "reward_std": 0.2584696263074875, "rewards/accuracy_reward": 0.07142857206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654018059372902, "step": 3202 }, { "clip_ratio": 0.0, "completion_length": 1377.15185546875, "epoch": 0.9567620043312672, "grad_norm": 5.341412544250488, "kl": 2.80078125, "learning_rate": 3.1519854402671966e-08, "loss": 0.2036, "reward": 0.5719866305589676, "reward_std": 0.30907678604125977, "rewards/accuracy_reward": 0.09598214970901608, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044887661934, "step": 3203 }, { "clip_ratio": 0.0, "completion_length": 1408.4688415527344, "epoch": 0.9570607124187888, "grad_norm": 2.693453788757324, "kl": 2.4375, "learning_rate": 3.149885762201882e-08, "loss": 0.1731, "reward": 0.5128348395228386, "reward_std": 0.27698221057653427, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884066939354, "step": 3204 }, { "clip_ratio": 0.0, "completion_length": 1427.4710388183594, "epoch": 0.9573594205063102, "grad_norm": 3.302361249923706, "kl": 2.392578125, "learning_rate": 3.1478006077524036e-08, "loss": 0.1509, "reward": 0.5468750223517418, "reward_std": 0.27309511601924896, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 3205 }, { "clip_ratio": 0.0, "completion_length": 1418.2210693359375, "epoch": 0.9576581285938317, "grad_norm": 12.413537979125977, "kl": 2.28515625, "learning_rate": 3.145729979187209e-08, "loss": 0.2007, "reward": 0.510044664144516, "reward_std": 0.2530633173882961, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839477300644, "step": 3206 }, { "clip_ratio": 0.0, "completion_length": 1515.5938110351562, "epoch": 0.9579568366813531, "grad_norm": 2.6120388507843018, "kl": 2.51171875, "learning_rate": 3.143673878758936e-08, "loss": 0.1403, "reward": 0.4938616305589676, "reward_std": 0.27892304211854935, "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 3207 }, { "clip_ratio": 0.0, "completion_length": 1445.7098999023438, "epoch": 0.9582555447688746, "grad_norm": 3.1319570541381836, "kl": 2.6484375, "learning_rate": 3.141632308704424e-08, "loss": 0.1859, "reward": 0.5602678805589676, "reward_std": 0.267105408012867, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4776785969734192, "step": 3208 }, { "clip_ratio": 0.0, "completion_length": 1485.85498046875, "epoch": 0.9585542528563961, "grad_norm": 5.3204851150512695, "kl": 2.6015625, "learning_rate": 3.139605271244699e-08, "loss": 0.2054, "reward": 0.6099330633878708, "reward_std": 0.2906583100557327, "rewards/accuracy_reward": 0.11383929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4960937649011612, "step": 3209 }, { "clip_ratio": 0.0, "completion_length": 1449.87060546875, "epoch": 0.9588529609439176, "grad_norm": 4.025735378265381, "kl": 2.828125, "learning_rate": 3.137592768584979e-08, "loss": 0.2299, "reward": 0.5563616305589676, "reward_std": 0.23233938589692116, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4402901977300644, "step": 3210 }, { "clip_ratio": 0.0, "completion_length": 1468.3661499023438, "epoch": 0.959151669031439, "grad_norm": 7.386535167694092, "kl": 2.953125, "learning_rate": 3.135594802914672e-08, "loss": 0.1477, "reward": 0.5078125223517418, "reward_std": 0.24241125956177711, "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 3211 }, { "clip_ratio": 0.0, "completion_length": 1482.2478637695312, "epoch": 0.9594503771189605, "grad_norm": 6.639090538024902, "kl": 2.337890625, "learning_rate": 3.133611376407365e-08, "loss": 0.1974, "reward": 0.5412946715950966, "reward_std": 0.3201925456523895, "rewards/accuracy_reward": 0.06696428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303880095482, "step": 3212 }, { "clip_ratio": 0.0, "completion_length": 1448.1161499023438, "epoch": 0.9597490852064819, "grad_norm": 2.996692657470703, "kl": 2.5703125, "learning_rate": 3.131642491220833e-08, "loss": 0.1519, "reward": 0.5558035969734192, "reward_std": 0.2794121950864792, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 3213 }, { "clip_ratio": 0.0, "completion_length": 1500.4800109863281, "epoch": 0.9600477932940035, "grad_norm": 6.965978145599365, "kl": 2.796875, "learning_rate": 3.129688149497032e-08, "loss": 0.2079, "reward": 0.5541294813156128, "reward_std": 0.24604981392621994, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437723517418, "step": 3214 }, { "clip_ratio": 0.0, "completion_length": 1445.3170166015625, "epoch": 0.9603465013815249, "grad_norm": 5.524154186248779, "kl": 2.80859375, "learning_rate": 3.127748353362093e-08, "loss": 0.2277, "reward": 0.5329241380095482, "reward_std": 0.24771517887711525, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4503348469734192, "step": 3215 }, { "clip_ratio": 0.0, "completion_length": 1461.2366943359375, "epoch": 0.9606452094690464, "grad_norm": 9.80479621887207, "kl": 2.921875, "learning_rate": 3.125823104926324e-08, "loss": 0.1805, "reward": 0.5239955484867096, "reward_std": 0.24589737877249718, "rewards/accuracy_reward": 0.06919643259607255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.454799123108387, "step": 3216 }, { "clip_ratio": 0.0, "completion_length": 1483.1072082519531, "epoch": 0.9609439175565678, "grad_norm": 3.886624574661255, "kl": 2.65234375, "learning_rate": 3.123912406284206e-08, "loss": 0.186, "reward": 0.5630580633878708, "reward_std": 0.24922561645507812, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866380095482, "step": 3217 }, { "clip_ratio": 0.0, "completion_length": 1436.7991638183594, "epoch": 0.9612426256440894, "grad_norm": 8.049198150634766, "kl": 2.96875, "learning_rate": 3.122016259514395e-08, "loss": 0.204, "reward": 0.5234375223517418, "reward_std": 0.2782001867890358, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4542410969734192, "step": 3218 }, { "clip_ratio": 0.0, "completion_length": 1405.7210693359375, "epoch": 0.9615413337316108, "grad_norm": 4.174561023712158, "kl": 2.81640625, "learning_rate": 3.1201346666797075e-08, "loss": 0.1761, "reward": 0.4882812723517418, "reward_std": 0.25033745914697647, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241305589676, "step": 3219 }, { "clip_ratio": 0.0, "completion_length": 1440.5647583007812, "epoch": 0.9618400418191323, "grad_norm": 6.080018520355225, "kl": 2.703125, "learning_rate": 3.118267629827136e-08, "loss": 0.1761, "reward": 0.5094866305589676, "reward_std": 0.2532144971191883, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4536830559372902, "step": 3220 }, { "clip_ratio": 0.0, "completion_length": 1448.7857971191406, "epoch": 0.9621387499066537, "grad_norm": 12.078797340393066, "kl": 2.99609375, "learning_rate": 3.116415150987832e-08, "loss": 0.1542, "reward": 0.5312500074505806, "reward_std": 0.28626131266355515, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466517873108387, "step": 3221 }, { "clip_ratio": 0.0, "completion_length": 1423.4085388183594, "epoch": 0.9624374579941752, "grad_norm": 2.2962517738342285, "kl": 2.56640625, "learning_rate": 3.11457723217711e-08, "loss": 0.1859, "reward": 0.5931919813156128, "reward_std": 0.23853809013962746, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884215950966, "step": 3222 }, { "clip_ratio": 0.0, "completion_length": 1401.83935546875, "epoch": 0.9627361660816967, "grad_norm": 8.138810157775879, "kl": 2.921875, "learning_rate": 3.1127538753944494e-08, "loss": 0.1737, "reward": 0.4921875149011612, "reward_std": 0.2586720995604992, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 3223 }, { "clip_ratio": 0.0, "completion_length": 1367.0982666015625, "epoch": 0.9630348741692182, "grad_norm": 7.426176071166992, "kl": 2.6484375, "learning_rate": 3.110945082623481e-08, "loss": 0.1936, "reward": 0.5803571715950966, "reward_std": 0.2767837420105934, "rewards/accuracy_reward": 0.10267857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.477678582072258, "step": 3224 }, { "clip_ratio": 0.0, "completion_length": 1432.1116638183594, "epoch": 0.9633335822567396, "grad_norm": 3.6598854064941406, "kl": 2.345703125, "learning_rate": 3.109150855831993e-08, "loss": 0.1272, "reward": 0.6540178805589676, "reward_std": 0.32148072123527527, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517857164144516, "step": 3225 }, { "clip_ratio": 0.0, "completion_length": 1495.1741638183594, "epoch": 0.9636322903442611, "grad_norm": 6.0612640380859375, "kl": 2.515625, "learning_rate": 3.107371196971931e-08, "loss": 0.157, "reward": 0.5429687798023224, "reward_std": 0.25735069438815117, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 3226 }, { "clip_ratio": 0.0, "completion_length": 1426.8125610351562, "epoch": 0.9639309984317825, "grad_norm": 4.350166320800781, "kl": 2.2421875, "learning_rate": 3.1056061079793865e-08, "loss": 0.1428, "reward": 0.5987723469734192, "reward_std": 0.26653479784727097, "rewards/accuracy_reward": 0.10937500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.489397332072258, "step": 3227 }, { "clip_ratio": 0.0, "completion_length": 1466.3259887695312, "epoch": 0.9642297065193041, "grad_norm": 3.966756582260132, "kl": 2.40234375, "learning_rate": 3.103855590774605e-08, "loss": 0.1653, "reward": 0.6010044813156128, "reward_std": 0.29853595793247223, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.471540205180645, "step": 3228 }, { "clip_ratio": 0.0, "completion_length": 1365.7031860351562, "epoch": 0.9645284146068255, "grad_norm": 4.106782913208008, "kl": 2.6796875, "learning_rate": 3.102119647261979e-08, "loss": 0.1948, "reward": 0.554129496216774, "reward_std": 0.26408498734235764, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4893973469734192, "step": 3229 }, { "clip_ratio": 0.0, "completion_length": 1476.5915832519531, "epoch": 0.9648271226943469, "grad_norm": 14.419795989990234, "kl": 2.49609375, "learning_rate": 3.100398279330042e-08, "loss": 0.2234, "reward": 0.5446428805589676, "reward_std": 0.2967846095561981, "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.475446455180645, "step": 3230 }, { "clip_ratio": 0.0, "completion_length": 1561.2009582519531, "epoch": 0.9651258307818684, "grad_norm": 7.2439422607421875, "kl": 2.7734375, "learning_rate": 3.0986914888514755e-08, "loss": 0.2157, "reward": 0.5574777126312256, "reward_std": 0.2651301398873329, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598395228386, "step": 3231 }, { "clip_ratio": 0.0, "completion_length": 1423.6496276855469, "epoch": 0.9654245388693898, "grad_norm": 3.853492498397827, "kl": 2.4375, "learning_rate": 3.0969992776831e-08, "loss": 0.2016, "reward": 0.498325914144516, "reward_std": 0.24828174710273743, "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4827009066939354, "step": 3232 }, { "clip_ratio": 0.0, "completion_length": 1392.0313110351562, "epoch": 0.9657232469569114, "grad_norm": 10.050911903381348, "kl": 2.2578125, "learning_rate": 3.095321647665875e-08, "loss": 0.1811, "reward": 0.5546875149011612, "reward_std": 0.29587752372026443, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4988839477300644, "step": 3233 }, { "clip_ratio": 0.0, "completion_length": 1475.5000610351562, "epoch": 0.9660219550444328, "grad_norm": 4.185423374176025, "kl": 2.73828125, "learning_rate": 3.093658600624897e-08, "loss": 0.1895, "reward": 0.5329241380095482, "reward_std": 0.28894853591918945, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312723517418, "step": 3234 }, { "clip_ratio": 0.0, "completion_length": 1490.0000610351562, "epoch": 0.9663206631319543, "grad_norm": 4.513205051422119, "kl": 2.640625, "learning_rate": 3.092010138369399e-08, "loss": 0.1744, "reward": 0.5731026977300644, "reward_std": 0.27521076798439026, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598469734192, "step": 3235 }, { "clip_ratio": 0.0, "completion_length": 1383.2857666015625, "epoch": 0.9666193712194757, "grad_norm": 5.037080764770508, "kl": 2.625, "learning_rate": 3.090376262692747e-08, "loss": 0.2451, "reward": 0.5691964477300644, "reward_std": 0.27229293063282967, "rewards/accuracy_reward": 0.10491071990691125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857387661934, "step": 3236 }, { "clip_ratio": 0.0, "completion_length": 1395.29248046875, "epoch": 0.9669180793069972, "grad_norm": 2.643789768218994, "kl": 2.3203125, "learning_rate": 3.088756975372436e-08, "loss": 0.176, "reward": 0.5797991305589676, "reward_std": 0.275678850710392, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4815848469734192, "step": 3237 }, { "clip_ratio": 0.0, "completion_length": 1434.5268859863281, "epoch": 0.9672167873945187, "grad_norm": 5.25883150100708, "kl": 2.25390625, "learning_rate": 3.0871522781700925e-08, "loss": 0.1553, "reward": 0.502232164144516, "reward_std": 0.25569648668169975, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071566939354, "step": 3238 }, { "clip_ratio": 0.0, "completion_length": 1458.352783203125, "epoch": 0.9675154954820402, "grad_norm": 5.269351482391357, "kl": 2.5078125, "learning_rate": 3.0855621728314705e-08, "loss": 0.1754, "reward": 0.5401785969734192, "reward_std": 0.2817278951406479, "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.475446455180645, "step": 3239 }, { "clip_ratio": 0.0, "completion_length": 1425.7277526855469, "epoch": 0.9678142035695616, "grad_norm": 4.277035713195801, "kl": 2.44140625, "learning_rate": 3.083986661086449e-08, "loss": 0.1427, "reward": 0.4949776977300644, "reward_std": 0.2583452798426151, "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 3240 }, { "clip_ratio": 0.0, "completion_length": 1544.2679443359375, "epoch": 0.9681129116570831, "grad_norm": 6.03294563293457, "kl": 2.83984375, "learning_rate": 3.082425744649028e-08, "loss": 0.1763, "reward": 0.4910714477300644, "reward_std": 0.2706146091222763, "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 3241 }, { "clip_ratio": 0.0, "completion_length": 1414.6384582519531, "epoch": 0.9684116197446045, "grad_norm": 5.378229141235352, "kl": 2.404296875, "learning_rate": 3.080879425217335e-08, "loss": 0.1889, "reward": 0.572544664144516, "reward_std": 0.24099378660321236, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4899553805589676, "step": 3242 }, { "clip_ratio": 0.0, "completion_length": 1422.8906860351562, "epoch": 0.9687103278321261, "grad_norm": 3.383380651473999, "kl": 2.62890625, "learning_rate": 3.079347704473611e-08, "loss": 0.174, "reward": 0.5887277200818062, "reward_std": 0.28715093433856964, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.483816996216774, "step": 3243 }, { "clip_ratio": 0.0, "completion_length": 1536.3840026855469, "epoch": 0.9690090359196475, "grad_norm": 5.400080680847168, "kl": 2.73828125, "learning_rate": 3.0778305840842196e-08, "loss": 0.193, "reward": 0.6037946790456772, "reward_std": 0.27070479467511177, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660969734192, "step": 3244 }, { "clip_ratio": 0.0, "completion_length": 1483.8304138183594, "epoch": 0.969307744007169, "grad_norm": 2.681622266769409, "kl": 2.67578125, "learning_rate": 3.0763280656996386e-08, "loss": 0.1623, "reward": 0.5072544887661934, "reward_std": 0.27665942907333374, "rewards/accuracy_reward": 0.06473214691504836, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4425223469734192, "step": 3245 }, { "clip_ratio": 0.0, "completion_length": 1462.6116638183594, "epoch": 0.9696064520946904, "grad_norm": 4.437602519989014, "kl": 2.453125, "learning_rate": 3.074840150954461e-08, "loss": 0.1654, "reward": 0.5334821715950966, "reward_std": 0.2857673466205597, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 3246 }, { "clip_ratio": 0.0, "completion_length": 1506.2344055175781, "epoch": 0.969905160182212, "grad_norm": 3.145455837249756, "kl": 2.37109375, "learning_rate": 3.073366841467391e-08, "loss": 0.1853, "reward": 0.5334821790456772, "reward_std": 0.2567288987338543, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 3247 }, { "clip_ratio": 0.0, "completion_length": 1458.2656860351562, "epoch": 0.9702038682697334, "grad_norm": 9.533458709716797, "kl": 2.5234375, "learning_rate": 3.071908138841248e-08, "loss": 0.1857, "reward": 0.5184151977300644, "reward_std": 0.2674972712993622, "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794813156128, "step": 3248 }, { "clip_ratio": 0.0, "completion_length": 1409.97998046875, "epoch": 0.9705025763572549, "grad_norm": 6.588200569152832, "kl": 2.197265625, "learning_rate": 3.070464044662955e-08, "loss": 0.1785, "reward": 0.4983259215950966, "reward_std": 0.2695155739784241, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080484867096, "step": 3249 }, { "clip_ratio": 0.0, "completion_length": 1484.9866943359375, "epoch": 0.9708012844447763, "grad_norm": 2.876185894012451, "kl": 2.6171875, "learning_rate": 3.069034560503544e-08, "loss": 0.1814, "reward": 0.5351562649011612, "reward_std": 0.2919646427035332, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4949776902794838, "step": 3250 }, { "clip_ratio": 0.0, "completion_length": 1432.9911193847656, "epoch": 0.9710999925322978, "grad_norm": 11.33430290222168, "kl": 2.6171875, "learning_rate": 3.067619687918156e-08, "loss": 0.1508, "reward": 0.5926339626312256, "reward_std": 0.27194420248270035, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 3251 }, { "clip_ratio": 0.0, "completion_length": 1453.9754943847656, "epoch": 0.9713987006198193, "grad_norm": 2.820138692855835, "kl": 2.34375, "learning_rate": 3.066219428446033e-08, "loss": 0.1578, "reward": 0.5652902126312256, "reward_std": 0.24001264572143555, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4469866305589676, "step": 3252 }, { "clip_ratio": 0.0, "completion_length": 1413.6786193847656, "epoch": 0.9716974087073408, "grad_norm": 3.3607091903686523, "kl": 2.40625, "learning_rate": 3.064833783610519e-08, "loss": 0.1804, "reward": 0.5948660969734192, "reward_std": 0.2852509655058384, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4899553805589676, "step": 3253 }, { "clip_ratio": 0.0, "completion_length": 1361.7411499023438, "epoch": 0.9719961167948622, "grad_norm": 2.4452602863311768, "kl": 2.271484375, "learning_rate": 3.0634627549190626e-08, "loss": 0.1596, "reward": 0.5295759066939354, "reward_std": 0.28985659033060074, "rewards/accuracy_reward": 0.06473214458674192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437723517418, "step": 3254 }, { "clip_ratio": 0.0, "completion_length": 1470.5067749023438, "epoch": 0.9722948248823837, "grad_norm": 4.285000801086426, "kl": 2.4140625, "learning_rate": 3.062106343863204e-08, "loss": 0.1707, "reward": 0.5016741305589676, "reward_std": 0.24180680140852928, "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 3255 }, { "clip_ratio": 0.0, "completion_length": 1531.4688110351562, "epoch": 0.9725935329699051, "grad_norm": 11.338885307312012, "kl": 2.6328125, "learning_rate": 3.0607645519185895e-08, "loss": 0.1639, "reward": 0.5368303954601288, "reward_std": 0.24271558597683907, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 3256 }, { "clip_ratio": 0.0, "completion_length": 1531.1161499023438, "epoch": 0.9728922410574267, "grad_norm": 9.737259864807129, "kl": 2.37109375, "learning_rate": 3.059437380544957e-08, "loss": 0.1997, "reward": 0.5156250223517418, "reward_std": 0.2621733844280243, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4598214477300644, "step": 3257 }, { "clip_ratio": 0.0, "completion_length": 1434.2478637695312, "epoch": 0.9731909491449481, "grad_norm": 4.161248207092285, "kl": 2.5234375, "learning_rate": 3.058124831186136e-08, "loss": 0.1851, "reward": 0.5128348469734192, "reward_std": 0.2817784883081913, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241380095482, "step": 3258 }, { "clip_ratio": 0.0, "completion_length": 1522.38623046875, "epoch": 0.9734896572324696, "grad_norm": 4.58559513092041, "kl": 2.46484375, "learning_rate": 3.056826905270053e-08, "loss": 0.1806, "reward": 0.522879496216774, "reward_std": 0.24676504731178284, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473469734192, "step": 3259 }, { "clip_ratio": 0.0, "completion_length": 1482.37060546875, "epoch": 0.973788365319991, "grad_norm": 4.094880104064941, "kl": 2.38671875, "learning_rate": 3.055543604208726e-08, "loss": 0.1675, "reward": 0.5781250298023224, "reward_std": 0.29662156105041504, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893133878708, "step": 3260 }, { "clip_ratio": 0.0, "completion_length": 1463.1050109863281, "epoch": 0.9740870734075125, "grad_norm": 3.6394412517547607, "kl": 2.44921875, "learning_rate": 3.054274929398259e-08, "loss": 0.1357, "reward": 0.5876116305589676, "reward_std": 0.2935869172215462, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5117187723517418, "step": 3261 }, { "clip_ratio": 0.0, "completion_length": 1413.4554138183594, "epoch": 0.974385781495034, "grad_norm": 4.720171928405762, "kl": 2.72265625, "learning_rate": 3.053020882218845e-08, "loss": 0.2105, "reward": 0.516741082072258, "reward_std": 0.29759524390101433, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696715950966, "step": 3262 }, { "clip_ratio": 0.0, "completion_length": 1389.4576721191406, "epoch": 0.9746844895825555, "grad_norm": 9.604052543640137, "kl": 2.5, "learning_rate": 3.0517814640347666e-08, "loss": 0.2194, "reward": 0.4888393133878708, "reward_std": 0.2558048479259014, "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4575893059372902, "step": 3263 }, { "clip_ratio": 0.0, "completion_length": 1461.7054443359375, "epoch": 0.9749831976700769, "grad_norm": 9.285935401916504, "kl": 2.51953125, "learning_rate": 3.050556676194388e-08, "loss": 0.2055, "reward": 0.5546875298023224, "reward_std": 0.24941996112465858, "rewards/accuracy_reward": 0.08482143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4698660895228386, "step": 3264 }, { "clip_ratio": 0.0, "completion_length": 1374.5826721191406, "epoch": 0.9752819057575984, "grad_norm": 6.888237953186035, "kl": 2.38671875, "learning_rate": 3.0493465200301587e-08, "loss": 0.2251, "reward": 0.5948660969734192, "reward_std": 0.2744561694562435, "rewards/accuracy_reward": 0.10937500395812094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854910895228386, "step": 3265 }, { "clip_ratio": 0.0, "completion_length": 1472.3594360351562, "epoch": 0.9755806138451198, "grad_norm": 4.531765460968018, "kl": 2.86328125, "learning_rate": 3.04815099685861e-08, "loss": 0.1952, "reward": 0.5306919887661934, "reward_std": 0.2668744660913944, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955708384514, "step": 3266 }, { "clip_ratio": 0.0, "completion_length": 1400.8415832519531, "epoch": 0.9758793219326414, "grad_norm": 6.010406017303467, "kl": 2.53125, "learning_rate": 3.046970107980353e-08, "loss": 0.1492, "reward": 0.5357143059372902, "reward_std": 0.2657856084406376, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500298023224, "step": 3267 }, { "clip_ratio": 0.0, "completion_length": 1421.0246276855469, "epoch": 0.9761780300201628, "grad_norm": 5.895253658294678, "kl": 2.3984375, "learning_rate": 3.045803854680081e-08, "loss": 0.1813, "reward": 0.498325914144516, "reward_std": 0.2551261708140373, "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4760044887661934, "step": 3268 }, { "clip_ratio": 0.0, "completion_length": 1452.21435546875, "epoch": 0.9764767381076843, "grad_norm": 4.705284118652344, "kl": 2.37890625, "learning_rate": 3.044652238226561e-08, "loss": 0.1783, "reward": 0.5797991305589676, "reward_std": 0.30683116242289543, "rewards/accuracy_reward": 0.0959821508731693, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169887661934, "step": 3269 }, { "clip_ratio": 0.0, "completion_length": 1475.2098999023438, "epoch": 0.9767754461952057, "grad_norm": 4.505090236663818, "kl": 2.63671875, "learning_rate": 3.043515259872641e-08, "loss": 0.2108, "reward": 0.6434151977300644, "reward_std": 0.253260038793087, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478236623108387, "step": 3270 }, { "clip_ratio": 0.0, "completion_length": 1473.3282165527344, "epoch": 0.9770741542827273, "grad_norm": 10.768470764160156, "kl": 2.828125, "learning_rate": 3.0423929208552405e-08, "loss": 0.1825, "reward": 0.5809151977300644, "reward_std": 0.2493545040488243, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4425223395228386, "step": 3271 }, { "clip_ratio": 0.0, "completion_length": 1464.1518859863281, "epoch": 0.9773728623702487, "grad_norm": 3.8692476749420166, "kl": 2.7421875, "learning_rate": 3.041285222395355e-08, "loss": 0.2026, "reward": 0.5223214477300644, "reward_std": 0.2833050414919853, "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500149011612, "step": 3272 }, { "clip_ratio": 0.0, "completion_length": 1489.10498046875, "epoch": 0.9776715704577701, "grad_norm": 3.3168718814849854, "kl": 2.4921875, "learning_rate": 3.040192165698052e-08, "loss": 0.1588, "reward": 0.511718787252903, "reward_std": 0.29310544580221176, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473395228386, "step": 3273 }, { "clip_ratio": 0.0, "completion_length": 1386.29248046875, "epoch": 0.9779702785452916, "grad_norm": 4.302709102630615, "kl": 2.59765625, "learning_rate": 3.0391137519524705e-08, "loss": 0.1428, "reward": 0.5106026977300644, "reward_std": 0.3043283745646477, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463727705180645, "step": 3274 }, { "clip_ratio": 0.0, "completion_length": 1413.2813110351562, "epoch": 0.978268986632813, "grad_norm": 5.201013565063477, "kl": 2.83984375, "learning_rate": 3.038049982331816e-08, "loss": 0.2442, "reward": 0.555245578289032, "reward_std": 0.3073942959308624, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955633878708, "step": 3275 }, { "clip_ratio": 0.0, "completion_length": 1561.6652221679688, "epoch": 0.9785676947203346, "grad_norm": 7.299087047576904, "kl": 2.91796875, "learning_rate": 3.0370008579933696e-08, "loss": 0.1708, "reward": 0.5050223469734192, "reward_std": 0.2515399232506752, "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4469866305589676, "step": 3276 }, { "clip_ratio": 0.0, "completion_length": 1511.9665832519531, "epoch": 0.978866402807856, "grad_norm": 3.896212100982666, "kl": 2.396484375, "learning_rate": 3.0359663800784734e-08, "loss": 0.155, "reward": 0.6160714775323868, "reward_std": 0.2774003818631172, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5022321715950966, "step": 3277 }, { "clip_ratio": 0.0, "completion_length": 1507.3973999023438, "epoch": 0.9791651108953775, "grad_norm": 4.085843563079834, "kl": 2.3515625, "learning_rate": 3.034946549712538e-08, "loss": 0.1822, "reward": 0.5563616305589676, "reward_std": 0.3044721707701683, "rewards/accuracy_reward": 0.09151786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437649011612, "step": 3278 }, { "clip_ratio": 0.0, "completion_length": 1452.3973999023438, "epoch": 0.9794638189828989, "grad_norm": 3.1966300010681152, "kl": 2.3203125, "learning_rate": 3.033941368005041e-08, "loss": 0.1958, "reward": 0.5362723544239998, "reward_std": 0.24664441496133804, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4827009215950966, "step": 3279 }, { "clip_ratio": 0.0, "completion_length": 1476.5313110351562, "epoch": 0.9797625270704204, "grad_norm": 3.714277744293213, "kl": 2.271484375, "learning_rate": 3.032950836049518e-08, "loss": 0.1455, "reward": 0.5574777126312256, "reward_std": 0.23203562200069427, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562649011612, "step": 3280 }, { "clip_ratio": 0.0, "completion_length": 1494.3594360351562, "epoch": 0.9800612351579419, "grad_norm": 2.8435699939727783, "kl": 2.51171875, "learning_rate": 3.0319749549235736e-08, "loss": 0.1995, "reward": 0.4732143133878708, "reward_std": 0.24404239654541016, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4620535895228386, "step": 3281 }, { "clip_ratio": 0.0, "completion_length": 1415.2076110839844, "epoch": 0.9803599432454634, "grad_norm": 10.045074462890625, "kl": 2.23828125, "learning_rate": 3.031013725688871e-08, "loss": 0.1891, "reward": 0.5569196715950966, "reward_std": 0.24084026366472244, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4944196715950966, "step": 3282 }, { "clip_ratio": 0.0, "completion_length": 1406.2433471679688, "epoch": 0.9806586513329848, "grad_norm": 11.285782814025879, "kl": 2.23046875, "learning_rate": 3.03006714939113e-08, "loss": 0.187, "reward": 0.5993303805589676, "reward_std": 0.2957450821995735, "rewards/accuracy_reward": 0.11383929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854910895228386, "step": 3283 }, { "clip_ratio": 0.0, "completion_length": 1330.075927734375, "epoch": 0.9809573594205063, "grad_norm": 6.0888519287109375, "kl": 2.158203125, "learning_rate": 3.0291352270601374e-08, "loss": 0.1334, "reward": 0.5424107387661934, "reward_std": 0.25040822476148605, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964402794838, "step": 3284 }, { "clip_ratio": 0.0, "completion_length": 1454.0491638183594, "epoch": 0.9812560675080277, "grad_norm": 7.4475812911987305, "kl": 2.3984375, "learning_rate": 3.0282179597097295e-08, "loss": 0.1941, "reward": 0.5468750298023224, "reward_std": 0.2849079631268978, "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 3285 }, { "clip_ratio": 0.0, "completion_length": 1539.7098999023438, "epoch": 0.9815547755955493, "grad_norm": 3.1913352012634277, "kl": 2.671875, "learning_rate": 3.027315348337807e-08, "loss": 0.1641, "reward": 0.5055803805589676, "reward_std": 0.28586407005786896, "rewards/accuracy_reward": 0.04687500139698386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4587053805589676, "step": 3286 }, { "clip_ratio": 0.0, "completion_length": 1465.6384887695312, "epoch": 0.9818534836830707, "grad_norm": 9.18374252319336, "kl": 2.40625, "learning_rate": 3.0264273939263185e-08, "loss": 0.1998, "reward": 0.4771205559372902, "reward_std": 0.2696148231625557, "rewards/accuracy_reward": 0.029017857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4481026902794838, "step": 3287 }, { "clip_ratio": 0.0, "completion_length": 1436.9152526855469, "epoch": 0.9821521917705922, "grad_norm": 3.7658441066741943, "kl": 2.31640625, "learning_rate": 3.0255540974412735e-08, "loss": 0.1671, "reward": 0.541294664144516, "reward_std": 0.2747498042881489, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625223517418, "step": 3288 }, { "clip_ratio": 0.0, "completion_length": 1436.6250915527344, "epoch": 0.9824508998581136, "grad_norm": 4.827023506164551, "kl": 2.625, "learning_rate": 3.024695459832734e-08, "loss": 0.1617, "reward": 0.5161830633878708, "reward_std": 0.25005901977419853, "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4648437649011612, "step": 3289 }, { "clip_ratio": 0.0, "completion_length": 1383.3058776855469, "epoch": 0.9827496079456352, "grad_norm": 5.442416667938232, "kl": 2.8203125, "learning_rate": 3.0238514820348115e-08, "loss": 0.1895, "reward": 0.4804687798023224, "reward_std": 0.2625104747712612, "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455915205180645, "step": 3290 }, { "clip_ratio": 0.0, "completion_length": 1432.122802734375, "epoch": 0.9830483160331566, "grad_norm": 3.1376352310180664, "kl": 2.763671875, "learning_rate": 3.0230221649656715e-08, "loss": 0.172, "reward": 0.5725446715950966, "reward_std": 0.2769414223730564, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474330373108387, "step": 3291 }, { "clip_ratio": 0.0, "completion_length": 1492.2054443359375, "epoch": 0.9833470241206781, "grad_norm": 3.274275541305542, "kl": 2.89453125, "learning_rate": 3.022207509527532e-08, "loss": 0.2364, "reward": 0.5412946715950966, "reward_std": 0.26304449886083603, "rewards/accuracy_reward": 0.08928571967408061, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4520089477300644, "step": 3292 }, { "clip_ratio": 0.0, "completion_length": 1427.8326721191406, "epoch": 0.9836457322081995, "grad_norm": 7.249435901641846, "kl": 2.21484375, "learning_rate": 3.0214075166066556e-08, "loss": 0.1382, "reward": 0.595982164144516, "reward_std": 0.2909509241580963, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4910714626312256, "step": 3293 }, { "clip_ratio": 0.0, "completion_length": 1487.2098999023438, "epoch": 0.983944440295721, "grad_norm": 5.565303802490234, "kl": 2.28125, "learning_rate": 3.020622187073357e-08, "loss": 0.1466, "reward": 0.491071455180645, "reward_std": 0.278594221919775, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250223517418, "step": 3294 }, { "clip_ratio": 0.0, "completion_length": 1458.9576416015625, "epoch": 0.9842431483832424, "grad_norm": 3.658068895339966, "kl": 2.369140625, "learning_rate": 3.0198515217819974e-08, "loss": 0.1894, "reward": 0.5675223395228386, "reward_std": 0.2637890875339508, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223469734192, "step": 3295 }, { "clip_ratio": 0.0, "completion_length": 1547.8125915527344, "epoch": 0.984541856470764, "grad_norm": 4.155376434326172, "kl": 2.611328125, "learning_rate": 3.019095521570987e-08, "loss": 0.1835, "reward": 0.5212053954601288, "reward_std": 0.22364426404237747, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303805589676, "step": 3296 }, { "clip_ratio": 0.0, "completion_length": 1514.227783203125, "epoch": 0.9848405645582854, "grad_norm": 5.369364261627197, "kl": 2.671875, "learning_rate": 3.0183541872627765e-08, "loss": 0.1807, "reward": 0.584263414144516, "reward_std": 0.2977071702480316, "rewards/accuracy_reward": 0.08928571594879031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494977705180645, "step": 3297 }, { "clip_ratio": 0.0, "completion_length": 1457.357177734375, "epoch": 0.9851392726458069, "grad_norm": 4.452979564666748, "kl": 2.671875, "learning_rate": 3.017627519663869e-08, "loss": 0.2104, "reward": 0.5200893059372902, "reward_std": 0.25660115852952003, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455357164144516, "step": 3298 }, { "clip_ratio": 0.0, "completion_length": 1476.1630554199219, "epoch": 0.9854379807333283, "grad_norm": 4.120031833648682, "kl": 2.65625, "learning_rate": 3.016915519564803e-08, "loss": 0.1843, "reward": 0.5267857387661934, "reward_std": 0.23747417703270912, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 3299 }, { "clip_ratio": 0.0, "completion_length": 1475.6116638183594, "epoch": 0.9857366888208499, "grad_norm": 4.1395649909973145, "kl": 2.439453125, "learning_rate": 3.0162181877401696e-08, "loss": 0.1552, "reward": 0.5217634215950966, "reward_std": 0.28295962512493134, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4726562723517418, "step": 3300 }, { "clip_ratio": 0.0, "completion_length": 1406.294677734375, "epoch": 0.9860353969083713, "grad_norm": 4.28291654586792, "kl": 2.5859375, "learning_rate": 3.0155355249485956e-08, "loss": 0.205, "reward": 0.6210937798023224, "reward_std": 0.2556511461734772, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366305589676, "step": 3301 }, { "clip_ratio": 0.0, "completion_length": 1509.2366638183594, "epoch": 0.9863341049958928, "grad_norm": 4.604603290557861, "kl": 2.5859375, "learning_rate": 3.0148675319327484e-08, "loss": 0.1995, "reward": 0.4676339477300644, "reward_std": 0.25382040813565254, "rewards/accuracy_reward": 0.02678571525029838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4408482313156128, "step": 3302 }, { "clip_ratio": 0.0, "completion_length": 1427.6072082519531, "epoch": 0.9866328130834142, "grad_norm": 20.122413635253906, "kl": 2.1953125, "learning_rate": 3.0142142094193444e-08, "loss": 0.202, "reward": 0.5619419887661934, "reward_std": 0.28606799989938736, "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927455633878708, "step": 3303 }, { "clip_ratio": 0.0, "completion_length": 1476.58935546875, "epoch": 0.9869315211709357, "grad_norm": 6.230081558227539, "kl": 2.5234375, "learning_rate": 3.01357555811913e-08, "loss": 0.1965, "reward": 0.571986623108387, "reward_std": 0.2432853803038597, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330559372902, "step": 3304 }, { "clip_ratio": 0.0, "completion_length": 1497.0112609863281, "epoch": 0.9872302292584572, "grad_norm": 13.915817260742188, "kl": 2.89453125, "learning_rate": 3.0129515787268956e-08, "loss": 0.1734, "reward": 0.5223214477300644, "reward_std": 0.23075947165489197, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4709821715950966, "step": 3305 }, { "clip_ratio": 0.0, "completion_length": 1545.4554138183594, "epoch": 0.9875289373459787, "grad_norm": 5.355803966522217, "kl": 2.86328125, "learning_rate": 3.012342271921472e-08, "loss": 0.209, "reward": 0.537388414144516, "reward_std": 0.26869161799550056, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4614955559372902, "step": 3306 }, { "clip_ratio": 0.0, "completion_length": 1434.8995971679688, "epoch": 0.9878276454335001, "grad_norm": 6.2631940841674805, "kl": 2.76171875, "learning_rate": 3.011747638365724e-08, "loss": 0.2037, "reward": 0.5239955559372902, "reward_std": 0.28925689682364464, "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.454799123108387, "step": 3307 }, { "clip_ratio": 0.0, "completion_length": 1471.7054138183594, "epoch": 0.9881263535210216, "grad_norm": 4.653446197509766, "kl": 2.4453125, "learning_rate": 3.011167678706555e-08, "loss": 0.1895, "reward": 0.531808078289032, "reward_std": 0.2848854884505272, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794738650322, "step": 3308 }, { "clip_ratio": 0.0, "completion_length": 1324.4978332519531, "epoch": 0.988425061608543, "grad_norm": 4.8081560134887695, "kl": 2.666015625, "learning_rate": 3.010602393574903e-08, "loss": 0.2449, "reward": 0.6372768133878708, "reward_std": 0.28013139218091965, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160895228386, "step": 3309 }, { "clip_ratio": 0.0, "completion_length": 1420.5312805175781, "epoch": 0.9887237696960646, "grad_norm": 5.3170342445373535, "kl": 2.53125, "learning_rate": 3.0100517835857454e-08, "loss": 0.2, "reward": 0.5770089626312256, "reward_std": 0.30569836869835854, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4787946492433548, "step": 3310 }, { "clip_ratio": 0.0, "completion_length": 1446.2701721191406, "epoch": 0.989022477783586, "grad_norm": 18.89107894897461, "kl": 2.16796875, "learning_rate": 3.0095158493380886e-08, "loss": 0.1744, "reward": 0.5513393059372902, "reward_std": 0.3001447468996048, "rewards/accuracy_reward": 0.049107145983725786, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5022321566939354, "step": 3311 }, { "clip_ratio": 0.0, "completion_length": 1439.4955749511719, "epoch": 0.9893211858711075, "grad_norm": 5.3701324462890625, "kl": 2.4765625, "learning_rate": 3.0089945914149786e-08, "loss": 0.1361, "reward": 0.4933035895228386, "reward_std": 0.23102691024541855, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486607164144516, "step": 3312 }, { "clip_ratio": 0.0, "completion_length": 1419.49560546875, "epoch": 0.9896198939586289, "grad_norm": 4.2401628494262695, "kl": 1.998046875, "learning_rate": 3.0084880103834926e-08, "loss": 0.12, "reward": 0.5831473469734192, "reward_std": 0.2847651317715645, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580559372902, "step": 3313 }, { "clip_ratio": 0.0, "completion_length": 1488.1205749511719, "epoch": 0.9899186020461505, "grad_norm": 6.813846111297607, "kl": 2.890625, "learning_rate": 3.007996106794741e-08, "loss": 0.1819, "reward": 0.5072544887661934, "reward_std": 0.24607884138822556, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4492187649011612, "step": 3314 }, { "clip_ratio": 0.0, "completion_length": 1523.6116943359375, "epoch": 0.9902173101336719, "grad_norm": 5.417466163635254, "kl": 2.6875, "learning_rate": 3.007518881183867e-08, "loss": 0.1554, "reward": 0.5117187723517418, "reward_std": 0.24431534484028816, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 3315 }, { "clip_ratio": 0.0, "completion_length": 1467.15185546875, "epoch": 0.9905160182211933, "grad_norm": 3.3481054306030273, "kl": 2.31640625, "learning_rate": 3.007056334070044e-08, "loss": 0.1326, "reward": 0.561941996216774, "reward_std": 0.2975269742310047, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 3316 }, { "clip_ratio": 0.0, "completion_length": 1490.6652526855469, "epoch": 0.9908147263087148, "grad_norm": 8.100129127502441, "kl": 2.84765625, "learning_rate": 3.0066084659564796e-08, "loss": 0.1981, "reward": 0.4994419887661934, "reward_std": 0.23948604613542557, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4436384066939354, "step": 3317 }, { "clip_ratio": 0.0, "completion_length": 1473.4576416015625, "epoch": 0.9911134343962362, "grad_norm": 3.8322105407714844, "kl": 2.6328125, "learning_rate": 3.00617527733041e-08, "loss": 0.1751, "reward": 0.5569196715950966, "reward_std": 0.23604775592684746, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4743303656578064, "step": 3318 }, { "clip_ratio": 0.0, "completion_length": 1422.5469055175781, "epoch": 0.9914121424837578, "grad_norm": 3.9023683071136475, "kl": 2.39453125, "learning_rate": 3.005756768663101e-08, "loss": 0.1703, "reward": 0.5251116305589676, "reward_std": 0.23394690454006195, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.476004496216774, "step": 3319 }, { "clip_ratio": 0.0, "completion_length": 1486.8081359863281, "epoch": 0.9917108505712792, "grad_norm": 4.105138301849365, "kl": 2.46484375, "learning_rate": 3.00535294040985e-08, "loss": 0.1829, "reward": 0.5697544887661934, "reward_std": 0.2558258883655071, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4469866305589676, "step": 3320 }, { "clip_ratio": 0.0, "completion_length": 1413.2076721191406, "epoch": 0.9920095586588007, "grad_norm": 5.042075157165527, "kl": 2.240234375, "learning_rate": 3.004963793009982e-08, "loss": 0.1803, "reward": 0.5714286118745804, "reward_std": 0.28741026669740677, "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678880095482, "step": 3321 }, { "clip_ratio": 0.0, "completion_length": 1394.7299499511719, "epoch": 0.9923082667463221, "grad_norm": 12.254508972167969, "kl": 2.22265625, "learning_rate": 3.0045893268868526e-08, "loss": 0.1813, "reward": 0.5429687649011612, "reward_std": 0.24594613164663315, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687649011612, "step": 3322 }, { "clip_ratio": 0.0, "completion_length": 1363.5715026855469, "epoch": 0.9926069748338436, "grad_norm": 5.6813740730285645, "kl": 2.20703125, "learning_rate": 3.004229542447842e-08, "loss": 0.1698, "reward": 0.6160714477300644, "reward_std": 0.3351222574710846, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357313156128, "step": 3323 }, { "clip_ratio": 0.0, "completion_length": 1478.9420166015625, "epoch": 0.992905682921365, "grad_norm": 14.015979766845703, "kl": 2.421875, "learning_rate": 3.003884440084363e-08, "loss": 0.185, "reward": 0.5011160969734192, "reward_std": 0.27344826608896255, "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4787946715950966, "step": 3324 }, { "clip_ratio": 0.0, "completion_length": 1395.2835083007812, "epoch": 0.9932043910088866, "grad_norm": 6.886924743652344, "kl": 2.17578125, "learning_rate": 3.0035540201718505e-08, "loss": 0.1708, "reward": 0.4933035969734192, "reward_std": 0.27011607587337494, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178805589676, "step": 3325 }, { "clip_ratio": 0.0, "completion_length": 1429.0982666015625, "epoch": 0.993503099096408, "grad_norm": 3.652326822280884, "kl": 2.380859375, "learning_rate": 3.00323828306977e-08, "loss": 0.1904, "reward": 0.5747768133878708, "reward_std": 0.2551848851144314, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625223517418, "step": 3326 }, { "clip_ratio": 0.0, "completion_length": 1518.5291137695312, "epoch": 0.9938018071839295, "grad_norm": 4.867300987243652, "kl": 2.53515625, "learning_rate": 3.0029372291216124e-08, "loss": 0.1683, "reward": 0.5167410969734192, "reward_std": 0.24433667585253716, "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4654017984867096, "step": 3327 }, { "clip_ratio": 0.0, "completion_length": 1368.2388916015625, "epoch": 0.9941005152714509, "grad_norm": 4.401246547698975, "kl": 2.171875, "learning_rate": 3.0026508586548965e-08, "loss": 0.1779, "reward": 0.5368303954601288, "reward_std": 0.29430899769067764, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232313156128, "step": 3328 }, { "clip_ratio": 0.0, "completion_length": 1491.6250610351562, "epoch": 0.9943992233589725, "grad_norm": 7.199917793273926, "kl": 2.220703125, "learning_rate": 3.0023791719811644e-08, "loss": 0.1366, "reward": 0.585379496216774, "reward_std": 0.30137407034635544, "rewards/accuracy_reward": 0.11830357369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467075914144516, "step": 3329 }, { "clip_ratio": 0.0, "completion_length": 1488.0916137695312, "epoch": 0.9946979314464939, "grad_norm": 3.472311496734619, "kl": 2.44921875, "learning_rate": 3.002122169395982e-08, "loss": 0.1614, "reward": 0.5262277126312256, "reward_std": 0.3050765097141266, "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4748884066939354, "step": 3330 }, { "clip_ratio": 0.0, "completion_length": 1414.3170166015625, "epoch": 0.9949966395340154, "grad_norm": 6.337196350097656, "kl": 2.0703125, "learning_rate": 3.0018798511789464e-08, "loss": 0.1178, "reward": 0.5150669887661934, "reward_std": 0.28897804021835327, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474888414144516, "step": 3331 }, { "clip_ratio": 0.0, "completion_length": 1524.4241638183594, "epoch": 0.9952953476215368, "grad_norm": 3.8562142848968506, "kl": 2.48828125, "learning_rate": 3.001652217593675e-08, "loss": 0.1257, "reward": 0.5563616305589676, "reward_std": 0.25369075685739517, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4849330633878708, "step": 3332 }, { "clip_ratio": 0.0, "completion_length": 1585.8995971679688, "epoch": 0.9955940557090583, "grad_norm": 7.351052284240723, "kl": 2.2734375, "learning_rate": 3.0014392688878104e-08, "loss": 0.1561, "reward": 0.5636160969734192, "reward_std": 0.26217906549572945, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4564732313156128, "step": 3333 }, { "clip_ratio": 0.0, "completion_length": 1420.404052734375, "epoch": 0.9958927637965798, "grad_norm": 10.565464973449707, "kl": 2.685546875, "learning_rate": 3.00124100529302e-08, "loss": 0.1814, "reward": 0.5647321715950966, "reward_std": 0.2537923790514469, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470982164144516, "step": 3334 }, { "clip_ratio": 0.0, "completion_length": 1424.2656860351562, "epoch": 0.9961914718841013, "grad_norm": 5.462636470794678, "kl": 2.36328125, "learning_rate": 3.001057427024995e-08, "loss": 0.1351, "reward": 0.5496652126312256, "reward_std": 0.27770889922976494, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 3335 }, { "clip_ratio": 0.0, "completion_length": 1533.1786193847656, "epoch": 0.9964901799716227, "grad_norm": 5.104606628417969, "kl": 3.14453125, "learning_rate": 3.00088853428345e-08, "loss": 0.2074, "reward": 0.5926339626312256, "reward_std": 0.2540331333875656, "rewards/accuracy_reward": 0.125000003259629, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339477300644, "step": 3336 }, { "clip_ratio": 0.0, "completion_length": 1541.7322082519531, "epoch": 0.9967888880591442, "grad_norm": 3.5976932048797607, "kl": 2.59375, "learning_rate": 3.0007343272521256e-08, "loss": 0.1742, "reward": 0.5502232536673546, "reward_std": 0.23611819744110107, "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463169664144516, "step": 3337 }, { "clip_ratio": 0.0, "completion_length": 1470.8058776855469, "epoch": 0.9970875961466656, "grad_norm": 9.176896095275879, "kl": 2.560546875, "learning_rate": 3.000594806098783e-08, "loss": 0.1993, "reward": 0.5859375447034836, "reward_std": 0.27115510404109955, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4966518133878708, "step": 3338 }, { "clip_ratio": 0.0, "completion_length": 1407.2255554199219, "epoch": 0.9973863042341872, "grad_norm": 19.693614959716797, "kl": 2.013671875, "learning_rate": 3.000469970975207e-08, "loss": 0.1563, "reward": 0.608816996216774, "reward_std": 0.2736416347324848, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5128348469734192, "step": 3339 }, { "clip_ratio": 0.0, "completion_length": 1418.2165832519531, "epoch": 0.9976850123217086, "grad_norm": 8.878547668457031, "kl": 2.41015625, "learning_rate": 3.000359822017207e-08, "loss": 0.1887, "reward": 0.5558035969734192, "reward_std": 0.29776983708143234, "rewards/accuracy_reward": 0.08258929126895964, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4732143059372902, "step": 3340 }, { "clip_ratio": 0.0, "completion_length": 1444.4063110351562, "epoch": 0.9979837204092301, "grad_norm": 3.899360179901123, "kl": 2.52734375, "learning_rate": 3.000264359344615e-08, "loss": 0.1726, "reward": 0.5033482313156128, "reward_std": 0.23715965077280998, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4520089477300644, "step": 3341 }, { "clip_ratio": 0.0, "completion_length": 1457.0647583007812, "epoch": 0.9982824284967515, "grad_norm": 6.4891037940979, "kl": 2.240234375, "learning_rate": 3.0001835830612807e-08, "loss": 0.166, "reward": 0.6422991305589676, "reward_std": 0.329277940094471, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5016741156578064, "step": 3342 }, { "clip_ratio": 0.0, "completion_length": 1414.9107971191406, "epoch": 0.998581136584273, "grad_norm": 5.683371067047119, "kl": 2.69921875, "learning_rate": 3.000117493255086e-08, "loss": 0.1789, "reward": 0.588727705180645, "reward_std": 0.3061058185994625, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4860491305589676, "step": 3343 }, { "clip_ratio": 0.0, "completion_length": 1369.1094360351562, "epoch": 0.9988798446717945, "grad_norm": 4.522483825683594, "kl": 2.6484375, "learning_rate": 3.000066089997928e-08, "loss": 0.1634, "reward": 0.546316996216774, "reward_std": 0.25991878286004066, "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169813156128, "step": 3344 }, { "clip_ratio": 0.0, "completion_length": 1526.0379943847656, "epoch": 0.999178552759316, "grad_norm": 8.557799339294434, "kl": 2.8046875, "learning_rate": 3.000029373345726e-08, "loss": 0.1496, "reward": 0.5301339626312256, "reward_std": 0.20869986712932587, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478794664144516, "step": 3345 }, { "clip_ratio": 0.0, "completion_length": 1454.3125915527344, "epoch": 0.9994772608468374, "grad_norm": 8.543980598449707, "kl": 2.63671875, "learning_rate": 3.000007343338428e-08, "loss": 0.186, "reward": 0.5407366305589676, "reward_std": 0.2533954605460167, "rewards/accuracy_reward": 0.0937500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.446986623108387, "step": 3346 }, { "clip_ratio": 0.0, "completion_length": 1526.5000915527344, "epoch": 0.9997759689343589, "grad_norm": 3.87550950050354, "kl": 2.765625, "learning_rate": 3e-08, "loss": 0.2047, "reward": 0.5362723469734192, "reward_std": 0.25890685617923737, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4603794887661934, "step": 3347 }, { "epoch": 0.9997759689343589, "step": 3347, "total_flos": 0.0, "train_loss": 0.055864378488265046, "train_runtime": 77081.5309, "train_samples_per_second": 1.216, "train_steps_per_second": 0.043 } ], "logging_steps": 1, "max_steps": 3347, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }