{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.856898029134533, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1549.7083435058594, "epoch": 0.001713796058269066, "grad_norm": 0.2566092312335968, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0569, "reward": 0.6208184361457825, "reward_std": 0.9855608642101288, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.18897756189107895, "rewards/format_reward": 0.2916666753590107, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1414.0417175292969, "epoch": 0.003427592116538132, "grad_norm": 0.29864758253097534, "kl": 0.0, "learning_rate": 4e-08, "loss": -0.0809, "reward": 0.1427215114235878, "reward_std": 0.6335150748491287, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.25154703110456467, "rewards/format_reward": 0.2291666753590107, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1390.2917175292969, "epoch": 0.005141388174807198, "grad_norm": 0.29091691970825195, "kl": 9.447336196899414e-05, "learning_rate": 6e-08, "loss": 0.0646, "reward": 0.4808660186827183, "reward_std": 0.47989463061094284, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.20031768828630447, "rewards/format_reward": 0.2812500102445483, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 1567.9375305175781, "epoch": 0.006855184233076264, "grad_norm": 0.26566898822784424, "kl": 7.075071334838867e-05, "learning_rate": 8e-08, "loss": -0.0258, "reward": 0.18869880307465792, "reward_std": 0.7589461207389832, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.21264867670834064, "rewards/format_reward": 0.1979166716337204, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1479.9167175292969, "epoch": 0.00856898029134533, "grad_norm": 0.2975413203239441, "kl": 7.289648056030273e-05, "learning_rate": 1e-07, "loss": -0.0589, "reward": 0.8189541846513748, "reward_std": 0.8007446154952049, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.1580449752509594, "rewards/format_reward": 0.2604166716337204, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1646.7917175292969, "epoch": 0.010282776349614395, "grad_norm": 0.23834176361560822, "kl": 8.058547973632812e-05, "learning_rate": 1.2e-07, "loss": 0.0029, "reward": -0.04225504118949175, "reward_std": 0.553499698638916, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2690293677151203, "rewards/format_reward": 0.17708334140479565, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1165.6458435058594, "epoch": 0.011996572407883462, "grad_norm": 0.3262133002281189, "kl": 0.00012004375457763672, "learning_rate": 1.4e-07, "loss": -0.0617, "reward": 1.1578989699482918, "reward_std": 0.9958820343017578, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": -0.11706694308668375, "rewards/format_reward": 0.4583333507180214, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 1534.3958740234375, "epoch": 0.013710368466152529, "grad_norm": 0.24637284874916077, "kl": 6.645917892456055e-05, "learning_rate": 1.6e-07, "loss": -0.0325, "reward": 0.790864821523428, "reward_std": 1.2760685980319977, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.1795810554176569, "rewards/format_reward": 0.3229166753590107, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1406.0833740234375, "epoch": 0.015424164524421594, "grad_norm": 0.3252864181995392, "kl": 6.35385513305664e-05, "learning_rate": 1.8e-07, "loss": 0.0611, "reward": 0.46893399208784103, "reward_std": 0.7476691864430904, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.21526261791586876, "rewards/format_reward": 0.2812500111758709, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 1481.1042022705078, "epoch": 0.01713796058269066, "grad_norm": 0.3649034798145294, "kl": 0.00012791156768798828, "learning_rate": 2e-07, "loss": 0.0039, "reward": 0.5377852376550436, "reward_std": 0.8327888622879982, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.17093832325190306, "rewards/format_reward": 0.2187500111758709, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1507.2500305175781, "epoch": 0.018851756640959727, "grad_norm": 0.3549154996871948, "kl": 8.279085159301758e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0096, "reward": 0.625231646001339, "reward_std": 0.7666523866355419, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.17873376421630383, "rewards/format_reward": 0.2291666753590107, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1587.4167175292969, "epoch": 0.02056555269922879, "grad_norm": 0.251277357339859, "kl": 7.414817810058594e-05, "learning_rate": 2.4e-07, "loss": -0.031, "reward": 0.2848171964287758, "reward_std": 0.8330317437648773, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.2284027300775051, "rewards/format_reward": 0.2187500074505806, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1385.5000305175781, "epoch": 0.022279348757497857, "grad_norm": 0.3666461408138275, "kl": 8.702278137207031e-05, "learning_rate": 2.6e-07, "loss": 0.104, "reward": 0.657318189740181, "reward_std": 0.7007318809628487, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.23300136625766754, "rewards/format_reward": 0.3854166744276881, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1418.2917175292969, "epoch": 0.023993144815766924, "grad_norm": 0.30520063638687134, "kl": 0.00011420249938964844, "learning_rate": 2.8e-07, "loss": -0.0065, "reward": 0.4347895681858063, "reward_std": 0.5189282819628716, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.1626796661876142, "rewards/format_reward": 0.1666666716337204, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 1485.1875305175781, "epoch": 0.02570694087403599, "grad_norm": 0.2740783393383026, "kl": 0.00011640787124633789, "learning_rate": 3e-07, "loss": -0.0048, "reward": -0.019001017324626446, "reward_std": 0.6500238478183746, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.26481105387210846, "rewards/format_reward": 0.13541667442768812, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1309.9167175292969, "epoch": 0.027420736932305057, "grad_norm": 0.3026612102985382, "kl": 7.218122482299805e-05, "learning_rate": 3.2e-07, "loss": 0.023, "reward": 0.9514074325561523, "reward_std": 0.8899444937705994, "rewards/accuracy_reward": 0.09375, "rewards/cosine_scaled_reward": -0.17226803209632635, "rewards/format_reward": 0.4270833358168602, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1449.9166870117188, "epoch": 0.02913453299057412, "grad_norm": 0.27961981296539307, "kl": 7.82012939453125e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0057, "reward": 0.5791401639580727, "reward_std": 0.6529509723186493, "rewards/accuracy_reward": 0.08333333488553762, "rewards/cosine_scaled_reward": -0.19404403865337372, "rewards/format_reward": 0.18750000651925802, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1511.6250305175781, "epoch": 0.030848329048843187, "grad_norm": 0.2333238124847412, "kl": 8.535385131835938e-05, "learning_rate": 3.6e-07, "loss": 0.0385, "reward": 0.3788021163782105, "reward_std": 0.9533726572990417, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.20547283068299294, "rewards/format_reward": 0.1770833395421505, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 1607.5833740234375, "epoch": 0.032562125107112254, "grad_norm": 0.3147532343864441, "kl": 8.320808410644531e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0074, "reward": 0.3463876098394394, "reward_std": 0.7631522864103317, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2376747578382492, "rewards/format_reward": 0.3020833432674408, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1680.8333740234375, "epoch": 0.03427592116538132, "grad_norm": 0.3314201533794403, "kl": 6.365776062011719e-05, "learning_rate": 4e-07, "loss": 0.0129, "reward": 0.08592129033058882, "reward_std": 0.8577414005994797, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.26088032126426697, "rewards/format_reward": 0.1354166716337204, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1422.8541870117188, "epoch": 0.03598971722365039, "grad_norm": 0.2986367642879486, "kl": 8.559226989746094e-05, "learning_rate": 4.1999999999999995e-07, "loss": -0.1152, "reward": 0.545179876498878, "reward_std": 0.853977307677269, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.23388315364718437, "rewards/format_reward": 0.2812500176951289, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 1561.0833740234375, "epoch": 0.037703513281919454, "grad_norm": 0.33064112067222595, "kl": 9.429454803466797e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0381, "reward": -0.020190313458442688, "reward_std": 0.42865169048309326, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2501176781952381, "rewards/format_reward": 0.15625000558793545, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1617.5000305175781, "epoch": 0.03941730934018852, "grad_norm": 0.24645616114139557, "kl": 5.990266799926758e-05, "learning_rate": 4.6e-07, "loss": 0.0022, "reward": -0.061072273179888725, "reward_std": 0.5299023687839508, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.32925406843423843, "rewards/format_reward": 0.2500000111758709, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 1456.9375305175781, "epoch": 0.04113110539845758, "grad_norm": 0.2407306581735611, "kl": 7.05718994140625e-05, "learning_rate": 4.8e-07, "loss": -0.0391, "reward": 1.0003801509737968, "reward_std": 0.8640259802341461, "rewards/accuracy_reward": 0.11458333488553762, "rewards/cosine_scaled_reward": -0.15404087863862514, "rewards/format_reward": 0.32291666977107525, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1410.9792175292969, "epoch": 0.04284490145672665, "grad_norm": 0.2708369791507721, "kl": 8.213520050048828e-05, "learning_rate": 5e-07, "loss": 0.0349, "reward": 0.4050113819539547, "reward_std": 0.7914649695158005, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.19529718905687332, "rewards/format_reward": 0.31250000558793545, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1593.2917175292969, "epoch": 0.044558697514995714, "grad_norm": 0.27268657088279724, "kl": 6.699562072753906e-05, "learning_rate": 5.2e-07, "loss": 0.0223, "reward": 0.3962297812104225, "reward_std": 0.945412665605545, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.24394675344228745, "rewards/format_reward": 0.2604166753590107, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 1592.9375305175781, "epoch": 0.04627249357326478, "grad_norm": 0.25189143419265747, "kl": 7.450580596923828e-05, "learning_rate": 5.4e-07, "loss": -0.035, "reward": 0.3084355629980564, "reward_std": 0.7038140445947647, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.19312112405896187, "rewards/format_reward": 0.19791667442768812, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1407.7500305175781, "epoch": 0.04798628963153385, "grad_norm": 0.276398241519928, "kl": 9.196996688842773e-05, "learning_rate": 5.6e-07, "loss": -0.0933, "reward": 1.1628794074058533, "reward_std": 1.145683228969574, "rewards/accuracy_reward": 0.10416666697710752, "rewards/cosine_scaled_reward": -0.15041928738355637, "rewards/format_reward": 0.4062500149011612, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1545.0000305175781, "epoch": 0.049700085689802914, "grad_norm": 0.2882435619831085, "kl": 6.723403930664062e-05, "learning_rate": 5.8e-07, "loss": 0.0013, "reward": 0.19917990267276764, "reward_std": 0.7036862522363663, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2851256728172302, "rewards/format_reward": 0.22916666977107525, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 1446.9375, "epoch": 0.05141388174807198, "grad_norm": 0.3235947787761688, "kl": 9.739398956298828e-05, "learning_rate": 6e-07, "loss": -0.0015, "reward": 0.8554540276527405, "reward_std": 0.9236274063587189, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.1764096152037382, "rewards/format_reward": 0.385416679084301, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1410.3750305175781, "epoch": 0.05312767780634105, "grad_norm": 0.35821714997291565, "kl": 6.109476089477539e-05, "learning_rate": 6.2e-07, "loss": 0.1537, "reward": 0.8505711704492569, "reward_std": 0.775887742638588, "rewards/accuracy_reward": 0.10416666697710752, "rewards/cosine_scaled_reward": -0.18543021008372307, "rewards/format_reward": 0.27083333767950535, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1427.0000610351562, "epoch": 0.054841473864610114, "grad_norm": 0.29756107926368713, "kl": 8.952617645263672e-05, "learning_rate": 6.4e-07, "loss": 0.0014, "reward": 0.2789556197822094, "reward_std": 0.6537229269742966, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.25284862518310547, "rewards/format_reward": 0.28125000558793545, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1226.0625457763672, "epoch": 0.056555269922879174, "grad_norm": 0.311769038438797, "kl": 5.3822994232177734e-05, "learning_rate": 6.6e-07, "loss": 0.0657, "reward": 1.2427364438772202, "reward_std": 0.8947374746203423, "rewards/accuracy_reward": 0.12500000465661287, "rewards/cosine_scaled_reward": -0.14077210240066051, "rewards/format_reward": 0.4062500149011612, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 1463.5625610351562, "epoch": 0.05826906598114824, "grad_norm": 0.2749904990196228, "kl": 7.456541061401367e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0151, "reward": 0.2089916095137596, "reward_std": 0.7645938321948051, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2636757530272007, "rewards/format_reward": 0.2083333358168602, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1574.1250610351562, "epoch": 0.05998286203941731, "grad_norm": 0.2814057469367981, "kl": 6.365776062011719e-05, "learning_rate": 7e-07, "loss": -0.0354, "reward": 0.4698517946526408, "reward_std": 0.7770050093531609, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.2284439541399479, "rewards/format_reward": 0.3125000149011612, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1329.6458740234375, "epoch": 0.061696658097686374, "grad_norm": 0.29900285601615906, "kl": 0.00013148784637451172, "learning_rate": 7.2e-07, "loss": 0.0355, "reward": 0.38275275379419327, "reward_std": 0.5484570488333702, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.2323998138308525, "rewards/format_reward": 0.291666679084301, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 1505.4583740234375, "epoch": 0.06341045415595545, "grad_norm": 0.23520199954509735, "kl": 8.672475814819336e-05, "learning_rate": 7.4e-07, "loss": 0.0243, "reward": 0.3799854665994644, "reward_std": 0.775396078824997, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.2143558170646429, "rewards/format_reward": 0.16666667442768812, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1575.1041870117188, "epoch": 0.06512425021422451, "grad_norm": 0.28803354501724243, "kl": 6.490945816040039e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0042, "reward": 0.08564997464418411, "reward_std": 0.524849109351635, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2424239031970501, "rewards/format_reward": 0.20833333767950535, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1587.0625305175781, "epoch": 0.06683804627249357, "grad_norm": 0.2588130235671997, "kl": 7.474422454833984e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0004, "reward": 0.31661053746938705, "reward_std": 0.6547420993447304, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.211146367713809, "rewards/format_reward": 0.17708333861082792, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 1419.9375610351562, "epoch": 0.06855184233076264, "grad_norm": 0.27477696537971497, "kl": 6.985664367675781e-05, "learning_rate": 8e-07, "loss": -0.1003, "reward": 1.1732516959309578, "reward_std": 0.8123161420226097, "rewards/accuracy_reward": 0.1250000037252903, "rewards/cosine_scaled_reward": -0.08519345941022038, "rewards/format_reward": 0.2708333386108279, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1474.6458740234375, "epoch": 0.0702656383890317, "grad_norm": 0.27839529514312744, "kl": 7.69495964050293e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0605, "reward": 1.1124837547540665, "reward_std": 1.0182685256004333, "rewards/accuracy_reward": 0.11458333488553762, "rewards/cosine_scaled_reward": -0.15390180423855782, "rewards/format_reward": 0.3125000074505806, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1352.8333740234375, "epoch": 0.07197943444730077, "grad_norm": 0.2626473307609558, "kl": 5.447864532470703e-05, "learning_rate": 8.399999999999999e-07, "loss": -0.0291, "reward": 0.4686532625928521, "reward_std": 0.6282949075102806, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.24288901686668396, "rewards/format_reward": 0.4479166753590107, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 1543.5625610351562, "epoch": 0.07369323050556983, "grad_norm": 0.2575068473815918, "kl": 6.371736526489258e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0766, "reward": 1.0126795023679733, "reward_std": 0.9010422676801682, "rewards/accuracy_reward": 0.1145833358168602, "rewards/cosine_scaled_reward": -0.14877209067344666, "rewards/format_reward": 0.3333333386108279, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1446.5833740234375, "epoch": 0.07540702656383891, "grad_norm": 0.2544824481010437, "kl": 5.650520324707031e-05, "learning_rate": 8.799999999999999e-07, "loss": -0.0808, "reward": 0.5173629745841026, "reward_std": 0.7816298678517342, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.21039017662405968, "rewards/format_reward": 0.3750000037252903, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 1529.8542175292969, "epoch": 0.07712082262210797, "grad_norm": 0.26859909296035767, "kl": 0.00012755393981933594, "learning_rate": 9e-07, "loss": 0.0018, "reward": 0.5219013094902039, "reward_std": 0.7768488749861717, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.14432941935956478, "rewards/format_reward": 0.2083333432674408, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1248.5000610351562, "epoch": 0.07883461868037704, "grad_norm": 0.37795937061309814, "kl": 0.00010341405868530273, "learning_rate": 9.2e-07, "loss": 0.0897, "reward": 0.8892228305339813, "reward_std": 0.9948980510234833, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.18321672268211842, "rewards/format_reward": 0.4791666865348816, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 1435.1875305175781, "epoch": 0.0805484147386461, "grad_norm": 0.262204110622406, "kl": 9.328126907348633e-05, "learning_rate": 9.399999999999999e-07, "loss": -0.0252, "reward": 0.5409297198057175, "reward_std": 0.761884793639183, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.2008819542825222, "rewards/format_reward": 0.322916679084301, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1400.5000305175781, "epoch": 0.08226221079691516, "grad_norm": 0.2843024432659149, "kl": 7.283687591552734e-05, "learning_rate": 9.6e-07, "loss": 0.0258, "reward": 0.81534643471241, "reward_std": 1.0338954292237759, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.15888767689466476, "rewards/format_reward": 0.3750000074505806, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1199.875, "epoch": 0.08397600685518423, "grad_norm": 0.35097768902778625, "kl": 0.00010097026824951172, "learning_rate": 9.8e-07, "loss": -0.0198, "reward": 1.327251985669136, "reward_std": 1.300454005599022, "rewards/accuracy_reward": 0.11458333488553762, "rewards/cosine_scaled_reward": -0.12861149292439222, "rewards/format_reward": 0.510416679084301, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 1303.0, "epoch": 0.0856898029134533, "grad_norm": 0.34222906827926636, "kl": 0.00014340877532958984, "learning_rate": 1e-06, "loss": 0.1216, "reward": 0.4692706950008869, "reward_std": 0.7595674097537994, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2138059325516224, "rewards/format_reward": 0.4270833507180214, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1519.9166870117188, "epoch": 0.08740359897172237, "grad_norm": 0.24351148307323456, "kl": 6.276369094848633e-05, "learning_rate": 9.999890338174275e-07, "loss": -0.0212, "reward": 0.6564827468246222, "reward_std": 0.5184785276651382, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.18480780348181725, "rewards/format_reward": 0.2500000037252903, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1529.5833740234375, "epoch": 0.08911739502999143, "grad_norm": 0.2733688950538635, "kl": 9.131431579589844e-05, "learning_rate": 9.999561358041868e-07, "loss": -0.0305, "reward": 0.31063079461455345, "reward_std": 0.9850436672568321, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2166740708053112, "rewards/format_reward": 0.17708333861082792, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1467.9166870117188, "epoch": 0.0908311910882605, "grad_norm": 0.25451886653900146, "kl": 0.00011324882507324219, "learning_rate": 9.999013075636804e-07, "loss": 0.0124, "reward": 0.25967272371053696, "reward_std": 0.6069408655166626, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.25990210846066475, "rewards/format_reward": 0.28125, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1637.4791870117188, "epoch": 0.09254498714652956, "grad_norm": 0.23521429300308228, "kl": 4.8041343688964844e-05, "learning_rate": 9.998245517681593e-07, "loss": -0.0419, "reward": -0.009522376582026482, "reward_std": 0.6713765487074852, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.25045548751950264, "rewards/format_reward": 0.15625000558793545, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1639.5625305175781, "epoch": 0.09425878320479864, "grad_norm": 0.2962999641895294, "kl": 6.145238876342773e-05, "learning_rate": 9.997258721585931e-07, "loss": 0.0274, "reward": 0.05718616710510105, "reward_std": 0.5747075229883194, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.23509091138839722, "rewards/format_reward": 0.1666666679084301, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1558.6458740234375, "epoch": 0.0959725792630677, "grad_norm": 0.3224487602710724, "kl": 0.0001246929168701172, "learning_rate": 9.996052735444862e-07, "loss": -0.0361, "reward": 0.555650869384408, "reward_std": 0.9909381493926048, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.18972268514335155, "rewards/format_reward": 0.27083334140479565, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1691.5833740234375, "epoch": 0.09768637532133675, "grad_norm": 0.25908270478248596, "kl": 5.307793617248535e-05, "learning_rate": 9.994627618036452e-07, "loss": 0.0389, "reward": 0.4178902134299278, "reward_std": 0.6143478825688362, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.16642580553889275, "rewards/format_reward": 0.1875000074505806, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1550.7083435058594, "epoch": 0.09940017137960583, "grad_norm": 0.26480039954185486, "kl": 9.274482727050781e-05, "learning_rate": 9.992983438818915e-07, "loss": 0.0277, "reward": 0.3206222988665104, "reward_std": 0.7064706832170486, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.19392597675323486, "rewards/format_reward": 0.23958333395421505, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1367.5208435058594, "epoch": 0.10111396743787489, "grad_norm": 0.3325769007205963, "kl": 0.00011909008026123047, "learning_rate": 9.991120277927223e-07, "loss": -0.0285, "reward": 0.725213622674346, "reward_std": 0.788853645324707, "rewards/accuracy_reward": 0.08333333488553762, "rewards/cosine_scaled_reward": -0.21251679956912994, "rewards/format_reward": 0.2812500149011612, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1617.6041870117188, "epoch": 0.10282776349614396, "grad_norm": 0.26701754331588745, "kl": 7.194280624389648e-05, "learning_rate": 9.989038226169207e-07, "loss": 0.0283, "reward": 0.17363526113331318, "reward_std": 0.38850264623761177, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.1875241920351982, "rewards/format_reward": 0.13541666977107525, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1648.0416870117188, "epoch": 0.10454155955441302, "grad_norm": 0.33412033319473267, "kl": 6.657838821411133e-05, "learning_rate": 9.98673738502114e-07, "loss": -0.0192, "reward": 0.20339430589228868, "reward_std": 0.7374705746769905, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.23108146712183952, "rewards/format_reward": 0.2187500074505806, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1308.2500305175781, "epoch": 0.1062553556126821, "grad_norm": 0.3093053102493286, "kl": 0.0001016855239868164, "learning_rate": 9.98421786662277e-07, "loss": -0.1007, "reward": 1.0369166433811188, "reward_std": 1.086327701807022, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": -0.15324707701802254, "rewards/format_reward": 0.354166679084301, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 1690.1458740234375, "epoch": 0.10796915167095116, "grad_norm": 0.2637367248535156, "kl": 7.319450378417969e-05, "learning_rate": 9.981479793771866e-07, "loss": 0.0201, "reward": -0.04099972918629646, "reward_std": 0.5225711427628994, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2726781591773033, "rewards/format_reward": 0.1354166716337204, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1428.8333740234375, "epoch": 0.10968294772922023, "grad_norm": 0.31131911277770996, "kl": 0.00010216236114501953, "learning_rate": 9.97852329991824e-07, "loss": 0.0963, "reward": 0.3072669580578804, "reward_std": 0.6717003732919693, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.24537866562604904, "rewards/format_reward": 0.250000006519258, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1547.7917175292969, "epoch": 0.11139674378748929, "grad_norm": 0.3028079569339752, "kl": 9.393692016601562e-05, "learning_rate": 9.975348529157229e-07, "loss": -0.0456, "reward": 0.39046264067292213, "reward_std": 0.4245295375585556, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.18120174296200275, "rewards/format_reward": 0.09375000279396772, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1437.0417175292969, "epoch": 0.11311053984575835, "grad_norm": 0.34703078866004944, "kl": 7.510185241699219e-05, "learning_rate": 9.971955636222684e-07, "loss": -0.0542, "reward": 1.3410741090774536, "reward_std": 0.6576060205698013, "rewards/accuracy_reward": 0.13541667070239782, "rewards/cosine_scaled_reward": -0.11065312474966049, "rewards/format_reward": 0.354166679084301, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1505.8542175292969, "epoch": 0.11482433590402742, "grad_norm": 0.2783399224281311, "kl": 7.730722427368164e-05, "learning_rate": 9.968344786479415e-07, "loss": 0.0006, "reward": 0.2658543325960636, "reward_std": 0.807809866964817, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2537504807114601, "rewards/format_reward": 0.250000006519258, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1534.6666870117188, "epoch": 0.11653813196229648, "grad_norm": 0.2961093783378601, "kl": 9.751319885253906e-05, "learning_rate": 9.964516155915151e-07, "loss": 0.0957, "reward": 0.23031030222773552, "reward_std": 0.6089238375425339, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.20732926949858665, "rewards/format_reward": 0.22916667349636555, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1538.8750610351562, "epoch": 0.11825192802056556, "grad_norm": 0.2661342918872833, "kl": 9.822845458984375e-05, "learning_rate": 9.960469931131936e-07, "loss": -0.0247, "reward": 0.4905584305524826, "reward_std": 1.0583526492118835, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.20718198269605637, "rewards/format_reward": 0.2708333395421505, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1440.1875610351562, "epoch": 0.11996572407883462, "grad_norm": 0.28563302755355835, "kl": 9.071826934814453e-05, "learning_rate": 9.956206309337066e-07, "loss": -0.0214, "reward": 1.1916613206267357, "reward_std": 0.6305195689201355, "rewards/accuracy_reward": 0.12500000651925802, "rewards/cosine_scaled_reward": -0.1197041841223836, "rewards/format_reward": 0.354166679084301, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1498.0000610351562, "epoch": 0.12167952013710369, "grad_norm": 0.3835605978965759, "kl": 7.128715515136719e-05, "learning_rate": 9.951725498333448e-07, "loss": 0.0491, "reward": 0.7751804813742638, "reward_std": 1.0542180240154266, "rewards/accuracy_reward": 0.08333333395421505, "rewards/cosine_scaled_reward": -0.17253165692090988, "rewards/format_reward": 0.2291666716337204, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1348.8125610351562, "epoch": 0.12339331619537275, "grad_norm": 0.3369770646095276, "kl": 0.00010955333709716797, "learning_rate": 9.947027716509488e-07, "loss": -0.0223, "reward": 1.3596730416174978, "reward_std": 1.071122221648693, "rewards/accuracy_reward": 0.1354166679084301, "rewards/cosine_scaled_reward": -0.1284898892045021, "rewards/format_reward": 0.4270833469927311, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1523.4167175292969, "epoch": 0.12510711225364182, "grad_norm": 0.38462695479393005, "kl": 0.00013840198516845703, "learning_rate": 9.942113192828444e-07, "loss": 0.0507, "reward": 0.7955479570664465, "reward_std": 0.7627622112631798, "rewards/accuracy_reward": 0.09375, "rewards/cosine_scaled_reward": -0.18039103038609028, "rewards/format_reward": 0.2708333460614085, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1232.5625457763672, "epoch": 0.1268209083119109, "grad_norm": 0.3747916519641876, "kl": 0.00013077259063720703, "learning_rate": 9.93698216681727e-07, "loss": -0.0132, "reward": 1.1943221688270569, "reward_std": 1.380107969045639, "rewards/accuracy_reward": 0.11458333395421505, "rewards/cosine_scaled_reward": -0.1479830238968134, "rewards/format_reward": 0.46875, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1585.6250305175781, "epoch": 0.12853470437017994, "grad_norm": 0.2654974162578583, "kl": 0.0001055002212524414, "learning_rate": 9.931634888554935e-07, "loss": 0.0357, "reward": 0.0786585733294487, "reward_std": 0.7322740331292152, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2772916667163372, "rewards/format_reward": 0.15625000558793545, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1137.7708435058594, "epoch": 0.13024850042844902, "grad_norm": 0.33660879731178284, "kl": 0.0001277923583984375, "learning_rate": 9.926071618660237e-07, "loss": 0.0421, "reward": 1.1408534049987793, "reward_std": 0.6549174636602402, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.1487407712265849, "rewards/format_reward": 0.5000000111758709, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 1504.2708740234375, "epoch": 0.1319622964867181, "grad_norm": 0.292646586894989, "kl": 0.00012028217315673828, "learning_rate": 9.9202926282791e-07, "loss": 0.1113, "reward": 0.42379511147737503, "reward_std": 0.9228705763816833, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.18780511617660522, "rewards/format_reward": 0.23958334140479565, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1620.0416870117188, "epoch": 0.13367609254498714, "grad_norm": 0.2709222435951233, "kl": 0.00012493133544921875, "learning_rate": 9.91429819907136e-07, "loss": -0.0088, "reward": -0.11346174217760563, "reward_std": 0.24242284521460533, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2262607365846634, "rewards/format_reward": 0.06250000186264515, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1215.3750457763672, "epoch": 0.1353898886032562, "grad_norm": 0.371949166059494, "kl": 0.0008922815322875977, "learning_rate": 9.908088623197048e-07, "loss": -0.0848, "reward": 0.8136582225561142, "reward_std": 0.652880847454071, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.15616203658282757, "rewards/format_reward": 0.4270833432674408, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 1669.0625305175781, "epoch": 0.13710368466152528, "grad_norm": 0.26468425989151, "kl": 6.92605972290039e-05, "learning_rate": 9.901664203302124e-07, "loss": 0.0218, "reward": 0.18439925089478493, "reward_std": 0.6725399419665337, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.22035246714949608, "rewards/format_reward": 0.19791666883975267, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1511.5208435058594, "epoch": 0.13881748071979436, "grad_norm": 0.2657768726348877, "kl": 9.560585021972656e-05, "learning_rate": 9.895025252503755e-07, "loss": 0.0318, "reward": 0.5059427544474602, "reward_std": 0.5254486501216888, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.18378128111362457, "rewards/format_reward": 0.2395833358168602, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1469.8958740234375, "epoch": 0.1405312767780634, "grad_norm": 0.3196522891521454, "kl": 0.00023925304412841797, "learning_rate": 9.888172094375033e-07, "loss": 0.0289, "reward": 0.5352756977081299, "reward_std": 0.9369603246450424, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.17629460990428925, "rewards/format_reward": 0.2916666753590107, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1491.2708435058594, "epoch": 0.14224507283633248, "grad_norm": 0.2970998287200928, "kl": 0.0001386404037475586, "learning_rate": 9.881105062929221e-07, "loss": 0.0842, "reward": 0.5559089332818985, "reward_std": 0.516811840236187, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.20787525177001953, "rewards/format_reward": 0.21875000558793545, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1434.6666870117188, "epoch": 0.14395886889460155, "grad_norm": 0.2803492844104767, "kl": 8.761882781982422e-05, "learning_rate": 9.873824502603459e-07, "loss": 0.0154, "reward": 0.6808804646134377, "reward_std": 0.6223056390881538, "rewards/accuracy_reward": 0.11458333861082792, "rewards/cosine_scaled_reward": -0.14863869547843933, "rewards/format_reward": 0.2708333386108279, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1502.0417175292969, "epoch": 0.1456726649528706, "grad_norm": 0.2663383483886719, "kl": 7.605552673339844e-05, "learning_rate": 9.866330768241983e-07, "loss": -0.0259, "reward": 0.7719027325510979, "reward_std": 0.6865327507257462, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.13260741718113422, "rewards/format_reward": 0.2708333358168602, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1383.9583740234375, "epoch": 0.14738646101113967, "grad_norm": 0.26092609763145447, "kl": 9.548664093017578e-05, "learning_rate": 9.85862422507884e-07, "loss": -0.0524, "reward": 0.25671922299079597, "reward_std": 0.6172372698783875, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.28932496905326843, "rewards/format_reward": 0.34375000558793545, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1728.7291870117188, "epoch": 0.14910025706940874, "grad_norm": 0.24404188990592957, "kl": 6.109476089477539e-05, "learning_rate": 9.850705248720068e-07, "loss": 0.0018, "reward": -0.22325069084763527, "reward_std": 0.5693457275629044, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.29550234228372574, "rewards/format_reward": 0.07291666697710752, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1664.0833740234375, "epoch": 0.15081405312767782, "grad_norm": 0.29350191354751587, "kl": 6.55055046081543e-05, "learning_rate": 9.8425742251254e-07, "loss": 0.0256, "reward": 0.012098066508769989, "reward_std": 0.5351692587137222, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2370474822819233, "rewards/format_reward": 0.11458333488553762, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1432.4792175292969, "epoch": 0.15252784918594686, "grad_norm": 0.2622717022895813, "kl": 0.00010013580322265625, "learning_rate": 9.83423155058946e-07, "loss": 0.0146, "reward": 0.28211013600230217, "reward_std": 0.6477071270346642, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.26215092837810516, "rewards/format_reward": 0.30208334513008595, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1464.2291870117188, "epoch": 0.15424164524421594, "grad_norm": 0.33814817667007446, "kl": 0.00013047456741333008, "learning_rate": 9.825677631722435e-07, "loss": -0.0532, "reward": 0.2572305239737034, "reward_std": 0.387471754103899, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.22083472087979317, "rewards/format_reward": 0.17708333861082792, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1426.6458435058594, "epoch": 0.155955441302485, "grad_norm": 0.2609995901584625, "kl": 0.0002288818359375, "learning_rate": 9.816912885430258e-07, "loss": 0.0247, "reward": 0.14339020662009716, "reward_std": 0.5801211446523666, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2412719689309597, "rewards/format_reward": 0.22916666697710752, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1398.1250610351562, "epoch": 0.15766923736075408, "grad_norm": 0.2580174505710602, "kl": 7.218122482299805e-05, "learning_rate": 9.807937738894303e-07, "loss": 0.0258, "reward": 0.5541771352291107, "reward_std": 0.5360789932310581, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.20377103984355927, "rewards/format_reward": 0.26041667349636555, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1538.2708740234375, "epoch": 0.15938303341902313, "grad_norm": 0.25805678963661194, "kl": 8.285045623779297e-05, "learning_rate": 9.798752629550546e-07, "loss": -0.0287, "reward": 0.5447847582399845, "reward_std": 0.9511096179485321, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.21756825502961874, "rewards/format_reward": 0.2812500111758709, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 1539.3958740234375, "epoch": 0.1610968294772922, "grad_norm": 0.33063074946403503, "kl": 0.0005064010620117188, "learning_rate": 9.78935800506826e-07, "loss": 0.0871, "reward": 0.9277739748358727, "reward_std": 0.7628872096538544, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.15467804670333862, "rewards/format_reward": 0.37500001303851604, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1465.6041870117188, "epoch": 0.16281062553556128, "grad_norm": 0.2793846130371094, "kl": 0.00011479854583740234, "learning_rate": 9.779754323328192e-07, "loss": 0.0828, "reward": 0.27345613669604063, "reward_std": 0.5890941619873047, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.23359986767172813, "rewards/format_reward": 0.22916666977107525, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1562.0833740234375, "epoch": 0.16452442159383032, "grad_norm": 0.2997168004512787, "kl": 0.00010025501251220703, "learning_rate": 9.769942052400235e-07, "loss": -0.0651, "reward": 0.11648451164364815, "reward_std": 0.7505641058087349, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.27028296515345573, "rewards/format_reward": 0.18750000558793545, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1477.6666870117188, "epoch": 0.1662382176520994, "grad_norm": 0.3399166762828827, "kl": 0.0001264810562133789, "learning_rate": 9.759921670520634e-07, "loss": 0.0139, "reward": 0.28585558384656906, "reward_std": 0.4958939217031002, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.21860605105757713, "rewards/format_reward": 0.3125000074505806, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1550.0000305175781, "epoch": 0.16795201371036847, "grad_norm": 0.2669260799884796, "kl": 0.00010633468627929688, "learning_rate": 9.749693666068663e-07, "loss": 0.0197, "reward": 0.5891378601081669, "reward_std": 0.794895127415657, "rewards/accuracy_reward": 0.0520833358168602, "rewards/cosine_scaled_reward": -0.24003576394170523, "rewards/format_reward": 0.375, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1576.2083740234375, "epoch": 0.16966580976863754, "grad_norm": 0.2624787986278534, "kl": 0.0001227855682373047, "learning_rate": 9.739258537542835e-07, "loss": -0.0209, "reward": 0.16898822411894798, "reward_std": 0.6391639858484268, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2093992941081524, "rewards/format_reward": 0.1562500037252903, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1460.6875305175781, "epoch": 0.1713796058269066, "grad_norm": 0.34687182307243347, "kl": 0.0001291036605834961, "learning_rate": 9.728616793536587e-07, "loss": -0.0869, "reward": 0.3641693815588951, "reward_std": 0.7006602585315704, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.2502584457397461, "rewards/format_reward": 0.32291666977107525, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1485.9792175292969, "epoch": 0.17309340188517566, "grad_norm": 0.2527833580970764, "kl": 9.036064147949219e-05, "learning_rate": 9.717768952713511e-07, "loss": 0.0023, "reward": 0.4851684862514958, "reward_std": 0.5674109682440758, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.20687074586749077, "rewards/format_reward": 0.2500000074505806, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1580.2708740234375, "epoch": 0.17480719794344474, "grad_norm": 0.28545081615448, "kl": 9.453296661376953e-05, "learning_rate": 9.706715543782064e-07, "loss": 0.0331, "reward": 0.4691831008531153, "reward_std": 0.6173286661505699, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.18831564113497734, "rewards/format_reward": 0.25000000558793545, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1568.5625305175781, "epoch": 0.17652099400171378, "grad_norm": 0.24966423213481903, "kl": 0.0001354813575744629, "learning_rate": 9.695457105469804e-07, "loss": 0.0499, "reward": 0.6344753429293633, "reward_std": 0.8610301837325096, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.1816388964653015, "rewards/format_reward": 0.20833334419876337, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1620.4583435058594, "epoch": 0.17823479005998286, "grad_norm": 0.2664543390274048, "kl": 0.00010859966278076172, "learning_rate": 9.683994186497132e-07, "loss": 0.018, "reward": 0.17410985194146633, "reward_std": 0.44621556997299194, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.18129736836999655, "rewards/format_reward": 0.09375000186264515, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1476.4791870117188, "epoch": 0.17994858611825193, "grad_norm": 0.30209964513778687, "kl": 0.00010883808135986328, "learning_rate": 9.672327345550543e-07, "loss": -0.0613, "reward": 0.41774739080574363, "reward_std": 0.7054343000054359, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2176332138478756, "rewards/format_reward": 0.354166679084301, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1575.7083740234375, "epoch": 0.181662382176521, "grad_norm": 0.305949866771698, "kl": 8.207559585571289e-05, "learning_rate": 9.66045715125541e-07, "loss": -0.0178, "reward": 0.19928239285945892, "reward_std": 0.5839960649609566, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.24334679171442986, "rewards/format_reward": 0.22916667442768812, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1595.4375305175781, "epoch": 0.18337617823479005, "grad_norm": 0.25899356603622437, "kl": 0.00011038780212402344, "learning_rate": 9.648384182148252e-07, "loss": -0.0196, "reward": 0.3261868259869516, "reward_std": 0.7823206260800362, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.2219230905175209, "rewards/format_reward": 0.2291666753590107, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1706.9166870117188, "epoch": 0.18508997429305912, "grad_norm": 0.24813760817050934, "kl": 6.186962127685547e-05, "learning_rate": 9.636109026648554e-07, "loss": -0.0167, "reward": 0.09848582930862904, "reward_std": 0.49404577910900116, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.2548905946314335, "rewards/format_reward": 0.1250000037252903, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 1367.3333435058594, "epoch": 0.1868037703513282, "grad_norm": 0.310843825340271, "kl": 0.00025588274002075195, "learning_rate": 9.623632283030077e-07, "loss": 0.0524, "reward": 0.2563808672130108, "reward_std": 0.5945833437144756, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.1962066236883402, "rewards/format_reward": 0.21875001024454832, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 1545.8542175292969, "epoch": 0.18851756640959727, "grad_norm": 0.3281257748603821, "kl": 7.37905502319336e-05, "learning_rate": 9.610954559391704e-07, "loss": -0.0966, "reward": 0.5111359301954508, "reward_std": 1.0251065343618393, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.23871351778507233, "rewards/format_reward": 0.2604166744276881, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 1352.9791870117188, "epoch": 0.19023136246786632, "grad_norm": 0.29662254452705383, "kl": 8.809566497802734e-05, "learning_rate": 9.598076473627796e-07, "loss": 0.0491, "reward": 0.5007766149938107, "reward_std": 0.5933823436498642, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.219840994104743, "rewards/format_reward": 0.3020833469927311, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1521.0833740234375, "epoch": 0.1919451585261354, "grad_norm": 0.22664791345596313, "kl": 9.524822235107422e-05, "learning_rate": 9.58499865339809e-07, "loss": 0.0133, "reward": 0.9934604372829199, "reward_std": 0.8234051987528801, "rewards/accuracy_reward": 0.09375, "rewards/cosine_scaled_reward": -0.12148096412420273, "rewards/format_reward": 0.3645833358168602, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1488.0000610351562, "epoch": 0.19365895458440446, "grad_norm": 0.26795610785484314, "kl": 0.00011110305786132812, "learning_rate": 9.571721736097088e-07, "loss": -0.0702, "reward": 0.3580288141965866, "reward_std": 0.5283620953559875, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.22576149180531502, "rewards/format_reward": 0.2395833358168602, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 1324.2916870117188, "epoch": 0.1953727506426735, "grad_norm": 0.2916397452354431, "kl": 0.00011074542999267578, "learning_rate": 9.55824636882301e-07, "loss": -0.0276, "reward": 0.9822416566312313, "reward_std": 0.8637243807315826, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.17669491842389107, "rewards/format_reward": 0.447916679084301, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 1271.9166870117188, "epoch": 0.19708654670094258, "grad_norm": 0.36266040802001953, "kl": 0.00022935867309570312, "learning_rate": 9.54457320834625e-07, "loss": 0.0726, "reward": 1.2695526629686356, "reward_std": 1.0137326195836067, "rewards/accuracy_reward": 0.13541666977107525, "rewards/cosine_scaled_reward": -0.16991223488003016, "rewards/format_reward": 0.489583358168602, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 1590.9791870117188, "epoch": 0.19880034275921166, "grad_norm": 0.258552223443985, "kl": 0.0002028942108154297, "learning_rate": 9.530702921077358e-07, "loss": 0.0004, "reward": 0.2832749206572771, "reward_std": 0.37558774650096893, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.23772768676280975, "rewards/format_reward": 0.21875000651925802, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1381.0000610351562, "epoch": 0.20051413881748073, "grad_norm": 0.30766648054122925, "kl": 0.0001800060272216797, "learning_rate": 9.516636183034564e-07, "loss": 0.0235, "reward": 0.7005061060190201, "reward_std": 0.545831985771656, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": -0.10087039694190025, "rewards/format_reward": 0.30208334140479565, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1369.0208740234375, "epoch": 0.20222793487574978, "grad_norm": 0.2912021577358246, "kl": 0.00012183189392089844, "learning_rate": 9.502373679810839e-07, "loss": 0.0264, "reward": 0.8078554645180702, "reward_std": 0.9160244166851044, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.1643010601401329, "rewards/format_reward": 0.3854166716337204, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1497.5833740234375, "epoch": 0.20394173093401885, "grad_norm": 0.3184635043144226, "kl": 7.218122482299805e-05, "learning_rate": 9.487916106540465e-07, "loss": 0.0635, "reward": 0.756352175027132, "reward_std": 0.9685456156730652, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.202363814227283, "rewards/format_reward": 0.354166679084301, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 1514.4583740234375, "epoch": 0.20565552699228792, "grad_norm": 0.301592618227005, "kl": 7.611513137817383e-05, "learning_rate": 9.473264167865171e-07, "loss": 0.0527, "reward": 0.6178061664104462, "reward_std": 1.1512691602110863, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.16802122257649899, "rewards/format_reward": 0.26041666977107525, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 1497.7500610351562, "epoch": 0.207369323050557, "grad_norm": 0.2697899639606476, "kl": 0.00011616945266723633, "learning_rate": 9.458418577899774e-07, "loss": -0.0769, "reward": 0.5254677496850491, "reward_std": 0.8822471648454666, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.17704324051737785, "rewards/format_reward": 0.23958334140479565, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1671.2083740234375, "epoch": 0.20908311910882604, "grad_norm": 0.2589820623397827, "kl": 8.726119995117188e-05, "learning_rate": 9.443380060197385e-07, "loss": 0.0032, "reward": 0.12582964450120926, "reward_std": 0.6176033914089203, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.24815166369080544, "rewards/format_reward": 0.2187500037252903, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 1429.4166870117188, "epoch": 0.21079691516709512, "grad_norm": 0.3138475716114044, "kl": 0.0001118779182434082, "learning_rate": 9.428149347714143e-07, "loss": 0.0055, "reward": 0.5503488332033157, "reward_std": 1.1257527768611908, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.19736815616488457, "rewards/format_reward": 0.23958333861082792, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1601.3333740234375, "epoch": 0.2125107112253642, "grad_norm": 0.305877685546875, "kl": 8.064508438110352e-05, "learning_rate": 9.412727182773486e-07, "loss": -0.0732, "reward": 0.2043483555316925, "reward_std": 0.25610289350152016, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.21675095334649086, "rewards/format_reward": 0.1458333395421505, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1594.25, "epoch": 0.21422450728363324, "grad_norm": 0.32135531306266785, "kl": 9.369850158691406e-05, "learning_rate": 9.397114317029974e-07, "loss": -0.0312, "reward": 0.7128554601222277, "reward_std": 0.7205516248941422, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.1431606486439705, "rewards/format_reward": 0.2083333358168602, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1565.0000305175781, "epoch": 0.2159383033419023, "grad_norm": 0.30996406078338623, "kl": 0.00023823976516723633, "learning_rate": 9.381311511432658e-07, "loss": -0.057, "reward": 0.22010673861950636, "reward_std": 0.7089016139507294, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.23909669369459152, "rewards/format_reward": 0.26041667722165585, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 1514.2708740234375, "epoch": 0.21765209940017138, "grad_norm": 0.2887818515300751, "kl": 0.00017833709716796875, "learning_rate": 9.36531953618799e-07, "loss": -0.0256, "reward": 0.40598735213279724, "reward_std": 0.668736569583416, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.21378641575574875, "rewards/format_reward": 0.1979166716337204, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 1636.3958740234375, "epoch": 0.21936589545844046, "grad_norm": 0.3046552836894989, "kl": 0.0016319751739501953, "learning_rate": 9.34913917072228e-07, "loss": -0.0277, "reward": 0.14084632135927677, "reward_std": 0.5165095031261444, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.265597328543663, "rewards/format_reward": 0.17708333488553762, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1377.1666870117188, "epoch": 0.2210796915167095, "grad_norm": 0.30127936601638794, "kl": 0.00021696090698242188, "learning_rate": 9.332771203643714e-07, "loss": 0.0466, "reward": 0.6318524926900864, "reward_std": 0.8379577845335007, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.20013105496764183, "rewards/format_reward": 0.3437500074505806, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1281.7917175292969, "epoch": 0.22279348757497858, "grad_norm": 0.33132994174957275, "kl": 0.00022983551025390625, "learning_rate": 9.316216432703916e-07, "loss": -0.0199, "reward": 1.4362520650029182, "reward_std": 0.7704244181513786, "rewards/accuracy_reward": 0.16666666977107525, "rewards/cosine_scaled_reward": -0.1344580352306366, "rewards/format_reward": 0.4166666818782687, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1669.8125610351562, "epoch": 0.22450728363324765, "grad_norm": 0.26186782121658325, "kl": 0.00011074542999267578, "learning_rate": 9.299475664759068e-07, "loss": -0.0367, "reward": 0.0669812560081482, "reward_std": 0.6163998395204544, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2444169856607914, "rewards/format_reward": 0.17708333674818277, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1598.8541870117188, "epoch": 0.2262210796915167, "grad_norm": 0.27530592679977417, "kl": 8.606910705566406e-05, "learning_rate": 9.282549715730579e-07, "loss": -0.022, "reward": 0.4553011879324913, "reward_std": 0.7698710784316063, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.19618215784430504, "rewards/format_reward": 0.2812500111758709, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 1476.7916870117188, "epoch": 0.22793487574978577, "grad_norm": 0.31282302737236023, "kl": 0.00011754035949707031, "learning_rate": 9.265439410565328e-07, "loss": 0.0149, "reward": 0.5651843696832657, "reward_std": 0.8456990644335747, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.21052773855626583, "rewards/format_reward": 0.2291666716337204, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 1257.2500457763672, "epoch": 0.22964867180805484, "grad_norm": 0.3299978971481323, "kl": 0.0005350112915039062, "learning_rate": 9.248145583195447e-07, "loss": -0.0553, "reward": 0.9383534267544746, "reward_std": 0.8029176890850067, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.1500500962138176, "rewards/format_reward": 0.4375000111758709, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 1279.9583740234375, "epoch": 0.23136246786632392, "grad_norm": 0.26606979966163635, "kl": 0.0002206563949584961, "learning_rate": 9.230669076497687e-07, "loss": -0.064, "reward": 0.8971398249268532, "reward_std": 0.8118537589907646, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": -0.1715338509529829, "rewards/format_reward": 0.3437500111758709, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1403.9167175292969, "epoch": 0.23307626392459296, "grad_norm": 0.33854958415031433, "kl": 0.0002644062042236328, "learning_rate": 9.213010742252327e-07, "loss": 0.0447, "reward": 0.45938077941536903, "reward_std": 0.87321837246418, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.2194330394268036, "rewards/format_reward": 0.2395833395421505, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 1600.7291870117188, "epoch": 0.23479005998286204, "grad_norm": 0.2964058220386505, "kl": 7.575750350952148e-05, "learning_rate": 9.195171441101668e-07, "loss": 0.0798, "reward": -0.00549701415002346, "reward_std": 0.4839354231953621, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.26485851779580116, "rewards/format_reward": 0.197916679084301, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 1569.2500305175781, "epoch": 0.2365038560411311, "grad_norm": 0.27017030119895935, "kl": 6.553530693054199e-05, "learning_rate": 9.177152042508077e-07, "loss": 0.0473, "reward": 0.36223573237657547, "reward_std": 0.5121153928339481, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.2260081209242344, "rewards/format_reward": 0.1770833395421505, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 1500.1250305175781, "epoch": 0.23821765209940018, "grad_norm": 0.3094533681869507, "kl": 0.00014731287956237793, "learning_rate": 9.158953424711624e-07, "loss": -0.0638, "reward": 0.3372825998812914, "reward_std": 0.35855112969875336, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2001120790373534, "rewards/format_reward": 0.19791667256504297, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 1184.2708587646484, "epoch": 0.23993144815766923, "grad_norm": 0.35570886731147766, "kl": 0.00015413761138916016, "learning_rate": 9.140576474687263e-07, "loss": -0.0027, "reward": 0.796763252466917, "reward_std": 0.8294687867164612, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.18409650027751923, "rewards/format_reward": 0.4375000149011612, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1546.2917175292969, "epoch": 0.2416452442159383, "grad_norm": 0.3957357406616211, "kl": 0.0001176595687866211, "learning_rate": 9.122022088101613e-07, "loss": 0.0978, "reward": 0.26207078620791435, "reward_std": 0.5963894873857498, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.24103039875626564, "rewards/format_reward": 0.22916667070239782, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1646.3750305175781, "epoch": 0.24335904027420738, "grad_norm": 0.29940950870513916, "kl": 0.00010442733764648438, "learning_rate": 9.103291169269299e-07, "loss": 0.0115, "reward": 0.12744126096367836, "reward_std": 0.7610968500375748, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.2749045342206955, "rewards/format_reward": 0.16666667070239782, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 1623.1250610351562, "epoch": 0.24507283633247642, "grad_norm": 0.23946928977966309, "kl": 0.00012135505676269531, "learning_rate": 9.084384631108882e-07, "loss": -0.0226, "reward": 0.05879242718219757, "reward_std": 0.7065651714801788, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2815237008035183, "rewards/format_reward": 0.2291666679084301, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1717.0417175292969, "epoch": 0.2467866323907455, "grad_norm": 0.30980589985847473, "kl": 5.7578086853027344e-05, "learning_rate": 9.065303395098358e-07, "loss": 0.0031, "reward": -0.03822072362527251, "reward_std": 0.41779977828264236, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.27143828570842743, "rewards/format_reward": 0.17708334047347307, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1459.7083740234375, "epoch": 0.24850042844901457, "grad_norm": 0.26551711559295654, "kl": 9.238719940185547e-05, "learning_rate": 9.046048391230247e-07, "loss": 0.0405, "reward": 0.9580619968473911, "reward_std": 0.65601646900177, "rewards/accuracy_reward": 0.1145833358168602, "rewards/cosine_scaled_reward": -0.1509431917220354, "rewards/format_reward": 0.27083334140479565, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1350.8333740234375, "epoch": 0.25021422450728364, "grad_norm": 0.22373563051223755, "kl": 0.0002359151840209961, "learning_rate": 9.026620557966279e-07, "loss": -0.0072, "reward": 0.5567201105877757, "reward_std": 0.7935501113533974, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.20699115842580795, "rewards/format_reward": 0.3020833358168602, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 1449.2291870117188, "epoch": 0.2519280205655527, "grad_norm": 0.24441158771514893, "kl": 0.00011551380157470703, "learning_rate": 9.007020842191634e-07, "loss": -0.0188, "reward": 0.7537737488746643, "reward_std": 0.6216295659542084, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.1462303502485156, "rewards/format_reward": 0.3229166716337204, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1600.7084045410156, "epoch": 0.2536418166238218, "grad_norm": 0.26157715916633606, "kl": 8.0108642578125e-05, "learning_rate": 8.987250199168808e-07, "loss": -0.005, "reward": 0.2785342447459698, "reward_std": 0.6226839050650597, "rewards/accuracy_reward": 0.0520833358168602, "rewards/cosine_scaled_reward": -0.2609979063272476, "rewards/format_reward": 0.17708333767950535, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1360.8750305175781, "epoch": 0.25535561268209084, "grad_norm": 0.2786451578140259, "kl": 0.000179290771484375, "learning_rate": 8.967309592491052e-07, "loss": -0.0155, "reward": 0.3341223709285259, "reward_std": 0.6954998485744, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.21542707458138466, "rewards/format_reward": 0.3229166753590107, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 1405.9166870117188, "epoch": 0.2570694087403599, "grad_norm": 0.22569335997104645, "kl": 0.0001271963119506836, "learning_rate": 8.9471999940354e-07, "loss": 0.0332, "reward": 1.1879419684410095, "reward_std": 0.849453404545784, "rewards/accuracy_reward": 0.15625000465661287, "rewards/cosine_scaled_reward": -0.07610182277858257, "rewards/format_reward": 0.3645833469927311, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1398.062515258789, "epoch": 0.258783204798629, "grad_norm": 0.2917872667312622, "kl": 0.0003751516342163086, "learning_rate": 8.926922383915315e-07, "loss": 0.0708, "reward": 0.2504497095942497, "reward_std": 0.7372590154409409, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.29317696392536163, "rewards/format_reward": 0.2812500139698386, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 1530.8958740234375, "epoch": 0.26049700085689803, "grad_norm": 0.29970836639404297, "kl": 0.00013697147369384766, "learning_rate": 8.906477750432903e-07, "loss": 0.0914, "reward": 0.3063769284635782, "reward_std": 0.6683933213353157, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.22888047248125076, "rewards/format_reward": 0.2083333395421505, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 1557.4583740234375, "epoch": 0.2622107969151671, "grad_norm": 0.31202468276023865, "kl": 0.00016236305236816406, "learning_rate": 8.88586709003076e-07, "loss": -0.0161, "reward": 0.23547574784606695, "reward_std": 0.623480673879385, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.212197445333004, "rewards/format_reward": 0.2395833432674408, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 1524.0000305175781, "epoch": 0.2639245929734362, "grad_norm": 0.2832154631614685, "kl": 0.00022339820861816406, "learning_rate": 8.865091407243394e-07, "loss": 0.0183, "reward": 0.8646783344447613, "reward_std": 0.8572295308113098, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.16477943025529385, "rewards/format_reward": 0.26041667722165585, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 1595.1041870117188, "epoch": 0.2656383890317052, "grad_norm": 0.25693395733833313, "kl": 0.00016927719116210938, "learning_rate": 8.844151714648274e-07, "loss": -0.023, "reward": -0.12136609526351094, "reward_std": 0.3815436065196991, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.26241349801421165, "rewards/format_reward": 0.13541667070239782, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1576.6250610351562, "epoch": 0.26735218508997427, "grad_norm": 0.25725266337394714, "kl": 0.00014984607696533203, "learning_rate": 8.823049032816478e-07, "loss": 0.0012, "reward": 0.24715298786759377, "reward_std": 0.7283157333731651, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2703519072383642, "rewards/format_reward": 0.27083334140479565, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 1352.2917022705078, "epoch": 0.26906598114824337, "grad_norm": 0.3138567805290222, "kl": 0.00035691261291503906, "learning_rate": 8.801784390262943e-07, "loss": 0.1456, "reward": 0.3800088334828615, "reward_std": 0.6559219062328339, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.2365667223930359, "rewards/format_reward": 0.20833334233611822, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1562.9375305175781, "epoch": 0.2707797772065124, "grad_norm": 0.34799516201019287, "kl": 0.0002155303955078125, "learning_rate": 8.780358823396352e-07, "loss": 0.028, "reward": 0.3663053174968809, "reward_std": 0.5830123052001, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.18629762902855873, "rewards/format_reward": 0.19791667256504297, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 1593.7708740234375, "epoch": 0.27249357326478146, "grad_norm": 0.2532951831817627, "kl": 0.00032138824462890625, "learning_rate": 8.758773376468604e-07, "loss": 0.0147, "reward": 0.010641081258654594, "reward_std": 0.6984475031495094, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2668936438858509, "rewards/format_reward": 0.15625000651925802, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 1554.9583435058594, "epoch": 0.27420736932305056, "grad_norm": 0.24631713330745697, "kl": 0.0002465248107910156, "learning_rate": 8.737029101523929e-07, "loss": -0.0377, "reward": 0.3181496039032936, "reward_std": 0.3356131613254547, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.13599055260419846, "rewards/format_reward": 0.15625, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1536.9375305175781, "epoch": 0.2759211653813196, "grad_norm": 0.29518961906433105, "kl": 0.0003285408020019531, "learning_rate": 8.715127058347614e-07, "loss": -0.0098, "reward": 0.450228420086205, "reward_std": 0.8069275915622711, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.20325740799307823, "rewards/format_reward": 0.2083333395421505, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 1397.5625305175781, "epoch": 0.2776349614395887, "grad_norm": 0.32521313428878784, "kl": 0.0004208087921142578, "learning_rate": 8.693068314414344e-07, "loss": -0.0538, "reward": 0.2773311994969845, "reward_std": 0.5018344074487686, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2300034947693348, "rewards/format_reward": 0.2604166716337204, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 1482.354248046875, "epoch": 0.27934875749785776, "grad_norm": 0.26695311069488525, "kl": 0.0003781318664550781, "learning_rate": 8.670853944836176e-07, "loss": 0.0127, "reward": 0.5796077568084002, "reward_std": 0.947176143527031, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.21064921002835035, "rewards/format_reward": 0.2604166716337204, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 1325.2083740234375, "epoch": 0.2810625535561268, "grad_norm": 0.29990634322166443, "kl": 0.0003497600555419922, "learning_rate": 8.648485032310144e-07, "loss": -0.0518, "reward": 0.8711699396371841, "reward_std": 0.6968246772885323, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.18954484723508358, "rewards/format_reward": 0.3750000186264515, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 1571.4375305175781, "epoch": 0.2827763496143959, "grad_norm": 0.2641393542289734, "kl": 0.00020003318786621094, "learning_rate": 8.625962667065487e-07, "loss": 0.0589, "reward": 0.3537735566496849, "reward_std": 0.6288779303431511, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.20627648755908012, "rewards/format_reward": 0.22916667722165585, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1436.1458740234375, "epoch": 0.28449014567266495, "grad_norm": 0.3507601320743561, "kl": 0.0003981590270996094, "learning_rate": 8.603287946810513e-07, "loss": 0.0933, "reward": 0.4801969490945339, "reward_std": 0.6994242109358311, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.18304483219981194, "rewards/format_reward": 0.2708333469927311, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1492.7500305175781, "epoch": 0.286203941730934, "grad_norm": 0.26833590865135193, "kl": 0.000438690185546875, "learning_rate": 8.580461976679099e-07, "loss": -0.0162, "reward": 0.6301786154508591, "reward_std": 0.9764036163687706, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.15223283600062132, "rewards/format_reward": 0.29166667722165585, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 1572.6458740234375, "epoch": 0.2879177377892031, "grad_norm": 0.25694307684898376, "kl": 0.0003559589385986328, "learning_rate": 8.557485869176825e-07, "loss": 0.0152, "reward": 0.27175838500261307, "reward_std": 0.6918256878852844, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.221031554043293, "rewards/format_reward": 0.1979166716337204, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1476.6666870117188, "epoch": 0.28963153384747214, "grad_norm": 0.3062116801738739, "kl": 0.0006361007690429688, "learning_rate": 8.534360744126753e-07, "loss": -0.1259, "reward": 0.41318080946803093, "reward_std": 0.4180409014225006, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.1710204929113388, "rewards/format_reward": 0.19791667256504297, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 1407.8958587646484, "epoch": 0.2913453299057412, "grad_norm": 0.3516060411930084, "kl": 0.0011429786682128906, "learning_rate": 8.511087728614862e-07, "loss": 0.0441, "reward": 0.9393245745450258, "reward_std": 0.6329760327935219, "rewards/accuracy_reward": 0.1145833358168602, "rewards/cosine_scaled_reward": -0.15719182137399912, "rewards/format_reward": 0.2708333432674408, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 1431.0208740234375, "epoch": 0.2930591259640103, "grad_norm": 0.3069850206375122, "kl": 0.0004830360412597656, "learning_rate": 8.487667956935087e-07, "loss": 0.0076, "reward": 0.7211491465568542, "reward_std": 0.9154210761189461, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.18316034972667694, "rewards/format_reward": 0.3645833507180214, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1627.6041870117188, "epoch": 0.29477292202227934, "grad_norm": 0.24381308257579803, "kl": 0.00026917457580566406, "learning_rate": 8.464102570534061e-07, "loss": -0.0342, "reward": 0.39864223077893257, "reward_std": 0.8442265018820763, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.22351143136620522, "rewards/format_reward": 0.1979166679084301, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1377.0416870117188, "epoch": 0.29648671808054844, "grad_norm": 0.2742233872413635, "kl": 0.0032143592834472656, "learning_rate": 8.440392717955475e-07, "loss": 0.0075, "reward": 0.43526358902454376, "reward_std": 0.5948851481080055, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.23395783826708794, "rewards/format_reward": 0.322916679084301, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1395.9167175292969, "epoch": 0.2982005141388175, "grad_norm": 0.48359552025794983, "kl": 0.0013885498046875, "learning_rate": 8.416539554784089e-07, "loss": 0.0137, "reward": 0.2430977914482355, "reward_std": 0.5702231302857399, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.20221445709466934, "rewards/format_reward": 0.2604166679084301, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1477.0208740234375, "epoch": 0.29991431019708653, "grad_norm": 0.34349390864372253, "kl": 0.000823974609375, "learning_rate": 8.392544243589427e-07, "loss": 0.1064, "reward": 0.09682288765907288, "reward_std": 0.3903738856315613, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2545960769057274, "rewards/format_reward": 0.2708333358168602, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 1413.6666870117188, "epoch": 0.30162810625535563, "grad_norm": 0.4631779193878174, "kl": 0.0005354881286621094, "learning_rate": 8.368407953869103e-07, "loss": -0.1349, "reward": 0.6249970365315676, "reward_std": 1.1163012832403183, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.1966523379087448, "rewards/format_reward": 0.3750000176951289, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 1708.0625305175781, "epoch": 0.3033419023136247, "grad_norm": 0.2329261600971222, "kl": 0.00026154518127441406, "learning_rate": 8.344131861991828e-07, "loss": -0.0058, "reward": 0.36284567788243294, "reward_std": 0.4691716283559799, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.21771202981472015, "rewards/format_reward": 0.1979166753590107, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1524.9583435058594, "epoch": 0.3050556983718937, "grad_norm": 0.28604593873023987, "kl": 0.0003364086151123047, "learning_rate": 8.319717151140072e-07, "loss": -0.0166, "reward": 0.6543274968862534, "reward_std": 0.9791368544101715, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.17932307720184326, "rewards/format_reward": 0.2604166716337204, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1677.9583435058594, "epoch": 0.3067694944301628, "grad_norm": 0.30129143595695496, "kl": 0.0004076957702636719, "learning_rate": 8.295165011252396e-07, "loss": 0.007, "reward": 0.243562500923872, "reward_std": 0.7496606484055519, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.219083484262228, "rewards/format_reward": 0.18750000651925802, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1274.062515258789, "epoch": 0.30848329048843187, "grad_norm": 0.3613395690917969, "kl": 0.0006461143493652344, "learning_rate": 8.270476638965461e-07, "loss": -0.0415, "reward": 0.8658371195197105, "reward_std": 1.10177643597126, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.1902889758348465, "rewards/format_reward": 0.447916679084301, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1594.0000305175781, "epoch": 0.3101970865467009, "grad_norm": 0.3060204088687897, "kl": 0.0006518363952636719, "learning_rate": 8.245653237555705e-07, "loss": 0.066, "reward": 0.6866211239248514, "reward_std": 1.0839852541685104, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.17629671841859818, "rewards/format_reward": 0.29166667722165585, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 1632.5208740234375, "epoch": 0.31191088260497, "grad_norm": 0.3030488193035126, "kl": 0.00047004222869873047, "learning_rate": 8.220696016880687e-07, "loss": 0.0201, "reward": -0.2194829210639, "reward_std": 0.4896611124277115, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.3070765510201454, "rewards/format_reward": 0.11458333767950535, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1587.1458740234375, "epoch": 0.31362467866323906, "grad_norm": 0.2954941987991333, "kl": 0.0003554821014404297, "learning_rate": 8.195606193320136e-07, "loss": 0.0997, "reward": 0.5128022143617272, "reward_std": 0.7717390954494476, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.18813828378915787, "rewards/format_reward": 0.22916666977107525, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1508.0000305175781, "epoch": 0.31533847472150817, "grad_norm": 0.3083325922489166, "kl": 0.0015592575073242188, "learning_rate": 8.170384989716657e-07, "loss": -0.0724, "reward": 0.4367760196328163, "reward_std": 0.8800807446241379, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.19818584620952606, "rewards/format_reward": 0.250000006519258, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1524.9375305175781, "epoch": 0.3170522707797772, "grad_norm": 0.286538302898407, "kl": 0.0006296634674072266, "learning_rate": 8.145033635316128e-07, "loss": 0.0208, "reward": 0.22530196234583855, "reward_std": 0.7582524865865707, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.22987183183431625, "rewards/format_reward": 0.19791667442768812, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1506.3958740234375, "epoch": 0.31876606683804626, "grad_norm": 0.3232147991657257, "kl": 0.00032401084899902344, "learning_rate": 8.119553365707802e-07, "loss": 0.0208, "reward": 0.4753815531730652, "reward_std": 0.7288932427763939, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2013268917798996, "rewards/format_reward": 0.2500000111758709, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1399.0833740234375, "epoch": 0.32047986289631536, "grad_norm": 0.2806786894798279, "kl": 0.0003571510314941406, "learning_rate": 8.093945422764069e-07, "loss": 0.0452, "reward": 1.078583400696516, "reward_std": 1.0053385198116302, "rewards/accuracy_reward": 0.13541667349636555, "rewards/cosine_scaled_reward": -0.1937819980084896, "rewards/format_reward": 0.3854166865348816, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1659.1041870117188, "epoch": 0.3221936589545844, "grad_norm": 0.27923351526260376, "kl": 0.0006756782531738281, "learning_rate": 8.068211054579943e-07, "loss": 0.0204, "reward": 0.07308734953403473, "reward_std": 0.5209638699889183, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.21014533191919327, "rewards/format_reward": 0.14583333861082792, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1439.4166870117188, "epoch": 0.32390745501285345, "grad_norm": 0.337918221950531, "kl": 0.0007171630859375, "learning_rate": 8.04235151541222e-07, "loss": 0.0703, "reward": 0.15663141384720802, "reward_std": 0.5696099773049355, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.19109883531928062, "rewards/format_reward": 0.197916679084301, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 1567.0833435058594, "epoch": 0.32562125107112255, "grad_norm": 0.265076220035553, "kl": 0.002460002899169922, "learning_rate": 8.01636806561836e-07, "loss": -0.0344, "reward": 0.11269698292016983, "reward_std": 0.5739325433969498, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.23342406563460827, "rewards/format_reward": 0.2083333395421505, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1663.8541870117188, "epoch": 0.3273350471293916, "grad_norm": 0.2943641245365143, "kl": 0.0004372596740722656, "learning_rate": 7.990261971595048e-07, "loss": 0.0196, "reward": 0.16959162894636393, "reward_std": 0.7194448634982109, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2530036121606827, "rewards/format_reward": 0.1979166753590107, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1417.1458740234375, "epoch": 0.32904884318766064, "grad_norm": 0.3483695685863495, "kl": 0.0011310577392578125, "learning_rate": 7.964034505716476e-07, "loss": -0.0292, "reward": 0.28315007127821445, "reward_std": 0.7390157654881477, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2552552279084921, "rewards/format_reward": 0.2916666716337204, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1268.9375457763672, "epoch": 0.33076263924592975, "grad_norm": 0.329458624124527, "kl": 0.0010833740234375, "learning_rate": 7.93768694627233e-07, "loss": 0.0303, "reward": 0.5155042503029108, "reward_std": 0.8000799417495728, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.2303585298359394, "rewards/format_reward": 0.3750000149011612, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1264.2292175292969, "epoch": 0.3324764353041988, "grad_norm": 0.35155826807022095, "kl": 0.0013065338134765625, "learning_rate": 7.911220577405484e-07, "loss": 0.0121, "reward": 0.6516094729304314, "reward_std": 0.6152506731450558, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.18962617963552475, "rewards/format_reward": 0.3854166753590107, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1593.8125305175781, "epoch": 0.3341902313624679, "grad_norm": 0.30138924717903137, "kl": 0.0005452632904052734, "learning_rate": 7.884636689049422e-07, "loss": -0.0596, "reward": 0.268291431479156, "reward_std": 0.5906689390540123, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.17487580329179764, "rewards/format_reward": 0.19791667070239782, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1491.8958740234375, "epoch": 0.33590402742073694, "grad_norm": 0.28991609811782837, "kl": 0.0017118453979492188, "learning_rate": 7.857936576865356e-07, "loss": -0.0337, "reward": 0.5647330663632601, "reward_std": 0.8028706908226013, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.18598925322294235, "rewards/format_reward": 0.3125000074505806, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1520.1667175292969, "epoch": 0.337617823479006, "grad_norm": 0.30366936326026917, "kl": 0.0009660720825195312, "learning_rate": 7.831121542179086e-07, "loss": -0.0631, "reward": 0.5051862448453903, "reward_std": 0.6083739511668682, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.21957962960004807, "rewards/format_reward": 0.26041666977107525, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1455.9583740234375, "epoch": 0.3393316195372751, "grad_norm": 0.28774720430374146, "kl": 0.0010623931884765625, "learning_rate": 7.804192891917571e-07, "loss": -0.0166, "reward": 0.43268176168203354, "reward_std": 0.5119665786623955, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.23898062482476234, "rewards/format_reward": 0.3333333432674408, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1552.5000305175781, "epoch": 0.34104541559554413, "grad_norm": 0.27825096249580383, "kl": 0.0020427703857421875, "learning_rate": 7.777151938545235e-07, "loss": 0.0261, "reward": 0.457376591861248, "reward_std": 0.711308553814888, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.1980813667178154, "rewards/format_reward": 0.2395833432674408, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 1537.6667175292969, "epoch": 0.3427592116538132, "grad_norm": 0.2760595977306366, "kl": 0.0009918212890625, "learning_rate": 7.75e-07, "loss": 0.0019, "reward": 0.6058689560741186, "reward_std": 0.8575867787003517, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.1764862439595163, "rewards/format_reward": 0.3333333507180214, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1451.8958740234375, "epoch": 0.3444730077120823, "grad_norm": 0.29227250814437866, "kl": 0.0012784004211425781, "learning_rate": 7.72273839962904e-07, "loss": -0.0181, "reward": 0.5321098193526268, "reward_std": 0.5402198471128941, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.18361522071063519, "rewards/format_reward": 0.2500000027939677, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1395.0000305175781, "epoch": 0.3461868037703513, "grad_norm": 0.26160165667533875, "kl": 0.0015106201171875, "learning_rate": 7.695368466124296e-07, "loss": 0.0488, "reward": 0.9036880284547806, "reward_std": 0.8476225659251213, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": -0.18844211474061012, "rewards/format_reward": 0.3854166716337204, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1409.1458740234375, "epoch": 0.34790059982862037, "grad_norm": 0.3121320605278015, "kl": 0.0012507438659667969, "learning_rate": 7.667891533457718e-07, "loss": 0.0627, "reward": 1.0111969839781523, "reward_std": 1.1430891081690788, "rewards/accuracy_reward": 0.11458333488553762, "rewards/cosine_scaled_reward": -0.21238414198160172, "rewards/format_reward": 0.3854166753590107, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1446.9167175292969, "epoch": 0.3496143958868895, "grad_norm": 0.27098214626312256, "kl": 0.0015354156494140625, "learning_rate": 7.640308940816239e-07, "loss": 0.0206, "reward": 0.42572443559765816, "reward_std": 0.3084217198193073, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.18036598339676857, "rewards/format_reward": 0.2187500037252903, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1411.2708740234375, "epoch": 0.3513281919451585, "grad_norm": 0.28694698214530945, "kl": 0.0013275146484375, "learning_rate": 7.612622032536507e-07, "loss": 0.1296, "reward": 0.1224188543856144, "reward_std": 0.6760578900575638, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.3061027526855469, "rewards/format_reward": 0.2604166744276881, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1423.5209045410156, "epoch": 0.35304198800342756, "grad_norm": 0.355116605758667, "kl": 0.0011377334594726562, "learning_rate": 7.584832158039378e-07, "loss": 0.0178, "reward": 0.30912495171651244, "reward_std": 0.7893542125821114, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.20777561515569687, "rewards/format_reward": 0.2500000102445483, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 1428.8125305175781, "epoch": 0.35475578406169667, "grad_norm": 0.2700238823890686, "kl": 0.0029344558715820312, "learning_rate": 7.556940671764124e-07, "loss": 0.0719, "reward": 0.4809890575706959, "reward_std": 0.7781959772109985, "rewards/accuracy_reward": 0.06250000093132257, "rewards/cosine_scaled_reward": -0.22299901768565178, "rewards/format_reward": 0.2395833395421505, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1418.7500305175781, "epoch": 0.3564695801199657, "grad_norm": 0.3213181495666504, "kl": 0.0018987655639648438, "learning_rate": 7.528948933102438e-07, "loss": -0.1139, "reward": 0.30493446439504623, "reward_std": 0.75405253469944, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.22520049661397934, "rewards/format_reward": 0.22916667442768812, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 1472.1458740234375, "epoch": 0.3581833761782348, "grad_norm": 0.295888215303421, "kl": 0.001293182373046875, "learning_rate": 7.500858306332172e-07, "loss": 0.0205, "reward": 0.7049083486199379, "reward_std": 0.7757160514593124, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.14897773665143177, "rewards/format_reward": 0.26041667722165585, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 1384.4583740234375, "epoch": 0.35989717223650386, "grad_norm": 0.3134666681289673, "kl": 0.0015621185302734375, "learning_rate": 7.472670160550848e-07, "loss": 0.0936, "reward": 0.6157244890928268, "reward_std": 0.9224071279168129, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.1860289741307497, "rewards/format_reward": 0.3750000149011612, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1649.0625610351562, "epoch": 0.3616109682947729, "grad_norm": 0.23310723900794983, "kl": 0.0006375312805175781, "learning_rate": 7.444385869608921e-07, "loss": 0.0379, "reward": -0.043950184248387814, "reward_std": 0.3435293883085251, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2602936290204525, "rewards/format_reward": 0.19791666977107525, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1412.1667175292969, "epoch": 0.363324764353042, "grad_norm": 0.27200737595558167, "kl": 0.0014286041259765625, "learning_rate": 7.416006812042827e-07, "loss": 0.0497, "reward": 0.9091778621077538, "reward_std": 0.7571589723229408, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.11946619441732764, "rewards/format_reward": 0.2708333386108279, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1543.1458435058594, "epoch": 0.36503856041131105, "grad_norm": 0.24152739346027374, "kl": 0.0015354156494140625, "learning_rate": 7.387534371007797e-07, "loss": -0.0126, "reward": 0.011342905461788177, "reward_std": 0.37173977866768837, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2708110474050045, "rewards/format_reward": 0.25000000186264515, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 1498.9583740234375, "epoch": 0.3667523564695801, "grad_norm": 0.29263201355934143, "kl": 0.001026153564453125, "learning_rate": 7.358969934210438e-07, "loss": 0.0561, "reward": 0.2628788137808442, "reward_std": 0.3592139929533005, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2303277626633644, "rewards/format_reward": 0.19791666977107525, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1469.6250305175781, "epoch": 0.3684661525278492, "grad_norm": 0.24959833920001984, "kl": 0.0009860992431640625, "learning_rate": 7.330314893841101e-07, "loss": 0.0211, "reward": 0.5021949373185635, "reward_std": 0.937240332365036, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.21530525013804436, "rewards/format_reward": 0.3125000111758709, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1592.7708435058594, "epoch": 0.37017994858611825, "grad_norm": 0.281783789396286, "kl": 0.0015850067138671875, "learning_rate": 7.301570646506027e-07, "loss": -0.0691, "reward": 0.4340474624186754, "reward_std": 0.7465260550379753, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.2362687811255455, "rewards/format_reward": 0.19791667349636555, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1485.6250305175781, "epoch": 0.3718937446443873, "grad_norm": 0.2756541967391968, "kl": 0.0015964508056640625, "learning_rate": 7.27273859315928e-07, "loss": -0.113, "reward": 0.5143733620643616, "reward_std": 0.8895893320441246, "rewards/accuracy_reward": 0.06250000093132257, "rewards/cosine_scaled_reward": -0.1793924793601036, "rewards/format_reward": 0.21875000279396772, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 1517.3750305175781, "epoch": 0.3736075407026564, "grad_norm": 0.29070937633514404, "kl": 0.0009374618530273438, "learning_rate": 7.243820139034464e-07, "loss": -0.0237, "reward": 0.8206158205866814, "reward_std": 1.0968997925519943, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.12784741912037134, "rewards/format_reward": 0.27083334140479565, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1627.6458740234375, "epoch": 0.37532133676092544, "grad_norm": 0.3091233968734741, "kl": 0.0029497146606445312, "learning_rate": 7.214816693576234e-07, "loss": -0.0081, "reward": 0.5958885625004768, "reward_std": 0.9548313468694687, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.165425858926028, "rewards/format_reward": 0.23958333861082792, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 1657.9583740234375, "epoch": 0.37703513281919454, "grad_norm": 0.30196475982666016, "kl": 0.0020732879638671875, "learning_rate": 7.185729670371604e-07, "loss": -0.0144, "reward": -0.12533910013735294, "reward_std": 0.3947201892733574, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2612520717084408, "rewards/format_reward": 0.14583333767950535, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1646.6458740234375, "epoch": 0.3787489288774636, "grad_norm": 0.26658761501312256, "kl": 0.0012149810791015625, "learning_rate": 7.156560487081051e-07, "loss": -0.0114, "reward": 0.18843666091561317, "reward_std": 0.7580625228583813, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2518426850438118, "rewards/format_reward": 0.2187500074505806, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1469.4791870117188, "epoch": 0.38046272493573263, "grad_norm": 0.29353833198547363, "kl": 0.0029439926147460938, "learning_rate": 7.127310565369415e-07, "loss": 0.0103, "reward": 0.5816408507525921, "reward_std": 0.511856883764267, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.19628694094717503, "rewards/format_reward": 0.291666672565043, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 1677.5417175292969, "epoch": 0.38217652099400173, "grad_norm": 0.24983949959278107, "kl": 0.00212860107421875, "learning_rate": 7.097981330836616e-07, "loss": -0.0388, "reward": 0.10370806977152824, "reward_std": 0.7805009186267853, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2444031797349453, "rewards/format_reward": 0.12500000186264515, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1588.0417175292969, "epoch": 0.3838903170522708, "grad_norm": 0.3074623942375183, "kl": 0.0014591217041015625, "learning_rate": 7.068574212948169e-07, "loss": -0.0364, "reward": -0.21326511725783348, "reward_std": 0.40029212832450867, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.29444269090890884, "rewards/format_reward": 0.10416667070239782, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1470.4166870117188, "epoch": 0.3856041131105398, "grad_norm": 0.2668305039405823, "kl": 0.004240512847900391, "learning_rate": 7.039090644965509e-07, "loss": -0.0444, "reward": 0.43112599570304155, "reward_std": 1.025677740573883, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.2224056925624609, "rewards/format_reward": 0.21875000651925802, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1524.3542175292969, "epoch": 0.3873179091688089, "grad_norm": 0.24207906424999237, "kl": 0.001285552978515625, "learning_rate": 7.009532063876148e-07, "loss": -0.0133, "reward": 0.20098010450601578, "reward_std": 0.4968443959951401, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.21850886940956116, "rewards/format_reward": 0.2083333395421505, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 1687.1250305175781, "epoch": 0.389031705227078, "grad_norm": 0.2755184769630432, "kl": 0.0006885528564453125, "learning_rate": 6.979899910323624e-07, "loss": 0.0313, "reward": 0.1163886021822691, "reward_std": 0.5962348952889442, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.23180323839187622, "rewards/format_reward": 0.16666666977107525, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1521.3541870117188, "epoch": 0.390745501285347, "grad_norm": 0.2736872136592865, "kl": 0.00174713134765625, "learning_rate": 6.950195628537299e-07, "loss": 0.0192, "reward": 0.6056804731488228, "reward_std": 0.9653726145625114, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.16172956302762032, "rewards/format_reward": 0.2812500037252903, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 1430.7709045410156, "epoch": 0.3924592973436161, "grad_norm": 0.2898569703102112, "kl": 0.003772735595703125, "learning_rate": 6.920420666261961e-07, "loss": -0.1013, "reward": 0.16834751144051552, "reward_std": 0.8317932933568954, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.259422168135643, "rewards/format_reward": 0.2187500037252903, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 1527.5625, "epoch": 0.39417309340188517, "grad_norm": 0.2458394318819046, "kl": 0.00273895263671875, "learning_rate": 6.890576474687263e-07, "loss": 0.0225, "reward": 0.36850229650735855, "reward_std": 0.44148915261030197, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.22217821329832077, "rewards/format_reward": 0.27083334419876337, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1635.2292175292969, "epoch": 0.39588688946015427, "grad_norm": 0.24855723977088928, "kl": 0.001384735107421875, "learning_rate": 6.860664508377001e-07, "loss": -0.0123, "reward": 0.3638776633888483, "reward_std": 0.9782525599002838, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.19856508448719978, "rewards/format_reward": 0.21875000558793545, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 1465.6875, "epoch": 0.3976006855184233, "grad_norm": 0.2706748843193054, "kl": 0.0015106201171875, "learning_rate": 6.83068622519821e-07, "loss": -0.0079, "reward": 0.6824747771024704, "reward_std": 1.202951930463314, "rewards/accuracy_reward": 0.08333333674818277, "rewards/cosine_scaled_reward": -0.1890408918261528, "rewards/format_reward": 0.2708333386108279, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1183.6458587646484, "epoch": 0.39931448157669236, "grad_norm": 0.3087099492549896, "kl": 0.0015783309936523438, "learning_rate": 6.800643086250121e-07, "loss": -0.0562, "reward": 1.6357879862189293, "reward_std": 0.9204326570034027, "rewards/accuracy_reward": 0.1562500037252903, "rewards/cosine_scaled_reward": -0.08350442518712953, "rewards/format_reward": 0.5625000223517418, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1545.1458740234375, "epoch": 0.40102827763496146, "grad_norm": 0.30824851989746094, "kl": 0.0018053054809570312, "learning_rate": 6.770536555792944e-07, "loss": -0.0356, "reward": 0.3720626160502434, "reward_std": 0.5978400260210037, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.2530326321721077, "rewards/format_reward": 0.2187500074505806, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1724.4166870117188, "epoch": 0.4027420736932305, "grad_norm": 0.21939213573932648, "kl": 0.0011310577392578125, "learning_rate": 6.740368101176495e-07, "loss": -0.0541, "reward": -0.09954050742089748, "reward_std": 0.5700792819261551, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.25520598143339157, "rewards/format_reward": 0.07291666697710752, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1656.7708740234375, "epoch": 0.40445586975149955, "grad_norm": 0.24315086007118225, "kl": 0.0017242431640625, "learning_rate": 6.710139192768694e-07, "loss": 0.0181, "reward": -0.08899100869894028, "reward_std": 0.567382175475359, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2840782068669796, "rewards/format_reward": 0.0937500037252903, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1391.2917175292969, "epoch": 0.40616966580976865, "grad_norm": 0.284648060798645, "kl": 0.004982471466064453, "learning_rate": 6.679851303883891e-07, "loss": -0.0752, "reward": 0.4182049632072449, "reward_std": 0.6335307955741882, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.21466262266039848, "rewards/format_reward": 0.31250000558793545, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1576.8542175292969, "epoch": 0.4078834618680377, "grad_norm": 0.37806493043899536, "kl": 0.0016613006591796875, "learning_rate": 6.649505910711058e-07, "loss": -0.0262, "reward": 0.23228356195613742, "reward_std": 0.5114509798586369, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.20781026035547256, "rewards/format_reward": 0.14583333674818277, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1438.1250305175781, "epoch": 0.40959725792630675, "grad_norm": 0.2724723517894745, "kl": 0.00337982177734375, "learning_rate": 6.619104492241847e-07, "loss": -0.0632, "reward": 0.2005984801799059, "reward_std": 0.46372513473033905, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2599928639829159, "rewards/format_reward": 0.28125000838190317, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 1323.2708587646484, "epoch": 0.41131105398457585, "grad_norm": 0.29104921221733093, "kl": 0.0015172958374023438, "learning_rate": 6.588648530198504e-07, "loss": 0.0754, "reward": 1.0371160432696342, "reward_std": 1.0028871893882751, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.16367125883698463, "rewards/format_reward": 0.4166666753590107, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 1315.8541870117188, "epoch": 0.4130248500428449, "grad_norm": 0.3675242066383362, "kl": 0.00168609619140625, "learning_rate": 6.558139508961654e-07, "loss": -0.0468, "reward": 0.6890245275571942, "reward_std": 0.829789325594902, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.20996215008199215, "rewards/format_reward": 0.385416679084301, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 1711.6250305175781, "epoch": 0.414738646101114, "grad_norm": 0.27449020743370056, "kl": 0.0006427764892578125, "learning_rate": 6.527578915497951e-07, "loss": -0.0047, "reward": 0.0678191315382719, "reward_std": 0.6721196174621582, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2533269338309765, "rewards/format_reward": 0.1666666679084301, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 1564.2292175292969, "epoch": 0.41645244215938304, "grad_norm": 0.2841816842556, "kl": 0.001232147216796875, "learning_rate": 6.496968239287603e-07, "loss": 0.0798, "reward": -0.010489307343959808, "reward_std": 0.6119135469198227, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.31749165803194046, "rewards/format_reward": 0.21875000931322575, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 1542.7083740234375, "epoch": 0.4181662382176521, "grad_norm": 0.2418055236339569, "kl": 0.0013904571533203125, "learning_rate": 6.466308972251785e-07, "loss": 0.0006, "reward": 0.42967063235118985, "reward_std": 0.5240038335323334, "rewards/accuracy_reward": 0.0520833358168602, "rewards/cosine_scaled_reward": -0.22576111694797873, "rewards/format_reward": 0.22916667256504297, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1163.6875305175781, "epoch": 0.4198800342759212, "grad_norm": 0.3541621267795563, "kl": 0.0019626617431640625, "learning_rate": 6.435602608679916e-07, "loss": -0.0503, "reward": 1.0524671040475368, "reward_std": 0.8917372450232506, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.17973799630999565, "rewards/format_reward": 0.6145833432674408, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 1451.6250610351562, "epoch": 0.42159383033419023, "grad_norm": 0.30440303683280945, "kl": 0.0012979507446289062, "learning_rate": 6.404850645156841e-07, "loss": -0.0257, "reward": 0.6202996233478189, "reward_std": 0.7275250256061554, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.16593201458454132, "rewards/format_reward": 0.31250001676380634, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 1685.4583435058594, "epoch": 0.4233076263924593, "grad_norm": 0.28518855571746826, "kl": 0.0011157989501953125, "learning_rate": 6.374054580489873e-07, "loss": -0.025, "reward": -0.02131518442183733, "reward_std": 0.5307684168219566, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.23101667314767838, "rewards/format_reward": 0.11458333488553762, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1473.8125305175781, "epoch": 0.4250214224507284, "grad_norm": 0.23807476460933685, "kl": 0.0013265609741210938, "learning_rate": 6.343215915635761e-07, "loss": -0.0614, "reward": 0.7047823406755924, "reward_std": 0.7143717184662819, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.1652532685548067, "rewards/format_reward": 0.2812500027939677, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 1546.3541870117188, "epoch": 0.4267352185089974, "grad_norm": 0.24748165905475616, "kl": 0.0014638900756835938, "learning_rate": 6.31233615362752e-07, "loss": 0.0185, "reward": 0.6113807391375303, "reward_std": 0.5387562289834023, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.19036031886935234, "rewards/format_reward": 0.2187500037252903, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 1507.4375305175781, "epoch": 0.4284490145672665, "grad_norm": 0.2894173860549927, "kl": 0.0014209747314453125, "learning_rate": 6.281416799501187e-07, "loss": -0.0473, "reward": 0.19241389445960522, "reward_std": 0.7252123057842255, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2510671392083168, "rewards/format_reward": 0.25000000186264515, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1443.2083740234375, "epoch": 0.4301628106255356, "grad_norm": 0.4095551669597626, "kl": 0.0017261505126953125, "learning_rate": 6.25045936022246e-07, "loss": -0.1353, "reward": 0.6435118420049548, "reward_std": 0.934768907725811, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.1795162484049797, "rewards/format_reward": 0.3020833358168602, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 1465.9375610351562, "epoch": 0.4318766066838046, "grad_norm": 0.275666743516922, "kl": 0.0014324188232421875, "learning_rate": 6.219465344613258e-07, "loss": -0.018, "reward": 0.12436750531196594, "reward_std": 0.7275985479354858, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.26183638721704483, "rewards/format_reward": 0.18750000279396772, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 1510.0417175292969, "epoch": 0.43359040274207367, "grad_norm": 0.25496789813041687, "kl": 0.0009136199951171875, "learning_rate": 6.188436263278172e-07, "loss": -0.016, "reward": 0.5188035815954208, "reward_std": 0.9701652638614178, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.2066810093820095, "rewards/format_reward": 0.1770833395421505, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 1549.9792175292969, "epoch": 0.43530419880034277, "grad_norm": 0.27471932768821716, "kl": 0.0010223388671875, "learning_rate": 6.157373628530852e-07, "loss": -0.0365, "reward": 0.5654481025412679, "reward_std": 1.0282247960567474, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.1806558482348919, "rewards/format_reward": 0.2812500037252903, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 1348.2083740234375, "epoch": 0.4370179948586118, "grad_norm": 0.30511415004730225, "kl": 0.0012774467468261719, "learning_rate": 6.126278954320294e-07, "loss": 0.0483, "reward": 0.8795673847198486, "reward_std": 0.8337312787771225, "rewards/accuracy_reward": 0.08333333395421505, "rewards/cosine_scaled_reward": -0.1382422624155879, "rewards/format_reward": 0.3333333432674408, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 1538.2083740234375, "epoch": 0.4387317909168809, "grad_norm": 0.2755931615829468, "kl": 0.003383636474609375, "learning_rate": 6.095153756157051e-07, "loss": 0.0742, "reward": 0.2700856775045395, "reward_std": 0.3797752112150192, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.25267578288912773, "rewards/format_reward": 0.2187500074505806, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1461.8959045410156, "epoch": 0.44044558697514996, "grad_norm": 0.3680107891559601, "kl": 0.002166748046875, "learning_rate": 6.06399955103937e-07, "loss": 0.1579, "reward": 0.39618358900770545, "reward_std": 0.7873297780752182, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.23296307399868965, "rewards/format_reward": 0.2395833395421505, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1543.0208740234375, "epoch": 0.442159383033419, "grad_norm": 0.27821409702301025, "kl": 0.0012140274047851562, "learning_rate": 6.032817857379256e-07, "loss": 0.0219, "reward": 0.36168060451745987, "reward_std": 0.5424905493855476, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.24902323260903358, "rewards/format_reward": 0.22916667070239782, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 1361.4375305175781, "epoch": 0.4438731790916881, "grad_norm": 0.3345107436180115, "kl": 0.0014476776123046875, "learning_rate": 6.001610194928464e-07, "loss": 0.0802, "reward": 1.179038206115365, "reward_std": 1.1666965633630753, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.1345681119710207, "rewards/format_reward": 0.43750001303851604, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 1471.3958740234375, "epoch": 0.44558697514995715, "grad_norm": 0.283227801322937, "kl": 0.0009584426879882812, "learning_rate": 5.97037808470444e-07, "loss": 0.0181, "reward": 0.19981666887179017, "reward_std": 0.7603968679904938, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.28081241995096207, "rewards/format_reward": 0.2708333386108279, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 1480.2292175292969, "epoch": 0.4473007712082262, "grad_norm": 0.2809438705444336, "kl": 0.00098419189453125, "learning_rate": 5.939123048916173e-07, "loss": -0.0626, "reward": 0.6192706078290939, "reward_std": 0.9330656826496124, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.22592685744166374, "rewards/format_reward": 0.3750000074505806, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1499.7917175292969, "epoch": 0.4490145672664953, "grad_norm": 0.2977607846260071, "kl": 0.0025501251220703125, "learning_rate": 5.907846610890011e-07, "loss": 0.0765, "reward": 0.40138836577534676, "reward_std": 1.058370865881443, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.18452044390141964, "rewards/format_reward": 0.21875000558793545, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 1516.2291870117188, "epoch": 0.45072836332476435, "grad_norm": 0.2541120648384094, "kl": 0.0012226104736328125, "learning_rate": 5.87655029499542e-07, "loss": 0.0006, "reward": 0.20188701339066029, "reward_std": 0.5907558128237724, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.215114988386631, "rewards/format_reward": 0.2395833395421505, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1588.2083435058594, "epoch": 0.4524421593830334, "grad_norm": 0.369785338640213, "kl": 0.003452301025390625, "learning_rate": 5.845235626570683e-07, "loss": 0.1067, "reward": -0.17164241895079613, "reward_std": 0.37897656112909317, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2791437990963459, "rewards/format_reward": 0.15625000651925802, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 1688.2083740234375, "epoch": 0.4541559554413025, "grad_norm": 0.25646331906318665, "kl": 0.0008211135864257812, "learning_rate": 5.813904131848564e-07, "loss": 0.0496, "reward": 0.40768040530383587, "reward_std": 0.47176536452025175, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": -0.11216204985976219, "rewards/format_reward": 0.0833333358168602, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1518.7916870117188, "epoch": 0.45586975149957154, "grad_norm": 0.2357741892337799, "kl": 0.000965118408203125, "learning_rate": 5.78255733788191e-07, "loss": -0.0472, "reward": 0.5058878529816866, "reward_std": 0.9048085063695908, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.1941013392060995, "rewards/format_reward": 0.2291666753590107, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1370.0208740234375, "epoch": 0.45758354755784064, "grad_norm": 0.2968303859233856, "kl": 0.00202178955078125, "learning_rate": 5.751196772469237e-07, "loss": 0.0396, "reward": 0.4842541292309761, "reward_std": 0.7571203708648682, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2155583193525672, "rewards/format_reward": 0.3333333358168602, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1385.8333740234375, "epoch": 0.4592973436161097, "grad_norm": 0.2968428432941437, "kl": 0.0023469924926757812, "learning_rate": 5.71982396408026e-07, "loss": -0.017, "reward": 0.6936724968254566, "reward_std": 0.8312683999538422, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.22146771964617074, "rewards/format_reward": 0.3645833469927311, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 1570.6667175292969, "epoch": 0.46101113967437873, "grad_norm": 0.2661321461200714, "kl": 0.0008296966552734375, "learning_rate": 5.688440441781398e-07, "loss": 0.0054, "reward": 0.13332478515803814, "reward_std": 0.4149567186832428, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.23616338148713112, "rewards/format_reward": 0.2187500111758709, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1490.6875610351562, "epoch": 0.46272493573264784, "grad_norm": 0.2545984089374542, "kl": 0.00135040283203125, "learning_rate": 5.657047735161255e-07, "loss": 0.0919, "reward": 0.7821074835956097, "reward_std": 0.6425863802433014, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.1767166256904602, "rewards/format_reward": 0.3125000111758709, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1521.8750610351562, "epoch": 0.4644387317909169, "grad_norm": 0.2771352231502533, "kl": 0.0013113021850585938, "learning_rate": 5.625647374256061e-07, "loss": -0.0375, "reward": 0.3960751476697624, "reward_std": 0.46416520327329636, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2203481704927981, "rewards/format_reward": 0.2500000111758709, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1412.1042175292969, "epoch": 0.4661525278491859, "grad_norm": 0.27568578720092773, "kl": 0.00176239013671875, "learning_rate": 5.594240889475106e-07, "loss": 0.0309, "reward": 0.44331623800098896, "reward_std": 0.6773833259940147, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.21556473523378372, "rewards/format_reward": 0.3020833432674408, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1356.6041870117188, "epoch": 0.46786632390745503, "grad_norm": 0.27975478768348694, "kl": 0.0014009475708007812, "learning_rate": 5.562829811526154e-07, "loss": -0.0274, "reward": 0.7240446954965591, "reward_std": 1.045374572277069, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.21347414329648018, "rewards/format_reward": 0.354166679084301, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1636.6667175292969, "epoch": 0.4695801199657241, "grad_norm": 0.2683631479740143, "kl": 0.0010700225830078125, "learning_rate": 5.531415671340826e-07, "loss": -0.0025, "reward": 0.17355282232165337, "reward_std": 0.3705536052584648, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2301059477031231, "rewards/format_reward": 0.2812500074505806, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1553.2083740234375, "epoch": 0.4712939160239931, "grad_norm": 0.33635562658309937, "kl": 0.0011768341064453125, "learning_rate": 5.5e-07, "loss": 0.0588, "reward": 0.36671384796500206, "reward_std": 0.39920416846871376, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.19434788078069687, "rewards/format_reward": 0.21875, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1501.6042175292969, "epoch": 0.4730077120822622, "grad_norm": 0.288104385137558, "kl": 0.0024690628051757812, "learning_rate": 5.468584328659172e-07, "loss": -0.0367, "reward": 0.35132842278108, "reward_std": 0.8972981795668602, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2235843949019909, "rewards/format_reward": 0.2500000111758709, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1590.3333740234375, "epoch": 0.47472150814053127, "grad_norm": 0.24080416560173035, "kl": 0.0013065338134765625, "learning_rate": 5.437170188473847e-07, "loss": 0.0462, "reward": -0.03010409977287054, "reward_std": 0.49339213222265244, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.24666466563940048, "rewards/format_reward": 0.15625000186264515, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1475.8333435058594, "epoch": 0.47643530419880037, "grad_norm": 0.25690314173698425, "kl": 0.0014476776123046875, "learning_rate": 5.405759110524894e-07, "loss": 0.0637, "reward": 0.5569645212963223, "reward_std": 0.9936123788356781, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.23540310934185982, "rewards/format_reward": 0.3437500074505806, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1678.9791870117188, "epoch": 0.4781491002570694, "grad_norm": 0.24415026605129242, "kl": 0.00095367431640625, "learning_rate": 5.37435262574394e-07, "loss": 0.0144, "reward": 0.16477025300264359, "reward_std": 0.6051457226276398, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.20591148734092712, "rewards/format_reward": 0.1979166716337204, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1506.6250305175781, "epoch": 0.47986289631533846, "grad_norm": 0.26392680406570435, "kl": 0.0023336410522460938, "learning_rate": 5.342952264838747e-07, "loss": 0.0013, "reward": 0.4733018707484007, "reward_std": 0.7523324713110924, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.2140564490109682, "rewards/format_reward": 0.25000000931322575, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1287.0000305175781, "epoch": 0.48157669237360756, "grad_norm": 0.29941684007644653, "kl": 0.0029144287109375, "learning_rate": 5.311559558218603e-07, "loss": -0.0345, "reward": 0.7630709037184715, "reward_std": 0.6651666909456253, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.15811054781079292, "rewards/format_reward": 0.3229166716337204, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1531.1667175292969, "epoch": 0.4832904884318766, "grad_norm": 0.2747533917427063, "kl": 0.0010986328125, "learning_rate": 5.28017603591974e-07, "loss": 0.0652, "reward": 0.2705502174794674, "reward_std": 0.5016405843198299, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.2651446685194969, "rewards/format_reward": 0.23958334419876337, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 1673.5625305175781, "epoch": 0.48500428449014565, "grad_norm": 0.22642135620117188, "kl": 0.0008697509765625, "learning_rate": 5.248803227530763e-07, "loss": -0.0168, "reward": 0.13469044864177704, "reward_std": 0.5786846578121185, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.21285491064190865, "rewards/format_reward": 0.17708333488553762, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1504.7916870117188, "epoch": 0.48671808054841476, "grad_norm": 0.2727656662464142, "kl": 0.0019969940185546875, "learning_rate": 5.21744266211809e-07, "loss": 0.0987, "reward": 0.17584533244371414, "reward_std": 0.9530358016490936, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.26050129532814026, "rewards/format_reward": 0.13541666977107525, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1625.1042175292969, "epoch": 0.4884318766066838, "grad_norm": 0.2627033293247223, "kl": 0.00135040283203125, "learning_rate": 5.186095868151436e-07, "loss": -0.0045, "reward": 0.33440714702010155, "reward_std": 0.4926134943962097, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.23611152917146683, "rewards/format_reward": 0.2291666716337204, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1458.3333435058594, "epoch": 0.49014567266495285, "grad_norm": 0.2648809850215912, "kl": 0.0019407272338867188, "learning_rate": 5.154764373429315e-07, "loss": -0.0038, "reward": 0.43773437407799065, "reward_std": 0.8458793424069881, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.17636944695550483, "rewards/format_reward": 0.2291666679084301, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1603.5416870117188, "epoch": 0.49185946872322195, "grad_norm": 0.2246415913105011, "kl": 0.0008897781372070312, "learning_rate": 5.123449705004581e-07, "loss": -0.0038, "reward": 0.3730207160115242, "reward_std": 0.6528768911957741, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.20618611853569746, "rewards/format_reward": 0.19791666977107525, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1618.7917175292969, "epoch": 0.493573264781491, "grad_norm": 0.38273441791534424, "kl": 0.0009946823120117188, "learning_rate": 5.09215338910999e-07, "loss": -0.0405, "reward": 0.22999665327370167, "reward_std": 0.5767201408743858, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2371416799724102, "rewards/format_reward": 0.260416679084301, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1577.8125305175781, "epoch": 0.4952870608397601, "grad_norm": 0.25025105476379395, "kl": 0.0012979507446289062, "learning_rate": 5.060876951083828e-07, "loss": -0.0036, "reward": 0.15548935439437628, "reward_std": 0.5783236026763916, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.20484132319688797, "rewards/format_reward": 0.17708333488553762, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1389.0834045410156, "epoch": 0.49700085689802914, "grad_norm": 0.35921505093574524, "kl": 0.0019588470458984375, "learning_rate": 5.02962191529556e-07, "loss": -0.0026, "reward": 0.5405751286889426, "reward_std": 0.9654501527547836, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.19835910387337208, "rewards/format_reward": 0.30208334140479565, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 1685.0, "epoch": 0.4987146529562982, "grad_norm": 0.26681071519851685, "kl": 0.0010046958923339844, "learning_rate": 4.998389805071536e-07, "loss": 0.0389, "reward": -0.14831870794296265, "reward_std": 0.35369744896888733, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.26506757736206055, "rewards/format_reward": 0.10416666977107525, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1354.5000305175781, "epoch": 0.5004284490145673, "grad_norm": 0.326076477766037, "kl": 0.00252532958984375, "learning_rate": 4.967182142620745e-07, "loss": 0.0549, "reward": 0.5435319095849991, "reward_std": 0.6741289496421814, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.22746334969997406, "rewards/format_reward": 0.3020833432674408, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1452.2292175292969, "epoch": 0.5021422450728363, "grad_norm": 0.2539391815662384, "kl": 0.001277923583984375, "learning_rate": 4.93600044896063e-07, "loss": 0.0331, "reward": 0.2068037036806345, "reward_std": 0.7527924031019211, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.24660906940698624, "rewards/format_reward": 0.2812500074505806, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1487.8125305175781, "epoch": 0.5038560411311054, "grad_norm": 0.28219902515411377, "kl": 0.0015411376953125, "learning_rate": 4.904846243842949e-07, "loss": 0.0807, "reward": 0.19793431088328362, "reward_std": 0.5930087864398956, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.20344459637999535, "rewards/format_reward": 0.2187500074505806, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 1381.0208740234375, "epoch": 0.5055698371893744, "grad_norm": 0.2740439176559448, "kl": 0.002559661865234375, "learning_rate": 4.873721045679706e-07, "loss": 0.0127, "reward": 0.519521371461451, "reward_std": 0.8600037917494774, "rewards/accuracy_reward": 0.06250000093132257, "rewards/cosine_scaled_reward": -0.20514550060033798, "rewards/format_reward": 0.2812500074505806, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 1623.0208740234375, "epoch": 0.5072836332476436, "grad_norm": 0.3007984161376953, "kl": 0.0010242462158203125, "learning_rate": 4.842626371469149e-07, "loss": -0.007, "reward": 0.4530015401542187, "reward_std": 0.9436793178319931, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.20549920108169317, "rewards/format_reward": 0.17708333767950535, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1576.6250610351562, "epoch": 0.5089974293059126, "grad_norm": 0.2906862497329712, "kl": 0.0011758804321289062, "learning_rate": 4.811563736721829e-07, "loss": 0.0599, "reward": 0.49198414757847786, "reward_std": 1.0118756145238876, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.18148974049836397, "rewards/format_reward": 0.20833334140479565, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1487.7083740234375, "epoch": 0.5107112253641817, "grad_norm": 0.352023720741272, "kl": 0.00144195556640625, "learning_rate": 4.780534655386743e-07, "loss": -0.0307, "reward": 0.37683416716754436, "reward_std": 0.8631244823336601, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2052404135465622, "rewards/format_reward": 0.2812500074505806, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1460.5416870117188, "epoch": 0.5124250214224507, "grad_norm": 0.32392027974128723, "kl": 0.00228118896484375, "learning_rate": 4.749540639777539e-07, "loss": -0.0335, "reward": 0.6682421006262302, "reward_std": 1.012038916349411, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.16558968648314476, "rewards/format_reward": 0.3333333358168602, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1429.0208740234375, "epoch": 0.5141388174807198, "grad_norm": 0.2622582018375397, "kl": 0.0013561248779296875, "learning_rate": 4.7185832004988133e-07, "loss": 0.0258, "reward": 0.8314435705542564, "reward_std": 0.8176407516002655, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.13659141026437283, "rewards/format_reward": 0.2291666716337204, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1618.0208740234375, "epoch": 0.5158526135389888, "grad_norm": 0.33633002638816833, "kl": 0.00131988525390625, "learning_rate": 4.68766384637248e-07, "loss": -0.0229, "reward": 0.11395054124295712, "reward_std": 0.571219764649868, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.24826429784297943, "rewards/format_reward": 0.18750000558793545, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 1644.3958740234375, "epoch": 0.517566409597258, "grad_norm": 0.2794082462787628, "kl": 0.0025787353515625, "learning_rate": 4.656784084364238e-07, "loss": 0.0632, "reward": -0.022331595420837402, "reward_std": 0.543972559273243, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2196408323943615, "rewards/format_reward": 0.10416666977107525, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1646.5625305175781, "epoch": 0.519280205655527, "grad_norm": 0.3117695748806, "kl": 0.0012073516845703125, "learning_rate": 4.6259454195101267e-07, "loss": -0.0041, "reward": 0.27055755932815373, "reward_std": 0.7059797570109367, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2268887422978878, "rewards/format_reward": 0.2187500037252903, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 1473.8333740234375, "epoch": 0.5209940017137961, "grad_norm": 0.25882959365844727, "kl": 0.00136566162109375, "learning_rate": 4.59514935484316e-07, "loss": 0.0138, "reward": 1.1540717035531998, "reward_std": 0.7546401843428612, "rewards/accuracy_reward": 0.12500000186264515, "rewards/cosine_scaled_reward": -0.08102886518463492, "rewards/format_reward": 0.2395833358168602, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 1562.8125305175781, "epoch": 0.5227077977720651, "grad_norm": 0.26582640409469604, "kl": 0.0014476776123046875, "learning_rate": 4.5643973913200837e-07, "loss": -0.0198, "reward": 0.4465249180793762, "reward_std": 0.5320742838084698, "rewards/accuracy_reward": 0.0520833358168602, "rewards/cosine_scaled_reward": -0.18380979727953672, "rewards/format_reward": 0.19791667722165585, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1408.3750305175781, "epoch": 0.5244215938303342, "grad_norm": 0.26723894476890564, "kl": 0.00182342529296875, "learning_rate": 4.5336910277482155e-07, "loss": 0.0754, "reward": 0.5236281603574753, "reward_std": 0.5826352834701538, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.19684619456529617, "rewards/format_reward": 0.291666679084301, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 1548.9583740234375, "epoch": 0.5261353898886033, "grad_norm": 0.2609714865684509, "kl": 0.0017099380493164062, "learning_rate": 4.503031760712397e-07, "loss": -0.0152, "reward": 0.23132172971963882, "reward_std": 0.7677483707666397, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.2582585848867893, "rewards/format_reward": 0.2395833358168602, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 1513.5625305175781, "epoch": 0.5278491859468724, "grad_norm": 0.25365719199180603, "kl": 0.0029115676879882812, "learning_rate": 4.4724210845020494e-07, "loss": -0.027, "reward": 0.579220362007618, "reward_std": 0.5990113839507103, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.17437524069100618, "rewards/format_reward": 0.25000000931322575, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 1507.6042175292969, "epoch": 0.5295629820051414, "grad_norm": 0.31888437271118164, "kl": 0.002178192138671875, "learning_rate": 4.441860491038345e-07, "loss": 0.0247, "reward": 0.8379712477326393, "reward_std": 1.0112811923027039, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.2099217213690281, "rewards/format_reward": 0.3645833395421505, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 1639.1666870117188, "epoch": 0.5312767780634104, "grad_norm": 0.2766580283641815, "kl": 0.001255035400390625, "learning_rate": 4.4113514698014953e-07, "loss": -0.023, "reward": 0.06279195286333561, "reward_std": 0.5707031115889549, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2568861898034811, "rewards/format_reward": 0.1666666716337204, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 1546.6667175292969, "epoch": 0.5329905741216795, "grad_norm": 0.34961050748825073, "kl": 0.001983642578125, "learning_rate": 4.3808955077581546e-07, "loss": 0.0053, "reward": 0.027295149862766266, "reward_std": 0.6128209084272385, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2780274897813797, "rewards/format_reward": 0.21875000558793545, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 1541.3125305175781, "epoch": 0.5347043701799485, "grad_norm": 0.2850496172904968, "kl": 0.0015439987182617188, "learning_rate": 4.350494089288943e-07, "loss": 0.041, "reward": 0.895597904920578, "reward_std": 0.996664509177208, "rewards/accuracy_reward": 0.13541666697710752, "rewards/cosine_scaled_reward": -0.09330026619136333, "rewards/format_reward": 0.2916666716337204, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 1304.2708435058594, "epoch": 0.5364181662382177, "grad_norm": 0.34603336453437805, "kl": 0.0025577545166015625, "learning_rate": 4.3201486961161093e-07, "loss": 0.0074, "reward": 0.4570487830787897, "reward_std": 0.7667558044195175, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.24745379388332367, "rewards/format_reward": 0.3854166828095913, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 1478.7708740234375, "epoch": 0.5381319622964867, "grad_norm": 0.2592521905899048, "kl": 0.0016460418701171875, "learning_rate": 4.2898608072313045e-07, "loss": 0.0546, "reward": 0.4311096593737602, "reward_std": 0.862842507660389, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.19942990876734257, "rewards/format_reward": 0.19791667442768812, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 1462.8333740234375, "epoch": 0.5398457583547558, "grad_norm": 0.33211708068847656, "kl": 0.0047512054443359375, "learning_rate": 4.2596318988235037e-07, "loss": -0.0538, "reward": 0.3032863112166524, "reward_std": 0.7379888445138931, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.1883798986673355, "rewards/format_reward": 0.21875001303851604, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1452.0625305175781, "epoch": 0.5415595544130248, "grad_norm": 0.27663302421569824, "kl": 0.0022258758544921875, "learning_rate": 4.2294634442070553e-07, "loss": 0.0233, "reward": 0.379562683403492, "reward_std": 0.8121780008077621, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.22090492397546768, "rewards/format_reward": 0.2916666744276881, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 1670.7083435058594, "epoch": 0.5432733504712939, "grad_norm": 0.25537002086639404, "kl": 0.0017232894897460938, "learning_rate": 4.1993569137498776e-07, "loss": -0.0021, "reward": -0.182917146012187, "reward_std": 0.4011043682694435, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2771887518465519, "rewards/format_reward": 0.10416666977107525, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 1319.6666870117188, "epoch": 0.5449871465295629, "grad_norm": 0.3191383481025696, "kl": 0.002109527587890625, "learning_rate": 4.1693137748017915e-07, "loss": -0.0518, "reward": 1.2622752636671066, "reward_std": 0.6648239344358444, "rewards/accuracy_reward": 0.13541667070239782, "rewards/cosine_scaled_reward": -0.15025184489786625, "rewards/format_reward": 0.3645833432674408, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 1539.3958435058594, "epoch": 0.5467009425878321, "grad_norm": 0.21360871195793152, "kl": 0.001598358154296875, "learning_rate": 4.1393354916230005e-07, "loss": 0.0642, "reward": 0.670879889279604, "reward_std": 0.5511768870055676, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.17239019460976124, "rewards/format_reward": 0.2604166716337204, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 1502.7500305175781, "epoch": 0.5484147386461011, "grad_norm": 0.30792051553726196, "kl": 0.004299163818359375, "learning_rate": 4.1094235253127374e-07, "loss": 0.0965, "reward": 0.5891358256340027, "reward_std": 0.7148416712880135, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.21276572160422802, "rewards/format_reward": 0.2395833358168602, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 1625.3542175292969, "epoch": 0.5501285347043702, "grad_norm": 0.2449854463338852, "kl": 0.0019092559814453125, "learning_rate": 4.079579333738039e-07, "loss": 0.0027, "reward": 0.09082813444547355, "reward_std": 0.5155207589268684, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.23538977652788162, "rewards/format_reward": 0.1666666679084301, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1390.5625610351562, "epoch": 0.5518423307626392, "grad_norm": 0.29950419068336487, "kl": 0.0020055770874023438, "learning_rate": 4.0498043714627006e-07, "loss": 0.0094, "reward": 1.061479389667511, "reward_std": 1.1913686990737915, "rewards/accuracy_reward": 0.08333333674818277, "rewards/cosine_scaled_reward": -0.1262189457193017, "rewards/format_reward": 0.4583333507180214, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 1513.0208740234375, "epoch": 0.5535561268209083, "grad_norm": 0.2541593611240387, "kl": 0.0019054412841796875, "learning_rate": 4.020100089676376e-07, "loss": 0.0179, "reward": 0.48006347566843033, "reward_std": 0.8878462538123131, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.24122651480138302, "rewards/format_reward": 0.2500000074505806, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 1394.2917175292969, "epoch": 0.5552699228791774, "grad_norm": 0.28044670820236206, "kl": 0.0023059844970703125, "learning_rate": 3.9904679361238526e-07, "loss": -0.0235, "reward": 0.7225539498031139, "reward_std": 0.8518542274832726, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.15590969566255808, "rewards/format_reward": 0.36458334885537624, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1490.8333740234375, "epoch": 0.5569837189374465, "grad_norm": 0.2727906405925751, "kl": 0.00678253173828125, "learning_rate": 3.9609093550344907e-07, "loss": -0.0087, "reward": -0.06165006849914789, "reward_std": 0.5125335156917572, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.298244871199131, "rewards/format_reward": 0.1770833358168602, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1312.541732788086, "epoch": 0.5586975149957155, "grad_norm": 0.2801344394683838, "kl": 0.002471923828125, "learning_rate": 3.931425787051832e-07, "loss": 0.0511, "reward": 1.2787828594446182, "reward_std": 0.8359841406345367, "rewards/accuracy_reward": 0.14583333861082792, "rewards/cosine_scaled_reward": -0.16505897510796785, "rewards/format_reward": 0.4062500074505806, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 1458.1875610351562, "epoch": 0.5604113110539846, "grad_norm": 0.26847410202026367, "kl": 0.001903533935546875, "learning_rate": 3.902018669163384e-07, "loss": -0.0135, "reward": 0.42947175493463874, "reward_std": 0.6873602792620659, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.2103552147746086, "rewards/format_reward": 0.2500000037252903, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 1452.6875305175781, "epoch": 0.5621251071122536, "grad_norm": 0.3270891010761261, "kl": 0.0029392242431640625, "learning_rate": 3.872689434630585e-07, "loss": 0.0059, "reward": 0.6504845693707466, "reward_std": 0.658838301897049, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": -0.1178303137421608, "rewards/format_reward": 0.31250002048909664, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 1420.5625305175781, "epoch": 0.5638389031705227, "grad_norm": 0.2690076529979706, "kl": 0.0023937225341796875, "learning_rate": 3.843439512918949e-07, "loss": 0.0277, "reward": 0.7830097079277039, "reward_std": 1.0598141178488731, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.14712585415691137, "rewards/format_reward": 0.3437500074505806, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 1569.0416870117188, "epoch": 0.5655526992287918, "grad_norm": 0.25364336371421814, "kl": 0.0018033981323242188, "learning_rate": 3.8142703296283953e-07, "loss": 0.0415, "reward": 0.6755895912647247, "reward_std": 0.8362427651882172, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.17127279681153595, "rewards/format_reward": 0.31250001583248377, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1507.8333740234375, "epoch": 0.5672664952870609, "grad_norm": 0.2895563840866089, "kl": 0.0018024444580078125, "learning_rate": 3.785183306423767e-07, "loss": -0.0391, "reward": 0.2783973217010498, "reward_std": 0.8718981444835663, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.24730180948972702, "rewards/format_reward": 0.2500000037252903, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1425.8750305175781, "epoch": 0.5689802913453299, "grad_norm": 0.34960538148880005, "kl": 0.002040863037109375, "learning_rate": 3.7561798609655373e-07, "loss": -0.0507, "reward": 0.4344450933858752, "reward_std": 0.8201218247413635, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.20083919540047646, "rewards/format_reward": 0.2395833395421505, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 1487.5209045410156, "epoch": 0.570694087403599, "grad_norm": 0.24745923280715942, "kl": 0.0020313262939453125, "learning_rate": 3.72726140684072e-07, "loss": -0.0304, "reward": 0.18487435579299927, "reward_std": 0.5386057496070862, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.21758716367185116, "rewards/format_reward": 0.22916666977107525, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 1550.2916870117188, "epoch": 0.572407883461868, "grad_norm": 0.26442721486091614, "kl": 0.0026597976684570312, "learning_rate": 3.6984293534939737e-07, "loss": -0.0096, "reward": 0.0944853127002716, "reward_std": 0.6071152612566948, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2625133693218231, "rewards/format_reward": 0.19791667256504297, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1411.2291870117188, "epoch": 0.5741216795201372, "grad_norm": 0.26867979764938354, "kl": 0.0038623809814453125, "learning_rate": 3.6696851061588994e-07, "loss": -0.0019, "reward": 0.35343707352876663, "reward_std": 0.7175738885998726, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2047328483313322, "rewards/format_reward": 0.2708333386108279, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1435.2708435058594, "epoch": 0.5758354755784062, "grad_norm": 0.2698570489883423, "kl": 0.003559112548828125, "learning_rate": 3.641030065789562e-07, "loss": 0.0676, "reward": 0.19872682355344296, "reward_std": 0.5484345108270645, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2322644665837288, "rewards/format_reward": 0.2395833395421505, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 1453.2500610351562, "epoch": 0.5775492716366752, "grad_norm": 0.39643779397010803, "kl": 0.002422332763671875, "learning_rate": 3.612465628992203e-07, "loss": -0.0209, "reward": 0.5667788721621037, "reward_std": 0.9422286599874496, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.2142929658293724, "rewards/format_reward": 0.2500000074505806, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 1499.2708435058594, "epoch": 0.5792630676949443, "grad_norm": 0.31420981884002686, "kl": 0.0034742355346679688, "learning_rate": 3.5839931879571725e-07, "loss": -0.0471, "reward": 0.6687855117488652, "reward_std": 0.6993714831769466, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.17757828161120415, "rewards/format_reward": 0.229166679084301, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 1635.0416870117188, "epoch": 0.5809768637532133, "grad_norm": 0.3503028452396393, "kl": 0.002582550048828125, "learning_rate": 3.555614130391079e-07, "loss": 0.0312, "reward": 0.11836901400238276, "reward_std": 0.6972107961773872, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.25987397134304047, "rewards/format_reward": 0.18750000558793545, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1518.3125305175781, "epoch": 0.5826906598114824, "grad_norm": 0.2905596196651459, "kl": 0.00196075439453125, "learning_rate": 3.5273298394491515e-07, "loss": 0.0276, "reward": -0.018636616878211498, "reward_std": 0.44363710284233093, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2682676538825035, "rewards/format_reward": 0.229166679084301, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1431.5833740234375, "epoch": 0.5844044558697515, "grad_norm": 0.38920703530311584, "kl": 0.0020751953125, "learning_rate": 3.4991416936678276e-07, "loss": -0.081, "reward": 0.17504378408193588, "reward_std": 0.44530437886714935, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.24066567048430443, "rewards/format_reward": 0.2604166716337204, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 1588.0833740234375, "epoch": 0.5861182519280206, "grad_norm": 0.2563221752643585, "kl": 0.0012540817260742188, "learning_rate": 3.471051066897562e-07, "loss": -0.0366, "reward": 0.24711184948682785, "reward_std": 0.6161708235740662, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.24378404393792152, "rewards/format_reward": 0.2708333432674408, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 1347.375015258789, "epoch": 0.5878320479862896, "grad_norm": 0.3236555755138397, "kl": 0.002685546875, "learning_rate": 3.4430593282358777e-07, "loss": -0.0737, "reward": 0.8499315101653337, "reward_std": 0.7836231589317322, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.1686049527488649, "rewards/format_reward": 0.4166666865348816, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 1482.1667175292969, "epoch": 0.5895458440445587, "grad_norm": 0.3159193694591522, "kl": 0.002262115478515625, "learning_rate": 3.4151678419606233e-07, "loss": 0.0313, "reward": 0.20483139622956514, "reward_std": 0.6143444329500198, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.23227821104228497, "rewards/format_reward": 0.2604166744276881, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1510.5625610351562, "epoch": 0.5912596401028277, "grad_norm": 0.2599591016769409, "kl": 0.0024280548095703125, "learning_rate": 3.387377967463493e-07, "loss": -0.1187, "reward": 0.49281424283981323, "reward_std": 0.833685964345932, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2351253591477871, "rewards/format_reward": 0.3437500111758709, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 1518.3125, "epoch": 0.5929734361610969, "grad_norm": 0.2566784620285034, "kl": 0.0026597976684570312, "learning_rate": 3.359691059183761e-07, "loss": 0.0576, "reward": 0.10886371694505215, "reward_std": 0.6810979098081589, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.25465162843465805, "rewards/format_reward": 0.22916666977107525, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 1464.5834045410156, "epoch": 0.5946872322193659, "grad_norm": 0.3026113212108612, "kl": 0.001850128173828125, "learning_rate": 3.3321084665422803e-07, "loss": -0.0558, "reward": 0.26063048280775547, "reward_std": 0.5907115042209625, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.26359253376722336, "rewards/format_reward": 0.3437500149011612, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1412.1875457763672, "epoch": 0.596401028277635, "grad_norm": 0.34688422083854675, "kl": 0.00620269775390625, "learning_rate": 3.3046315338757026e-07, "loss": -0.0117, "reward": 0.12697511911392212, "reward_std": 0.4296252429485321, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2743792124092579, "rewards/format_reward": 0.28125000558793545, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 1557.2500305175781, "epoch": 0.598114824335904, "grad_norm": 0.2776733636856079, "kl": 0.00171661376953125, "learning_rate": 3.2772616003709616e-07, "loss": -0.0037, "reward": 0.3554678609070834, "reward_std": 0.9817065000534058, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.23372411355376244, "rewards/format_reward": 0.20833333861082792, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1624.9791870117188, "epoch": 0.5998286203941731, "grad_norm": 0.23979856073856354, "kl": 0.0014801025390625, "learning_rate": 3.250000000000001e-07, "loss": -0.0343, "reward": 0.3130597583949566, "reward_std": 1.1155104637145996, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.24837487936019897, "rewards/format_reward": 0.2291666716337204, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 1587.0625305175781, "epoch": 0.6015424164524421, "grad_norm": 0.2794056832790375, "kl": 0.0021200180053710938, "learning_rate": 3.222848061454764e-07, "loss": -0.0059, "reward": 0.17041855677962303, "reward_std": 0.7474512904882431, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2524133399128914, "rewards/format_reward": 0.20833333767950535, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 1703.5000305175781, "epoch": 0.6032562125107113, "grad_norm": 0.20891119539737701, "kl": 0.00339508056640625, "learning_rate": 3.195807108082429e-07, "loss": 0.032, "reward": -0.17395459115505219, "reward_std": 0.34111763164401054, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.25343700870871544, "rewards/format_reward": 0.08333333674818277, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 1624.5625305175781, "epoch": 0.6049700085689803, "grad_norm": 0.2826477885246277, "kl": 0.001903533935546875, "learning_rate": 3.168878457820915e-07, "loss": 0.0494, "reward": 0.03589773364365101, "reward_std": 0.6567730531096458, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.25958870351314545, "rewards/format_reward": 0.1875000037252903, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1421.4375305175781, "epoch": 0.6066838046272494, "grad_norm": 0.2641639709472656, "kl": 0.002178192138671875, "learning_rate": 3.142063423134644e-07, "loss": 0.0997, "reward": 1.1788862720131874, "reward_std": 0.7796865254640579, "rewards/accuracy_reward": 0.12500000186264515, "rewards/cosine_scaled_reward": -0.1469529727473855, "rewards/format_reward": 0.4270833386108279, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 1403.5208740234375, "epoch": 0.6083976006855184, "grad_norm": 0.3317899703979492, "kl": 0.002410888671875, "learning_rate": 3.115363310950578e-07, "loss": -0.0302, "reward": 0.48206552490592003, "reward_std": 0.5036356523633003, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.2182038016617298, "rewards/format_reward": 0.2500000111758709, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1523.6250305175781, "epoch": 0.6101113967437874, "grad_norm": 0.36468636989593506, "kl": 0.0030536651611328125, "learning_rate": 3.0887794225945143e-07, "loss": -0.077, "reward": 0.31238503009080887, "reward_std": 0.8116755038499832, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.24230190180242062, "rewards/format_reward": 0.2604166716337204, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 1531.3333740234375, "epoch": 0.6118251928020566, "grad_norm": 0.31790032982826233, "kl": 0.00177764892578125, "learning_rate": 3.062313053727671e-07, "loss": -0.0039, "reward": 0.07550723478198051, "reward_std": 0.39915400743484497, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2813524901866913, "rewards/format_reward": 0.2916666716337204, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1640.6041870117188, "epoch": 0.6135389888603257, "grad_norm": 0.2601807415485382, "kl": 0.0016021728515625, "learning_rate": 3.0359654942835247e-07, "loss": -0.0006, "reward": 0.28700127080082893, "reward_std": 1.0524575859308243, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.23822485655546188, "rewards/format_reward": 0.17708333767950535, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1546.6250305175781, "epoch": 0.6152527849185947, "grad_norm": 0.3116397559642792, "kl": 0.003833770751953125, "learning_rate": 3.0097380284049523e-07, "loss": 0.0389, "reward": 0.19815253466367722, "reward_std": 0.7473566234111786, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.23686950653791428, "rewards/format_reward": 0.18750000186264515, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 1337.1875305175781, "epoch": 0.6169665809768637, "grad_norm": 0.2909329831600189, "kl": 0.0030670166015625, "learning_rate": 2.9836319343816397e-07, "loss": -0.0459, "reward": 0.3552706688642502, "reward_std": 0.7695464119315147, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.25929973274469376, "rewards/format_reward": 0.2500000027939677, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1522.6667175292969, "epoch": 0.6186803770351328, "grad_norm": 0.2280290722846985, "kl": 0.002048492431640625, "learning_rate": 2.9576484845877793e-07, "loss": 0.0055, "reward": 0.6520526558160782, "reward_std": 1.0060482174158096, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.22033316269516945, "rewards/format_reward": 0.2916666716337204, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1703.8125305175781, "epoch": 0.6203941730934018, "grad_norm": 0.3011070191860199, "kl": 0.001598358154296875, "learning_rate": 2.931788945420058e-07, "loss": -0.0108, "reward": -0.06266473978757858, "reward_std": 0.5431503728032112, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2694770060479641, "rewards/format_reward": 0.11458333674818277, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1513.3125305175781, "epoch": 0.622107969151671, "grad_norm": 0.3134266436100006, "kl": 0.0020580291748046875, "learning_rate": 2.9060545772359305e-07, "loss": 0.038, "reward": 0.2682766281068325, "reward_std": 0.7535002008080482, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.19206186942756176, "rewards/format_reward": 0.18750000558793545, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 1448.0417175292969, "epoch": 0.62382176520994, "grad_norm": 0.3311283588409424, "kl": 0.0020236968994140625, "learning_rate": 2.8804466342921987e-07, "loss": -0.0098, "reward": 0.15795885026454926, "reward_std": 0.30675796419382095, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1892208829522133, "rewards/format_reward": 0.2500000074505806, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1605.1667175292969, "epoch": 0.6255355612682091, "grad_norm": 0.36332038044929504, "kl": 0.0021076202392578125, "learning_rate": 2.854966364683872e-07, "loss": 0.0331, "reward": 0.1046671625226736, "reward_std": 0.5455575659871101, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.20939945615828037, "rewards/format_reward": 0.15625000838190317, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 1586.9791870117188, "epoch": 0.6272493573264781, "grad_norm": 0.29355624318122864, "kl": 0.009174346923828125, "learning_rate": 2.829615010283344e-07, "loss": -0.027, "reward": 0.5508327009156346, "reward_std": 0.434384074062109, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": -0.08403514325618744, "rewards/format_reward": 0.1875000111758709, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 1256.0625305175781, "epoch": 0.6289631533847472, "grad_norm": 0.29369834065437317, "kl": 0.003662109375, "learning_rate": 2.8043938066798645e-07, "loss": 0.0073, "reward": 0.6286521656438708, "reward_std": 0.9221112877130508, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.20299134403467178, "rewards/format_reward": 0.3750000074505806, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 1618.8125610351562, "epoch": 0.6306769494430163, "grad_norm": 0.27679574489593506, "kl": 0.0018301010131835938, "learning_rate": 2.7793039831193133e-07, "loss": 0.0365, "reward": 0.1682092708069831, "reward_std": 0.6755241900682449, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.24134992808103561, "rewards/format_reward": 0.16666667349636555, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 1345.3542175292969, "epoch": 0.6323907455012854, "grad_norm": 0.2981966733932495, "kl": 0.0034046173095703125, "learning_rate": 2.7543467624442956e-07, "loss": -0.0287, "reward": 1.1676701717078686, "reward_std": 1.2293783277273178, "rewards/accuracy_reward": 0.12500000093132257, "rewards/cosine_scaled_reward": -0.10941352834925056, "rewards/format_reward": 0.35416667722165585, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 1419.2292022705078, "epoch": 0.6341045415595544, "grad_norm": 0.2704571485519409, "kl": 0.0016345977783203125, "learning_rate": 2.729523361034538e-07, "loss": 0.0931, "reward": 0.4965338706970215, "reward_std": 0.559646487236023, "rewards/accuracy_reward": 0.0520833358168602, "rewards/cosine_scaled_reward": -0.2247798889875412, "rewards/format_reward": 0.3125000074505806, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1411.8541870117188, "epoch": 0.6358183376178235, "grad_norm": 0.3237265646457672, "kl": 0.0018434524536132812, "learning_rate": 2.7048349887476037e-07, "loss": 0.0369, "reward": 0.6508457884192467, "reward_std": 0.8934041857719421, "rewards/accuracy_reward": 0.06250000093132257, "rewards/cosine_scaled_reward": -0.21806739643216133, "rewards/format_reward": 0.3333333395421505, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 1625.5208740234375, "epoch": 0.6375321336760925, "grad_norm": 0.24745488166809082, "kl": 0.002635955810546875, "learning_rate": 2.6802828488599294e-07, "loss": -0.0212, "reward": 0.07880330551415682, "reward_std": 0.5076464600861073, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.26656462997198105, "rewards/format_reward": 0.2187500074505806, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 1609.5416870117188, "epoch": 0.6392459297343616, "grad_norm": 0.40871137380599976, "kl": 0.0014896392822265625, "learning_rate": 2.655868138008171e-07, "loss": -0.0124, "reward": 0.21406953036785126, "reward_std": 0.6213681772351265, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.30080828070640564, "rewards/format_reward": 0.2812500111758709, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 1501.5000305175781, "epoch": 0.6409597257926307, "grad_norm": 0.2766628563404083, "kl": 0.002811431884765625, "learning_rate": 2.631592046130896e-07, "loss": -0.0421, "reward": 0.3906727936118841, "reward_std": 0.7104802057147026, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.2238925527781248, "rewards/format_reward": 0.2500000111758709, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1480.0625305175781, "epoch": 0.6426735218508998, "grad_norm": 0.3094847500324249, "kl": 0.0052032470703125, "learning_rate": 2.6074557564105724e-07, "loss": -0.0191, "reward": 0.15879609622061253, "reward_std": 0.5000172480940819, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2149198092520237, "rewards/format_reward": 0.2291666716337204, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 1327.0208740234375, "epoch": 0.6443873179091688, "grad_norm": 0.2825866639614105, "kl": 0.00589752197265625, "learning_rate": 2.583460445215911e-07, "loss": -0.0239, "reward": 0.5309350336901844, "reward_std": 0.8217562958598137, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.2182630728930235, "rewards/format_reward": 0.2708333432674408, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 1620.8333740234375, "epoch": 0.6461011139674379, "grad_norm": 0.2529444396495819, "kl": 0.001277923583984375, "learning_rate": 2.5596072820445254e-07, "loss": -0.0177, "reward": 0.24563428107649088, "reward_std": 0.8564099371433258, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2212253138422966, "rewards/format_reward": 0.2187500074505806, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 1585.9791870117188, "epoch": 0.6478149100257069, "grad_norm": 0.2812616527080536, "kl": 0.002666473388671875, "learning_rate": 2.5358974294659373e-07, "loss": -0.0114, "reward": 0.07188674993813038, "reward_std": 0.8066864013671875, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.24227309226989746, "rewards/format_reward": 0.14583333861082792, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1236.2917175292969, "epoch": 0.6495287060839761, "grad_norm": 0.5107555985450745, "kl": 0.0024261474609375, "learning_rate": 2.512332043064913e-07, "loss": -0.1417, "reward": 0.7737401723861694, "reward_std": 1.0626971647143364, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.20454938523471355, "rewards/format_reward": 0.4583333358168602, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 1412.0208740234375, "epoch": 0.6512425021422451, "grad_norm": 0.5260189175605774, "kl": 0.00237274169921875, "learning_rate": 2.488912271385139e-07, "loss": -0.0082, "reward": 0.11665789876133204, "reward_std": 0.7751671895384789, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.29444097727537155, "rewards/format_reward": 0.2500000111758709, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1650.5833740234375, "epoch": 0.6529562982005142, "grad_norm": 0.2257082313299179, "kl": 0.002079010009765625, "learning_rate": 2.465639255873246e-07, "loss": -0.002, "reward": -0.17455148696899414, "reward_std": 0.3701091632246971, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2487034685909748, "rewards/format_reward": 0.09375000186264515, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 1481.2083435058594, "epoch": 0.6546700942587832, "grad_norm": 0.30999627709388733, "kl": 0.0020589828491210938, "learning_rate": 2.4425141308231765e-07, "loss": -0.0818, "reward": 0.39649971574544907, "reward_std": 0.8797931224107742, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.23154840990900993, "rewards/format_reward": 0.20833333395421505, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1477.8125305175781, "epoch": 0.6563838903170522, "grad_norm": 0.29673850536346436, "kl": 0.00428009033203125, "learning_rate": 2.4195380233209006e-07, "loss": -0.0766, "reward": 0.19455857016146183, "reward_std": 0.541585698723793, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.26186586171388626, "rewards/format_reward": 0.2708333358168602, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 1682.9583435058594, "epoch": 0.6580976863753213, "grad_norm": 0.22542622685432434, "kl": 0.0011224746704101562, "learning_rate": 2.3967120531894857e-07, "loss": 0.0513, "reward": -0.1359546771273017, "reward_std": 0.35826287791132927, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2534148283302784, "rewards/format_reward": 0.0937500037252903, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1484.8541870117188, "epoch": 0.6598114824335904, "grad_norm": 0.27959033846855164, "kl": 0.0029392242431640625, "learning_rate": 2.374037332934512e-07, "loss": -0.098, "reward": -0.02571732923388481, "reward_std": 0.44074755907058716, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2364521324634552, "rewards/format_reward": 0.19791666977107525, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 1488.3958435058594, "epoch": 0.6615252784918595, "grad_norm": 0.2378430962562561, "kl": 0.0012645721435546875, "learning_rate": 2.3515149676898552e-07, "loss": -0.039, "reward": 0.07076568156480789, "reward_std": 0.43841054663062096, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.24396153166890144, "rewards/format_reward": 0.21875, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1451.5208740234375, "epoch": 0.6632390745501285, "grad_norm": 0.3240017294883728, "kl": 0.0023651123046875, "learning_rate": 2.3291460551638237e-07, "loss": 0.0978, "reward": 0.6527014970779419, "reward_std": 0.5769590213894844, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": -0.21393180266022682, "rewards/format_reward": 0.21875001024454832, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1418.1042175292969, "epoch": 0.6649528706083976, "grad_norm": 0.2752251625061035, "kl": 0.0028781890869140625, "learning_rate": 2.306931685585657e-07, "loss": -0.0529, "reward": 0.8440244849771261, "reward_std": 0.6565186530351639, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.2019278071820736, "rewards/format_reward": 0.3854166753590107, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1421.5208740234375, "epoch": 0.6666666666666666, "grad_norm": 0.27702751755714417, "kl": 0.0026226043701171875, "learning_rate": 2.2848729416523859e-07, "loss": -0.0539, "reward": 0.8581810109317303, "reward_std": 0.5204750150442123, "rewards/accuracy_reward": 0.1145833358168602, "rewards/cosine_scaled_reward": -0.09280841797590256, "rewards/format_reward": 0.2916666744276881, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 1378.8541870117188, "epoch": 0.6683804627249358, "grad_norm": 0.27244675159454346, "kl": 0.0025920867919921875, "learning_rate": 2.2629708984760706e-07, "loss": -0.0489, "reward": 0.8850560709834099, "reward_std": 1.1492140740156174, "rewards/accuracy_reward": 0.08333333395421505, "rewards/cosine_scaled_reward": -0.20179678685963154, "rewards/format_reward": 0.4270833432674408, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1637.8542175292969, "epoch": 0.6700942587832048, "grad_norm": 0.27187544107437134, "kl": 0.0049285888671875, "learning_rate": 2.2412266235313973e-07, "loss": 0.0146, "reward": -0.018726075068116188, "reward_std": 0.6156428083777428, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.274042472243309, "rewards/format_reward": 0.13541666977107525, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1484.9166717529297, "epoch": 0.6718080548414739, "grad_norm": 0.34059178829193115, "kl": 0.002033233642578125, "learning_rate": 2.2196411766036487e-07, "loss": -0.0722, "reward": 0.4430566392838955, "reward_std": 0.6922470554709435, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.21358026936650276, "rewards/format_reward": 0.28125000558793545, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1628.9166870117188, "epoch": 0.6735218508997429, "grad_norm": 0.23434066772460938, "kl": 0.00211334228515625, "learning_rate": 2.1982156097370557e-07, "loss": 0.0105, "reward": -0.09393579512834549, "reward_std": 0.3237507604062557, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.33932163566350937, "rewards/format_reward": 0.1250000074505806, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1315.3333740234375, "epoch": 0.675235646958012, "grad_norm": 0.29491373896598816, "kl": 0.0037288665771484375, "learning_rate": 2.1769509671835223e-07, "loss": -0.0701, "reward": 0.6508196443319321, "reward_std": 0.8796055167913437, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.19033638876862824, "rewards/format_reward": 0.3645833432674408, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 1452.7916870117188, "epoch": 0.676949443016281, "grad_norm": 0.27866828441619873, "kl": 0.0017375946044921875, "learning_rate": 2.1558482853517253e-07, "loss": 0.0756, "reward": 0.7415898907929659, "reward_std": 0.6144783869385719, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.16752544231712818, "rewards/format_reward": 0.3125, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1655.9166870117188, "epoch": 0.6786632390745502, "grad_norm": 0.19680996239185333, "kl": 0.0015344619750976562, "learning_rate": 2.134908592756607e-07, "loss": 0.0498, "reward": 0.08432755898684263, "reward_std": 0.5908685103058815, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2236979342997074, "rewards/format_reward": 0.11458333674818277, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 1467.8125305175781, "epoch": 0.6803770351328192, "grad_norm": 0.2637212574481964, "kl": 0.0015411376953125, "learning_rate": 2.1141329099692406e-07, "loss": -0.0323, "reward": 0.3956471234560013, "reward_std": 0.8686472177505493, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.22639739885926247, "rewards/format_reward": 0.3020833358168602, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 1616.0209045410156, "epoch": 0.6820908311910883, "grad_norm": 0.2836722731590271, "kl": 0.0020313262939453125, "learning_rate": 2.0935222495670968e-07, "loss": 0.0571, "reward": 0.5134015195071697, "reward_std": 0.7734205052256584, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.1734911259263754, "rewards/format_reward": 0.16666666977107525, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1232.1666870117188, "epoch": 0.6838046272493573, "grad_norm": 0.3226679861545563, "kl": 0.004119873046875, "learning_rate": 2.0730776160846853e-07, "loss": -0.0493, "reward": 1.2915575206279755, "reward_std": 1.2753455489873886, "rewards/accuracy_reward": 0.13541667349636555, "rewards/cosine_scaled_reward": -0.15988415479660034, "rewards/format_reward": 0.5208333507180214, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1649.1667175292969, "epoch": 0.6855184233076264, "grad_norm": 0.2472459226846695, "kl": 0.001773834228515625, "learning_rate": 2.0528000059645995e-07, "loss": -0.0103, "reward": 0.06294973194599152, "reward_std": 0.6322297528386116, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2695380821824074, "rewards/format_reward": 0.15625000279396772, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 1584.5833740234375, "epoch": 0.6872322193658955, "grad_norm": 0.28003522753715515, "kl": 0.002376556396484375, "learning_rate": 2.032690407508949e-07, "loss": 0.0172, "reward": 0.29370687156915665, "reward_std": 0.8225355744361877, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2329692356288433, "rewards/format_reward": 0.20833333767950535, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 1528.4583435058594, "epoch": 0.6889460154241646, "grad_norm": 0.2969180643558502, "kl": 0.0026397705078125, "learning_rate": 2.0127498008311922e-07, "loss": -0.0051, "reward": -0.05280027166008949, "reward_std": 0.3903113231062889, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2595442086458206, "rewards/format_reward": 0.1770833358168602, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1510.8333740234375, "epoch": 0.6906598114824336, "grad_norm": 0.26062414050102234, "kl": 0.0029163360595703125, "learning_rate": 1.9929791578083655e-07, "loss": -0.0638, "reward": 0.7391143590211868, "reward_std": 0.758337527513504, "rewards/accuracy_reward": 0.12500000279396772, "rewards/cosine_scaled_reward": -0.13690827786922455, "rewards/format_reward": 0.2500000037252903, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 1358.2917175292969, "epoch": 0.6923736075407027, "grad_norm": 0.3119449317455292, "kl": 0.002368927001953125, "learning_rate": 1.9733794420337213e-07, "loss": -0.0557, "reward": 0.17309055104851723, "reward_std": 0.5400930494070053, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.28718583285808563, "rewards/format_reward": 0.3541666679084301, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 1629.0208740234375, "epoch": 0.6940874035989717, "grad_norm": 0.23889358341693878, "kl": 0.0018787384033203125, "learning_rate": 1.9539516087697517e-07, "loss": 0.0018, "reward": 0.11960358917713165, "reward_std": 0.7777920588850975, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.278524037450552, "rewards/format_reward": 0.14583334140479565, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 1247.7083587646484, "epoch": 0.6958011996572407, "grad_norm": 0.36414635181427, "kl": 0.003208160400390625, "learning_rate": 1.934696604901642e-07, "loss": 0.0489, "reward": 0.6195550169795752, "reward_std": 0.7279501482844353, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.23867519572377205, "rewards/format_reward": 0.479166679084301, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 1667.6458435058594, "epoch": 0.6975149957155099, "grad_norm": 0.250451922416687, "kl": 0.00220489501953125, "learning_rate": 1.915615368891117e-07, "loss": 0.0308, "reward": 0.20595107227563858, "reward_std": 0.48369817435741425, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.2192402444779873, "rewards/format_reward": 0.1354166716337204, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 1319.9584045410156, "epoch": 0.699228791773779, "grad_norm": 0.27845609188079834, "kl": 0.0021209716796875, "learning_rate": 1.8967088307307e-07, "loss": 0.1217, "reward": 1.5674302279949188, "reward_std": 1.2716419026255608, "rewards/accuracy_reward": 0.16666667349636555, "rewards/cosine_scaled_reward": -0.0756095617543906, "rewards/format_reward": 0.3958333507180214, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 1454.5417175292969, "epoch": 0.700942587832048, "grad_norm": 0.3217247724533081, "kl": 0.004810333251953125, "learning_rate": 1.8779779118983867e-07, "loss": 0.1388, "reward": 0.5030364524573088, "reward_std": 0.9113668613135815, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.20980912074446678, "rewards/format_reward": 0.2500000074505806, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 1603.5625305175781, "epoch": 0.702656383890317, "grad_norm": 0.32288092374801636, "kl": 0.002399444580078125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0354, "reward": 0.26620784075930715, "reward_std": 0.5254248827695847, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.20196618512272835, "rewards/format_reward": 0.20833334513008595, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 1488.3541870117188, "epoch": 0.7043701799485861, "grad_norm": 0.28620922565460205, "kl": 0.0020618438720703125, "learning_rate": 1.8410465752883758e-07, "loss": 0.0085, "reward": 0.3866404742002487, "reward_std": 0.544437862932682, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.20075462386012077, "rewards/format_reward": 0.3229166744276881, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 1503.9792175292969, "epoch": 0.7060839760068551, "grad_norm": 0.2854917347431183, "kl": 0.002117156982421875, "learning_rate": 1.822847957491922e-07, "loss": -0.0722, "reward": 0.17356615141034126, "reward_std": 0.5067817345261574, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2639676071703434, "rewards/format_reward": 0.3020833432674408, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 1527.3958435058594, "epoch": 0.7077977720651243, "grad_norm": 0.2824840843677521, "kl": 0.0019626617431640625, "learning_rate": 1.804828558898332e-07, "loss": 0.1088, "reward": 0.22076449799351394, "reward_std": 0.5867825299501419, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.22575581818819046, "rewards/format_reward": 0.2187500074505806, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 1580.7708740234375, "epoch": 0.7095115681233933, "grad_norm": 0.27595776319503784, "kl": 0.0015163421630859375, "learning_rate": 1.7869892577476722e-07, "loss": -0.0465, "reward": 0.38086844608187675, "reward_std": 0.8873961195349693, "rewards/accuracy_reward": 0.0520833358168602, "rewards/cosine_scaled_reward": -0.21333472314290702, "rewards/format_reward": 0.19791666977107525, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 1409.0000610351562, "epoch": 0.7112253641816624, "grad_norm": 0.28819894790649414, "kl": 0.0022373199462890625, "learning_rate": 1.7693309235023127e-07, "loss": -0.129, "reward": 0.3180142305791378, "reward_std": 0.5801991075277328, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.23581846058368683, "rewards/format_reward": 0.36458334885537624, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 1495.2500305175781, "epoch": 0.7129391602399314, "grad_norm": 0.375191867351532, "kl": 0.003185272216796875, "learning_rate": 1.7518544168045524e-07, "loss": -0.0128, "reward": 0.6192193161696196, "reward_std": 0.7108025848865509, "rewards/accuracy_reward": 0.08333333488553762, "rewards/cosine_scaled_reward": -0.1999379526823759, "rewards/format_reward": 0.22916666977107525, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 1419.6041870117188, "epoch": 0.7146529562982005, "grad_norm": 0.30786266922950745, "kl": 0.0026340484619140625, "learning_rate": 1.7345605894346726e-07, "loss": 0.0226, "reward": 0.545205045491457, "reward_std": 0.9132278263568878, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.1991914976388216, "rewards/format_reward": 0.30208334140479565, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 1369.125015258789, "epoch": 0.7163667523564696, "grad_norm": 0.34540000557899475, "kl": 0.0025634765625, "learning_rate": 1.7174502842694212e-07, "loss": 0.0079, "reward": 0.08001699112355709, "reward_std": 0.37368160486221313, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2843298986554146, "rewards/format_reward": 0.2916666753590107, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 1387.4583740234375, "epoch": 0.7180805484147387, "grad_norm": 0.339304119348526, "kl": 0.011465072631835938, "learning_rate": 1.7005243352409333e-07, "loss": -0.0638, "reward": 0.5061864908784628, "reward_std": 0.7231926918029785, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.18234312906861305, "rewards/format_reward": 0.3020833469927311, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1450.6041870117188, "epoch": 0.7197943444730077, "grad_norm": 0.28949928283691406, "kl": 0.0025787353515625, "learning_rate": 1.6837835672960831e-07, "loss": -0.0293, "reward": 1.1634457781910896, "reward_std": 0.8921896517276764, "rewards/accuracy_reward": 0.11458333861082792, "rewards/cosine_scaled_reward": -0.108680484816432, "rewards/format_reward": 0.3333333469927311, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 1393.7709045410156, "epoch": 0.7215081405312768, "grad_norm": 0.33998945355415344, "kl": 0.0024509429931640625, "learning_rate": 1.6672287963562852e-07, "loss": 0.0491, "reward": 0.7930726930499077, "reward_std": 0.9040541276335716, "rewards/accuracy_reward": 0.06250000093132257, "rewards/cosine_scaled_reward": -0.15836179442703724, "rewards/format_reward": 0.322916679084301, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1633.2916870117188, "epoch": 0.7232219365895458, "grad_norm": 0.23244516551494598, "kl": 0.001995086669921875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0037, "reward": 0.10263706487603486, "reward_std": 0.6201393157243729, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2293078862130642, "rewards/format_reward": 0.17708334047347307, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1548.1250305175781, "epoch": 0.7249357326478149, "grad_norm": 0.25799766182899475, "kl": 0.0018787384033203125, "learning_rate": 1.6346804638120098e-07, "loss": -0.0096, "reward": 0.36351461336016655, "reward_std": 0.7988380044698715, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.18038328737020493, "rewards/format_reward": 0.25000001303851604, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 1579.3958740234375, "epoch": 0.726649528706084, "grad_norm": 0.261885404586792, "kl": 0.0017213821411132812, "learning_rate": 1.6186884885673413e-07, "loss": -0.0009, "reward": 0.202106274664402, "reward_std": 0.7989566773176193, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.19235052540898323, "rewards/format_reward": 0.15625000558793545, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 1439.7291870117188, "epoch": 0.7283633247643531, "grad_norm": 0.2712852656841278, "kl": 0.006488800048828125, "learning_rate": 1.6028856829700258e-07, "loss": 0.0239, "reward": 0.5689312815666199, "reward_std": 0.7213777899742126, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.18050884827971458, "rewards/format_reward": 0.21875, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1277.6250305175781, "epoch": 0.7300771208226221, "grad_norm": 0.49752435088157654, "kl": 0.0027599334716796875, "learning_rate": 1.5872728172265146e-07, "loss": 0.076, "reward": 1.1127668991684914, "reward_std": 0.5749448239803314, "rewards/accuracy_reward": 0.13541667349636555, "rewards/cosine_scaled_reward": -0.14212442096322775, "rewards/format_reward": 0.3020833386108279, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1445.0, "epoch": 0.7317909168808912, "grad_norm": 0.25714749097824097, "kl": 0.002696990966796875, "learning_rate": 1.5718506522858572e-07, "loss": -0.0134, "reward": 0.10718778986483812, "reward_std": 0.6740274652838707, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2864115312695503, "rewards/format_reward": 0.2187500074505806, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1624.6666870117188, "epoch": 0.7335047129391602, "grad_norm": 0.2530677318572998, "kl": 0.0018749237060546875, "learning_rate": 1.5566199398026147e-07, "loss": 0.022, "reward": 0.5430182013660669, "reward_std": 0.6560900881886482, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.2140037715435028, "rewards/format_reward": 0.2604166716337204, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1622.2291870117188, "epoch": 0.7352185089974294, "grad_norm": 0.2503276765346527, "kl": 0.002094268798828125, "learning_rate": 1.5415814221002265e-07, "loss": 0.0301, "reward": -0.06853180006146431, "reward_std": 0.37937263399362564, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2777785621583462, "rewards/format_reward": 0.16666666697710752, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 1544.3750305175781, "epoch": 0.7369323050556984, "grad_norm": 0.250139981508255, "kl": 0.0017986297607421875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0141, "reward": 0.6108243186026812, "reward_std": 0.9471312090754509, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.17712798714637756, "rewards/format_reward": 0.20833334140479565, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 1434.1041870117188, "epoch": 0.7386461011139674, "grad_norm": 0.28267359733581543, "kl": 0.0019893646240234375, "learning_rate": 1.5120838934595337e-07, "loss": 0.0376, "reward": 0.6465346980839968, "reward_std": 1.2121013402938843, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.17127413116395473, "rewards/format_reward": 0.3437500074505806, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1608.9167175292969, "epoch": 0.7403598971722365, "grad_norm": 0.26306575536727905, "kl": 0.0012712478637695312, "learning_rate": 1.4976263201891613e-07, "loss": 0.0194, "reward": 0.3465429022908211, "reward_std": 0.582036554813385, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.2412100676447153, "rewards/format_reward": 0.14583333395421505, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1531.3750305175781, "epoch": 0.7420736932305055, "grad_norm": 0.34143468737602234, "kl": 0.0018463134765625, "learning_rate": 1.483363816965435e-07, "loss": 0.0579, "reward": 0.4884794354438782, "reward_std": 0.42937102913856506, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.13383641093969345, "rewards/format_reward": 0.2604166669771075, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 1432.6250610351562, "epoch": 0.7437874892887746, "grad_norm": 0.38804128766059875, "kl": 0.0039615631103515625, "learning_rate": 1.469297078922642e-07, "loss": -0.0035, "reward": 0.42194677516818047, "reward_std": 0.550103060901165, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.19477596133947372, "rewards/format_reward": 0.3333333358168602, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 1563.6458435058594, "epoch": 0.7455012853470437, "grad_norm": 0.29043349623680115, "kl": 0.001499176025390625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0231, "reward": 0.5839780867099762, "reward_std": 0.7055636867880821, "rewards/accuracy_reward": 0.06250000279396772, "rewards/cosine_scaled_reward": -0.18507066927850246, "rewards/format_reward": 0.2500000037252903, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1419.2917175292969, "epoch": 0.7472150814053128, "grad_norm": 0.27879598736763, "kl": 0.004342079162597656, "learning_rate": 1.4417536311769885e-07, "loss": 0.0289, "reward": 0.8868619427084923, "reward_std": 0.8792257979512215, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": -0.1412948245415464, "rewards/format_reward": 0.34375000558793545, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1589.0208435058594, "epoch": 0.7489288774635818, "grad_norm": 0.2888442873954773, "kl": 0.0017032623291015625, "learning_rate": 1.4282782639029128e-07, "loss": 0.0339, "reward": 0.1717052087187767, "reward_std": 0.6946492567658424, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2423543520271778, "rewards/format_reward": 0.2604166716337204, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 1409.3958740234375, "epoch": 0.7506426735218509, "grad_norm": 0.2719264328479767, "kl": 0.002471923828125, "learning_rate": 1.4150013466019114e-07, "loss": -0.0168, "reward": 0.9626978933811188, "reward_std": 0.7359797842800617, "rewards/accuracy_reward": 0.12500000186264515, "rewards/cosine_scaled_reward": -0.166676202788949, "rewards/format_reward": 0.2708333432674408, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1714.8750305175781, "epoch": 0.7523564695801199, "grad_norm": 0.24937574565410614, "kl": 0.0011339187622070312, "learning_rate": 1.4019235263722034e-07, "loss": 0.0502, "reward": -0.18545041419565678, "reward_std": 0.2978869043290615, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.23207932710647583, "rewards/format_reward": 0.052083334885537624, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1352.2917022705078, "epoch": 0.7540702656383891, "grad_norm": 0.2822205722332001, "kl": 0.0021915435791015625, "learning_rate": 1.3890454406082956e-07, "loss": -0.0195, "reward": 0.6252860352396965, "reward_std": 0.6529619544744492, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.20248195342719555, "rewards/format_reward": 0.3333333423361182, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 1641.6875305175781, "epoch": 0.7557840616966581, "grad_norm": 0.2570679783821106, "kl": 0.001667022705078125, "learning_rate": 1.3763677169699217e-07, "loss": -0.0778, "reward": 0.003832288086414337, "reward_std": 0.5638974718749523, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.25156646594405174, "rewards/format_reward": 0.11458333861082792, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 1419.1666870117188, "epoch": 0.7574978577549272, "grad_norm": 0.2740538716316223, "kl": 0.0019435882568359375, "learning_rate": 1.3638909733514452e-07, "loss": 0.0162, "reward": 1.0481588691473007, "reward_std": 0.7896323874592781, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.13111398927867413, "rewards/format_reward": 0.4687500186264515, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1333.5417175292969, "epoch": 0.7592116538131962, "grad_norm": 0.3648242950439453, "kl": 0.0024566650390625, "learning_rate": 1.351615817851748e-07, "loss": 0.0397, "reward": 0.47288158535957336, "reward_std": 0.9505643546581268, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2298332378268242, "rewards/format_reward": 0.3125000149011612, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 1656.5208740234375, "epoch": 0.7609254498714653, "grad_norm": 0.2964666783809662, "kl": 0.0021820068359375, "learning_rate": 1.3395428487445914e-07, "loss": 0.0208, "reward": 0.3792496621608734, "reward_std": 0.8810505419969559, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.20247037336230278, "rewards/format_reward": 0.25000000558793545, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1681.6042175292969, "epoch": 0.7626392459297343, "grad_norm": 0.2914186418056488, "kl": 0.00177001953125, "learning_rate": 1.3276726544494571e-07, "loss": 0.031, "reward": 0.23477049777284265, "reward_std": 0.5860272571444511, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.21530815958976746, "rewards/format_reward": 0.1562500074505806, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1518.3333435058594, "epoch": 0.7643530419880035, "grad_norm": 0.28216373920440674, "kl": 0.001369476318359375, "learning_rate": 1.316005813502869e-07, "loss": 0.0654, "reward": 0.6750286594033241, "reward_std": 0.5412532538175583, "rewards/accuracy_reward": 0.0729166679084301, "rewards/cosine_scaled_reward": -0.15599552262574434, "rewards/format_reward": 0.2708333432674408, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 1428.5208740234375, "epoch": 0.7660668380462725, "grad_norm": 0.30427539348602295, "kl": 0.0028285980224609375, "learning_rate": 1.3045428945301953e-07, "loss": 0.0126, "reward": 0.6193914785981178, "reward_std": 0.9180549010634422, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.19425869919359684, "rewards/format_reward": 0.2604166716337204, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 1671.0208740234375, "epoch": 0.7677806341045416, "grad_norm": 0.2518438398838043, "kl": 0.0018138885498046875, "learning_rate": 1.2932844562179352e-07, "loss": 0.003, "reward": 0.3041791720315814, "reward_std": 0.8534430414438248, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.2137564681470394, "rewards/format_reward": 0.21875000558793545, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 1607.8541870117188, "epoch": 0.7694944301628106, "grad_norm": 0.2538737952709198, "kl": 0.0041255950927734375, "learning_rate": 1.2822310472864885e-07, "loss": 0.0294, "reward": 0.08448375202715397, "reward_std": 0.47953518480062485, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.2132504116743803, "rewards/format_reward": 0.09375000186264515, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1300.3541870117188, "epoch": 0.7712082262210797, "grad_norm": 0.282230406999588, "kl": 0.0021991729736328125, "learning_rate": 1.2713832064634125e-07, "loss": 0.1019, "reward": 0.7629837021231651, "reward_std": 0.7489147633314133, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.15245907567441463, "rewards/format_reward": 0.3020833358168602, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1650.8750305175781, "epoch": 0.7729220222793488, "grad_norm": 0.2622946798801422, "kl": 0.001873016357421875, "learning_rate": 1.260741462457165e-07, "loss": -0.016, "reward": -0.12438017874956131, "reward_std": 0.5109615363180637, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.28111129999160767, "rewards/format_reward": 0.125, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1508.0208740234375, "epoch": 0.7746358183376179, "grad_norm": 0.303348571062088, "kl": 0.003978729248046875, "learning_rate": 1.2503063339313356e-07, "loss": -0.0728, "reward": 0.7896851152181625, "reward_std": 0.9815626963973045, "rewards/accuracy_reward": 0.07291666697710752, "rewards/cosine_scaled_reward": -0.17916808277368546, "rewards/format_reward": 0.3437500149011612, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1580.4583740234375, "epoch": 0.7763496143958869, "grad_norm": 0.2578333914279938, "kl": 0.002101898193359375, "learning_rate": 1.2400783294793668e-07, "loss": -0.0507, "reward": 0.3240011315792799, "reward_std": 0.6389465481042862, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.17426074109971523, "rewards/format_reward": 0.1041666679084301, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 1233.6250305175781, "epoch": 0.778063410454156, "grad_norm": 0.2885809540748596, "kl": 0.00363922119140625, "learning_rate": 1.2300579475997657e-07, "loss": -0.0462, "reward": 0.8891761973500252, "reward_std": 1.197177767753601, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.19221091642975807, "rewards/format_reward": 0.3645833432674408, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 1532.8542175292969, "epoch": 0.779777206512425, "grad_norm": 0.28073832392692566, "kl": 0.0023908615112304688, "learning_rate": 1.220245676671809e-07, "loss": 0.0047, "reward": 0.23808614909648895, "reward_std": 0.5993708148598671, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.25339581072330475, "rewards/format_reward": 0.2291666716337204, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1470.7917175292969, "epoch": 0.781491002570694, "grad_norm": 0.2677416205406189, "kl": 0.0023288726806640625, "learning_rate": 1.2106419949317388e-07, "loss": 0.0206, "reward": 0.779910197481513, "reward_std": 0.7993754521012306, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.17951993457973003, "rewards/format_reward": 0.3020833395421505, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 1538.8541870117188, "epoch": 0.7832047986289632, "grad_norm": 0.3083825707435608, "kl": 0.0019845962524414062, "learning_rate": 1.2012473704494537e-07, "loss": 0.0633, "reward": 0.3929613158106804, "reward_std": 0.4508157819509506, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.20179006457328796, "rewards/format_reward": 0.14583333674818277, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1352.1458740234375, "epoch": 0.7849185946872322, "grad_norm": 0.31749284267425537, "kl": 0.00440216064453125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0838, "reward": 0.47300633788108826, "reward_std": 1.0741110667586327, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.22252168133854866, "rewards/format_reward": 0.3020833386108279, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 1414.8333740234375, "epoch": 0.7866323907455013, "grad_norm": 0.24783793091773987, "kl": 0.0043487548828125, "learning_rate": 1.1830871145697412e-07, "loss": -0.0238, "reward": 1.0414647981524467, "reward_std": 0.9714280366897583, "rewards/accuracy_reward": 0.09375000186264515, "rewards/cosine_scaled_reward": -0.12982673710212111, "rewards/format_reward": 0.3229166716337204, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 1585.8334045410156, "epoch": 0.7883461868037703, "grad_norm": 0.2753477394580841, "kl": 0.001529693603515625, "learning_rate": 1.1743223682775649e-07, "loss": -0.0595, "reward": -0.03680283762514591, "reward_std": 0.5208316072821617, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.27455031499266624, "rewards/format_reward": 0.1875000037252903, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1480.3541870117188, "epoch": 0.7900599828620394, "grad_norm": 0.31823086738586426, "kl": 0.002536773681640625, "learning_rate": 1.1657684494105386e-07, "loss": -0.0441, "reward": 0.308930940926075, "reward_std": 0.4995516464114189, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.18408312648534775, "rewards/format_reward": 0.2812500074505806, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1555.1041870117188, "epoch": 0.7917737789203085, "grad_norm": 0.27115893363952637, "kl": 0.0023899078369140625, "learning_rate": 1.1574257748745986e-07, "loss": 0.0457, "reward": 0.4185393452644348, "reward_std": 0.8871691823005676, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.22503817081451416, "rewards/format_reward": 0.28125001676380634, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1492.3542175292969, "epoch": 0.7934875749785776, "grad_norm": 0.24115745723247528, "kl": 0.0012664794921875, "learning_rate": 1.1492947512799328e-07, "loss": 0.0246, "reward": 0.710046922788024, "reward_std": 0.9779789745807648, "rewards/accuracy_reward": 0.07291666977107525, "rewards/cosine_scaled_reward": -0.17298575304448605, "rewards/format_reward": 0.3020833460614085, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 1465.5000305175781, "epoch": 0.7952013710368466, "grad_norm": 0.3306126892566681, "kl": 0.0021991729736328125, "learning_rate": 1.1413757749211602e-07, "loss": -0.0701, "reward": 0.37903094850480556, "reward_std": 0.9197189137339592, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.259335994720459, "rewards/format_reward": 0.2916666753590107, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 1630.3333740234375, "epoch": 0.7969151670951157, "grad_norm": 0.30497926473617554, "kl": 0.00186920166015625, "learning_rate": 1.1336692317580158e-07, "loss": 0.0361, "reward": 0.021877296268939972, "reward_std": 0.6427770256996155, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2640891894698143, "rewards/format_reward": 0.14583333861082792, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 1671.7500305175781, "epoch": 0.7986289631533847, "grad_norm": 0.23499169945716858, "kl": 0.001720428466796875, "learning_rate": 1.1261754973965422e-07, "loss": 0.0375, "reward": -0.016727572306990623, "reward_std": 0.6210906617343426, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.27923891320824623, "rewards/format_reward": 0.12500000279396772, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 1690.8750305175781, "epoch": 0.8003427592116538, "grad_norm": 0.25364887714385986, "kl": 0.0014820098876953125, "learning_rate": 1.1188949370707787e-07, "loss": 0.0486, "reward": -0.025222748517990112, "reward_std": 0.5080794095993042, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.2602507211267948, "rewards/format_reward": 0.10416667070239782, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1391.8750305175781, "epoch": 0.8020565552699229, "grad_norm": 0.3420558273792267, "kl": 0.0025119781494140625, "learning_rate": 1.1118279056249653e-07, "loss": 0.0859, "reward": 0.5714639741927385, "reward_std": 0.7158600762486458, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.2035766988992691, "rewards/format_reward": 0.28125000186264515, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 1536.4375610351562, "epoch": 0.803770351328192, "grad_norm": 0.2689160108566284, "kl": 0.0021257400512695312, "learning_rate": 1.1049747474962444e-07, "loss": -0.012, "reward": 0.007951326668262482, "reward_std": 0.603579930961132, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.25903622433543205, "rewards/format_reward": 0.18750000279396772, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 1389.8125305175781, "epoch": 0.805484147386461, "grad_norm": 0.2906559109687805, "kl": 0.0021419525146484375, "learning_rate": 1.0983357966978745e-07, "loss": -0.0294, "reward": 0.5438283756375313, "reward_std": 1.0326615944504738, "rewards/accuracy_reward": 0.06250000093132257, "rewards/cosine_scaled_reward": -0.23368197679519653, "rewards/format_reward": 0.3333333507180214, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 1593.9791870117188, "epoch": 0.8071979434447301, "grad_norm": 0.2347281128168106, "kl": 0.0017852783203125, "learning_rate": 1.0919113768029517e-07, "loss": 0.0171, "reward": -0.13019452383741736, "reward_std": 0.42053942382335663, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.24632840603590012, "rewards/format_reward": 0.12500000465661287, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 1487.3125305175781, "epoch": 0.8089117395029991, "grad_norm": 0.28390640020370483, "kl": 0.003276824951171875, "learning_rate": 1.0857018009286381e-07, "loss": 0.0471, "reward": 0.7808316666632891, "reward_std": 0.8974563926458359, "rewards/accuracy_reward": 0.08333333488553762, "rewards/cosine_scaled_reward": -0.22984285652637482, "rewards/format_reward": 0.3645833432674408, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 1613.0208740234375, "epoch": 0.8106255355612683, "grad_norm": 0.2611220180988312, "kl": 0.00237274169921875, "learning_rate": 1.0797073717209013e-07, "loss": 0.0024, "reward": 0.4201845917850733, "reward_std": 0.5787086635828018, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.21403678506612778, "rewards/format_reward": 0.1562500074505806, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 1424.4791717529297, "epoch": 0.8123393316195373, "grad_norm": 0.3259100615978241, "kl": 0.003170013427734375, "learning_rate": 1.0739283813397639e-07, "loss": 0.0046, "reward": 0.9525223551318049, "reward_std": 0.773897759616375, "rewards/accuracy_reward": 0.1041666679084301, "rewards/cosine_scaled_reward": -0.1674602646380663, "rewards/format_reward": 0.32291667722165585, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 1617.4583740234375, "epoch": 0.8140531276778064, "grad_norm": 0.27785417437553406, "kl": 0.002117156982421875, "learning_rate": 1.068365111445064e-07, "loss": 0.002, "reward": -0.034504614770412445, "reward_std": 0.6453158259391785, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.29863275587558746, "rewards/format_reward": 0.1458333395421505, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1642.0208435058594, "epoch": 0.8157669237360754, "grad_norm": 0.24803964793682098, "kl": 0.0032901763916015625, "learning_rate": 1.063017833182728e-07, "loss": -0.0068, "reward": -0.033661461900919676, "reward_std": 0.5855204090476036, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.27475955337285995, "rewards/format_reward": 0.17708333767950535, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 1489.1875, "epoch": 0.8174807197943444, "grad_norm": 0.22895686328411102, "kl": 0.0014104843139648438, "learning_rate": 1.0578868071715544e-07, "loss": 0.0127, "reward": 0.5462241061031818, "reward_std": 0.38443609699606895, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.23026614636182785, "rewards/format_reward": 0.22916667349636555, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 1260.7708587646484, "epoch": 0.8191945158526135, "grad_norm": 0.36001700162887573, "kl": 0.0029582977294921875, "learning_rate": 1.0529722834905125e-07, "loss": 0.0319, "reward": 0.4286029487848282, "reward_std": 0.7996908128261566, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.20781639032065868, "rewards/format_reward": 0.3020833507180214, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 1482.6458740234375, "epoch": 0.8209083119108826, "grad_norm": 0.28881216049194336, "kl": 0.0024471282958984375, "learning_rate": 1.0482745016665526e-07, "loss": 0.0271, "reward": 0.5074643790721893, "reward_std": 0.696723461151123, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.22283191978931427, "rewards/format_reward": 0.2708333395421505, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 1449.2500305175781, "epoch": 0.8226221079691517, "grad_norm": 0.29771751165390015, "kl": 0.0028934478759765625, "learning_rate": 1.0437936906629334e-07, "loss": -0.049, "reward": 0.3999009942635894, "reward_std": 0.8356301560997963, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.2523646242916584, "rewards/format_reward": 0.2708333358168602, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 1517.0416870117188, "epoch": 0.8243359040274207, "grad_norm": 0.27192792296409607, "kl": 0.0016155242919921875, "learning_rate": 1.0395300688680625e-07, "loss": -0.0018, "reward": 0.2986976206302643, "reward_std": 0.7875053882598877, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.27779635041952133, "rewards/format_reward": 0.3229166716337204, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1586.8958740234375, "epoch": 0.8260497000856898, "grad_norm": 0.23449064791202545, "kl": 0.00141143798828125, "learning_rate": 1.0354838440848501e-07, "loss": 0.01, "reward": 0.6149339154362679, "reward_std": 0.32066725939512253, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": -0.1879206132143736, "rewards/format_reward": 0.22916666697710752, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 1352.1458740234375, "epoch": 0.8277634961439588, "grad_norm": 0.25975510478019714, "kl": 0.002593994140625, "learning_rate": 1.0316552135205837e-07, "loss": 0.0456, "reward": 0.8296075742691755, "reward_std": 0.7724046930670738, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.14750460162758827, "rewards/format_reward": 0.322916679084301, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 1492.1458435058594, "epoch": 0.829477292202228, "grad_norm": 0.3252848982810974, "kl": 0.0024442672729492188, "learning_rate": 1.0280443637773163e-07, "loss": 0.0407, "reward": 0.32957739010453224, "reward_std": 0.9307611733675003, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.24090320616960526, "rewards/format_reward": 0.2916666716337204, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 1334.6875305175781, "epoch": 0.831191088260497, "grad_norm": 0.3913592994213104, "kl": 0.0066375732421875, "learning_rate": 1.0246514708427701e-07, "loss": -0.1766, "reward": 0.49342756532132626, "reward_std": 0.8096514493227005, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.22744118981063366, "rewards/format_reward": 0.3541666744276881, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 1405.3542175292969, "epoch": 0.8329048843187661, "grad_norm": 0.3229379951953888, "kl": 0.008695602416992188, "learning_rate": 1.0214767000817596e-07, "loss": 0.0611, "reward": 0.6591449100524187, "reward_std": 0.7455669045448303, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": -0.1503998525440693, "rewards/format_reward": 0.354166679084301, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1569.8125, "epoch": 0.8346186803770351, "grad_norm": 0.3465595245361328, "kl": 0.0024700164794921875, "learning_rate": 1.0185202062281336e-07, "loss": -0.1095, "reward": -0.1328437142074108, "reward_std": 0.28097547590732574, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.25740570947527885, "rewards/format_reward": 0.12500000279396772, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 1341.8541870117188, "epoch": 0.8363324764353042, "grad_norm": 0.2883760631084442, "kl": 0.0044956207275390625, "learning_rate": 1.0157821333772304e-07, "loss": -0.0132, "reward": 0.8968268608077778, "reward_std": 1.0606432855129242, "rewards/accuracy_reward": 0.09375, "rewards/cosine_scaled_reward": -0.1450992152094841, "rewards/format_reward": 0.28125000558793545, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 1509.9375305175781, "epoch": 0.8380462724935732, "grad_norm": 0.3751949071884155, "kl": 0.0017719268798828125, "learning_rate": 1.013262614978859e-07, "loss": -0.0342, "reward": 0.4215831942856312, "reward_std": 0.8841647803783417, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.21074516884982586, "rewards/format_reward": 0.3020833348855376, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1493.6875, "epoch": 0.8397600685518424, "grad_norm": 0.268597811460495, "kl": 0.0024871826171875, "learning_rate": 1.0109617738307911e-07, "loss": 0.0308, "reward": 0.3271262003108859, "reward_std": 0.7303740531206131, "rewards/accuracy_reward": 0.041666666977107525, "rewards/cosine_scaled_reward": -0.2523518577218056, "rewards/format_reward": 0.26041666977107525, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1588.1667175292969, "epoch": 0.8414738646101114, "grad_norm": 0.2774799168109894, "kl": 0.001842498779296875, "learning_rate": 1.0088797220727779e-07, "loss": -0.02, "reward": -0.06127945396292489, "reward_std": 0.4426092505455017, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2556958645582199, "rewards/format_reward": 0.1770833358168602, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 1670.5833740234375, "epoch": 0.8431876606683805, "grad_norm": 0.25282374024391174, "kl": 0.0024662017822265625, "learning_rate": 1.0070165611810855e-07, "loss": 0.0243, "reward": 0.015509873628616333, "reward_std": 0.5324227660894394, "rewards/accuracy_reward": 0.010416666977107525, "rewards/cosine_scaled_reward": -0.22523901239037514, "rewards/format_reward": 0.1145833358168602, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1433.6875305175781, "epoch": 0.8449014567266495, "grad_norm": 0.2538332939147949, "kl": 0.0018558502197265625, "learning_rate": 1.005372381963547e-07, "loss": 0.0205, "reward": 0.5890572015196085, "reward_std": 0.7479087933897972, "rewards/accuracy_reward": 0.0520833358168602, "rewards/cosine_scaled_reward": -0.19769811630249023, "rewards/format_reward": 0.322916679084301, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 1579.3333740234375, "epoch": 0.8466152527849186, "grad_norm": 0.2509302496910095, "kl": 0.0020503997802734375, "learning_rate": 1.0039472645551372e-07, "loss": 0.0519, "reward": 0.2534595690667629, "reward_std": 1.029329240322113, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.2416459023952484, "rewards/format_reward": 0.1666666716337204, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 1281.8750305175781, "epoch": 0.8483290488431876, "grad_norm": 0.3524729311466217, "kl": 0.0039005279541015625, "learning_rate": 1.002741278414069e-07, "loss": -0.109, "reward": 0.5718953423202038, "reward_std": 0.9078854024410248, "rewards/accuracy_reward": 0.05208333395421505, "rewards/cosine_scaled_reward": -0.18542027287185192, "rewards/format_reward": 0.34375000558793545, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1470.7083740234375, "epoch": 0.8500428449014568, "grad_norm": 0.32555145025253296, "kl": 0.0026264190673828125, "learning_rate": 1.0017544823184055e-07, "loss": 0.0229, "reward": 0.5937455967068672, "reward_std": 0.7896523922681808, "rewards/accuracy_reward": 0.10416666697710752, "rewards/cosine_scaled_reward": -0.11017215996980667, "rewards/format_reward": 0.2500000037252903, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 1557.5833435058594, "epoch": 0.8517566409597258, "grad_norm": 0.29465195536613464, "kl": 0.0020732879638671875, "learning_rate": 1.0009869243631952e-07, "loss": 0.0077, "reward": 0.5861381143331528, "reward_std": 1.3445016890764236, "rewards/accuracy_reward": 0.06250000093132257, "rewards/cosine_scaled_reward": -0.20600083097815514, "rewards/format_reward": 0.2916666753590107, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 1610.7917175292969, "epoch": 0.8534704370179949, "grad_norm": 0.2673383951187134, "kl": 0.001918792724609375, "learning_rate": 1.000438641958131e-07, "loss": 0.0054, "reward": -0.13415055978111923, "reward_std": 0.41844024509191513, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2875858470797539, "rewards/format_reward": 0.15625, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 1339.6458740234375, "epoch": 0.8551842330762639, "grad_norm": 0.29758375883102417, "kl": 0.004123687744140625, "learning_rate": 1.0001096618257236e-07, "loss": 0.0713, "reward": 0.968408640474081, "reward_std": 1.159729391336441, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": -0.16551226750016212, "rewards/format_reward": 0.3333333395421505, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1332.2500305175781, "epoch": 0.856898029134533, "grad_norm": 0.28377655148506165, "kl": 0.00530242919921875, "learning_rate": 1e-07, "loss": 0.1009, "reward": 1.3940413501113653, "reward_std": 1.01119065284729, "rewards/accuracy_reward": 0.1770833395421505, "rewards/cosine_scaled_reward": -0.041636522859334946, "rewards/format_reward": 0.4270833507180214, "step": 500 }, { "epoch": 0.856898029134533, "step": 500, "total_flos": 0.0, "train_loss": 0.0032392389368615113, "train_runtime": 46372.6786, "train_samples_per_second": 0.518, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }