OpenRS-GRPO / trainer_state.json
Liuboss's picture
Model save
3fc3c02 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.856898029134533,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 2770.8472290039062,
"epoch": 0.001713796058269066,
"grad_norm": 0.15192405879497528,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.014,
"reward": -0.06689765583723783,
"reward_std": 0.505804143846035,
"rewards/cosine_scaled_reward": -0.03344883490353823,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2785.013916015625,
"epoch": 0.003427592116538132,
"grad_norm": 0.1657538264989853,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": -0.0211,
"reward": -0.4646243788301945,
"reward_std": 0.39301297068595886,
"rewards/cosine_scaled_reward": -0.23231217823922634,
"rewards/format_reward": 0.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2713.027801513672,
"epoch": 0.005141388174807198,
"grad_norm": 0.1747598648071289,
"kl": 3.5196542739868164e-05,
"learning_rate": 4e-08,
"loss": -0.0275,
"reward": -0.23865782655775547,
"reward_std": 0.4481763616204262,
"rewards/cosine_scaled_reward": -0.11932891746982932,
"rewards/format_reward": 0.0,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2938.5277709960938,
"epoch": 0.006855184233076264,
"grad_norm": 0.16107600927352905,
"kl": 3.7282705307006836e-05,
"learning_rate": 6e-08,
"loss": -0.0289,
"reward": 0.06913903169333935,
"reward_std": 0.6892540901899338,
"rewards/cosine_scaled_reward": 0.03456950932741165,
"rewards/format_reward": 0.0,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2532.7222290039062,
"epoch": 0.00856898029134533,
"grad_norm": 0.15964782238006592,
"kl": 2.065300941467285e-05,
"learning_rate": 8e-08,
"loss": -0.0052,
"reward": -0.15601756004616618,
"reward_std": 0.5161308571696281,
"rewards/cosine_scaled_reward": -0.07800877187401056,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 3131.25,
"epoch": 0.010282776349614395,
"grad_norm": 0.13910692930221558,
"kl": 4.1961669921875e-05,
"learning_rate": 1e-07,
"loss": 0.029,
"reward": -0.13883829297265038,
"reward_std": 0.5291023775935173,
"rewards/cosine_scaled_reward": -0.06941914733033627,
"rewards/format_reward": 0.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 2258.6944885253906,
"epoch": 0.011996572407883462,
"grad_norm": 0.21499329805374146,
"kl": 3.059208393096924e-05,
"learning_rate": 1.2e-07,
"loss": -0.0297,
"reward": -0.22816578298807144,
"reward_std": 0.5721099078655243,
"rewards/cosine_scaled_reward": -0.11408288218080997,
"rewards/format_reward": 0.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 3106.65283203125,
"epoch": 0.013710368466152529,
"grad_norm": 0.15807782113552094,
"kl": 3.281235694885254e-05,
"learning_rate": 1.4e-07,
"loss": 0.0518,
"reward": -0.1028524599969387,
"reward_std": 0.7277905195951462,
"rewards/cosine_scaled_reward": -0.051426228135824203,
"rewards/format_reward": 0.0,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2652.2777709960938,
"epoch": 0.015424164524421594,
"grad_norm": 0.14988838136196136,
"kl": 3.746151924133301e-05,
"learning_rate": 1.6e-07,
"loss": -0.0052,
"reward": -0.04764566984522389,
"reward_std": 0.6422684416174889,
"rewards/cosine_scaled_reward": -0.023822834249585867,
"rewards/format_reward": 0.0,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 2956.250030517578,
"epoch": 0.01713796058269066,
"grad_norm": 0.15577340126037598,
"kl": 3.62396240234375e-05,
"learning_rate": 1.8e-07,
"loss": 0.0369,
"reward": -0.09274669736623764,
"reward_std": 0.6059432476758957,
"rewards/cosine_scaled_reward": -0.046373344492167234,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2610.430633544922,
"epoch": 0.018851756640959727,
"grad_norm": 0.18031956255435944,
"kl": 2.753734588623047e-05,
"learning_rate": 2e-07,
"loss": 0.0126,
"reward": 0.17614622993642115,
"reward_std": 0.7455325201153755,
"rewards/cosine_scaled_reward": 0.08807311341661261,
"rewards/format_reward": 0.0,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2977.2638549804688,
"epoch": 0.02056555269922879,
"grad_norm": 0.15254004299640656,
"kl": 3.084540367126465e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": -0.0238,
"reward": -0.2835669822525233,
"reward_std": 0.6270563155412674,
"rewards/cosine_scaled_reward": -0.14178348786663264,
"rewards/format_reward": 0.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2601.7916870117188,
"epoch": 0.022279348757497857,
"grad_norm": 0.1897689402103424,
"kl": 4.309415817260742e-05,
"learning_rate": 2.4e-07,
"loss": -0.008,
"reward": -0.08701697085052729,
"reward_std": 0.6209904551506042,
"rewards/cosine_scaled_reward": -0.04350848635658622,
"rewards/format_reward": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2891.3472290039062,
"epoch": 0.023993144815766924,
"grad_norm": 0.17451944947242737,
"kl": 3.2007694244384766e-05,
"learning_rate": 2.6e-07,
"loss": 0.0134,
"reward": -0.11856314726173878,
"reward_std": 0.5714613646268845,
"rewards/cosine_scaled_reward": -0.059281568974256516,
"rewards/format_reward": 0.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 3376.9444580078125,
"epoch": 0.02570694087403599,
"grad_norm": 0.19522128999233246,
"kl": 4.00543212890625e-05,
"learning_rate": 2.8e-07,
"loss": 0.0625,
"reward": -0.3375568427145481,
"reward_std": 0.5690607726573944,
"rewards/cosine_scaled_reward": -0.16877843253314495,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 2385.9861450195312,
"epoch": 0.027420736932305057,
"grad_norm": 0.17156164348125458,
"kl": 3.203749656677246e-05,
"learning_rate": 3e-07,
"loss": 0.0479,
"reward": 0.31096187606453896,
"reward_std": 0.719051368534565,
"rewards/cosine_scaled_reward": 0.15548093989491463,
"rewards/format_reward": 0.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 2834.4166870117188,
"epoch": 0.02913453299057412,
"grad_norm": 0.1916762739419937,
"kl": 3.910064697265625e-05,
"learning_rate": 3.2e-07,
"loss": 0.0288,
"reward": -0.1371638989658095,
"reward_std": 0.43335365504026413,
"rewards/cosine_scaled_reward": -0.06858194415690377,
"rewards/format_reward": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3107.9166870117188,
"epoch": 0.030848329048843187,
"grad_norm": 0.20290644466876984,
"kl": 3.56137752532959e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0182,
"reward": -0.2907893192023039,
"reward_std": 0.43716832995414734,
"rewards/cosine_scaled_reward": -0.14539465866982937,
"rewards/format_reward": 0.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 3065.611083984375,
"epoch": 0.032562125107112254,
"grad_norm": 0.1492234468460083,
"kl": 4.0084123611450195e-05,
"learning_rate": 3.6e-07,
"loss": 0.0216,
"reward": -0.19093798706308007,
"reward_std": 0.7698801159858704,
"rewards/cosine_scaled_reward": -0.09546899236738682,
"rewards/format_reward": 0.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 3355.7222900390625,
"epoch": 0.03427592116538132,
"grad_norm": 0.14321106672286987,
"kl": 3.36766242980957e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": -0.0048,
"reward": -0.2757381685078144,
"reward_std": 0.5536239072680473,
"rewards/cosine_scaled_reward": -0.1378690842539072,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 2938.125,
"epoch": 0.03598971722365039,
"grad_norm": 0.20512644946575165,
"kl": 4.1961669921875e-05,
"learning_rate": 4e-07,
"loss": 0.0577,
"reward": -0.1858626427128911,
"reward_std": 0.6686508804559708,
"rewards/cosine_scaled_reward": -0.09293132461607456,
"rewards/format_reward": 0.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 3192.2361450195312,
"epoch": 0.037703513281919454,
"grad_norm": 0.13245940208435059,
"kl": 3.49879264831543e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0372,
"reward": -0.186855623498559,
"reward_std": 0.5942067578434944,
"rewards/cosine_scaled_reward": -0.09342780988663435,
"rewards/format_reward": 0.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 3075.02783203125,
"epoch": 0.03941730934018852,
"grad_norm": 0.14223958551883698,
"kl": 2.8640031814575195e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": -0.0208,
"reward": -0.4465179964900017,
"reward_std": 0.36973506212234497,
"rewards/cosine_scaled_reward": -0.223259000107646,
"rewards/format_reward": 0.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2707.6250610351562,
"epoch": 0.04113110539845758,
"grad_norm": 0.20090773701667786,
"kl": 2.9414892196655273e-05,
"learning_rate": 4.6e-07,
"loss": 0.0292,
"reward": 0.08563654706813395,
"reward_std": 0.4666801244020462,
"rewards/cosine_scaled_reward": 0.04281827830709517,
"rewards/format_reward": 0.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 2578.9443969726562,
"epoch": 0.04284490145672665,
"grad_norm": 0.19762183725833893,
"kl": 2.6911497116088867e-05,
"learning_rate": 4.8e-07,
"loss": 0.0547,
"reward": -0.15825002267956734,
"reward_std": 0.6721501722931862,
"rewards/cosine_scaled_reward": -0.07912501133978367,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 3199.1111450195312,
"epoch": 0.044558697514995714,
"grad_norm": 0.14947673678398132,
"kl": 3.1381845474243164e-05,
"learning_rate": 5e-07,
"loss": 0.0771,
"reward": -0.3339938232675195,
"reward_std": 0.5660227835178375,
"rewards/cosine_scaled_reward": -0.16699691163375974,
"rewards/format_reward": 0.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3103.3193969726562,
"epoch": 0.04627249357326478,
"grad_norm": 0.12868770956993103,
"kl": 2.6017427444458008e-05,
"learning_rate": 5.2e-07,
"loss": 0.0118,
"reward": -0.2791058011353016,
"reward_std": 0.49328897148370743,
"rewards/cosine_scaled_reward": -0.13955289125442505,
"rewards/format_reward": 0.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2378.2222595214844,
"epoch": 0.04798628963153385,
"grad_norm": 0.2462579607963562,
"kl": 2.7805566787719727e-05,
"learning_rate": 5.4e-07,
"loss": 0.0596,
"reward": 0.03218653332442045,
"reward_std": 0.6807225868105888,
"rewards/cosine_scaled_reward": 0.016093265498057008,
"rewards/format_reward": 0.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2971.291748046875,
"epoch": 0.049700085689802914,
"grad_norm": 0.16591639816761017,
"kl": 3.515183925628662e-05,
"learning_rate": 5.6e-07,
"loss": 0.0141,
"reward": 0.011478596366941929,
"reward_std": 0.7397755682468414,
"rewards/cosine_scaled_reward": 0.005739298183470964,
"rewards/format_reward": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 2913.3611450195312,
"epoch": 0.05141388174807198,
"grad_norm": 0.13886681199073792,
"kl": 3.2573938369750977e-05,
"learning_rate": 5.8e-07,
"loss": 0.0258,
"reward": 0.05036446265876293,
"reward_std": 0.6957473307847977,
"rewards/cosine_scaled_reward": 0.025182233192026615,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2665.041748046875,
"epoch": 0.05312767780634105,
"grad_norm": 0.16625739634037018,
"kl": 2.000480890274048e-05,
"learning_rate": 6e-07,
"loss": 0.0045,
"reward": -0.044122666819021106,
"reward_std": 0.4255269840359688,
"rewards/cosine_scaled_reward": -0.022061329917050898,
"rewards/format_reward": 0.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2951.8611450195312,
"epoch": 0.054841473864610114,
"grad_norm": 0.15594074130058289,
"kl": 1.9147992134094238e-05,
"learning_rate": 6.2e-07,
"loss": 0.0942,
"reward": -0.3072533793747425,
"reward_std": 0.4980456754565239,
"rewards/cosine_scaled_reward": -0.15362668968737125,
"rewards/format_reward": 0.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 2260.0833435058594,
"epoch": 0.056555269922879174,
"grad_norm": 0.21370142698287964,
"kl": 3.5665929317474365e-05,
"learning_rate": 6.4e-07,
"loss": 0.0063,
"reward": 0.06617816537618637,
"reward_std": 0.5614925771951675,
"rewards/cosine_scaled_reward": 0.033089087810367346,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2807.013916015625,
"epoch": 0.05826906598114824,
"grad_norm": 0.20051412284374237,
"kl": 1.2192875146865845e-05,
"learning_rate": 6.6e-07,
"loss": 0.0328,
"reward": -0.17473484575748444,
"reward_std": 0.6600858569145203,
"rewards/cosine_scaled_reward": -0.08736742846667767,
"rewards/format_reward": 0.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3120.7500610351562,
"epoch": 0.05998286203941731,
"grad_norm": 0.13361996412277222,
"kl": 3.407895565032959e-05,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0472,
"reward": -0.4979929216206074,
"reward_std": 0.39260104298591614,
"rewards/cosine_scaled_reward": -0.2489964533597231,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2625.0694885253906,
"epoch": 0.061696658097686374,
"grad_norm": 0.16467803716659546,
"kl": 2.7239322662353516e-05,
"learning_rate": 7e-07,
"loss": -0.0168,
"reward": -0.35937849269248545,
"reward_std": 0.45373768359422684,
"rewards/cosine_scaled_reward": -0.1796892363927327,
"rewards/format_reward": 0.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3042.1806030273438,
"epoch": 0.06341045415595545,
"grad_norm": 0.15104345977306366,
"kl": 2.9146671295166016e-05,
"learning_rate": 7.2e-07,
"loss": 0.0068,
"reward": -0.37954360246658325,
"reward_std": 0.5432159453630447,
"rewards/cosine_scaled_reward": -0.18977180123329163,
"rewards/format_reward": 0.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3193.2083740234375,
"epoch": 0.06512425021422451,
"grad_norm": 0.20619741082191467,
"kl": 1.7097219824790955e-05,
"learning_rate": 7.4e-07,
"loss": 0.0389,
"reward": -0.29821273358538747,
"reward_std": 0.5581861883401871,
"rewards/cosine_scaled_reward": -0.1491063602734357,
"rewards/format_reward": 0.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 3018.5834045410156,
"epoch": 0.06683804627249357,
"grad_norm": 0.12940338253974915,
"kl": 3.90857458114624e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0162,
"reward": -0.25728118792176247,
"reward_std": 0.34478260576725006,
"rewards/cosine_scaled_reward": -0.12864059768617153,
"rewards/format_reward": 0.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 2860.7500610351562,
"epoch": 0.06855184233076264,
"grad_norm": 0.25654301047325134,
"kl": 0.0001112818717956543,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0545,
"reward": 0.13069207593798637,
"reward_std": 0.5447051227092743,
"rewards/cosine_scaled_reward": 0.06534605007618666,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2696.3611450195312,
"epoch": 0.0702656383890317,
"grad_norm": 0.19896458089351654,
"kl": 4.2378902435302734e-05,
"learning_rate": 8e-07,
"loss": 0.0826,
"reward": 0.2564197585452348,
"reward_std": 0.6877201497554779,
"rewards/cosine_scaled_reward": 0.12820987740997225,
"rewards/format_reward": 0.0,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2642.3333740234375,
"epoch": 0.07197943444730077,
"grad_norm": 0.1658892035484314,
"kl": 0.00020813941955566406,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0149,
"reward": -0.03526473790407181,
"reward_std": 0.6603178381919861,
"rewards/cosine_scaled_reward": -0.017632372677326202,
"rewards/format_reward": 0.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 2897.388916015625,
"epoch": 0.07369323050556983,
"grad_norm": 0.2002326250076294,
"kl": 6.079673767089844e-05,
"learning_rate": 8.399999999999999e-07,
"loss": 0.055,
"reward": 0.08917492628097534,
"reward_std": 0.4714968279004097,
"rewards/cosine_scaled_reward": 0.04458745941519737,
"rewards/format_reward": 0.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 2802.9583740234375,
"epoch": 0.07540702656383891,
"grad_norm": 0.14357756078243256,
"kl": 0.00016063451766967773,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0109,
"reward": -0.2601087912917137,
"reward_std": 0.5872670859098434,
"rewards/cosine_scaled_reward": -0.13005439937114716,
"rewards/format_reward": 0.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3034.8194580078125,
"epoch": 0.07712082262210797,
"grad_norm": 0.23196536302566528,
"kl": 0.00012412667274475098,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0428,
"reward": -0.2070726901292801,
"reward_std": 0.5877418145537376,
"rewards/cosine_scaled_reward": -0.10353635251522064,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 2306.2083435058594,
"epoch": 0.07883461868037704,
"grad_norm": 0.2409650981426239,
"kl": 0.0003217458724975586,
"learning_rate": 9e-07,
"loss": 0.0337,
"reward": -0.01094321720302105,
"reward_std": 0.6599317938089371,
"rewards/cosine_scaled_reward": -0.00547160767018795,
"rewards/format_reward": 0.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2936.1388549804688,
"epoch": 0.0805484147386461,
"grad_norm": 0.1777871698141098,
"kl": 0.0003833882510662079,
"learning_rate": 9.2e-07,
"loss": -0.0387,
"reward": -0.12989605404436588,
"reward_std": 0.6336122080683708,
"rewards/cosine_scaled_reward": -0.0649480305146426,
"rewards/format_reward": 0.0,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 2661.6806030273438,
"epoch": 0.08226221079691516,
"grad_norm": 0.3158990442752838,
"kl": 0.00035144388675689697,
"learning_rate": 9.399999999999999e-07,
"loss": 0.1047,
"reward": 0.12476684269495308,
"reward_std": 0.5184459760785103,
"rewards/cosine_scaled_reward": 0.06238342053256929,
"rewards/format_reward": 0.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2053.8611450195312,
"epoch": 0.08397600685518423,
"grad_norm": 0.19082558155059814,
"kl": 0.0006046295166015625,
"learning_rate": 9.6e-07,
"loss": -0.0144,
"reward": 0.012501850724220276,
"reward_std": 0.603157639503479,
"rewards/cosine_scaled_reward": 0.006250927224755287,
"rewards/format_reward": 0.0,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 2731.6527404785156,
"epoch": 0.0856898029134533,
"grad_norm": 0.22663110494613647,
"kl": 0.0009310245513916016,
"learning_rate": 9.8e-07,
"loss": 0.0409,
"reward": -0.30116934701800346,
"reward_std": 0.6284962445497513,
"rewards/cosine_scaled_reward": -0.1505846632644534,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2839.6944885253906,
"epoch": 0.08740359897172237,
"grad_norm": 0.17685562372207642,
"kl": 0.0002518892288208008,
"learning_rate": 1e-06,
"loss": 0.0272,
"reward": -0.16751686483621597,
"reward_std": 0.5093529745936394,
"rewards/cosine_scaled_reward": -0.08375842124223709,
"rewards/format_reward": 0.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3141.4583740234375,
"epoch": 0.08911739502999143,
"grad_norm": 0.14409120380878448,
"kl": 0.0003941059112548828,
"learning_rate": 9.999890338174275e-07,
"loss": -0.0079,
"reward": -0.19580290652811527,
"reward_std": 0.589723251760006,
"rewards/cosine_scaled_reward": -0.09790145605802536,
"rewards/format_reward": 0.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 3054.9445190429688,
"epoch": 0.0908311910882605,
"grad_norm": 0.13203154504299164,
"kl": 0.0002570152282714844,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0455,
"reward": -0.2164551168680191,
"reward_std": 0.6407450139522552,
"rewards/cosine_scaled_reward": -0.10822756588459015,
"rewards/format_reward": 0.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3393.8055419921875,
"epoch": 0.09254498714652956,
"grad_norm": 0.11958733946084976,
"kl": 0.0005993843078613281,
"learning_rate": 9.999013075636804e-07,
"loss": -0.007,
"reward": -0.27613697946071625,
"reward_std": 0.5631539821624756,
"rewards/cosine_scaled_reward": -0.13806848879903555,
"rewards/format_reward": 0.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3430.3055419921875,
"epoch": 0.09425878320479864,
"grad_norm": 0.13475047051906586,
"kl": 0.0003286600112915039,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0301,
"reward": -0.2911250814795494,
"reward_std": 0.5787934809923172,
"rewards/cosine_scaled_reward": -0.145562544465065,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 3075.1527709960938,
"epoch": 0.0959725792630677,
"grad_norm": 0.14396199584007263,
"kl": 0.0008380413055419922,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0481,
"reward": -0.058986596763134,
"reward_std": 0.5793360769748688,
"rewards/cosine_scaled_reward": -0.029493287205696106,
"rewards/format_reward": 0.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 3232.9722900390625,
"epoch": 0.09768637532133675,
"grad_norm": 0.14357316493988037,
"kl": 0.0003731250762939453,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0542,
"reward": -0.08436356298625469,
"reward_std": 0.4788799285888672,
"rewards/cosine_scaled_reward": -0.042181783355772495,
"rewards/format_reward": 0.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3087.3194580078125,
"epoch": 0.09940017137960583,
"grad_norm": 0.15331892669200897,
"kl": 0.0012726783752441406,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0529,
"reward": -0.29565126448869705,
"reward_std": 0.5033575221896172,
"rewards/cosine_scaled_reward": -0.1478256327100098,
"rewards/format_reward": 0.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 3110.6111450195312,
"epoch": 0.10111396743787489,
"grad_norm": 0.14103592932224274,
"kl": 0.0015869140625,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0384,
"reward": 0.018456660211086273,
"reward_std": 0.8149007856845856,
"rewards/cosine_scaled_reward": 0.009228323586285114,
"rewards/format_reward": 0.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3305.236083984375,
"epoch": 0.10282776349614396,
"grad_norm": 0.12172071635723114,
"kl": 0.00035071372985839844,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0086,
"reward": -0.27341870963573456,
"reward_std": 0.7006796821951866,
"rewards/cosine_scaled_reward": -0.13670935295522213,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 3224.0555419921875,
"epoch": 0.10454155955441302,
"grad_norm": 0.13248133659362793,
"kl": 0.0005254745483398438,
"learning_rate": 9.989038226169207e-07,
"loss": -0.0068,
"reward": -0.2998387850821018,
"reward_std": 0.3452136740088463,
"rewards/cosine_scaled_reward": -0.14991939440369606,
"rewards/format_reward": 0.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2643.5833740234375,
"epoch": 0.1062553556126821,
"grad_norm": 0.17902526259422302,
"kl": 0.0021648406982421875,
"learning_rate": 9.98673738502114e-07,
"loss": 0.057,
"reward": 0.017559568164870143,
"reward_std": 0.5955966338515282,
"rewards/cosine_scaled_reward": 0.008779789437539876,
"rewards/format_reward": 0.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3496.375,
"epoch": 0.10796915167095116,
"grad_norm": 0.1432785838842392,
"kl": 0.00047206878662109375,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0277,
"reward": -0.17097678780555725,
"reward_std": 0.6070086807012558,
"rewards/cosine_scaled_reward": -0.08548840321600437,
"rewards/format_reward": 0.0,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 2792.486114501953,
"epoch": 0.10968294772922023,
"grad_norm": 0.16470499336719513,
"kl": 0.0011835098266601562,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0207,
"reward": -0.26402536034584045,
"reward_std": 0.43254173547029495,
"rewards/cosine_scaled_reward": -0.13201268389821053,
"rewards/format_reward": 0.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3128.4861450195312,
"epoch": 0.11139674378748929,
"grad_norm": 0.1882910132408142,
"kl": 0.006333351135253906,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0385,
"reward": -0.0892822165042162,
"reward_std": 0.6130652017891407,
"rewards/cosine_scaled_reward": -0.04464110638946295,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2806.75,
"epoch": 0.11311053984575835,
"grad_norm": 0.15443913638591766,
"kl": 0.0003552436828613281,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0038,
"reward": -0.04117146506905556,
"reward_std": 0.4872736781835556,
"rewards/cosine_scaled_reward": -0.02058573253452778,
"rewards/format_reward": 0.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 3150.8194580078125,
"epoch": 0.11482433590402742,
"grad_norm": 0.15191471576690674,
"kl": 0.0016102790832519531,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0316,
"reward": -0.23821864277124405,
"reward_std": 0.5326030105352402,
"rewards/cosine_scaled_reward": -0.11910932138562202,
"rewards/format_reward": 0.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 2845.6806030273438,
"epoch": 0.11653813196229648,
"grad_norm": 0.1388000249862671,
"kl": 0.0018000602722167969,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0376,
"reward": -0.17579936794936657,
"reward_std": 0.6001454517245293,
"rewards/cosine_scaled_reward": -0.08789968676865101,
"rewards/format_reward": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 3050.7361450195312,
"epoch": 0.11825192802056556,
"grad_norm": 0.13662724196910858,
"kl": 0.0015287399291992188,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0787,
"reward": -0.09626813535578549,
"reward_std": 0.6232626661658287,
"rewards/cosine_scaled_reward": -0.04813406406901777,
"rewards/format_reward": 0.0,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2883.9722290039062,
"epoch": 0.11996572407883462,
"grad_norm": 0.18917521834373474,
"kl": 0.00302886962890625,
"learning_rate": 9.960469931131936e-07,
"loss": -0.0608,
"reward": 0.05035170167684555,
"reward_std": 0.4191203862428665,
"rewards/cosine_scaled_reward": 0.025175858289003372,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 3137.4583740234375,
"epoch": 0.12167952013710369,
"grad_norm": 0.15267273783683777,
"kl": 0.0017466545104980469,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0362,
"reward": -0.04426470585167408,
"reward_std": 0.6740965843200684,
"rewards/cosine_scaled_reward": -0.022132341749966145,
"rewards/format_reward": 0.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 2443.0138549804688,
"epoch": 0.12339331619537275,
"grad_norm": 0.16214598715305328,
"kl": 0.003936767578125,
"learning_rate": 9.951725498333448e-07,
"loss": -0.0396,
"reward": 0.09306424111127853,
"reward_std": 0.43733419477939606,
"rewards/cosine_scaled_reward": 0.04653212707489729,
"rewards/format_reward": 0.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 3163.513916015625,
"epoch": 0.12510711225364182,
"grad_norm": 0.23524802923202515,
"kl": 0.018090248107910156,
"learning_rate": 9.947027716509488e-07,
"loss": -0.0168,
"reward": -0.17970024980604649,
"reward_std": 0.4914797991514206,
"rewards/cosine_scaled_reward": -0.0898501230403781,
"rewards/format_reward": 0.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2410.25,
"epoch": 0.1268209083119109,
"grad_norm": 0.15706373751163483,
"kl": 0.0030879974365234375,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0191,
"reward": 0.2525464817881584,
"reward_std": 0.6606673151254654,
"rewards/cosine_scaled_reward": 0.12627324275672436,
"rewards/format_reward": 0.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 3146.8611450195312,
"epoch": 0.12853470437017994,
"grad_norm": 0.15255555510520935,
"kl": 0.0032701492309570312,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0281,
"reward": -0.07365414220839739,
"reward_std": 0.5634644776582718,
"rewards/cosine_scaled_reward": -0.036827060393989086,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 2159.249984741211,
"epoch": 0.13024850042844902,
"grad_norm": 0.39581403136253357,
"kl": 0.018310546875,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0072,
"reward": 0.14826004952192307,
"reward_std": 0.6063434556126595,
"rewards/cosine_scaled_reward": 0.07413001451641321,
"rewards/format_reward": 0.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 3143.9443969726562,
"epoch": 0.1319622964867181,
"grad_norm": 0.13312797248363495,
"kl": 0.00225830078125,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0387,
"reward": 0.15560828521847725,
"reward_std": 0.680296927690506,
"rewards/cosine_scaled_reward": 0.07780414074659348,
"rewards/format_reward": 0.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 3317.5416870117188,
"epoch": 0.13367609254498714,
"grad_norm": 0.13495096564292908,
"kl": 0.0019426345825195312,
"learning_rate": 9.9202926282791e-07,
"loss": -0.0019,
"reward": -0.4046759568154812,
"reward_std": 0.5655369237065315,
"rewards/cosine_scaled_reward": -0.20233797095716,
"rewards/format_reward": 0.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2373.2361755371094,
"epoch": 0.1353898886032562,
"grad_norm": 0.26138797402381897,
"kl": 0.010517120361328125,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0351,
"reward": -0.17695464938879013,
"reward_std": 0.34004002809524536,
"rewards/cosine_scaled_reward": -0.08847732283174992,
"rewards/format_reward": 0.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3025.5000610351562,
"epoch": 0.13710368466152528,
"grad_norm": 0.17277857661247253,
"kl": 0.0012784004211425781,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0488,
"reward": -0.08927152771502733,
"reward_std": 0.6381218209862709,
"rewards/cosine_scaled_reward": -0.04463577060960233,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 3080.2777709960938,
"epoch": 0.13881748071979436,
"grad_norm": 0.14923037588596344,
"kl": 0.0020084381103515625,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0073,
"reward": -0.27667392790317535,
"reward_std": 0.39360568672418594,
"rewards/cosine_scaled_reward": -0.13833696395158768,
"rewards/format_reward": 0.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 2893.77783203125,
"epoch": 0.1405312767780634,
"grad_norm": 0.3161645531654358,
"kl": 0.011153221130371094,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0838,
"reward": -0.08123429818078876,
"reward_std": 0.6654616445302963,
"rewards/cosine_scaled_reward": -0.040617153281345963,
"rewards/format_reward": 0.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 2858.736083984375,
"epoch": 0.14224507283633248,
"grad_norm": 0.1683678925037384,
"kl": 0.001474142074584961,
"learning_rate": 9.888172094375033e-07,
"loss": -0.0148,
"reward": -0.12576034758239985,
"reward_std": 0.6605924665927887,
"rewards/cosine_scaled_reward": -0.06288017379119992,
"rewards/format_reward": 0.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2913.3194885253906,
"epoch": 0.14395886889460155,
"grad_norm": 0.22510592639446259,
"kl": 0.0032978057861328125,
"learning_rate": 9.881105062929221e-07,
"loss": 0.042,
"reward": -0.05945697799324989,
"reward_std": 0.5878739953041077,
"rewards/cosine_scaled_reward": -0.0297284796833992,
"rewards/format_reward": 0.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 2990.7916870117188,
"epoch": 0.1456726649528706,
"grad_norm": 0.14112693071365356,
"kl": 0.0014820098876953125,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0518,
"reward": -0.05626801133621484,
"reward_std": 0.5443409904837608,
"rewards/cosine_scaled_reward": -0.028134002874139696,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2831.6805419921875,
"epoch": 0.14738646101113967,
"grad_norm": 0.17547817528247833,
"kl": 0.00238037109375,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0229,
"reward": -0.25049374252557755,
"reward_std": 0.6190591081976891,
"rewards/cosine_scaled_reward": -0.12524686381220818,
"rewards/format_reward": 0.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 3439.2639770507812,
"epoch": 0.14910025706940874,
"grad_norm": 0.12470373511314392,
"kl": 0.0005965232849121094,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0309,
"reward": -0.15761397371534258,
"reward_std": 0.568816527724266,
"rewards/cosine_scaled_reward": -0.07880698406370357,
"rewards/format_reward": 0.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 3291.013916015625,
"epoch": 0.15081405312767782,
"grad_norm": 0.17072485387325287,
"kl": 0.0011734962463378906,
"learning_rate": 9.850705248720068e-07,
"loss": -0.0003,
"reward": -0.31209783256053925,
"reward_std": 0.4534567594528198,
"rewards/cosine_scaled_reward": -0.15604891628026962,
"rewards/format_reward": 0.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2711.6111450195312,
"epoch": 0.15252784918594686,
"grad_norm": 0.17909394204616547,
"kl": 0.00319671630859375,
"learning_rate": 9.8425742251254e-07,
"loss": -0.0351,
"reward": -0.39153438061475754,
"reward_std": 0.44514787942171097,
"rewards/cosine_scaled_reward": -0.19576718658208847,
"rewards/format_reward": 0.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2884.3611755371094,
"epoch": 0.15424164524421594,
"grad_norm": 0.1545180082321167,
"kl": 0.0027666091918945312,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0231,
"reward": -0.12805988639593124,
"reward_std": 0.41310104727745056,
"rewards/cosine_scaled_reward": -0.06402994319796562,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 2961.8194580078125,
"epoch": 0.155955441302485,
"grad_norm": 0.17576223611831665,
"kl": 0.004119873046875,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0293,
"reward": -0.06583835743367672,
"reward_std": 0.6373212188482285,
"rewards/cosine_scaled_reward": -0.03291917638853192,
"rewards/format_reward": 0.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2929.5277709960938,
"epoch": 0.15766923736075408,
"grad_norm": 0.14930115640163422,
"kl": 0.0034623146057128906,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0296,
"reward": -0.18032184429466724,
"reward_std": 0.6196585968136787,
"rewards/cosine_scaled_reward": -0.09016093239188194,
"rewards/format_reward": 0.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2748.7361450195312,
"epoch": 0.15938303341902313,
"grad_norm": 0.1628389209508896,
"kl": 0.0011005401611328125,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0544,
"reward": -0.048349371179938316,
"reward_std": 0.5468417555093765,
"rewards/cosine_scaled_reward": -0.024174699559807777,
"rewards/format_reward": 0.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2974.8194580078125,
"epoch": 0.1610968294772922,
"grad_norm": 0.17104412615299225,
"kl": 0.0025157928466796875,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0562,
"reward": -0.10820803185924888,
"reward_std": 0.5462353378534317,
"rewards/cosine_scaled_reward": -0.05410401395056397,
"rewards/format_reward": 0.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2822.3055419921875,
"epoch": 0.16281062553556128,
"grad_norm": 0.22087068855762482,
"kl": 0.0032253265380859375,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0157,
"reward": -0.2787464428693056,
"reward_std": 0.5101591870188713,
"rewards/cosine_scaled_reward": -0.139373216079548,
"rewards/format_reward": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3177.1389770507812,
"epoch": 0.16452442159383032,
"grad_norm": 0.13341942429542542,
"kl": 0.0016889572143554688,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0599,
"reward": 0.22422180697321892,
"reward_std": 0.6203102543950081,
"rewards/cosine_scaled_reward": 0.11211090348660946,
"rewards/format_reward": 0.0,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 3359.1666870117188,
"epoch": 0.1662382176520994,
"grad_norm": 0.17103053629398346,
"kl": 0.0048770904541015625,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0584,
"reward": -0.34769631922245026,
"reward_std": 0.5649063661694527,
"rewards/cosine_scaled_reward": -0.17384816892445087,
"rewards/format_reward": 0.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 2853.8472290039062,
"epoch": 0.16795201371036847,
"grad_norm": 0.16162103414535522,
"kl": 0.002391815185546875,
"learning_rate": 9.759921670520634e-07,
"loss": -0.0363,
"reward": 0.04994424246251583,
"reward_std": 0.4738911837339401,
"rewards/cosine_scaled_reward": 0.024972120765596628,
"rewards/format_reward": 0.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 3113.9722290039062,
"epoch": 0.16966580976863754,
"grad_norm": 0.17794044315814972,
"kl": 0.002719879150390625,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0017,
"reward": -0.16785867512226105,
"reward_std": 0.5008634850382805,
"rewards/cosine_scaled_reward": -0.08392933756113052,
"rewards/format_reward": 0.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2779.9862060546875,
"epoch": 0.1713796058269066,
"grad_norm": 0.1735229194164276,
"kl": 0.005786895751953125,
"learning_rate": 9.739258537542835e-07,
"loss": -0.0595,
"reward": -0.15765622071921825,
"reward_std": 0.4426313266158104,
"rewards/cosine_scaled_reward": -0.07882811967283487,
"rewards/format_reward": 0.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2904.5833129882812,
"epoch": 0.17309340188517566,
"grad_norm": 0.16130799055099487,
"kl": 0.0022287368774414062,
"learning_rate": 9.728616793536587e-07,
"loss": -0.027,
"reward": -0.2833556551486254,
"reward_std": 0.41574449837207794,
"rewards/cosine_scaled_reward": -0.14167783502489328,
"rewards/format_reward": 0.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 2956.9444580078125,
"epoch": 0.17480719794344474,
"grad_norm": 0.14904557168483734,
"kl": 0.0023751258850097656,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0249,
"reward": -0.005829242058098316,
"reward_std": 0.49208924546837807,
"rewards/cosine_scaled_reward": -0.002914619166404009,
"rewards/format_reward": 0.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 3127.75,
"epoch": 0.17652099400171378,
"grad_norm": 0.13523682951927185,
"kl": 0.0023593902587890625,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0048,
"reward": -0.16767939552664757,
"reward_std": 0.497691310942173,
"rewards/cosine_scaled_reward": -0.08383970521390438,
"rewards/format_reward": 0.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 3349.888916015625,
"epoch": 0.17823479005998286,
"grad_norm": 0.16127026081085205,
"kl": 0.002029895782470703,
"learning_rate": 9.695457105469804e-07,
"loss": -0.0079,
"reward": -0.4253583773970604,
"reward_std": 0.5213425680994987,
"rewards/cosine_scaled_reward": -0.2126791886985302,
"rewards/format_reward": 0.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2762.3056030273438,
"epoch": 0.17994858611825193,
"grad_norm": 0.22534409165382385,
"kl": 0.004019737243652344,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0786,
"reward": -0.13280940428376198,
"reward_std": 0.6939076110720634,
"rewards/cosine_scaled_reward": -0.06640470400452614,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 3027.013885498047,
"epoch": 0.181662382176521,
"grad_norm": 0.18191885948181152,
"kl": 0.001827239990234375,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0572,
"reward": -0.30150486156344414,
"reward_std": 0.5941706523299217,
"rewards/cosine_scaled_reward": -0.15075243171304464,
"rewards/format_reward": 0.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 3236.4166870117188,
"epoch": 0.18337617823479005,
"grad_norm": 0.12520797550678253,
"kl": 0.002315521240234375,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0039,
"reward": 0.061343319714069366,
"reward_std": 0.5028644949197769,
"rewards/cosine_scaled_reward": 0.030671661719679832,
"rewards/format_reward": 0.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 3337.3056030273438,
"epoch": 0.18508997429305912,
"grad_norm": 0.14343461394309998,
"kl": 0.0016498565673828125,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0438,
"reward": -0.17464184761047363,
"reward_std": 0.5610974803566933,
"rewards/cosine_scaled_reward": -0.08732092566788197,
"rewards/format_reward": 0.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 2781.4305419921875,
"epoch": 0.1868037703513282,
"grad_norm": 0.1800822913646698,
"kl": 0.0033397674560546875,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0242,
"reward": -0.26444700360298157,
"reward_std": 0.5241282097995281,
"rewards/cosine_scaled_reward": -0.13222350925207138,
"rewards/format_reward": 0.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 2989.3472290039062,
"epoch": 0.18851756640959727,
"grad_norm": 0.24952495098114014,
"kl": 0.00222015380859375,
"learning_rate": 9.623632283030077e-07,
"loss": 0.1294,
"reward": -0.038819944486021996,
"reward_std": 0.7193348854780197,
"rewards/cosine_scaled_reward": -0.019409974105656147,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2775.6111450195312,
"epoch": 0.19023136246786632,
"grad_norm": 0.16514870524406433,
"kl": 0.0029087066650390625,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0565,
"reward": -0.3495597681030631,
"reward_std": 0.3909125030040741,
"rewards/cosine_scaled_reward": -0.17477987939491868,
"rewards/format_reward": 0.0,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 2831.8055419921875,
"epoch": 0.1919451585261354,
"grad_norm": 0.13825589418411255,
"kl": 0.0023212432861328125,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0231,
"reward": -0.17934568971395493,
"reward_std": 0.5252480655908585,
"rewards/cosine_scaled_reward": -0.08967284485697746,
"rewards/format_reward": 0.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2977.8333740234375,
"epoch": 0.19365895458440446,
"grad_norm": 0.15359072387218475,
"kl": 0.002410888671875,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0053,
"reward": 0.25896316685248166,
"reward_std": 0.705707773566246,
"rewards/cosine_scaled_reward": 0.12948158156359568,
"rewards/format_reward": 0.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 2234.2500610351562,
"epoch": 0.1953727506426735,
"grad_norm": 0.17650143802165985,
"kl": 0.002643585205078125,
"learning_rate": 9.571721736097088e-07,
"loss": -0.0481,
"reward": -0.20779240669799037,
"reward_std": 0.50680061429739,
"rewards/cosine_scaled_reward": -0.10389620521164034,
"rewards/format_reward": 0.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2372.65283203125,
"epoch": 0.19708654670094258,
"grad_norm": 0.21576084196567535,
"kl": 0.007110595703125,
"learning_rate": 9.55824636882301e-07,
"loss": 0.1072,
"reward": 0.03794890362769365,
"reward_std": 0.6275844648480415,
"rewards/cosine_scaled_reward": 0.018974455073475838,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 3263.27783203125,
"epoch": 0.19880034275921166,
"grad_norm": 0.18672628700733185,
"kl": 0.0029048919677734375,
"learning_rate": 9.54457320834625e-07,
"loss": -0.034,
"reward": -0.3033560863696039,
"reward_std": 0.5516846142709255,
"rewards/cosine_scaled_reward": -0.15167804807424545,
"rewards/format_reward": 0.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 3024.2500610351562,
"epoch": 0.20051413881748073,
"grad_norm": 0.1308911144733429,
"kl": 0.00705718994140625,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0178,
"reward": 0.19102132320404053,
"reward_std": 0.7014489844441414,
"rewards/cosine_scaled_reward": 0.09551066905260086,
"rewards/format_reward": 0.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2574.013916015625,
"epoch": 0.20222793487574978,
"grad_norm": 0.325631320476532,
"kl": 0.013393402099609375,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0659,
"reward": -0.29521266371011734,
"reward_std": 0.5856474936008453,
"rewards/cosine_scaled_reward": -0.1476063383743167,
"rewards/format_reward": 0.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2724.3333740234375,
"epoch": 0.20394173093401885,
"grad_norm": 0.14827784895896912,
"kl": 0.0027828216552734375,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0141,
"reward": -0.03255775198340416,
"reward_std": 0.34701335430145264,
"rewards/cosine_scaled_reward": -0.01627887785434723,
"rewards/format_reward": 0.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 2813.1111450195312,
"epoch": 0.20565552699228792,
"grad_norm": 0.21779808402061462,
"kl": 0.0053081512451171875,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0158,
"reward": -0.19739244412630796,
"reward_std": 0.6424184814095497,
"rewards/cosine_scaled_reward": -0.0986962317256257,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2874.0000610351562,
"epoch": 0.207369323050557,
"grad_norm": 0.2778118848800659,
"kl": 0.0032444000244140625,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0937,
"reward": -0.15650038793683052,
"reward_std": 0.5867400094866753,
"rewards/cosine_scaled_reward": -0.07825020421296358,
"rewards/format_reward": 0.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 3251.1112060546875,
"epoch": 0.20908311910882604,
"grad_norm": 0.12883791327476501,
"kl": 0.003650665283203125,
"learning_rate": 9.458418577899774e-07,
"loss": 0.02,
"reward": -0.30216934718191624,
"reward_std": 0.5233990028500557,
"rewards/cosine_scaled_reward": -0.1510846719611436,
"rewards/format_reward": 0.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 2774.3194580078125,
"epoch": 0.21079691516709512,
"grad_norm": 0.20982016623020172,
"kl": 0.002071380615234375,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0498,
"reward": 0.3517572022974491,
"reward_std": 0.7633289247751236,
"rewards/cosine_scaled_reward": 0.17587858624756336,
"rewards/format_reward": 0.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 3077.4166259765625,
"epoch": 0.2125107112253642,
"grad_norm": 0.14578428864479065,
"kl": 0.0024623870849609375,
"learning_rate": 9.428149347714143e-07,
"loss": 0.002,
"reward": -0.09189963340759277,
"reward_std": 0.4004024267196655,
"rewards/cosine_scaled_reward": -0.04594981297850609,
"rewards/format_reward": 0.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2898.5556030273438,
"epoch": 0.21422450728363324,
"grad_norm": 0.19108974933624268,
"kl": 0.0016632080078125,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0218,
"reward": 0.01400849362835288,
"reward_std": 0.5958191454410553,
"rewards/cosine_scaled_reward": 0.007004249142482877,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 3005.916748046875,
"epoch": 0.2159383033419023,
"grad_norm": 0.26980966329574585,
"kl": 0.004871368408203125,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0539,
"reward": -0.19987820833921432,
"reward_std": 0.5232749357819557,
"rewards/cosine_scaled_reward": -0.09993909671902657,
"rewards/format_reward": 0.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2932.5416870117188,
"epoch": 0.21765209940017138,
"grad_norm": 0.15654343366622925,
"kl": 0.0043792724609375,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0405,
"reward": -0.17467445600777864,
"reward_std": 0.5738040953874588,
"rewards/cosine_scaled_reward": -0.0873372326605022,
"rewards/format_reward": 0.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 3097.6666870117188,
"epoch": 0.21936589545844046,
"grad_norm": 0.16381874680519104,
"kl": 0.00551605224609375,
"learning_rate": 9.36531953618799e-07,
"loss": -0.0288,
"reward": -0.20874720811843872,
"reward_std": 0.5535652860999107,
"rewards/cosine_scaled_reward": -0.10437360778450966,
"rewards/format_reward": 0.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2531.861114501953,
"epoch": 0.2210796915167095,
"grad_norm": 0.26021480560302734,
"kl": 0.006000518798828125,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0459,
"reward": -0.044261377304792404,
"reward_std": 0.4739195331931114,
"rewards/cosine_scaled_reward": -0.022130683064460754,
"rewards/format_reward": 0.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 2091.138916015625,
"epoch": 0.22279348757497858,
"grad_norm": 0.22645580768585205,
"kl": 0.00482940673828125,
"learning_rate": 9.332771203643714e-07,
"loss": -0.0704,
"reward": 0.38943320140242577,
"reward_std": 0.7351026237010956,
"rewards/cosine_scaled_reward": 0.19471661932766438,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 3235.513916015625,
"epoch": 0.22450728363324765,
"grad_norm": 0.14915120601654053,
"kl": 0.003574371337890625,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0073,
"reward": -0.32377296313643456,
"reward_std": 0.48132046312093735,
"rewards/cosine_scaled_reward": -0.16188647784292698,
"rewards/format_reward": 0.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 3044.4166870117188,
"epoch": 0.2262210796915167,
"grad_norm": 0.16817504167556763,
"kl": 0.003704071044921875,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0174,
"reward": -0.18535634828731418,
"reward_std": 0.6574838161468506,
"rewards/cosine_scaled_reward": -0.0926781720481813,
"rewards/format_reward": 0.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 3003.4444580078125,
"epoch": 0.22793487574978577,
"grad_norm": 0.15358978509902954,
"kl": 0.0041980743408203125,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0337,
"reward": -0.05171632254496217,
"reward_std": 0.5909973978996277,
"rewards/cosine_scaled_reward": -0.025858158012852073,
"rewards/format_reward": 0.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 2156.4583740234375,
"epoch": 0.22964867180805484,
"grad_norm": 0.1683642566204071,
"kl": 0.00467681884765625,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0033,
"reward": 0.009663693606853485,
"reward_std": 0.4995303153991699,
"rewards/cosine_scaled_reward": 0.004831850528717041,
"rewards/format_reward": 0.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2540.2777709960938,
"epoch": 0.23136246786632392,
"grad_norm": 0.1953487992286682,
"kl": 0.00795745849609375,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0474,
"reward": -0.21851413743570447,
"reward_std": 0.5443524122238159,
"rewards/cosine_scaled_reward": -0.10925705661065876,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 2695.0694885253906,
"epoch": 0.23307626392459296,
"grad_norm": 0.1705743372440338,
"kl": 0.0045166015625,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0191,
"reward": 0.05242172256112099,
"reward_std": 0.5593772605061531,
"rewards/cosine_scaled_reward": 0.026210861280560493,
"rewards/format_reward": 0.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 3158.9444580078125,
"epoch": 0.23479005998286204,
"grad_norm": 0.17036336660385132,
"kl": 0.00504302978515625,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0254,
"reward": 0.028430916368961334,
"reward_std": 0.7066435366868973,
"rewards/cosine_scaled_reward": 0.014215447008609772,
"rewards/format_reward": 0.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 3004.0416259765625,
"epoch": 0.2365038560411311,
"grad_norm": 0.1331450194120407,
"kl": 0.003459930419921875,
"learning_rate": 9.195171441101668e-07,
"loss": -0.0176,
"reward": -0.014733657240867615,
"reward_std": 0.5561396405100822,
"rewards/cosine_scaled_reward": -0.007366828620433807,
"rewards/format_reward": 0.0,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 2905.4444580078125,
"epoch": 0.23821765209940018,
"grad_norm": 0.17066888511180878,
"kl": 0.00594329833984375,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0097,
"reward": -0.19389863312244415,
"reward_std": 0.47480132430791855,
"rewards/cosine_scaled_reward": -0.09694933146238327,
"rewards/format_reward": 0.0,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 2243.4722595214844,
"epoch": 0.23993144815766923,
"grad_norm": 0.2052508443593979,
"kl": 0.0069751739501953125,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0149,
"reward": -0.17606773134320974,
"reward_std": 0.4814153388142586,
"rewards/cosine_scaled_reward": -0.08803386008366942,
"rewards/format_reward": 0.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 3140.3472290039062,
"epoch": 0.2416452442159383,
"grad_norm": 0.2093152105808258,
"kl": 0.009227752685546875,
"learning_rate": 9.140576474687263e-07,
"loss": -0.0173,
"reward": -0.2940823882818222,
"reward_std": 0.46395206451416016,
"rewards/cosine_scaled_reward": -0.1470412015914917,
"rewards/format_reward": 0.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 3106.0833740234375,
"epoch": 0.24335904027420738,
"grad_norm": 0.15536610782146454,
"kl": 0.003704071044921875,
"learning_rate": 9.122022088101613e-07,
"loss": -0.0133,
"reward": -0.12113199383020401,
"reward_std": 0.5028039142489433,
"rewards/cosine_scaled_reward": -0.06056600622832775,
"rewards/format_reward": 0.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 3045.0,
"epoch": 0.24507283633247642,
"grad_norm": 0.1554841846227646,
"kl": 0.00397491455078125,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0112,
"reward": -0.24326159805059433,
"reward_std": 0.545206792652607,
"rewards/cosine_scaled_reward": -0.12163079530000687,
"rewards/format_reward": 0.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 3281.52783203125,
"epoch": 0.2467866323907455,
"grad_norm": 0.15741369128227234,
"kl": 0.004161834716796875,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0205,
"reward": -0.3316431827843189,
"reward_std": 0.5960408300161362,
"rewards/cosine_scaled_reward": -0.16582159511744976,
"rewards/format_reward": 0.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 2848.000030517578,
"epoch": 0.24850042844901457,
"grad_norm": 0.16731388866901398,
"kl": 0.004131317138671875,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0076,
"reward": -0.030747827142477036,
"reward_std": 0.532738171517849,
"rewards/cosine_scaled_reward": -0.015373910777270794,
"rewards/format_reward": 0.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2467.6944580078125,
"epoch": 0.25021422450728364,
"grad_norm": 0.21354977786540985,
"kl": 0.004360198974609375,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0206,
"reward": 0.0201254915446043,
"reward_std": 0.8671004623174667,
"rewards/cosine_scaled_reward": 0.010062748566269875,
"rewards/format_reward": 0.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 2855.6666870117188,
"epoch": 0.2519280205655527,
"grad_norm": 0.144964799284935,
"kl": 0.004791259765625,
"learning_rate": 9.026620557966279e-07,
"loss": -0.0329,
"reward": -0.2643125932663679,
"reward_std": 0.5043439790606499,
"rewards/cosine_scaled_reward": -0.13215629663318396,
"rewards/format_reward": 0.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 3096.513916015625,
"epoch": 0.2536418166238218,
"grad_norm": 0.14218087494373322,
"kl": 0.003002166748046875,
"learning_rate": 9.007020842191634e-07,
"loss": -0.0089,
"reward": -0.221635602414608,
"reward_std": 0.4477159082889557,
"rewards/cosine_scaled_reward": -0.11081778630614281,
"rewards/format_reward": 0.0,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 2720.263916015625,
"epoch": 0.25535561268209084,
"grad_norm": 0.17918290197849274,
"kl": 0.004497528076171875,
"learning_rate": 8.987250199168808e-07,
"loss": -0.0669,
"reward": -0.07472209073603153,
"reward_std": 0.6641673818230629,
"rewards/cosine_scaled_reward": -0.03736104257404804,
"rewards/format_reward": 0.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 2809.666717529297,
"epoch": 0.2570694087403599,
"grad_norm": 0.13525010645389557,
"kl": 0.004337310791015625,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0411,
"reward": 0.32146409433335066,
"reward_std": 0.6463728100061417,
"rewards/cosine_scaled_reward": 0.16073204344138503,
"rewards/format_reward": 0.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2688.6944580078125,
"epoch": 0.258783204798629,
"grad_norm": 0.29565665125846863,
"kl": 0.0060577392578125,
"learning_rate": 8.9471999940354e-07,
"loss": -0.0998,
"reward": -0.27624649833887815,
"reward_std": 0.46585455536842346,
"rewards/cosine_scaled_reward": -0.13812324171885848,
"rewards/format_reward": 0.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2944.888916015625,
"epoch": 0.26049700085689803,
"grad_norm": 0.15996259450912476,
"kl": 0.005001068115234375,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0467,
"reward": -0.09553277865052223,
"reward_std": 0.5195184722542763,
"rewards/cosine_scaled_reward": -0.047766391187906265,
"rewards/format_reward": 0.0,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2880.5833740234375,
"epoch": 0.2622107969151671,
"grad_norm": 0.16603334248065948,
"kl": 0.003936767578125,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0105,
"reward": -0.19235826842486858,
"reward_std": 0.5736033394932747,
"rewards/cosine_scaled_reward": -0.09617912326939404,
"rewards/format_reward": 0.0,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2859.0972900390625,
"epoch": 0.2639245929734362,
"grad_norm": 0.17567692697048187,
"kl": 0.004161834716796875,
"learning_rate": 8.88586709003076e-07,
"loss": -0.0056,
"reward": -0.19033684581518173,
"reward_std": 0.5773953720927238,
"rewards/cosine_scaled_reward": -0.09516842663288116,
"rewards/format_reward": 0.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 3215.1666870117188,
"epoch": 0.2656383890317052,
"grad_norm": 0.14003609120845795,
"kl": 0.004474639892578125,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0216,
"reward": -0.1411176547408104,
"reward_std": 0.6216752380132675,
"rewards/cosine_scaled_reward": -0.07055883854627609,
"rewards/format_reward": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 2929.1250610351562,
"epoch": 0.26735218508997427,
"grad_norm": 0.14357882738113403,
"kl": 0.003513336181640625,
"learning_rate": 8.844151714648274e-07,
"loss": -0.0355,
"reward": -0.26859963312745094,
"reward_std": 0.501942828297615,
"rewards/cosine_scaled_reward": -0.13429982028901577,
"rewards/format_reward": 0.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2811.6666870117188,
"epoch": 0.26906598114824337,
"grad_norm": 0.18389619886875153,
"kl": 0.006900787353515625,
"learning_rate": 8.823049032816478e-07,
"loss": -0.049,
"reward": 0.005984093062579632,
"reward_std": 0.7341288924217224,
"rewards/cosine_scaled_reward": 0.0029920428059995174,
"rewards/format_reward": 0.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 2909.77783203125,
"epoch": 0.2707797772065124,
"grad_norm": 0.13957001268863678,
"kl": 0.0042877197265625,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0033,
"reward": -0.17342954874038696,
"reward_std": 0.4903194531798363,
"rewards/cosine_scaled_reward": -0.08671476691961288,
"rewards/format_reward": 0.0,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 3235.013916015625,
"epoch": 0.27249357326478146,
"grad_norm": 0.15003739297389984,
"kl": 0.005523681640625,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0068,
"reward": -0.3410843312740326,
"reward_std": 0.502905935049057,
"rewards/cosine_scaled_reward": -0.17054216749966145,
"rewards/format_reward": 0.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 3160.6805419921875,
"epoch": 0.27420736932305056,
"grad_norm": 0.1586807668209076,
"kl": 0.00443267822265625,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0141,
"reward": 0.04759278893470764,
"reward_std": 0.6465433575212955,
"rewards/cosine_scaled_reward": 0.02379640005528927,
"rewards/format_reward": 0.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 2961.3333129882812,
"epoch": 0.2759211653813196,
"grad_norm": 0.18396639823913574,
"kl": 0.0078125,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0282,
"reward": -0.32911188155412674,
"reward_std": 0.6032818555831909,
"rewards/cosine_scaled_reward": -0.16455595009028912,
"rewards/format_reward": 0.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2621.2222290039062,
"epoch": 0.2776349614395887,
"grad_norm": 0.15461236238479614,
"kl": 0.00507354736328125,
"learning_rate": 8.715127058347614e-07,
"loss": -0.0194,
"reward": -0.4356637103483081,
"reward_std": 0.36323027312755585,
"rewards/cosine_scaled_reward": -0.21783185191452503,
"rewards/format_reward": 0.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 3038.4027709960938,
"epoch": 0.27934875749785776,
"grad_norm": 0.12717723846435547,
"kl": 0.005706787109375,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0023,
"reward": -0.04007915942929685,
"reward_std": 0.6919823586940765,
"rewards/cosine_scaled_reward": -0.02003958181012422,
"rewards/format_reward": 0.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2678.7361755371094,
"epoch": 0.2810625535561268,
"grad_norm": 0.19941791892051697,
"kl": 0.005207061767578125,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0441,
"reward": -0.040945328772068024,
"reward_std": 0.5933430567383766,
"rewards/cosine_scaled_reward": -0.020472656935453415,
"rewards/format_reward": 0.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 2895.0416870117188,
"epoch": 0.2827763496143959,
"grad_norm": 0.16098277270793915,
"kl": 0.0064697265625,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0293,
"reward": -0.09013996832072735,
"reward_std": 0.5875271111726761,
"rewards/cosine_scaled_reward": -0.04506997298449278,
"rewards/format_reward": 0.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 2754.263916015625,
"epoch": 0.28449014567266495,
"grad_norm": 0.15243615210056305,
"kl": 0.00676727294921875,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0191,
"reward": -0.0630449466407299,
"reward_std": 0.6104780063033104,
"rewards/cosine_scaled_reward": -0.0315224789083004,
"rewards/format_reward": 0.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 3024.9166870117188,
"epoch": 0.286203941730934,
"grad_norm": 0.14153960347175598,
"kl": 0.0053863525390625,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0428,
"reward": -0.1417745603248477,
"reward_std": 0.7242364957928658,
"rewards/cosine_scaled_reward": -0.07088728016242385,
"rewards/format_reward": 0.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 3074.4306030273438,
"epoch": 0.2879177377892031,
"grad_norm": 0.1459978222846985,
"kl": 0.0064067840576171875,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0112,
"reward": -0.01038459874689579,
"reward_std": 0.7124739363789558,
"rewards/cosine_scaled_reward": -0.0051923105493187904,
"rewards/format_reward": 0.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 3006.638916015625,
"epoch": 0.28963153384747214,
"grad_norm": 0.2151106894016266,
"kl": 0.00925445556640625,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0553,
"reward": -0.2934446856379509,
"reward_std": 0.5195991396903992,
"rewards/cosine_scaled_reward": -0.14672234281897545,
"rewards/format_reward": 0.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 2720.8194274902344,
"epoch": 0.2913453299057412,
"grad_norm": 0.16352801024913788,
"kl": 0.00519561767578125,
"learning_rate": 8.534360744126753e-07,
"loss": 0.061,
"reward": 0.10783382831141353,
"reward_std": 0.6230225935578346,
"rewards/cosine_scaled_reward": 0.053916911128908396,
"rewards/format_reward": 0.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2681.388946533203,
"epoch": 0.2930591259640103,
"grad_norm": 0.17118766903877258,
"kl": 0.00641632080078125,
"learning_rate": 8.511087728614862e-07,
"loss": 0.026,
"reward": -0.0785403607878834,
"reward_std": 0.5736416950821877,
"rewards/cosine_scaled_reward": -0.039270187029615045,
"rewards/format_reward": 0.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 2987.2361450195312,
"epoch": 0.29477292202227934,
"grad_norm": 0.15370719134807587,
"kl": 0.0035552978515625,
"learning_rate": 8.487667956935087e-07,
"loss": -0.0033,
"reward": -0.02974682953208685,
"reward_std": 0.5253070890903473,
"rewards/cosine_scaled_reward": -0.014873407315462828,
"rewards/format_reward": 0.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 2586.4027709960938,
"epoch": 0.29648671808054844,
"grad_norm": 0.22191597521305084,
"kl": 0.0059356689453125,
"learning_rate": 8.464102570534061e-07,
"loss": -0.0092,
"reward": -0.2831332399509847,
"reward_std": 0.5445848181843758,
"rewards/cosine_scaled_reward": -0.14156663417816162,
"rewards/format_reward": 0.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 2655.2222900390625,
"epoch": 0.2982005141388175,
"grad_norm": 0.22858025133609772,
"kl": 0.01116943359375,
"learning_rate": 8.440392717955475e-07,
"loss": -0.0181,
"reward": -0.2866486459970474,
"reward_std": 0.5677091330289841,
"rewards/cosine_scaled_reward": -0.14332432113587856,
"rewards/format_reward": 0.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 2976.8194580078125,
"epoch": 0.29991431019708653,
"grad_norm": 0.15686574578285217,
"kl": 0.00734710693359375,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0308,
"reward": -0.3254437707364559,
"reward_std": 0.5169026479125023,
"rewards/cosine_scaled_reward": -0.16272189188748598,
"rewards/format_reward": 0.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 2500.736114501953,
"epoch": 0.30162810625535563,
"grad_norm": 0.2628232538700104,
"kl": 0.03589630126953125,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0534,
"reward": -0.1589430421590805,
"reward_std": 0.6641415655612946,
"rewards/cosine_scaled_reward": -0.07947152107954025,
"rewards/format_reward": 0.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 3302.7361450195312,
"epoch": 0.3033419023136247,
"grad_norm": 0.13509048521518707,
"kl": 0.003936767578125,
"learning_rate": 8.368407953869103e-07,
"loss": -0.0077,
"reward": -0.3392331041395664,
"reward_std": 0.44542837142944336,
"rewards/cosine_scaled_reward": -0.1696165525354445,
"rewards/format_reward": 0.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 2946.9306030273438,
"epoch": 0.3050556983718937,
"grad_norm": 0.14318227767944336,
"kl": 0.004425048828125,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0129,
"reward": 0.040801383554935455,
"reward_std": 0.47273271530866623,
"rewards/cosine_scaled_reward": 0.02040068805217743,
"rewards/format_reward": 0.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 3092.125,
"epoch": 0.3067694944301628,
"grad_norm": 0.15563301742076874,
"kl": 0.010219573974609375,
"learning_rate": 8.319717151140072e-07,
"loss": 0.045,
"reward": -0.1892098607495427,
"reward_std": 0.5936430767178535,
"rewards/cosine_scaled_reward": -0.09460492385551333,
"rewards/format_reward": 0.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 1971.5138854980469,
"epoch": 0.30848329048843187,
"grad_norm": 0.19795306026935577,
"kl": 0.00974273681640625,
"learning_rate": 8.295165011252396e-07,
"loss": -0.0138,
"reward": -0.11939475126564503,
"reward_std": 0.6153334528207779,
"rewards/cosine_scaled_reward": -0.05969736957922578,
"rewards/format_reward": 0.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 3067.1945190429688,
"epoch": 0.3101970865467009,
"grad_norm": 0.15797466039657593,
"kl": 0.005138397216796875,
"learning_rate": 8.270476638965461e-07,
"loss": -0.0212,
"reward": 0.10869292449206114,
"reward_std": 0.6324612945318222,
"rewards/cosine_scaled_reward": 0.054346468299627304,
"rewards/format_reward": 0.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 3268.486083984375,
"epoch": 0.31191088260497,
"grad_norm": 0.15513566136360168,
"kl": 0.006145477294921875,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0633,
"reward": -0.2609336208552122,
"reward_std": 0.49053191393613815,
"rewards/cosine_scaled_reward": -0.13046680949628353,
"rewards/format_reward": 0.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2989.84716796875,
"epoch": 0.31362467866323906,
"grad_norm": 0.15209534764289856,
"kl": 0.00447845458984375,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0061,
"reward": -0.040327644906938076,
"reward_std": 0.717703215777874,
"rewards/cosine_scaled_reward": -0.02016383269801736,
"rewards/format_reward": 0.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 2890.4583740234375,
"epoch": 0.31533847472150817,
"grad_norm": 0.1359020322561264,
"kl": 0.006763458251953125,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0369,
"reward": -0.22703023999929428,
"reward_std": 0.6005472913384438,
"rewards/cosine_scaled_reward": -0.11351512093096972,
"rewards/format_reward": 0.0,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 2926.3194580078125,
"epoch": 0.3170522707797772,
"grad_norm": 0.13524238765239716,
"kl": 0.0067901611328125,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0495,
"reward": -0.17516471818089485,
"reward_std": 0.5499648228287697,
"rewards/cosine_scaled_reward": -0.08758235163986683,
"rewards/format_reward": 0.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2758.0833740234375,
"epoch": 0.31876606683804626,
"grad_norm": 0.19634363055229187,
"kl": 0.0046234130859375,
"learning_rate": 8.145033635316128e-07,
"loss": -0.0094,
"reward": -0.17140711098909378,
"reward_std": 0.5592127367854118,
"rewards/cosine_scaled_reward": -0.08570355176925659,
"rewards/format_reward": 0.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 2399.749969482422,
"epoch": 0.32047986289631536,
"grad_norm": 0.14529581367969513,
"kl": 0.00421905517578125,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0233,
"reward": 0.07888301834464073,
"reward_std": 0.7940803468227386,
"rewards/cosine_scaled_reward": 0.03944151382893324,
"rewards/format_reward": 0.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 3158.486083984375,
"epoch": 0.3221936589545844,
"grad_norm": 0.15482233464717865,
"kl": 0.0067596435546875,
"learning_rate": 8.093945422764069e-07,
"loss": -0.0061,
"reward": -0.22822286747395992,
"reward_std": 0.48042069375514984,
"rewards/cosine_scaled_reward": -0.1141114397905767,
"rewards/format_reward": 0.0,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 2875.52783203125,
"epoch": 0.32390745501285345,
"grad_norm": 0.16830354928970337,
"kl": 0.00731658935546875,
"learning_rate": 8.068211054579943e-07,
"loss": -0.0214,
"reward": -0.27129118889570236,
"reward_std": 0.44227684289216995,
"rewards/cosine_scaled_reward": -0.13564559258520603,
"rewards/format_reward": 0.0,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 3101.52783203125,
"epoch": 0.32562125107112255,
"grad_norm": 0.17314012348651886,
"kl": 0.006618499755859375,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0361,
"reward": -0.29743205150589347,
"reward_std": 0.6253781244158745,
"rewards/cosine_scaled_reward": -0.1487160255201161,
"rewards/format_reward": 0.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 3005.0694580078125,
"epoch": 0.3273350471293916,
"grad_norm": 0.13242636620998383,
"kl": 0.005157470703125,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0398,
"reward": -0.2783219777047634,
"reward_std": 0.5744869485497475,
"rewards/cosine_scaled_reward": -0.139160992577672,
"rewards/format_reward": 0.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2682.2777709960938,
"epoch": 0.32904884318766064,
"grad_norm": 0.1771107167005539,
"kl": 0.00849151611328125,
"learning_rate": 7.990261971595048e-07,
"loss": -0.0275,
"reward": -0.16758478805422783,
"reward_std": 0.5308270826935768,
"rewards/cosine_scaled_reward": -0.08379239588975906,
"rewards/format_reward": 0.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 2224.2083587646484,
"epoch": 0.33076263924592975,
"grad_norm": 0.2606137990951538,
"kl": 0.0111236572265625,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0598,
"reward": -0.1425977125763893,
"reward_std": 0.6462048292160034,
"rewards/cosine_scaled_reward": -0.0712988581508398,
"rewards/format_reward": 0.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 2366.65283203125,
"epoch": 0.3324764353041988,
"grad_norm": 0.20748130977153778,
"kl": 0.00627899169921875,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0302,
"reward": 0.07216466031968594,
"reward_std": 0.5604969188570976,
"rewards/cosine_scaled_reward": 0.03608234319835901,
"rewards/format_reward": 0.0,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 3086.3889770507812,
"epoch": 0.3341902313624679,
"grad_norm": 0.16518257558345795,
"kl": 0.007572174072265625,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0403,
"reward": -0.2750488445162773,
"reward_std": 0.44911373406648636,
"rewards/cosine_scaled_reward": -0.13752441480755806,
"rewards/format_reward": 0.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2695.0416870117188,
"epoch": 0.33590402742073694,
"grad_norm": 0.14707054197788239,
"kl": 0.0079803466796875,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0299,
"reward": 0.3252771459519863,
"reward_std": 0.7292146235704422,
"rewards/cosine_scaled_reward": 0.162638571113348,
"rewards/format_reward": 0.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 2711.7083129882812,
"epoch": 0.337617823479006,
"grad_norm": 0.19674766063690186,
"kl": 0.00566864013671875,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0125,
"reward": 0.1904342882335186,
"reward_std": 0.6823486983776093,
"rewards/cosine_scaled_reward": 0.09521715994924307,
"rewards/format_reward": 0.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2801.2361450195312,
"epoch": 0.3393316195372751,
"grad_norm": 0.17002622783184052,
"kl": 0.00652313232421875,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0551,
"reward": -0.1881256103515625,
"reward_std": 0.41709040850400925,
"rewards/cosine_scaled_reward": -0.0940628070384264,
"rewards/format_reward": 0.0,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 2945.0139770507812,
"epoch": 0.34104541559554413,
"grad_norm": 0.17246587574481964,
"kl": 0.006256103515625,
"learning_rate": 7.804192891917571e-07,
"loss": -0.0014,
"reward": -0.20545833744108677,
"reward_std": 0.5765868201851845,
"rewards/cosine_scaled_reward": -0.10272916965186596,
"rewards/format_reward": 0.0,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2718.9583435058594,
"epoch": 0.3427592116538132,
"grad_norm": 0.184196338057518,
"kl": 0.008544921875,
"learning_rate": 7.777151938545235e-07,
"loss": 0.016,
"reward": -0.036401793360710144,
"reward_std": 0.7076919972896576,
"rewards/cosine_scaled_reward": -0.01820090040564537,
"rewards/format_reward": 0.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2624.166717529297,
"epoch": 0.3444730077120823,
"grad_norm": 0.2025025188922882,
"kl": 0.00604248046875,
"learning_rate": 7.75e-07,
"loss": -0.0684,
"reward": -0.23150286450982094,
"reward_std": 0.4834456667304039,
"rewards/cosine_scaled_reward": -0.11575142852962017,
"rewards/format_reward": 0.0,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2488.5556030273438,
"epoch": 0.3461868037703513,
"grad_norm": 0.15225747227668762,
"kl": 0.005893707275390625,
"learning_rate": 7.72273839962904e-07,
"loss": -0.0317,
"reward": 0.06343521224334836,
"reward_std": 0.6216820403933525,
"rewards/cosine_scaled_reward": 0.03171759960241616,
"rewards/format_reward": 0.0,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 2638.0556030273438,
"epoch": 0.34790059982862037,
"grad_norm": 0.19878201186656952,
"kl": 0.00801849365234375,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0177,
"reward": 0.24296507984399796,
"reward_std": 0.7006724625825882,
"rewards/cosine_scaled_reward": 0.12148253805935383,
"rewards/format_reward": 0.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2701.2499389648438,
"epoch": 0.3496143958868895,
"grad_norm": 0.16115106642246246,
"kl": 0.005344390869140625,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0175,
"reward": -0.01583041623234749,
"reward_std": 0.5048926845192909,
"rewards/cosine_scaled_reward": -0.007915209047496319,
"rewards/format_reward": 0.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2820.15283203125,
"epoch": 0.3513281919451585,
"grad_norm": 0.1620146483182907,
"kl": 0.00921630859375,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0106,
"reward": 0.10508427396416664,
"reward_std": 0.5011924579739571,
"rewards/cosine_scaled_reward": 0.05254213139414787,
"rewards/format_reward": 0.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 2766.77783203125,
"epoch": 0.35304198800342756,
"grad_norm": 0.17725811898708344,
"kl": 0.0068359375,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0292,
"reward": -0.025651058182120323,
"reward_std": 0.6831357106566429,
"rewards/cosine_scaled_reward": -0.012825531885027885,
"rewards/format_reward": 0.0,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2571.3333740234375,
"epoch": 0.35475578406169667,
"grad_norm": 0.2153560221195221,
"kl": 0.00772857666015625,
"learning_rate": 7.584832158039378e-07,
"loss": -0.0053,
"reward": -0.0772455558180809,
"reward_std": 0.5703203156590462,
"rewards/cosine_scaled_reward": -0.038622772321105,
"rewards/format_reward": 0.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 2722.9444580078125,
"epoch": 0.3564695801199657,
"grad_norm": 0.2059468924999237,
"kl": 0.00662994384765625,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0612,
"reward": -0.18379988404922187,
"reward_std": 0.6482012867927551,
"rewards/cosine_scaled_reward": -0.09189994307234883,
"rewards/format_reward": 0.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2714.9445190429688,
"epoch": 0.3581833761782348,
"grad_norm": 0.1764851063489914,
"kl": 0.00818634033203125,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0477,
"reward": -0.011997078021522611,
"reward_std": 0.6311939656734467,
"rewards/cosine_scaled_reward": -0.005998534747050144,
"rewards/format_reward": 0.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 2458.9583740234375,
"epoch": 0.35989717223650386,
"grad_norm": 0.23969826102256775,
"kl": 0.00959014892578125,
"learning_rate": 7.500858306332172e-07,
"loss": -0.0174,
"reward": -0.052909690886735916,
"reward_std": 0.6342033296823502,
"rewards/cosine_scaled_reward": -0.026454854756593704,
"rewards/format_reward": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 3275.9722290039062,
"epoch": 0.3616109682947729,
"grad_norm": 0.14406003057956696,
"kl": 0.0072784423828125,
"learning_rate": 7.472670160550848e-07,
"loss": -0.0075,
"reward": -0.4154173508286476,
"reward_std": 0.47341830283403397,
"rewards/cosine_scaled_reward": -0.2077086754143238,
"rewards/format_reward": 0.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 2626.1805419921875,
"epoch": 0.363324764353042,
"grad_norm": 0.1677497923374176,
"kl": 0.007781982421875,
"learning_rate": 7.444385869608921e-07,
"loss": -0.0218,
"reward": -0.05068176053464413,
"reward_std": 0.5218113884329796,
"rewards/cosine_scaled_reward": -0.025340883061289787,
"rewards/format_reward": 0.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 2894.8472290039062,
"epoch": 0.36503856041131105,
"grad_norm": 0.17942826449871063,
"kl": 0.00614166259765625,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0757,
"reward": -0.09456230141222477,
"reward_std": 0.6797711104154587,
"rewards/cosine_scaled_reward": -0.0472811465151608,
"rewards/format_reward": 0.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2945.8750610351562,
"epoch": 0.3667523564695801,
"grad_norm": 0.1967238038778305,
"kl": 0.0100555419921875,
"learning_rate": 7.387534371007797e-07,
"loss": -0.0128,
"reward": -0.10412277281284332,
"reward_std": 0.7091450989246368,
"rewards/cosine_scaled_reward": -0.05206138640642166,
"rewards/format_reward": 0.0,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2597.4444580078125,
"epoch": 0.3684661525278492,
"grad_norm": 0.19232463836669922,
"kl": 0.00603485107421875,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0557,
"reward": 0.07514850981533527,
"reward_std": 0.5688696801662445,
"rewards/cosine_scaled_reward": 0.03757425490766764,
"rewards/format_reward": 0.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 3071.6805419921875,
"epoch": 0.37017994858611825,
"grad_norm": 0.15334580838680267,
"kl": 0.00914764404296875,
"learning_rate": 7.330314893841101e-07,
"loss": -0.0092,
"reward": -0.3550204383209348,
"reward_std": 0.36161456257104874,
"rewards/cosine_scaled_reward": -0.1775102224200964,
"rewards/format_reward": 0.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 2696.5972595214844,
"epoch": 0.3718937446443873,
"grad_norm": 0.1864735186100006,
"kl": 0.005496978759765625,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0101,
"reward": -0.07679219171404839,
"reward_std": 0.6243979334831238,
"rewards/cosine_scaled_reward": -0.03839609259739518,
"rewards/format_reward": 0.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2563.0834045410156,
"epoch": 0.3736075407026564,
"grad_norm": 0.16931480169296265,
"kl": 0.005645751953125,
"learning_rate": 7.27273859315928e-07,
"loss": -0.006,
"reward": -0.06438015587627888,
"reward_std": 0.4739932492375374,
"rewards/cosine_scaled_reward": -0.032190063036978245,
"rewards/format_reward": 0.0,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 3087.9306030273438,
"epoch": 0.37532133676092544,
"grad_norm": 0.15486636757850647,
"kl": 0.00830841064453125,
"learning_rate": 7.243820139034464e-07,
"loss": 0.034,
"reward": -0.1913878731429577,
"reward_std": 0.7374170869588852,
"rewards/cosine_scaled_reward": -0.09569394029676914,
"rewards/format_reward": 0.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 3060.9445190429688,
"epoch": 0.37703513281919454,
"grad_norm": 0.1392551213502884,
"kl": 0.00780487060546875,
"learning_rate": 7.214816693576234e-07,
"loss": -0.0219,
"reward": -0.3524288460612297,
"reward_std": 0.4711146801710129,
"rewards/cosine_scaled_reward": -0.17621441558003426,
"rewards/format_reward": 0.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 3082.2083129882812,
"epoch": 0.3787489288774636,
"grad_norm": 0.15662223100662231,
"kl": 0.006549835205078125,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0127,
"reward": -0.051485654432326555,
"reward_std": 0.5929789990186691,
"rewards/cosine_scaled_reward": -0.025742830068338662,
"rewards/format_reward": 0.0,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 2778.666717529297,
"epoch": 0.38046272493573263,
"grad_norm": 0.17736981809139252,
"kl": 0.008647918701171875,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0084,
"reward": -0.13847951218485832,
"reward_std": 0.5384139195084572,
"rewards/cosine_scaled_reward": -0.06923975050449371,
"rewards/format_reward": 0.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 3187.9305419921875,
"epoch": 0.38217652099400173,
"grad_norm": 0.13862548768520355,
"kl": 0.00606536865234375,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0539,
"reward": -0.3446214310824871,
"reward_std": 0.46420831978321075,
"rewards/cosine_scaled_reward": -0.1723107136785984,
"rewards/format_reward": 0.0,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 3035.513916015625,
"epoch": 0.3838903170522708,
"grad_norm": 0.17018531262874603,
"kl": 0.0105438232421875,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0259,
"reward": -0.21396764740347862,
"reward_std": 0.5872293263673782,
"rewards/cosine_scaled_reward": -0.10698381997644901,
"rewards/format_reward": 0.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 2804.013916015625,
"epoch": 0.3856041131105398,
"grad_norm": 0.19500206410884857,
"kl": 0.00768280029296875,
"learning_rate": 7.068574212948169e-07,
"loss": 0.1055,
"reward": -0.22558368369936943,
"reward_std": 0.6132937371730804,
"rewards/cosine_scaled_reward": -0.11279183439910412,
"rewards/format_reward": 0.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 2667.0555725097656,
"epoch": 0.3873179091688089,
"grad_norm": 0.18770119547843933,
"kl": 0.007568359375,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0042,
"reward": -0.05162630486302078,
"reward_std": 0.6696203723549843,
"rewards/cosine_scaled_reward": -0.025813143118284643,
"rewards/format_reward": 0.0,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2906.6805419921875,
"epoch": 0.389031705227078,
"grad_norm": 0.16604122519493103,
"kl": 0.0076141357421875,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0146,
"reward": -0.1345351382624358,
"reward_std": 0.7545941472053528,
"rewards/cosine_scaled_reward": -0.0672675691312179,
"rewards/format_reward": 0.0,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 2913.791748046875,
"epoch": 0.390745501285347,
"grad_norm": 0.36757490038871765,
"kl": 0.006805419921875,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0796,
"reward": 0.061263229697942734,
"reward_std": 0.5674895793199539,
"rewards/cosine_scaled_reward": 0.030631612986326218,
"rewards/format_reward": 0.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 2708.4027709960938,
"epoch": 0.3924592973436161,
"grad_norm": 0.17164915800094604,
"kl": 0.0065765380859375,
"learning_rate": 6.950195628537299e-07,
"loss": 0.061,
"reward": -0.17019816813990474,
"reward_std": 0.5833596885204315,
"rewards/cosine_scaled_reward": -0.08509908034466207,
"rewards/format_reward": 0.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 2934.0555419921875,
"epoch": 0.39417309340188517,
"grad_norm": 0.16252191364765167,
"kl": 0.0125732421875,
"learning_rate": 6.920420666261961e-07,
"loss": -0.0251,
"reward": -0.27500685676932335,
"reward_std": 0.4450754225254059,
"rewards/cosine_scaled_reward": -0.13750343304127455,
"rewards/format_reward": 0.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 3062.4583740234375,
"epoch": 0.39588688946015427,
"grad_norm": 0.13106314837932587,
"kl": 0.0096435546875,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0074,
"reward": -0.11593299638479948,
"reward_std": 0.5865771174430847,
"rewards/cosine_scaled_reward": -0.057966490276157856,
"rewards/format_reward": 0.0,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2548.4444580078125,
"epoch": 0.3976006855184233,
"grad_norm": 0.15283794701099396,
"kl": 0.00823974609375,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0156,
"reward": 0.012726329267024994,
"reward_std": 0.6339813768863678,
"rewards/cosine_scaled_reward": 0.006363175809383392,
"rewards/format_reward": 0.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 1752.4861602783203,
"epoch": 0.39931448157669236,
"grad_norm": 0.17673321068286896,
"kl": 0.0053558349609375,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0344,
"reward": 0.3881940320134163,
"reward_std": 0.6750105991959572,
"rewards/cosine_scaled_reward": 0.19409702718257904,
"rewards/format_reward": 0.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 2785.6111450195312,
"epoch": 0.40102827763496146,
"grad_norm": 0.172316312789917,
"kl": 0.01160430908203125,
"learning_rate": 6.800643086250121e-07,
"loss": -0.0154,
"reward": -0.2950245440006256,
"reward_std": 0.6799461841583252,
"rewards/cosine_scaled_reward": -0.1475122720003128,
"rewards/format_reward": 0.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 3018.3056030273438,
"epoch": 0.4027420736932305,
"grad_norm": 0.15055446326732635,
"kl": 0.0095977783203125,
"learning_rate": 6.770536555792944e-07,
"loss": -0.0457,
"reward": -0.19169194623827934,
"reward_std": 0.4096248298883438,
"rewards/cosine_scaled_reward": -0.09584598150104284,
"rewards/format_reward": 0.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 3233.8889770507812,
"epoch": 0.40445586975149955,
"grad_norm": 0.14838387072086334,
"kl": 0.009735107421875,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0306,
"reward": -0.14736445620656013,
"reward_std": 0.6041549146175385,
"rewards/cosine_scaled_reward": -0.07368221180513501,
"rewards/format_reward": 0.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 2462.7361755371094,
"epoch": 0.40616966580976865,
"grad_norm": 0.21186563372612,
"kl": 0.0077667236328125,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0142,
"reward": -0.2296012807637453,
"reward_std": 0.5129070654511452,
"rewards/cosine_scaled_reward": -0.11480064131319523,
"rewards/format_reward": 0.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 3028.5277709960938,
"epoch": 0.4078834618680377,
"grad_norm": 0.15430369973182678,
"kl": 0.00980377197265625,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0326,
"reward": -0.04171431064605713,
"reward_std": 0.5160864554345608,
"rewards/cosine_scaled_reward": -0.020857159048318863,
"rewards/format_reward": 0.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 2878.597198486328,
"epoch": 0.40959725792630675,
"grad_norm": 0.1511092185974121,
"kl": 0.00661468505859375,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0053,
"reward": -0.08533445000648499,
"reward_std": 0.48660216480493546,
"rewards/cosine_scaled_reward": -0.04266723245382309,
"rewards/format_reward": 0.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 2145.986114501953,
"epoch": 0.41131105398457585,
"grad_norm": 0.19034837186336517,
"kl": 0.00772857666015625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0412,
"reward": 0.22470230411272496,
"reward_std": 0.5070570334792137,
"rewards/cosine_scaled_reward": 0.11235115380259231,
"rewards/format_reward": 0.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2227.7361450195312,
"epoch": 0.4130248500428449,
"grad_norm": 0.1779133826494217,
"kl": 0.008626937866210938,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0419,
"reward": -0.0513172447681427,
"reward_std": 0.617318756878376,
"rewards/cosine_scaled_reward": -0.025658607482910156,
"rewards/format_reward": 0.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 3336.3333740234375,
"epoch": 0.414738646101114,
"grad_norm": 0.1791980117559433,
"kl": 0.00740814208984375,
"learning_rate": 6.558139508961654e-07,
"loss": -0.0064,
"reward": -0.14741731621325016,
"reward_std": 0.7067866027355194,
"rewards/cosine_scaled_reward": -0.07370865810662508,
"rewards/format_reward": 0.0,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 3000.888916015625,
"epoch": 0.41645244215938304,
"grad_norm": 0.146419495344162,
"kl": 0.006435394287109375,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0311,
"reward": -0.012151572853326797,
"reward_std": 0.7768204510211945,
"rewards/cosine_scaled_reward": -0.006075790151953697,
"rewards/format_reward": 0.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 2964.9444885253906,
"epoch": 0.4181662382176521,
"grad_norm": 0.1862625777721405,
"kl": 0.00849151611328125,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0372,
"reward": -0.16059484332799911,
"reward_std": 0.5683267489075661,
"rewards/cosine_scaled_reward": -0.08029741793870926,
"rewards/format_reward": 0.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 1833.4166717529297,
"epoch": 0.4198800342759212,
"grad_norm": 0.3224428594112396,
"kl": 0.0082550048828125,
"learning_rate": 6.466308972251785e-07,
"loss": -0.0459,
"reward": 0.06598322093486786,
"reward_std": 0.6559992954134941,
"rewards/cosine_scaled_reward": 0.032991619780659676,
"rewards/format_reward": 0.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2580.0694580078125,
"epoch": 0.42159383033419023,
"grad_norm": 0.1514631062746048,
"kl": 0.01023101806640625,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0527,
"reward": -0.02805427461862564,
"reward_std": 0.6845656186342239,
"rewards/cosine_scaled_reward": -0.01402713917195797,
"rewards/format_reward": 0.0,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 3155.2500610351562,
"epoch": 0.4233076263924593,
"grad_norm": 0.1348743587732315,
"kl": 0.01012420654296875,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0538,
"reward": -0.11579635553061962,
"reward_std": 0.7224173843860626,
"rewards/cosine_scaled_reward": -0.057898176833987236,
"rewards/format_reward": 0.0,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 2719.7222290039062,
"epoch": 0.4250214224507284,
"grad_norm": 0.16808690130710602,
"kl": 0.010040283203125,
"learning_rate": 6.374054580489873e-07,
"loss": -0.0027,
"reward": -0.1423700600862503,
"reward_std": 0.41877883672714233,
"rewards/cosine_scaled_reward": -0.07118503004312515,
"rewards/format_reward": 0.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 3017.5416870117188,
"epoch": 0.4267352185089974,
"grad_norm": 0.13636116683483124,
"kl": 0.01111602783203125,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0335,
"reward": -0.16177499457262456,
"reward_std": 0.41518206894397736,
"rewards/cosine_scaled_reward": -0.08088749897433445,
"rewards/format_reward": 0.0,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2664.6666870117188,
"epoch": 0.4284490145672665,
"grad_norm": 0.1738223433494568,
"kl": 0.00909423828125,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0581,
"reward": -0.14353771694004536,
"reward_std": 0.5664958357810974,
"rewards/cosine_scaled_reward": -0.07176885847002268,
"rewards/format_reward": 0.0,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 2695.2916870117188,
"epoch": 0.4301628106255356,
"grad_norm": 0.16924519836902618,
"kl": 0.009357452392578125,
"learning_rate": 6.281416799501187e-07,
"loss": -0.0019,
"reward": 0.09459428116679192,
"reward_std": 0.6146803349256516,
"rewards/cosine_scaled_reward": 0.04729713872075081,
"rewards/format_reward": 0.0,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 2714.5416259765625,
"epoch": 0.4318766066838046,
"grad_norm": 0.19001764059066772,
"kl": 0.0080413818359375,
"learning_rate": 6.25045936022246e-07,
"loss": -0.0049,
"reward": -0.00815525185316801,
"reward_std": 0.5676329433917999,
"rewards/cosine_scaled_reward": -0.00407763384282589,
"rewards/format_reward": 0.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2847.4166870117188,
"epoch": 0.43359040274207367,
"grad_norm": 0.1463892161846161,
"kl": 0.01003265380859375,
"learning_rate": 6.219465344613258e-07,
"loss": -0.0337,
"reward": 0.009062569588422775,
"reward_std": 0.5907448679208755,
"rewards/cosine_scaled_reward": 0.0045312922447919846,
"rewards/format_reward": 0.0,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 2974.9166870117188,
"epoch": 0.43530419880034277,
"grad_norm": 0.15965710580348969,
"kl": 0.009674072265625,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0032,
"reward": 0.09970302879810333,
"reward_std": 0.4728682413697243,
"rewards/cosine_scaled_reward": 0.04985151067376137,
"rewards/format_reward": 0.0,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2707.4027709960938,
"epoch": 0.4370179948586118,
"grad_norm": 0.1550796627998352,
"kl": 0.01021575927734375,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0007,
"reward": -0.08888162672519684,
"reward_std": 0.4977044016122818,
"rewards/cosine_scaled_reward": -0.04444081336259842,
"rewards/format_reward": 0.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 2744.763916015625,
"epoch": 0.4387317909168809,
"grad_norm": 0.21653364598751068,
"kl": 0.009979248046875,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0062,
"reward": -0.24449253268539906,
"reward_std": 0.4354872331023216,
"rewards/cosine_scaled_reward": -0.12224626448005438,
"rewards/format_reward": 0.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 2490.9722290039062,
"epoch": 0.44044558697514996,
"grad_norm": 0.1892397254705429,
"kl": 0.00962066650390625,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0269,
"reward": 0.1365387246478349,
"reward_std": 0.6730539947748184,
"rewards/cosine_scaled_reward": 0.06826936185825616,
"rewards/format_reward": 0.0,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2796.7916870117188,
"epoch": 0.442159383033419,
"grad_norm": 0.16943146288394928,
"kl": 0.0094757080078125,
"learning_rate": 6.06399955103937e-07,
"loss": -0.0094,
"reward": -0.27603928185999393,
"reward_std": 0.517802283167839,
"rewards/cosine_scaled_reward": -0.13801964186131954,
"rewards/format_reward": 0.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 2211.4305419921875,
"epoch": 0.4438731790916881,
"grad_norm": 0.23119370639324188,
"kl": 0.0067596435546875,
"learning_rate": 6.032817857379256e-07,
"loss": 0.1106,
"reward": -0.10240336135029793,
"reward_std": 0.5084675773978233,
"rewards/cosine_scaled_reward": -0.05120168812572956,
"rewards/format_reward": 0.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 2785.3055419921875,
"epoch": 0.44558697514995715,
"grad_norm": 0.21458233892917633,
"kl": 0.00876617431640625,
"learning_rate": 6.001610194928464e-07,
"loss": -0.0013,
"reward": 0.051987094804644585,
"reward_std": 0.5341488644480705,
"rewards/cosine_scaled_reward": 0.02599355112761259,
"rewards/format_reward": 0.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 2539.2083740234375,
"epoch": 0.4473007712082262,
"grad_norm": 0.1954081803560257,
"kl": 0.01007843017578125,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0316,
"reward": 0.026572998613119125,
"reward_std": 0.42085136845707893,
"rewards/cosine_scaled_reward": 0.013286499306559563,
"rewards/format_reward": 0.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 2897.3472900390625,
"epoch": 0.4490145672664953,
"grad_norm": 0.1940917670726776,
"kl": 0.01140594482421875,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0318,
"reward": -0.05599740147590637,
"reward_std": 0.6964142769575119,
"rewards/cosine_scaled_reward": -0.027998706325888634,
"rewards/format_reward": 0.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2977.875,
"epoch": 0.45072836332476435,
"grad_norm": 0.2107793092727661,
"kl": 0.011260986328125,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0458,
"reward": -0.443071685731411,
"reward_std": 0.4884059280157089,
"rewards/cosine_scaled_reward": -0.22153585404157639,
"rewards/format_reward": 0.0,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 2835.9166564941406,
"epoch": 0.4524421593830334,
"grad_norm": 0.1562654972076416,
"kl": 0.0097503662109375,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0527,
"reward": -0.31120575219392776,
"reward_std": 0.42043986171483994,
"rewards/cosine_scaled_reward": -0.15560288727283478,
"rewards/format_reward": 0.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 3025.416748046875,
"epoch": 0.4541559554413025,
"grad_norm": 0.35641464591026306,
"kl": 0.008880615234375,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0023,
"reward": 0.14537757262587547,
"reward_std": 0.4222983121871948,
"rewards/cosine_scaled_reward": 0.07268879748880863,
"rewards/format_reward": 0.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 2631.9445190429688,
"epoch": 0.45586975149957154,
"grad_norm": 0.1993650197982788,
"kl": 0.006988525390625,
"learning_rate": 5.813904131848564e-07,
"loss": 0.02,
"reward": -0.21425554435700178,
"reward_std": 0.6343535855412483,
"rewards/cosine_scaled_reward": -0.10712776239961386,
"rewards/format_reward": 0.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 2562.3611450195312,
"epoch": 0.45758354755784064,
"grad_norm": 0.22106732428073883,
"kl": 0.01442718505859375,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0396,
"reward": -0.292802631855011,
"reward_std": 0.3813341185450554,
"rewards/cosine_scaled_reward": -0.1464013159275055,
"rewards/format_reward": 0.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2572.2083740234375,
"epoch": 0.4592973436161097,
"grad_norm": 0.1711571365594864,
"kl": 0.00759124755859375,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0197,
"reward": -0.2553995121270418,
"reward_std": 0.5235799252986908,
"rewards/cosine_scaled_reward": -0.1276997560635209,
"rewards/format_reward": 0.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 3053.2638549804688,
"epoch": 0.46101113967437873,
"grad_norm": 0.1386088728904724,
"kl": 0.00843048095703125,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0184,
"reward": -0.17865224927663803,
"reward_std": 0.5562375336885452,
"rewards/cosine_scaled_reward": -0.08932612743228674,
"rewards/format_reward": 0.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 2723.638916015625,
"epoch": 0.46272493573264784,
"grad_norm": 2.8520348072052,
"kl": 0.05461883544921875,
"learning_rate": 5.688440441781398e-07,
"loss": -0.0068,
"reward": 0.27612179331481457,
"reward_std": 0.7261447310447693,
"rewards/cosine_scaled_reward": 0.13806088734418154,
"rewards/format_reward": 0.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 2795.666748046875,
"epoch": 0.4644387317909169,
"grad_norm": 0.1723846048116684,
"kl": 0.01483154296875,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0373,
"reward": -0.03490264154970646,
"reward_std": 0.6204687505960464,
"rewards/cosine_scaled_reward": -0.017451307736337185,
"rewards/format_reward": 0.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2526.1944580078125,
"epoch": 0.4661525278491859,
"grad_norm": 0.2960320711135864,
"kl": 0.0132598876953125,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0815,
"reward": 0.11341174505650997,
"reward_std": 0.5083474740386009,
"rewards/cosine_scaled_reward": 0.05670587276108563,
"rewards/format_reward": 0.0,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 2640.1944580078125,
"epoch": 0.46786632390745503,
"grad_norm": 0.17620131373405457,
"kl": 0.009765625,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0112,
"reward": 0.11540575325489044,
"reward_std": 0.5552510917186737,
"rewards/cosine_scaled_reward": 0.05770287476480007,
"rewards/format_reward": 0.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 3208.8333129882812,
"epoch": 0.4695801199657241,
"grad_norm": 0.15742135047912598,
"kl": 0.01263427734375,
"learning_rate": 5.562829811526154e-07,
"loss": -0.0201,
"reward": -0.4686981365084648,
"reward_std": 0.3511890172958374,
"rewards/cosine_scaled_reward": -0.2343490682542324,
"rewards/format_reward": 0.0,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 2885.9306030273438,
"epoch": 0.4712939160239931,
"grad_norm": 0.18644562363624573,
"kl": 0.01094818115234375,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0224,
"reward": -0.2238161340355873,
"reward_std": 0.5779955387115479,
"rewards/cosine_scaled_reward": -0.11190806701779366,
"rewards/format_reward": 0.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 2641.7222290039062,
"epoch": 0.4730077120822622,
"grad_norm": 0.2060326635837555,
"kl": 0.009002685546875,
"learning_rate": 5.5e-07,
"loss": 0.0921,
"reward": -0.10621737875044346,
"reward_std": 0.572068989276886,
"rewards/cosine_scaled_reward": -0.053108690306544304,
"rewards/format_reward": 0.0,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 3040.3193969726562,
"epoch": 0.47472150814053127,
"grad_norm": 0.15230870246887207,
"kl": 0.009868621826171875,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0243,
"reward": -0.27920062592602335,
"reward_std": 0.4912775382399559,
"rewards/cosine_scaled_reward": -0.13960031296301167,
"rewards/format_reward": 0.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2733.8472900390625,
"epoch": 0.47643530419880037,
"grad_norm": 0.16092219948768616,
"kl": 0.0078125,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0214,
"reward": 0.1801936998963356,
"reward_std": 0.7019116431474686,
"rewards/cosine_scaled_reward": 0.09009685181081295,
"rewards/format_reward": 0.0,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 3269.8194580078125,
"epoch": 0.4781491002570694,
"grad_norm": 0.13919058442115784,
"kl": 0.01202392578125,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0359,
"reward": -0.2203904101625085,
"reward_std": 0.5241215899586678,
"rewards/cosine_scaled_reward": -0.11019521998241544,
"rewards/format_reward": 0.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 2767.611083984375,
"epoch": 0.47986289631533846,
"grad_norm": 0.17986002564430237,
"kl": 0.01087188720703125,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0177,
"reward": 0.20984390750527382,
"reward_std": 0.6492117866873741,
"rewards/cosine_scaled_reward": 0.10492195282131433,
"rewards/format_reward": 0.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 2349.902801513672,
"epoch": 0.48157669237360756,
"grad_norm": 0.20755039155483246,
"kl": 0.0091094970703125,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0718,
"reward": 0.09011890506371856,
"reward_std": 0.755554661154747,
"rewards/cosine_scaled_reward": 0.04505945247365162,
"rewards/format_reward": 0.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 2690.3333740234375,
"epoch": 0.4832904884318766,
"grad_norm": 0.16312456130981445,
"kl": 0.0081787109375,
"learning_rate": 5.311559558218603e-07,
"loss": -0.0225,
"reward": -0.1038619177415967,
"reward_std": 0.6092793643474579,
"rewards/cosine_scaled_reward": -0.05193095514550805,
"rewards/format_reward": 0.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2992.4443969726562,
"epoch": 0.48500428449014565,
"grad_norm": 0.14668744802474976,
"kl": 0.009429931640625,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0094,
"reward": -0.18053901614621282,
"reward_std": 0.5393766239285469,
"rewards/cosine_scaled_reward": -0.09026950527913868,
"rewards/format_reward": 0.0,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2731.1666870117188,
"epoch": 0.48671808054841476,
"grad_norm": 0.18690791726112366,
"kl": 0.01348876953125,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0274,
"reward": 0.05301067978143692,
"reward_std": 0.8040451109409332,
"rewards/cosine_scaled_reward": 0.026505338959395885,
"rewards/format_reward": 0.0,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 3109.9722290039062,
"epoch": 0.4884318766066838,
"grad_norm": 0.13096371293067932,
"kl": 0.0122222900390625,
"learning_rate": 5.21744266211809e-07,
"loss": -0.0039,
"reward": -0.10303456708788872,
"reward_std": 0.6089868098497391,
"rewards/cosine_scaled_reward": -0.05151727236807346,
"rewards/format_reward": 0.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 2716.9166259765625,
"epoch": 0.49014567266495285,
"grad_norm": 0.21381936967372894,
"kl": 0.00853729248046875,
"learning_rate": 5.186095868151436e-07,
"loss": -0.0087,
"reward": -0.08554558828473091,
"reward_std": 0.6172359138727188,
"rewards/cosine_scaled_reward": -0.042772796005010605,
"rewards/format_reward": 0.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 3128.3195190429688,
"epoch": 0.49185946872322195,
"grad_norm": 0.20883357524871826,
"kl": 0.0149688720703125,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0885,
"reward": -0.16452566534280777,
"reward_std": 0.6313002184033394,
"rewards/cosine_scaled_reward": -0.08226283825933933,
"rewards/format_reward": 0.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 2783.2777709960938,
"epoch": 0.493573264781491,
"grad_norm": 0.18023322522640228,
"kl": 0.016357421875,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0778,
"reward": -0.29250151151791215,
"reward_std": 0.5800458639860153,
"rewards/cosine_scaled_reward": -0.14625075762160122,
"rewards/format_reward": 0.0,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 2974.1944274902344,
"epoch": 0.4952870608397601,
"grad_norm": 0.226176917552948,
"kl": 0.01458740234375,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0448,
"reward": -0.33544909581542015,
"reward_std": 0.5062796398997307,
"rewards/cosine_scaled_reward": -0.16772454418241978,
"rewards/format_reward": 0.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 2436.1944274902344,
"epoch": 0.49700085689802914,
"grad_norm": 0.1747155487537384,
"kl": 0.012481689453125,
"learning_rate": 5.060876951083828e-07,
"loss": -0.0449,
"reward": -0.14955687522888184,
"reward_std": 0.5533142015337944,
"rewards/cosine_scaled_reward": -0.07477843947708607,
"rewards/format_reward": 0.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 3039.3611450195312,
"epoch": 0.4987146529562982,
"grad_norm": 0.1754036843776703,
"kl": 0.01313018798828125,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0491,
"reward": -0.44222037494182587,
"reward_std": 0.49202967807650566,
"rewards/cosine_scaled_reward": -0.22111019119620323,
"rewards/format_reward": 0.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 2386.4722595214844,
"epoch": 0.5004284490145673,
"grad_norm": 0.23209446668624878,
"kl": 0.0133209228515625,
"learning_rate": 4.998389805071536e-07,
"loss": 0.1035,
"reward": 0.11830113036558032,
"reward_std": 0.7409112825989723,
"rewards/cosine_scaled_reward": 0.059150564251467586,
"rewards/format_reward": 0.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 2481.1666870117188,
"epoch": 0.5021422450728363,
"grad_norm": 0.17551322281360626,
"kl": 0.0098724365234375,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0432,
"reward": -0.1329963468015194,
"reward_std": 0.5577030703425407,
"rewards/cosine_scaled_reward": -0.06649817898869514,
"rewards/format_reward": 0.0,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 2711.5000610351562,
"epoch": 0.5038560411311054,
"grad_norm": 0.18221919238567352,
"kl": 0.01308441162109375,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0587,
"reward": -0.21110662072896957,
"reward_std": 0.5812349170446396,
"rewards/cosine_scaled_reward": -0.10555331036448479,
"rewards/format_reward": 0.0,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 2619.5972290039062,
"epoch": 0.5055698371893744,
"grad_norm": 0.18888363242149353,
"kl": 0.0113067626953125,
"learning_rate": 4.904846243842949e-07,
"loss": -0.0068,
"reward": 0.10603267699480057,
"reward_std": 0.6550966873764992,
"rewards/cosine_scaled_reward": 0.053016334772109985,
"rewards/format_reward": 0.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 2851.4583129882812,
"epoch": 0.5072836332476436,
"grad_norm": 0.15981672704219818,
"kl": 0.0122833251953125,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0399,
"reward": 0.07413195073604584,
"reward_std": 0.6663401573896408,
"rewards/cosine_scaled_reward": 0.03706597909331322,
"rewards/format_reward": 0.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 2682.5833435058594,
"epoch": 0.5089974293059126,
"grad_norm": 0.18823187053203583,
"kl": 0.0098419189453125,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0705,
"reward": -0.035793907940387726,
"reward_std": 0.5416731983423233,
"rewards/cosine_scaled_reward": -0.017896955832839012,
"rewards/format_reward": 0.0,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 2439.0556030273438,
"epoch": 0.5107112253641817,
"grad_norm": 0.17318564653396606,
"kl": 0.012298583984375,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0134,
"reward": 0.026430480182170868,
"reward_std": 0.5753844156861305,
"rewards/cosine_scaled_reward": 0.013215240091085434,
"rewards/format_reward": 0.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 2576.9027709960938,
"epoch": 0.5124250214224507,
"grad_norm": 0.21229924261569977,
"kl": 0.0172576904296875,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0552,
"reward": 0.3652267027646303,
"reward_std": 0.6922546178102493,
"rewards/cosine_scaled_reward": 0.18261335138231516,
"rewards/format_reward": 0.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 2756.9305419921875,
"epoch": 0.5141388174807198,
"grad_norm": 0.19316470623016357,
"kl": 0.011383056640625,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0299,
"reward": 0.22619394585490227,
"reward_std": 0.4907483011484146,
"rewards/cosine_scaled_reward": 0.11309697106480598,
"rewards/format_reward": 0.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 2826.9306640625,
"epoch": 0.5158526135389888,
"grad_norm": 0.233236625790596,
"kl": 0.01507568359375,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0851,
"reward": -0.13008400797843933,
"reward_std": 0.7507277429103851,
"rewards/cosine_scaled_reward": -0.06504200212657452,
"rewards/format_reward": 0.0,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 3145.986083984375,
"epoch": 0.517566409597258,
"grad_norm": 0.1533445417881012,
"kl": 0.0132293701171875,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0074,
"reward": -0.00015814602375030518,
"reward_std": 0.7809525281190872,
"rewards/cosine_scaled_reward": -7.90674239397049e-05,
"rewards/format_reward": 0.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 3029.40283203125,
"epoch": 0.519280205655527,
"grad_norm": 0.1652766764163971,
"kl": 0.0153656005859375,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0139,
"reward": -0.06143874488770962,
"reward_std": 0.7485700696706772,
"rewards/cosine_scaled_reward": -0.030719374306499958,
"rewards/format_reward": 0.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 2729.52783203125,
"epoch": 0.5209940017137961,
"grad_norm": 0.18553942441940308,
"kl": 0.01348876953125,
"learning_rate": 4.6259454195101267e-07,
"loss": -0.0083,
"reward": -0.042102924548089504,
"reward_std": 0.5168112218379974,
"rewards/cosine_scaled_reward": -0.02105145249515772,
"rewards/format_reward": 0.0,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 2928.9444580078125,
"epoch": 0.5227077977720651,
"grad_norm": 0.24608513712882996,
"kl": 0.01422119140625,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0859,
"reward": -0.11776435747742653,
"reward_std": 0.6116138771176338,
"rewards/cosine_scaled_reward": -0.058882176876068115,
"rewards/format_reward": 0.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 2568.7222290039062,
"epoch": 0.5244215938303342,
"grad_norm": 0.1760726422071457,
"kl": 0.0126800537109375,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0106,
"reward": -0.2896502474322915,
"reward_std": 0.542039155960083,
"rewards/cosine_scaled_reward": -0.1448251255787909,
"rewards/format_reward": 0.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 2816.7916259765625,
"epoch": 0.5261353898886033,
"grad_norm": 0.20767201483249664,
"kl": 0.015869140625,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0605,
"reward": -0.07595526240766048,
"reward_std": 0.7446087747812271,
"rewards/cosine_scaled_reward": -0.03797762934118509,
"rewards/format_reward": 0.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2619.0,
"epoch": 0.5278491859468724,
"grad_norm": 0.18840792775154114,
"kl": 0.01406097412109375,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0317,
"reward": -0.13000392355024815,
"reward_std": 0.5407935008406639,
"rewards/cosine_scaled_reward": -0.0650019682943821,
"rewards/format_reward": 0.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 2706.4583740234375,
"epoch": 0.5295629820051414,
"grad_norm": 0.20385313034057617,
"kl": 0.017333984375,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0402,
"reward": 0.09998160088434815,
"reward_std": 0.6437982618808746,
"rewards/cosine_scaled_reward": 0.049990794621407986,
"rewards/format_reward": 0.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 3076.0694580078125,
"epoch": 0.5312767780634104,
"grad_norm": 0.15351690351963043,
"kl": 0.0145416259765625,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0112,
"reward": -0.1288044311950216,
"reward_std": 0.5119795873761177,
"rewards/cosine_scaled_reward": -0.0644022131091333,
"rewards/format_reward": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 3078.3333129882812,
"epoch": 0.5329905741216795,
"grad_norm": 0.21041399240493774,
"kl": 0.0153045654296875,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0569,
"reward": -0.24592324905097485,
"reward_std": 0.5915715545415878,
"rewards/cosine_scaled_reward": -0.12296162731945515,
"rewards/format_reward": 0.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 2794.888916015625,
"epoch": 0.5347043701799485,
"grad_norm": 0.3006104528903961,
"kl": 0.0116729736328125,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.1717,
"reward": 0.2339099831879139,
"reward_std": 0.6782252490520477,
"rewards/cosine_scaled_reward": 0.1169549860060215,
"rewards/format_reward": 0.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2454.65283203125,
"epoch": 0.5364181662382177,
"grad_norm": 0.213435098528862,
"kl": 0.0183868408203125,
"learning_rate": 4.350494089288943e-07,
"loss": -0.0051,
"reward": -0.29112886637449265,
"reward_std": 0.48665956407785416,
"rewards/cosine_scaled_reward": -0.14556444063782692,
"rewards/format_reward": 0.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 2845.3194580078125,
"epoch": 0.5381319622964867,
"grad_norm": 0.23916800320148468,
"kl": 0.0161285400390625,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0824,
"reward": -0.16251583769917488,
"reward_std": 0.4937269687652588,
"rewards/cosine_scaled_reward": -0.08125792350620031,
"rewards/format_reward": 0.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 2697.9583740234375,
"epoch": 0.5398457583547558,
"grad_norm": 0.19882318377494812,
"kl": 0.018157958984375,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0178,
"reward": -0.25365344155579805,
"reward_std": 0.5236896127462387,
"rewards/cosine_scaled_reward": -0.12682672249502502,
"rewards/format_reward": 0.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 2544.9861450195312,
"epoch": 0.5415595544130248,
"grad_norm": 0.20584873855113983,
"kl": 0.014862060546875,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0389,
"reward": -0.09484067000448704,
"reward_std": 0.6149067878723145,
"rewards/cosine_scaled_reward": -0.04742033500224352,
"rewards/format_reward": 0.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 2950.8333740234375,
"epoch": 0.5432733504712939,
"grad_norm": 0.15868861973285675,
"kl": 0.018310546875,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0378,
"reward": -0.39894504845142365,
"reward_std": 0.4898769110441208,
"rewards/cosine_scaled_reward": -0.19947252236306667,
"rewards/format_reward": 0.0,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 2152.3195190429688,
"epoch": 0.5449871465295629,
"grad_norm": 0.1994917094707489,
"kl": 0.0172882080078125,
"learning_rate": 4.1993569137498776e-07,
"loss": -0.0091,
"reward": 0.24264823482371867,
"reward_std": 0.6610805988311768,
"rewards/cosine_scaled_reward": 0.12132412963546813,
"rewards/format_reward": 0.0,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 2402.5556030273438,
"epoch": 0.5467009425878321,
"grad_norm": 0.2102198302745819,
"kl": 0.01351165771484375,
"learning_rate": 4.1693137748017915e-07,
"loss": -0.0681,
"reward": 0.05987721309065819,
"reward_std": 0.5766515731811523,
"rewards/cosine_scaled_reward": 0.029938601423054934,
"rewards/format_reward": 0.0,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 2677.4027709960938,
"epoch": 0.5484147386461011,
"grad_norm": 0.2358679324388504,
"kl": 0.01690673828125,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0956,
"reward": -0.05587568995542824,
"reward_std": 0.6320854872465134,
"rewards/cosine_scaled_reward": -0.02793784497771412,
"rewards/format_reward": 0.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 3042.4722900390625,
"epoch": 0.5501285347043702,
"grad_norm": 0.18476322293281555,
"kl": 0.017547607421875,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0512,
"reward": -0.2119649334345013,
"reward_std": 0.585174448788166,
"rewards/cosine_scaled_reward": -0.1059824712574482,
"rewards/format_reward": 0.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 2080.375030517578,
"epoch": 0.5518423307626392,
"grad_norm": 0.18924832344055176,
"kl": 0.0111083984375,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0098,
"reward": 0.3428979776799679,
"reward_std": 0.7396816238760948,
"rewards/cosine_scaled_reward": 0.1714489795267582,
"rewards/format_reward": 0.0,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 2770.7916870117188,
"epoch": 0.5535561268209083,
"grad_norm": 0.17449912428855896,
"kl": 0.0141143798828125,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0149,
"reward": -0.15011528879404068,
"reward_std": 0.5199657753109932,
"rewards/cosine_scaled_reward": -0.07505764067173004,
"rewards/format_reward": 0.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 2524.4305725097656,
"epoch": 0.5552699228791774,
"grad_norm": 0.25161027908325195,
"kl": 0.01303863525390625,
"learning_rate": 4.020100089676376e-07,
"loss": 0.1119,
"reward": 0.2225971333682537,
"reward_std": 0.7053848057985306,
"rewards/cosine_scaled_reward": 0.11129856202751398,
"rewards/format_reward": 0.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2823.5694580078125,
"epoch": 0.5569837189374465,
"grad_norm": 0.17407697439193726,
"kl": 0.016265869140625,
"learning_rate": 3.9904679361238526e-07,
"loss": -0.0328,
"reward": -0.11739783291704953,
"reward_std": 0.6684166565537453,
"rewards/cosine_scaled_reward": -0.058698914712294936,
"rewards/format_reward": 0.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2481.6944580078125,
"epoch": 0.5586975149957155,
"grad_norm": 0.16408374905586243,
"kl": 0.01628875732421875,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0145,
"reward": 0.05000840872526169,
"reward_std": 0.4738306663930416,
"rewards/cosine_scaled_reward": 0.025004200637340546,
"rewards/format_reward": 0.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 2850.8611450195312,
"epoch": 0.5604113110539846,
"grad_norm": 0.1830449402332306,
"kl": 0.0183563232421875,
"learning_rate": 3.931425787051832e-07,
"loss": 0.054,
"reward": -0.26191626861691475,
"reward_std": 0.4200581759214401,
"rewards/cosine_scaled_reward": -0.1309581445530057,
"rewards/format_reward": 0.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2681.875030517578,
"epoch": 0.5621251071122536,
"grad_norm": 0.3444949984550476,
"kl": 0.031097412109375,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0002,
"reward": 0.058326710015535355,
"reward_std": 0.5914809927344322,
"rewards/cosine_scaled_reward": 0.029163353145122528,
"rewards/format_reward": 0.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 2444.1805419921875,
"epoch": 0.5638389031705227,
"grad_norm": 0.20234812796115875,
"kl": 0.0186004638671875,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0297,
"reward": 0.015948079526424408,
"reward_std": 0.5476803705096245,
"rewards/cosine_scaled_reward": 0.00797403953038156,
"rewards/format_reward": 0.0,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 2751.6666870117188,
"epoch": 0.5655526992287918,
"grad_norm": 0.20875848829746246,
"kl": 0.0170745849609375,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0404,
"reward": -0.1900151213631034,
"reward_std": 0.552287369966507,
"rewards/cosine_scaled_reward": -0.09500756207853556,
"rewards/format_reward": 0.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 2700.7222900390625,
"epoch": 0.5672664952870609,
"grad_norm": 0.17264467477798462,
"kl": 0.0172119140625,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0526,
"reward": 0.03160311561077833,
"reward_std": 0.5627969726920128,
"rewards/cosine_scaled_reward": 0.015801557805389166,
"rewards/format_reward": 0.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 2822.8472900390625,
"epoch": 0.5689802913453299,
"grad_norm": 0.27976417541503906,
"kl": 0.023895263671875,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0355,
"reward": 0.02845914661884308,
"reward_std": 0.5001804158091545,
"rewards/cosine_scaled_reward": 0.014229563996195793,
"rewards/format_reward": 0.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 2853.3333740234375,
"epoch": 0.570694087403599,
"grad_norm": 0.1514306664466858,
"kl": 0.017669677734375,
"learning_rate": 3.7561798609655373e-07,
"loss": -0.0082,
"reward": -0.13629086455330253,
"reward_std": 0.4956332743167877,
"rewards/cosine_scaled_reward": -0.06814542971551418,
"rewards/format_reward": 0.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 3072.9166870117188,
"epoch": 0.572407883461868,
"grad_norm": 0.14293867349624634,
"kl": 0.0254974365234375,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0174,
"reward": 0.02665301039814949,
"reward_std": 0.6765051260590553,
"rewards/cosine_scaled_reward": 0.013326505199074745,
"rewards/format_reward": 0.0,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 2824.861114501953,
"epoch": 0.5741216795201372,
"grad_norm": 0.19958122074604034,
"kl": 0.0171356201171875,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0929,
"reward": -0.056068588979542255,
"reward_std": 0.8257120847702026,
"rewards/cosine_scaled_reward": -0.028034291230142117,
"rewards/format_reward": 0.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 2580.0,
"epoch": 0.5758354755784062,
"grad_norm": 0.229178324341774,
"kl": 0.019195556640625,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.006,
"reward": -0.291859433054924,
"reward_std": 0.4463714547455311,
"rewards/cosine_scaled_reward": -0.1459297128021717,
"rewards/format_reward": 0.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2450.138916015625,
"epoch": 0.5775492716366752,
"grad_norm": 0.27258360385894775,
"kl": 0.0173187255859375,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0321,
"reward": 0.07944206055253744,
"reward_std": 0.6395395249128342,
"rewards/cosine_scaled_reward": 0.03972102585248649,
"rewards/format_reward": 0.0,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2720.1805725097656,
"epoch": 0.5792630676949443,
"grad_norm": 0.20289485156536102,
"kl": 0.019683837890625,
"learning_rate": 3.612465628992203e-07,
"loss": 0.069,
"reward": 0.48021042346954346,
"reward_std": 0.7420852333307266,
"rewards/cosine_scaled_reward": 0.24010521546006203,
"rewards/format_reward": 0.0,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 3105.0833740234375,
"epoch": 0.5809768637532133,
"grad_norm": 0.18909570574760437,
"kl": 0.022308349609375,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0378,
"reward": -0.22961215861141682,
"reward_std": 0.5897372663021088,
"rewards/cosine_scaled_reward": -0.11480608023703098,
"rewards/format_reward": 0.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2911.2083129882812,
"epoch": 0.5826906598114824,
"grad_norm": 0.20473147928714752,
"kl": 0.022918701171875,
"learning_rate": 3.555614130391079e-07,
"loss": -0.0498,
"reward": -0.1040644682943821,
"reward_std": 0.57014200091362,
"rewards/cosine_scaled_reward": -0.052032231353223324,
"rewards/format_reward": 0.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 2422.777801513672,
"epoch": 0.5844044558697515,
"grad_norm": 0.1749623566865921,
"kl": 0.019775390625,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0118,
"reward": -0.429408997297287,
"reward_std": 0.39798443764448166,
"rewards/cosine_scaled_reward": -0.2147044911980629,
"rewards/format_reward": 0.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 2946.736083984375,
"epoch": 0.5861182519280206,
"grad_norm": 0.20565366744995117,
"kl": 0.01824951171875,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0572,
"reward": -0.09714518021792173,
"reward_std": 0.6395711675286293,
"rewards/cosine_scaled_reward": -0.048572588711977005,
"rewards/format_reward": 0.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 2240.361114501953,
"epoch": 0.5878320479862896,
"grad_norm": 0.19630080461502075,
"kl": 0.013671875,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0286,
"reward": 0.09563972940668464,
"reward_std": 0.5933751873672009,
"rewards/cosine_scaled_reward": 0.047819861210882664,
"rewards/format_reward": 0.0,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 2915.2916870117188,
"epoch": 0.5895458440445587,
"grad_norm": 0.20998388528823853,
"kl": 0.0269317626953125,
"learning_rate": 3.4430593282358777e-07,
"loss": -0.0348,
"reward": -0.3282645223662257,
"reward_std": 0.49101946130394936,
"rewards/cosine_scaled_reward": -0.16413226234726608,
"rewards/format_reward": 0.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 2668.0277709960938,
"epoch": 0.5912596401028277,
"grad_norm": 0.25542527437210083,
"kl": 0.020050048828125,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0754,
"reward": 0.21342255361378193,
"reward_std": 0.653385765850544,
"rewards/cosine_scaled_reward": 0.10671127680689096,
"rewards/format_reward": 0.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 2691.2638549804688,
"epoch": 0.5929734361610969,
"grad_norm": 0.21436557173728943,
"kl": 0.0188446044921875,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0297,
"reward": -0.08409620448946953,
"reward_std": 0.6964321285486221,
"rewards/cosine_scaled_reward": -0.04204810503870249,
"rewards/format_reward": 0.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 2423.2083740234375,
"epoch": 0.5946872322193659,
"grad_norm": 0.2174253612756729,
"kl": 0.0170745849609375,
"learning_rate": 3.359691059183761e-07,
"loss": -0.0145,
"reward": 0.05711523536592722,
"reward_std": 0.6910872906446457,
"rewards/cosine_scaled_reward": 0.02855762024410069,
"rewards/format_reward": 0.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 2334.3194580078125,
"epoch": 0.596401028277635,
"grad_norm": 0.20871306955814362,
"kl": 0.019622802734375,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0377,
"reward": -0.29262126237154007,
"reward_std": 0.5664101913571358,
"rewards/cosine_scaled_reward": -0.14631063491106033,
"rewards/format_reward": 0.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 2684.4861450195312,
"epoch": 0.598114824335904,
"grad_norm": 0.2084410935640335,
"kl": 0.0155181884765625,
"learning_rate": 3.3046315338757026e-07,
"loss": -0.0696,
"reward": 0.2747867554426193,
"reward_std": 0.6360199972987175,
"rewards/cosine_scaled_reward": 0.13739337399601936,
"rewards/format_reward": 0.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2624.9444580078125,
"epoch": 0.5998286203941731,
"grad_norm": 0.27559694647789,
"kl": 0.01519775390625,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0439,
"reward": 0.16777711734175682,
"reward_std": 0.6573140621185303,
"rewards/cosine_scaled_reward": 0.08388857543468475,
"rewards/format_reward": 0.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 2644.0416564941406,
"epoch": 0.6015424164524421,
"grad_norm": 0.21829356253147125,
"kl": 0.020263671875,
"learning_rate": 3.250000000000001e-07,
"loss": 0.019,
"reward": 0.04395672678947449,
"reward_std": 0.5275484099984169,
"rewards/cosine_scaled_reward": 0.02197836432605982,
"rewards/format_reward": 0.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2948.3056030273438,
"epoch": 0.6032562125107113,
"grad_norm": 0.15744946897029877,
"kl": 0.0189056396484375,
"learning_rate": 3.222848061454764e-07,
"loss": -0.0085,
"reward": -0.41702286154031754,
"reward_std": 0.5593557730317116,
"rewards/cosine_scaled_reward": -0.20851144194602966,
"rewards/format_reward": 0.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 2635.7222595214844,
"epoch": 0.6049700085689803,
"grad_norm": 0.22034288942813873,
"kl": 0.021209716796875,
"learning_rate": 3.195807108082429e-07,
"loss": -0.0335,
"reward": -0.30768171697854996,
"reward_std": 0.5821868106722832,
"rewards/cosine_scaled_reward": -0.15384084545075893,
"rewards/format_reward": 0.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 2137.3055725097656,
"epoch": 0.6066838046272494,
"grad_norm": 0.276947557926178,
"kl": 0.015472412109375,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0844,
"reward": 0.3251216746866703,
"reward_std": 0.716858297586441,
"rewards/cosine_scaled_reward": 0.16256084106862545,
"rewards/format_reward": 0.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 2492.5555725097656,
"epoch": 0.6083976006855184,
"grad_norm": 0.2037208080291748,
"kl": 0.0183258056640625,
"learning_rate": 3.142063423134644e-07,
"loss": -0.0014,
"reward": -0.21882931515574455,
"reward_std": 0.47944844514131546,
"rewards/cosine_scaled_reward": -0.10941465757787228,
"rewards/format_reward": 0.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 2614.3472290039062,
"epoch": 0.6101113967437874,
"grad_norm": 0.19817198812961578,
"kl": 0.0220947265625,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0141,
"reward": -0.4298449754714966,
"reward_std": 0.520567923784256,
"rewards/cosine_scaled_reward": -0.2149224765598774,
"rewards/format_reward": 0.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 2584.777801513672,
"epoch": 0.6118251928020566,
"grad_norm": 0.18728262186050415,
"kl": 0.021331787109375,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.04,
"reward": 0.04458676278591156,
"reward_std": 0.49945997446775436,
"rewards/cosine_scaled_reward": 0.02229338139295578,
"rewards/format_reward": 0.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 2934.513916015625,
"epoch": 0.6135389888603257,
"grad_norm": 0.17515863478183746,
"kl": 0.017791748046875,
"learning_rate": 3.062313053727671e-07,
"loss": -0.0046,
"reward": -0.0155550935305655,
"reward_std": 0.607760101556778,
"rewards/cosine_scaled_reward": -0.007777547696605325,
"rewards/format_reward": 0.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 2598.2638549804688,
"epoch": 0.6152527849185947,
"grad_norm": 0.20000198483467102,
"kl": 0.0205535888671875,
"learning_rate": 3.0359654942835247e-07,
"loss": -0.008,
"reward": -0.21508236415684223,
"reward_std": 0.4807446375489235,
"rewards/cosine_scaled_reward": -0.10754118673503399,
"rewards/format_reward": 0.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 2585.3333435058594,
"epoch": 0.6169665809768637,
"grad_norm": 0.1761714369058609,
"kl": 0.01947021484375,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0011,
"reward": -0.027444179635494947,
"reward_std": 0.6417821869254112,
"rewards/cosine_scaled_reward": -0.013722071889787912,
"rewards/format_reward": 0.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 2367.3611450195312,
"epoch": 0.6186803770351328,
"grad_norm": 0.1938982903957367,
"kl": 0.01788330078125,
"learning_rate": 2.9836319343816397e-07,
"loss": -0.023,
"reward": 0.0992561224848032,
"reward_std": 0.7357365190982819,
"rewards/cosine_scaled_reward": 0.04962805658578873,
"rewards/format_reward": 0.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 3139.541748046875,
"epoch": 0.6203941730934018,
"grad_norm": 0.17356501519680023,
"kl": 0.024200439453125,
"learning_rate": 2.9576484845877793e-07,
"loss": -0.0258,
"reward": -0.128750279545784,
"reward_std": 0.5727476924657822,
"rewards/cosine_scaled_reward": -0.06437514536082745,
"rewards/format_reward": 0.0,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 2882.4306030273438,
"epoch": 0.622107969151671,
"grad_norm": 0.19220975041389465,
"kl": 0.0243377685546875,
"learning_rate": 2.931788945420058e-07,
"loss": -0.0247,
"reward": -0.019596407189965248,
"reward_std": 0.6233709305524826,
"rewards/cosine_scaled_reward": -0.009798200335353613,
"rewards/format_reward": 0.0,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 2840.0555419921875,
"epoch": 0.62382176520994,
"grad_norm": 0.237908735871315,
"kl": 0.02325439453125,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0684,
"reward": -0.17538912501186132,
"reward_std": 0.7643003761768341,
"rewards/cosine_scaled_reward": -0.08769455272704363,
"rewards/format_reward": 0.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 2791.75,
"epoch": 0.6255355612682091,
"grad_norm": 0.1972544640302658,
"kl": 0.022613525390625,
"learning_rate": 2.8804466342921987e-07,
"loss": -0.0356,
"reward": -0.19943542033433914,
"reward_std": 0.6234779357910156,
"rewards/cosine_scaled_reward": -0.09971771761775017,
"rewards/format_reward": 0.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 2936.7916564941406,
"epoch": 0.6272493573264781,
"grad_norm": 0.1693785935640335,
"kl": 0.022491455078125,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0289,
"reward": -0.07167929410934448,
"reward_std": 0.41813354194164276,
"rewards/cosine_scaled_reward": -0.035839639604091644,
"rewards/format_reward": 0.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2579.8056030273438,
"epoch": 0.6289631533847472,
"grad_norm": 0.18452903628349304,
"kl": 0.02313232421875,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0131,
"reward": 0.13851050520315766,
"reward_std": 0.6860260739922523,
"rewards/cosine_scaled_reward": 0.06925524887628853,
"rewards/format_reward": 0.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 3069.2361450195312,
"epoch": 0.6306769494430163,
"grad_norm": 0.207699254155159,
"kl": 0.026519775390625,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0636,
"reward": -0.25442312750965357,
"reward_std": 0.5900055021047592,
"rewards/cosine_scaled_reward": -0.12721156049519777,
"rewards/format_reward": 0.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 2244.763916015625,
"epoch": 0.6323907455012854,
"grad_norm": 0.17845271527767181,
"kl": 0.0156707763671875,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0488,
"reward": 0.17914995457977057,
"reward_std": 0.7317003160715103,
"rewards/cosine_scaled_reward": 0.08957497263327241,
"rewards/format_reward": 0.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 2134.1944274902344,
"epoch": 0.6341045415595544,
"grad_norm": 0.2277487814426422,
"kl": 0.01385498046875,
"learning_rate": 2.7543467624442956e-07,
"loss": -0.0127,
"reward": 0.11734075238928199,
"reward_std": 0.5018965676426888,
"rewards/cosine_scaled_reward": 0.05867037340067327,
"rewards/format_reward": 0.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 2490.500030517578,
"epoch": 0.6358183376178235,
"grad_norm": 0.21075375378131866,
"kl": 0.02069091796875,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0493,
"reward": -0.03656116779893637,
"reward_std": 0.4987756237387657,
"rewards/cosine_scaled_reward": -0.018280583899468184,
"rewards/format_reward": 0.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 2664.625030517578,
"epoch": 0.6375321336760925,
"grad_norm": 0.22036224603652954,
"kl": 0.01885986328125,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0736,
"reward": -0.017365715699270368,
"reward_std": 0.7068077325820923,
"rewards/cosine_scaled_reward": -0.008682856685481966,
"rewards/format_reward": 0.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 2659.3889770507812,
"epoch": 0.6392459297343616,
"grad_norm": 0.2022118866443634,
"kl": 0.0198822021484375,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.011,
"reward": -0.049437786685302854,
"reward_std": 0.5779630020260811,
"rewards/cosine_scaled_reward": -0.024718896602280438,
"rewards/format_reward": 0.0,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 2616.4306030273438,
"epoch": 0.6409597257926307,
"grad_norm": 0.1780145913362503,
"kl": 0.0235595703125,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0089,
"reward": -0.017803641967475414,
"reward_std": 0.6717728674411774,
"rewards/cosine_scaled_reward": -0.00890181539580226,
"rewards/format_reward": 0.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 2676.3472290039062,
"epoch": 0.6426735218508998,
"grad_norm": 0.15247489511966705,
"kl": 0.026458740234375,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0205,
"reward": -0.31310519203543663,
"reward_std": 0.5878890082240105,
"rewards/cosine_scaled_reward": -0.15655260160565376,
"rewards/format_reward": 0.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 2356.2083129882812,
"epoch": 0.6443873179091688,
"grad_norm": 0.2001314014196396,
"kl": 0.0235595703125,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0174,
"reward": -0.2070534396916628,
"reward_std": 0.4216439947485924,
"rewards/cosine_scaled_reward": -0.1035267198458314,
"rewards/format_reward": 0.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 2577.5695190429688,
"epoch": 0.6461011139674379,
"grad_norm": 0.19885659217834473,
"kl": 0.0179290771484375,
"learning_rate": 2.583460445215911e-07,
"loss": -0.0114,
"reward": -0.2356225922703743,
"reward_std": 0.4705282226204872,
"rewards/cosine_scaled_reward": -0.1178113017231226,
"rewards/format_reward": 0.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 2827.4305419921875,
"epoch": 0.6478149100257069,
"grad_norm": 0.16866172850131989,
"kl": 0.022796630859375,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0359,
"reward": -0.2195772840641439,
"reward_std": 0.7464367002248764,
"rewards/cosine_scaled_reward": -0.1097886401694268,
"rewards/format_reward": 0.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 2001.8472595214844,
"epoch": 0.6495287060839761,
"grad_norm": 0.27339431643486023,
"kl": 0.025421142578125,
"learning_rate": 2.5358974294659373e-07,
"loss": -0.0481,
"reward": -0.053384889382869005,
"reward_std": 0.7801851779222488,
"rewards/cosine_scaled_reward": -0.026692438637837768,
"rewards/format_reward": 0.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 2380.8611450195312,
"epoch": 0.6512425021422451,
"grad_norm": 0.49418047070503235,
"kl": 0.028839111328125,
"learning_rate": 2.512332043064913e-07,
"loss": 0.1507,
"reward": -0.04335943330079317,
"reward_std": 0.7678016275167465,
"rewards/cosine_scaled_reward": -0.021679717116057873,
"rewards/format_reward": 0.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 2910.1806640625,
"epoch": 0.6529562982005142,
"grad_norm": 0.19250288605690002,
"kl": 0.022003173828125,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0447,
"reward": -0.1130654625594616,
"reward_std": 0.5473960787057877,
"rewards/cosine_scaled_reward": -0.05653274059295654,
"rewards/format_reward": 0.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 2682.9861450195312,
"epoch": 0.6546700942587832,
"grad_norm": 0.1798926293849945,
"kl": 0.019439697265625,
"learning_rate": 2.465639255873246e-07,
"loss": -0.0224,
"reward": -0.07310536503791809,
"reward_std": 0.6817247718572617,
"rewards/cosine_scaled_reward": -0.036552680656313896,
"rewards/format_reward": 0.0,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 2607.763916015625,
"epoch": 0.6563838903170522,
"grad_norm": 0.24983283877372742,
"kl": 0.026153564453125,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0197,
"reward": -0.24107037298381329,
"reward_std": 0.6102746799588203,
"rewards/cosine_scaled_reward": -0.12053518556058407,
"rewards/format_reward": 0.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 2681.277801513672,
"epoch": 0.6580976863753213,
"grad_norm": 0.21532803773880005,
"kl": 0.0153350830078125,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0375,
"reward": -0.2287786863744259,
"reward_std": 0.5439959019422531,
"rewards/cosine_scaled_reward": -0.11438935063779354,
"rewards/format_reward": 0.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2749.4305725097656,
"epoch": 0.6598114824335904,
"grad_norm": 0.23645354807376862,
"kl": 0.02471923828125,
"learning_rate": 2.3967120531894857e-07,
"loss": -0.0225,
"reward": -0.1737481877207756,
"reward_std": 0.5551631152629852,
"rewards/cosine_scaled_reward": -0.0868740938603878,
"rewards/format_reward": 0.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 3019.7361450195312,
"epoch": 0.6615252784918595,
"grad_norm": 0.18375760316848755,
"kl": 0.019195556640625,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0429,
"reward": -0.34039000049233437,
"reward_std": 0.5544994547963142,
"rewards/cosine_scaled_reward": -0.17019500210881233,
"rewards/format_reward": 0.0,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 2555.625030517578,
"epoch": 0.6632390745501285,
"grad_norm": 0.2520519196987152,
"kl": 0.0201873779296875,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0754,
"reward": 0.06691954471170902,
"reward_std": 0.4953342378139496,
"rewards/cosine_scaled_reward": 0.03345977142453194,
"rewards/format_reward": 0.0,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 2198.75,
"epoch": 0.6649528706083976,
"grad_norm": 0.21169999241828918,
"kl": 0.0220489501953125,
"learning_rate": 2.3291460551638237e-07,
"loss": -0.0328,
"reward": 0.10132637619972229,
"reward_std": 0.6322794482111931,
"rewards/cosine_scaled_reward": 0.050663191825151443,
"rewards/format_reward": 0.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 2786.27783203125,
"epoch": 0.6666666666666666,
"grad_norm": 0.18405954539775848,
"kl": 0.0251617431640625,
"learning_rate": 2.306931685585657e-07,
"loss": -0.0196,
"reward": 0.03023771196603775,
"reward_std": 0.46946871280670166,
"rewards/cosine_scaled_reward": 0.015118852257728577,
"rewards/format_reward": 0.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 2521.5693969726562,
"epoch": 0.6683804627249358,
"grad_norm": 0.19272808730602264,
"kl": 0.0200347900390625,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0461,
"reward": 0.00521535862935707,
"reward_std": 0.616911455988884,
"rewards/cosine_scaled_reward": 0.0026076845824718475,
"rewards/format_reward": 0.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 2864.041748046875,
"epoch": 0.6700942587832048,
"grad_norm": 0.2623915672302246,
"kl": 0.0235595703125,
"learning_rate": 2.2629708984760706e-07,
"loss": -0.002,
"reward": -0.1861814223229885,
"reward_std": 0.5339604392647743,
"rewards/cosine_scaled_reward": -0.0930907130241394,
"rewards/format_reward": 0.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 2380.9583740234375,
"epoch": 0.6718080548414739,
"grad_norm": 0.25610801577568054,
"kl": 0.02032470703125,
"learning_rate": 2.2412266235313973e-07,
"loss": -0.0448,
"reward": -0.07657308876514435,
"reward_std": 0.6799488365650177,
"rewards/cosine_scaled_reward": -0.038286540657281876,
"rewards/format_reward": 0.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 2998.5139770507812,
"epoch": 0.6735218508997429,
"grad_norm": 0.19235925376415253,
"kl": 0.0222015380859375,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0569,
"reward": -0.001154482364654541,
"reward_std": 0.5102438926696777,
"rewards/cosine_scaled_reward": -0.0005772355943918228,
"rewards/format_reward": 0.0,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 2115.1806030273438,
"epoch": 0.675235646958012,
"grad_norm": 0.2744181752204895,
"kl": 0.027099609375,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0221,
"reward": 0.058095297776162624,
"reward_std": 0.718009740114212,
"rewards/cosine_scaled_reward": 0.029047648888081312,
"rewards/format_reward": 0.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 2774.1806030273438,
"epoch": 0.676949443016281,
"grad_norm": 0.19175058603286743,
"kl": 0.025482177734375,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0352,
"reward": -0.136960469186306,
"reward_std": 0.511358916759491,
"rewards/cosine_scaled_reward": -0.0684802271425724,
"rewards/format_reward": 0.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 3057.9584350585938,
"epoch": 0.6786632390745502,
"grad_norm": 0.16223175823688507,
"kl": 0.019256591796875,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0288,
"reward": -0.04862111946567893,
"reward_std": 0.5186164565384388,
"rewards/cosine_scaled_reward": -0.024310562410391867,
"rewards/format_reward": 0.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 2582.2916870117188,
"epoch": 0.6803770351328192,
"grad_norm": 0.287038654088974,
"kl": 0.0208740234375,
"learning_rate": 2.134908592756607e-07,
"loss": -0.0666,
"reward": -0.17554645985364914,
"reward_std": 0.5096240639686584,
"rewards/cosine_scaled_reward": -0.08777323365211487,
"rewards/format_reward": 0.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 3108.4444580078125,
"epoch": 0.6820908311910883,
"grad_norm": 0.1679101139307022,
"kl": 0.0231781005859375,
"learning_rate": 2.1141329099692406e-07,
"loss": -0.0035,
"reward": 0.038632214069366455,
"reward_std": 0.7707736194133759,
"rewards/cosine_scaled_reward": 0.019316108897328377,
"rewards/format_reward": 0.0,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 1893.5972442626953,
"epoch": 0.6838046272493573,
"grad_norm": 0.2708974778652191,
"kl": 0.0231170654296875,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0065,
"reward": 0.1442592293024063,
"reward_std": 0.5131981894373894,
"rewards/cosine_scaled_reward": 0.07212962210178375,
"rewards/format_reward": 0.0,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 2922.7916870117188,
"epoch": 0.6855184233076264,
"grad_norm": 0.3239152133464813,
"kl": 0.023956298828125,
"learning_rate": 2.0730776160846853e-07,
"loss": -0.0809,
"reward": -0.12957404926419258,
"reward_std": 0.5665386915206909,
"rewards/cosine_scaled_reward": -0.06478701997548342,
"rewards/format_reward": 0.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 2859.4444580078125,
"epoch": 0.6872322193658955,
"grad_norm": 0.19043125212192535,
"kl": 0.0223541259765625,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0588,
"reward": -0.32605881802737713,
"reward_std": 0.5183117464184761,
"rewards/cosine_scaled_reward": -0.16302942391484976,
"rewards/format_reward": 0.0,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2781.5556030273438,
"epoch": 0.6889460154241646,
"grad_norm": 0.26216545701026917,
"kl": 0.026580810546875,
"learning_rate": 2.032690407508949e-07,
"loss": -0.0263,
"reward": -0.4961502104997635,
"reward_std": 0.3931718245148659,
"rewards/cosine_scaled_reward": -0.24807510524988174,
"rewards/format_reward": 0.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2736.1111450195312,
"epoch": 0.6906598114824336,
"grad_norm": 0.21833109855651855,
"kl": 0.02435302734375,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0585,
"reward": 0.3037844013888389,
"reward_std": 0.5833063200116158,
"rewards/cosine_scaled_reward": 0.1518922229297459,
"rewards/format_reward": 0.0,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 2712.861114501953,
"epoch": 0.6923736075407027,
"grad_norm": 0.2146695852279663,
"kl": 0.025238037109375,
"learning_rate": 1.9929791578083655e-07,
"loss": -0.041,
"reward": -0.21084421500563622,
"reward_std": 0.4842342808842659,
"rewards/cosine_scaled_reward": -0.10542210191488266,
"rewards/format_reward": 0.0,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2936.3333129882812,
"epoch": 0.6940874035989717,
"grad_norm": 0.18586868047714233,
"kl": 0.024658203125,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.005,
"reward": 0.050316065549850464,
"reward_std": 0.5316065326333046,
"rewards/cosine_scaled_reward": 0.02515802625566721,
"rewards/format_reward": 0.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 2209.0972595214844,
"epoch": 0.6958011996572407,
"grad_norm": 0.2406679093837738,
"kl": 0.02728271484375,
"learning_rate": 1.9539516087697517e-07,
"loss": -0.0131,
"reward": -0.021612104028463364,
"reward_std": 0.5742413327097893,
"rewards/cosine_scaled_reward": -0.010806052014231682,
"rewards/format_reward": 0.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 2988.3056640625,
"epoch": 0.6975149957155099,
"grad_norm": 0.23395532369613647,
"kl": 0.027374267578125,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0598,
"reward": -0.08433661237359047,
"reward_std": 0.5562912449240685,
"rewards/cosine_scaled_reward": -0.04216831736266613,
"rewards/format_reward": 0.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 2319.6111755371094,
"epoch": 0.699228791773779,
"grad_norm": 0.2508476972579956,
"kl": 0.0188751220703125,
"learning_rate": 1.915615368891117e-07,
"loss": -0.0462,
"reward": 0.5069457921199501,
"reward_std": 0.5437265560030937,
"rewards/cosine_scaled_reward": 0.25347290316130966,
"rewards/format_reward": 0.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 2740.7777709960938,
"epoch": 0.700942587832048,
"grad_norm": 0.18038439750671387,
"kl": 0.030517578125,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0239,
"reward": 0.10421705152839422,
"reward_std": 0.6194805726408958,
"rewards/cosine_scaled_reward": 0.052108526695519686,
"rewards/format_reward": 0.0,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 2735.8472595214844,
"epoch": 0.702656383890317,
"grad_norm": 0.2515905201435089,
"kl": 0.02587890625,
"learning_rate": 1.8779779118983867e-07,
"loss": -0.0311,
"reward": -0.1710510030388832,
"reward_std": 0.5620269253849983,
"rewards/cosine_scaled_reward": -0.0855255089700222,
"rewards/format_reward": 0.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 2493.388916015625,
"epoch": 0.7043701799485861,
"grad_norm": 0.21452462673187256,
"kl": 0.0193328857421875,
"learning_rate": 1.8594235253127372e-07,
"loss": -0.0157,
"reward": -0.25840797275304794,
"reward_std": 0.4374122992157936,
"rewards/cosine_scaled_reward": -0.12920398078858852,
"rewards/format_reward": 0.0,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 2339.2083740234375,
"epoch": 0.7060839760068551,
"grad_norm": 0.2920970320701599,
"kl": 0.02203369140625,
"learning_rate": 1.8410465752883758e-07,
"loss": -0.0518,
"reward": -0.25962352380156517,
"reward_std": 0.5908957123756409,
"rewards/cosine_scaled_reward": -0.129811754450202,
"rewards/format_reward": 0.0,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 2770.0694580078125,
"epoch": 0.7077977720651243,
"grad_norm": 0.2641778290271759,
"kl": 0.026947021484375,
"learning_rate": 1.822847957491922e-07,
"loss": 0.061,
"reward": -0.15783867985010147,
"reward_std": 0.5947980135679245,
"rewards/cosine_scaled_reward": -0.07891935110092163,
"rewards/format_reward": 0.0,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2950.1945190429688,
"epoch": 0.7095115681233933,
"grad_norm": 0.2011335790157318,
"kl": 0.024566650390625,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0014,
"reward": -0.0009787320159375668,
"reward_std": 0.7296510636806488,
"rewards/cosine_scaled_reward": -0.0004893671721220016,
"rewards/format_reward": 0.0,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 2707.9305419921875,
"epoch": 0.7112253641816624,
"grad_norm": 0.3163622319698334,
"kl": 0.027496337890625,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0567,
"reward": -0.3990987651050091,
"reward_std": 0.43145136535167694,
"rewards/cosine_scaled_reward": -0.1995493769645691,
"rewards/format_reward": 0.0,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 2968.0833129882812,
"epoch": 0.7129391602399314,
"grad_norm": 0.23538915812969208,
"kl": 0.02618408203125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0018,
"reward": -0.08291278406977654,
"reward_std": 0.4231496602296829,
"rewards/cosine_scaled_reward": -0.041456387378275394,
"rewards/format_reward": 0.0,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2606.3333129882812,
"epoch": 0.7146529562982005,
"grad_norm": 0.3015352785587311,
"kl": 0.0229949951171875,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0752,
"reward": -0.3149372674524784,
"reward_std": 0.6667703241109848,
"rewards/cosine_scaled_reward": -0.1574686411768198,
"rewards/format_reward": 0.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2380.4445190429688,
"epoch": 0.7163667523564696,
"grad_norm": 0.22723092138767242,
"kl": 0.0203704833984375,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0512,
"reward": -0.34560693614184856,
"reward_std": 0.4205815941095352,
"rewards/cosine_scaled_reward": -0.17280346807092428,
"rewards/format_reward": 0.0,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 2430.125,
"epoch": 0.7180805484147387,
"grad_norm": 0.23899339139461517,
"kl": 0.02789306640625,
"learning_rate": 1.7174502842694212e-07,
"loss": -0.0302,
"reward": -0.18839553371071815,
"reward_std": 0.4583168476819992,
"rewards/cosine_scaled_reward": -0.09419775661081076,
"rewards/format_reward": 0.0,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 2552.4304809570312,
"epoch": 0.7197943444730077,
"grad_norm": 0.2333478480577469,
"kl": 0.0173492431640625,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0486,
"reward": -0.1561539713293314,
"reward_std": 0.6325561329722404,
"rewards/cosine_scaled_reward": -0.07807699032127857,
"rewards/format_reward": 0.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 2746.0694580078125,
"epoch": 0.7215081405312768,
"grad_norm": 0.2854664921760559,
"kl": 0.0223388671875,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0759,
"reward": -0.08271846733987331,
"reward_std": 0.6506856456398964,
"rewards/cosine_scaled_reward": -0.04135924857109785,
"rewards/format_reward": 0.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 2300.3055725097656,
"epoch": 0.7232219365895458,
"grad_norm": 0.17767125368118286,
"kl": 0.0181427001953125,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0242,
"reward": -0.16465576738119125,
"reward_std": 0.5095989629626274,
"rewards/cosine_scaled_reward": -0.08232788741588593,
"rewards/format_reward": 0.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2779.0694580078125,
"epoch": 0.7249357326478149,
"grad_norm": 0.1646861433982849,
"kl": 0.019927978515625,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0035,
"reward": -0.05413434375077486,
"reward_std": 0.7594424337148666,
"rewards/cosine_scaled_reward": -0.02706717373803258,
"rewards/format_reward": 0.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 2956.7083129882812,
"epoch": 0.726649528706084,
"grad_norm": 0.22844961285591125,
"kl": 0.03106689453125,
"learning_rate": 1.6346804638120098e-07,
"loss": -0.023,
"reward": -0.16962197236716747,
"reward_std": 0.6577330157160759,
"rewards/cosine_scaled_reward": -0.08481098245829344,
"rewards/format_reward": 0.0,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 2825.4444580078125,
"epoch": 0.7283633247643531,
"grad_norm": 0.21431787312030792,
"kl": 0.03460693359375,
"learning_rate": 1.6186884885673413e-07,
"loss": -0.0182,
"reward": -0.06549269519746304,
"reward_std": 0.6411803439259529,
"rewards/cosine_scaled_reward": -0.032746341079473495,
"rewards/format_reward": 0.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 2121.5556030273438,
"epoch": 0.7300771208226221,
"grad_norm": 0.28984084725379944,
"kl": 0.0258026123046875,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0539,
"reward": 0.15816697012633085,
"reward_std": 0.5270659551024437,
"rewards/cosine_scaled_reward": 0.07908349251374602,
"rewards/format_reward": 0.0,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 2789.486083984375,
"epoch": 0.7317909168808912,
"grad_norm": 0.20834913849830627,
"kl": 0.022003173828125,
"learning_rate": 1.5872728172265146e-07,
"loss": -0.0337,
"reward": -0.370651263743639,
"reward_std": 0.526657946407795,
"rewards/cosine_scaled_reward": -0.18532563000917435,
"rewards/format_reward": 0.0,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 2442.777801513672,
"epoch": 0.7335047129391602,
"grad_norm": 0.2500321567058563,
"kl": 0.021209716796875,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0647,
"reward": 0.2879646308720112,
"reward_std": 0.6987240761518478,
"rewards/cosine_scaled_reward": 0.1439823191612959,
"rewards/format_reward": 0.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 2859.0694580078125,
"epoch": 0.7352185089974294,
"grad_norm": 0.17108042538166046,
"kl": 0.02716064453125,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0365,
"reward": -0.21791245974600315,
"reward_std": 0.5681828185915947,
"rewards/cosine_scaled_reward": -0.10895622940734029,
"rewards/format_reward": 0.0,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 2675.52783203125,
"epoch": 0.7369323050556984,
"grad_norm": 0.18789908289909363,
"kl": 0.022308349609375,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0154,
"reward": -0.023968554101884365,
"reward_std": 0.5900578051805496,
"rewards/cosine_scaled_reward": -0.01198427053168416,
"rewards/format_reward": 0.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 2358.041717529297,
"epoch": 0.7386461011139674,
"grad_norm": 0.24318993091583252,
"kl": 0.022705078125,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0687,
"reward": 0.029904491268098354,
"reward_std": 0.7376819550991058,
"rewards/cosine_scaled_reward": 0.01495224516838789,
"rewards/format_reward": 0.0,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 3008.9306030273438,
"epoch": 0.7403598971722365,
"grad_norm": 0.17112420499324799,
"kl": 0.0255126953125,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0164,
"reward": -0.09738675877451897,
"reward_std": 0.39827052876353264,
"rewards/cosine_scaled_reward": -0.04869337775744498,
"rewards/format_reward": 0.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 2719.166717529297,
"epoch": 0.7420736932305055,
"grad_norm": 0.17819786071777344,
"kl": 0.024810791015625,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0207,
"reward": 0.0039961859583854675,
"reward_std": 0.4406754970550537,
"rewards/cosine_scaled_reward": 0.0019980808719992638,
"rewards/format_reward": 0.0,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 2033.6805419921875,
"epoch": 0.7437874892887746,
"grad_norm": 0.25923460721969604,
"kl": 0.0193939208984375,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0555,
"reward": -0.2742752702906728,
"reward_std": 0.617987684905529,
"rewards/cosine_scaled_reward": -0.1371376351453364,
"rewards/format_reward": 0.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 2954.9027709960938,
"epoch": 0.7455012853470437,
"grad_norm": 0.21946591138839722,
"kl": 0.021759033203125,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0302,
"reward": 0.07878507301211357,
"reward_std": 0.5823550596833229,
"rewards/cosine_scaled_reward": 0.03939253278076649,
"rewards/format_reward": 0.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 2508.8056030273438,
"epoch": 0.7472150814053128,
"grad_norm": 0.18389303982257843,
"kl": 0.022979736328125,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0511,
"reward": 0.11886966414749622,
"reward_std": 0.6237533167004585,
"rewards/cosine_scaled_reward": 0.05943482369184494,
"rewards/format_reward": 0.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 2480.375030517578,
"epoch": 0.7489288774635818,
"grad_norm": 0.18969161808490753,
"kl": 0.022491455078125,
"learning_rate": 1.4417536311769885e-07,
"loss": -0.0202,
"reward": -0.3488190211355686,
"reward_std": 0.6528129577636719,
"rewards/cosine_scaled_reward": -0.17440950870513916,
"rewards/format_reward": 0.0,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 2536.4444580078125,
"epoch": 0.7506426735218509,
"grad_norm": 0.2745197117328644,
"kl": 0.0180511474609375,
"learning_rate": 1.4282782639029128e-07,
"loss": -0.0504,
"reward": 0.2845611646771431,
"reward_std": 0.4479832947254181,
"rewards/cosine_scaled_reward": 0.14228056371212006,
"rewards/format_reward": 0.0,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 3101.6528930664062,
"epoch": 0.7523564695801199,
"grad_norm": 0.16732795536518097,
"kl": 0.023773193359375,
"learning_rate": 1.4150013466019114e-07,
"loss": -0.0111,
"reward": -0.2682619922561571,
"reward_std": 0.6106480062007904,
"rewards/cosine_scaled_reward": -0.134130991587881,
"rewards/format_reward": 0.0,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 2572.5555725097656,
"epoch": 0.7540702656383891,
"grad_norm": 0.19039277732372284,
"kl": 0.023834228515625,
"learning_rate": 1.4019235263722034e-07,
"loss": -0.0026,
"reward": 0.057762331794947386,
"reward_std": 0.5597369149327278,
"rewards/cosine_scaled_reward": 0.02888116310350597,
"rewards/format_reward": 0.0,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 2865.7916259765625,
"epoch": 0.7557840616966581,
"grad_norm": 0.19560997188091278,
"kl": 0.024139404296875,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0056,
"reward": -0.09995577030349523,
"reward_std": 0.6689890846610069,
"rewards/cosine_scaled_reward": -0.049977882008533925,
"rewards/format_reward": 0.0,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2008.4305419921875,
"epoch": 0.7574978577549272,
"grad_norm": 0.2906733751296997,
"kl": 0.02508544921875,
"learning_rate": 1.3763677169699217e-07,
"loss": -0.0189,
"reward": 0.1475011482834816,
"reward_std": 0.6993541121482849,
"rewards/cosine_scaled_reward": 0.07375057972967625,
"rewards/format_reward": 0.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 2263.736114501953,
"epoch": 0.7592116538131962,
"grad_norm": 0.27822345495224,
"kl": 0.027435302734375,
"learning_rate": 1.3638909733514452e-07,
"loss": -0.0396,
"reward": 0.08332556113600731,
"reward_std": 0.692223846912384,
"rewards/cosine_scaled_reward": 0.04166277777403593,
"rewards/format_reward": 0.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 2766.52783203125,
"epoch": 0.7609254498714653,
"grad_norm": 0.20186901092529297,
"kl": 0.02777099609375,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0635,
"reward": 0.03594814520329237,
"reward_std": 0.6744156032800674,
"rewards/cosine_scaled_reward": 0.017974070739001036,
"rewards/format_reward": 0.0,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 3045.90283203125,
"epoch": 0.7626392459297343,
"grad_norm": 0.24945306777954102,
"kl": 0.0214080810546875,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0559,
"reward": -0.03293860936537385,
"reward_std": 0.6841256394982338,
"rewards/cosine_scaled_reward": -0.016469309804961085,
"rewards/format_reward": 0.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 2726.013885498047,
"epoch": 0.7643530419880035,
"grad_norm": 0.267711341381073,
"kl": 0.02545166015625,
"learning_rate": 1.3276726544494571e-07,
"loss": -0.0273,
"reward": 0.17773457616567612,
"reward_std": 0.47991518676280975,
"rewards/cosine_scaled_reward": 0.08886728808283806,
"rewards/format_reward": 0.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 2652.3750610351562,
"epoch": 0.7660668380462725,
"grad_norm": 0.1955571472644806,
"kl": 0.024688720703125,
"learning_rate": 1.316005813502869e-07,
"loss": -0.0013,
"reward": -0.21300538629293442,
"reward_std": 0.5716921538114548,
"rewards/cosine_scaled_reward": -0.10650269035249949,
"rewards/format_reward": 0.0,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 2869.0972290039062,
"epoch": 0.7677806341045416,
"grad_norm": 0.18771569430828094,
"kl": 0.020751953125,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0449,
"reward": 0.015052955597639084,
"reward_std": 0.6415582820773125,
"rewards/cosine_scaled_reward": 0.007526477798819542,
"rewards/format_reward": 0.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 2951.72216796875,
"epoch": 0.7694944301628106,
"grad_norm": 0.186003640294075,
"kl": 0.0264739990234375,
"learning_rate": 1.2932844562179352e-07,
"loss": -0.0117,
"reward": -0.1732272356748581,
"reward_std": 0.6033661440014839,
"rewards/cosine_scaled_reward": -0.0866136197000742,
"rewards/format_reward": 0.0,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 2393.916717529297,
"epoch": 0.7712082262210797,
"grad_norm": 0.18100591003894806,
"kl": 0.018280029296875,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0174,
"reward": -0.17222392931580544,
"reward_std": 0.4759965166449547,
"rewards/cosine_scaled_reward": -0.08611196093261242,
"rewards/format_reward": 0.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 3247.5555419921875,
"epoch": 0.7729220222793488,
"grad_norm": 0.16927795112133026,
"kl": 0.026092529296875,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0101,
"reward": -0.1602705717086792,
"reward_std": 0.5965098738670349,
"rewards/cosine_scaled_reward": -0.0801352858543396,
"rewards/format_reward": 0.0,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 2699.2222900390625,
"epoch": 0.7746358183376179,
"grad_norm": 0.16316962242126465,
"kl": 0.0219879150390625,
"learning_rate": 1.260741462457165e-07,
"loss": 0.055,
"reward": -0.06079525873064995,
"reward_std": 0.6986799910664558,
"rewards/cosine_scaled_reward": -0.030397622846066952,
"rewards/format_reward": 0.0,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2595.3472290039062,
"epoch": 0.7763496143958869,
"grad_norm": 0.5946676135063171,
"kl": 0.029022216796875,
"learning_rate": 1.2503063339313356e-07,
"loss": -0.0538,
"reward": -0.26486414577811956,
"reward_std": 0.415864534676075,
"rewards/cosine_scaled_reward": -0.13243207102641463,
"rewards/format_reward": 0.0,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 2008.9305725097656,
"epoch": 0.778063410454156,
"grad_norm": 0.19199617207050323,
"kl": 0.0153961181640625,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0063,
"reward": 0.13748213648796082,
"reward_std": 0.6150016859173775,
"rewards/cosine_scaled_reward": 0.06874106079339981,
"rewards/format_reward": 0.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 2725.0000610351562,
"epoch": 0.779777206512425,
"grad_norm": 0.1928061991930008,
"kl": 0.02520751953125,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0235,
"reward": -0.0030081644654273987,
"reward_std": 0.7155122309923172,
"rewards/cosine_scaled_reward": -0.0015040775761008263,
"rewards/format_reward": 0.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 2560.888916015625,
"epoch": 0.781491002570694,
"grad_norm": 0.2278081625699997,
"kl": 0.0233154296875,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0422,
"reward": -0.040069979906547815,
"reward_std": 0.6579814180731773,
"rewards/cosine_scaled_reward": -0.02003499452257529,
"rewards/format_reward": 0.0,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2620.638916015625,
"epoch": 0.7832047986289632,
"grad_norm": 0.2580767571926117,
"kl": 0.0252532958984375,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.033,
"reward": 0.23012623190879822,
"reward_std": 0.6976396143436432,
"rewards/cosine_scaled_reward": 0.11506311595439911,
"rewards/format_reward": 0.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 2410.513946533203,
"epoch": 0.7849185946872322,
"grad_norm": 0.2296840250492096,
"kl": 0.0242919921875,
"learning_rate": 1.2012473704494537e-07,
"loss": -0.0221,
"reward": -0.00019283778965473175,
"reward_std": 0.6016373038291931,
"rewards/cosine_scaled_reward": -9.64207574725151e-05,
"rewards/format_reward": 0.0,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 2265.4583435058594,
"epoch": 0.7866323907455013,
"grad_norm": 0.23107987642288208,
"kl": 0.0201568603515625,
"learning_rate": 1.1920622611056974e-07,
"loss": -0.0309,
"reward": 0.18072006362490356,
"reward_std": 0.5644106566905975,
"rewards/cosine_scaled_reward": 0.09036003064829856,
"rewards/format_reward": 0.0,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 2700.541748046875,
"epoch": 0.7883461868037703,
"grad_norm": 0.21530865132808685,
"kl": 0.024566650390625,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0267,
"reward": 0.030735374661162496,
"reward_std": 0.7207788527011871,
"rewards/cosine_scaled_reward": 0.01536769128870219,
"rewards/format_reward": 0.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 2300.8333435058594,
"epoch": 0.7900599828620394,
"grad_norm": 0.31069549918174744,
"kl": 0.023162841796875,
"learning_rate": 1.1743223682775649e-07,
"loss": -0.0563,
"reward": 0.09233328700065613,
"reward_std": 0.7090381979942322,
"rewards/cosine_scaled_reward": 0.046166639775037766,
"rewards/format_reward": 0.0,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 2608.8056030273438,
"epoch": 0.7917737789203085,
"grad_norm": 0.20271840691566467,
"kl": 0.0250244140625,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0362,
"reward": -0.07103721424937248,
"reward_std": 0.7956888303160667,
"rewards/cosine_scaled_reward": -0.03551860898733139,
"rewards/format_reward": 0.0,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 2780.4444580078125,
"epoch": 0.7934875749785776,
"grad_norm": 0.2081584334373474,
"kl": 0.0209808349609375,
"learning_rate": 1.1574257748745986e-07,
"loss": -0.0471,
"reward": 0.021411696448922157,
"reward_std": 0.48744403570890427,
"rewards/cosine_scaled_reward": 0.010705851949751377,
"rewards/format_reward": 0.0,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 2673.4861450195312,
"epoch": 0.7952013710368466,
"grad_norm": 0.19560140371322632,
"kl": 0.0238037109375,
"learning_rate": 1.1492947512799328e-07,
"loss": -0.0409,
"reward": 0.18470758572220802,
"reward_std": 0.5649774596095085,
"rewards/cosine_scaled_reward": 0.09235379751771688,
"rewards/format_reward": 0.0,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 3105.27783203125,
"epoch": 0.7969151670951157,
"grad_norm": 0.20849952101707458,
"kl": 0.030609130859375,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.041,
"reward": -0.284846730530262,
"reward_std": 0.5418054684996605,
"rewards/cosine_scaled_reward": -0.14242336247116327,
"rewards/format_reward": 0.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 2965.8611450195312,
"epoch": 0.7986289631533847,
"grad_norm": 0.17242176830768585,
"kl": 0.025604248046875,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0034,
"reward": -0.3226154297590256,
"reward_std": 0.5333989933133125,
"rewards/cosine_scaled_reward": -0.16130771208554506,
"rewards/format_reward": 0.0,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 3079.6806030273438,
"epoch": 0.8003427592116538,
"grad_norm": 0.20415925979614258,
"kl": 0.0255126953125,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0969,
"reward": -0.24770671501755714,
"reward_std": 0.5701889246702194,
"rewards/cosine_scaled_reward": -0.12385335750877857,
"rewards/format_reward": 0.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 2234.986083984375,
"epoch": 0.8020565552699229,
"grad_norm": 0.3467549979686737,
"kl": 0.024200439453125,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0855,
"reward": -0.009742069989442825,
"reward_std": 0.591868631541729,
"rewards/cosine_scaled_reward": -0.004871031269431114,
"rewards/format_reward": 0.0,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 2717.0833435058594,
"epoch": 0.803770351328192,
"grad_norm": 0.20191779732704163,
"kl": 0.0237579345703125,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0622,
"reward": 0.058277749456465244,
"reward_std": 0.7684449702501297,
"rewards/cosine_scaled_reward": 0.029138876125216484,
"rewards/format_reward": 0.0,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 2607.1388549804688,
"epoch": 0.805484147386461,
"grad_norm": 0.2015339732170105,
"kl": 0.0255126953125,
"learning_rate": 1.1049747474962444e-07,
"loss": -0.0145,
"reward": -0.2184823751449585,
"reward_std": 0.5644990280270576,
"rewards/cosine_scaled_reward": -0.10924118757247925,
"rewards/format_reward": 0.0,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 3260.3333740234375,
"epoch": 0.8071979434447301,
"grad_norm": 0.2259937822818756,
"kl": 0.02581787109375,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0565,
"reward": -0.22493689320981503,
"reward_std": 0.5675350055098534,
"rewards/cosine_scaled_reward": -0.11246845219284296,
"rewards/format_reward": 0.0,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 2854.1806030273438,
"epoch": 0.8089117395029991,
"grad_norm": 0.21495820581912994,
"kl": 0.028594970703125,
"learning_rate": 1.0919113768029517e-07,
"loss": -0.0313,
"reward": 0.14072632044553757,
"reward_std": 0.6395101621747017,
"rewards/cosine_scaled_reward": 0.07036316394805908,
"rewards/format_reward": 0.0,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 2890.4444580078125,
"epoch": 0.8106255355612683,
"grad_norm": 0.18685470521450043,
"kl": 0.023468017578125,
"learning_rate": 1.0857018009286381e-07,
"loss": -0.0026,
"reward": -0.1240294948220253,
"reward_std": 0.48969001322984695,
"rewards/cosine_scaled_reward": -0.06201474368572235,
"rewards/format_reward": 0.0,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 2470.0972900390625,
"epoch": 0.8123393316195373,
"grad_norm": 0.2818465232849121,
"kl": 0.029144287109375,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0035,
"reward": 0.24050107831135392,
"reward_std": 0.5852163806557655,
"rewards/cosine_scaled_reward": 0.12025054381228983,
"rewards/format_reward": 0.0,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 2933.541748046875,
"epoch": 0.8140531276778064,
"grad_norm": 0.19675259292125702,
"kl": 0.027496337890625,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0321,
"reward": -0.22623535431921482,
"reward_std": 0.6677599251270294,
"rewards/cosine_scaled_reward": -0.1131176782073453,
"rewards/format_reward": 0.0,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 2794.9862060546875,
"epoch": 0.8157669237360754,
"grad_norm": 0.24406589567661285,
"kl": 0.02947998046875,
"learning_rate": 1.068365111445064e-07,
"loss": -0.0179,
"reward": -0.3521595522761345,
"reward_std": 0.5985631048679352,
"rewards/cosine_scaled_reward": -0.17607977613806725,
"rewards/format_reward": 0.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 2567.4722290039062,
"epoch": 0.8174807197943444,
"grad_norm": 0.17105937004089355,
"kl": 0.016357421875,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0385,
"reward": -0.2558911629021168,
"reward_std": 0.4246537983417511,
"rewards/cosine_scaled_reward": -0.12794558703899384,
"rewards/format_reward": 0.0,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 2355.6806030273438,
"epoch": 0.8191945158526135,
"grad_norm": 0.24741511046886444,
"kl": 0.02423095703125,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0854,
"reward": -0.1565770129673183,
"reward_std": 0.6820876449346542,
"rewards/cosine_scaled_reward": -0.07828850811347365,
"rewards/format_reward": 0.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 2624.0833435058594,
"epoch": 0.8209083119108826,
"grad_norm": 0.20646637678146362,
"kl": 0.0313720703125,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0146,
"reward": 0.1560894399881363,
"reward_std": 0.5770560130476952,
"rewards/cosine_scaled_reward": 0.0780447069555521,
"rewards/format_reward": 0.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 2327.7222290039062,
"epoch": 0.8226221079691517,
"grad_norm": 0.23056431114673615,
"kl": 0.01910400390625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0635,
"reward": 0.149917745962739,
"reward_std": 0.7103602811694145,
"rewards/cosine_scaled_reward": 0.07495887111872435,
"rewards/format_reward": 0.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 2461.5972900390625,
"epoch": 0.8243359040274207,
"grad_norm": 0.24706712365150452,
"kl": 0.03350830078125,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0029,
"reward": -0.1800133902579546,
"reward_std": 0.49367547780275345,
"rewards/cosine_scaled_reward": -0.09000669163651764,
"rewards/format_reward": 0.0,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 2864.0694580078125,
"epoch": 0.8260497000856898,
"grad_norm": 0.25076547265052795,
"kl": 0.021514892578125,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0445,
"reward": -0.14674655348062515,
"reward_std": 0.5242787301540375,
"rewards/cosine_scaled_reward": -0.07337328046560287,
"rewards/format_reward": 0.0,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 2596.874969482422,
"epoch": 0.8277634961439588,
"grad_norm": 0.2056618481874466,
"kl": 0.02398681640625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0449,
"reward": -0.12835523579269648,
"reward_std": 0.5452019795775414,
"rewards/cosine_scaled_reward": -0.06417762162163854,
"rewards/format_reward": 0.0,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 2179.500030517578,
"epoch": 0.829477292202228,
"grad_norm": 0.17514649033546448,
"kl": 0.0175018310546875,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.058,
"reward": -0.154528075363487,
"reward_std": 0.5336438938975334,
"rewards/cosine_scaled_reward": -0.07726403628475964,
"rewards/format_reward": 0.0,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 2221.6666259765625,
"epoch": 0.831191088260497,
"grad_norm": 0.2613643705844879,
"kl": 0.028350830078125,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0496,
"reward": -0.2054775208234787,
"reward_std": 0.5721682235598564,
"rewards/cosine_scaled_reward": -0.1027387659996748,
"rewards/format_reward": 0.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 2448.3193969726562,
"epoch": 0.8329048843187661,
"grad_norm": 0.2419876903295517,
"kl": 0.028411865234375,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0397,
"reward": -0.06807173043489456,
"reward_std": 0.507116761058569,
"rewards/cosine_scaled_reward": -0.03403585962951183,
"rewards/format_reward": 0.0,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 2735.7084045410156,
"epoch": 0.8346186803770351,
"grad_norm": 0.2406454235315323,
"kl": 0.0244140625,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0244,
"reward": -0.2896123267710209,
"reward_std": 0.5326507315039635,
"rewards/cosine_scaled_reward": -0.1448061689734459,
"rewards/format_reward": 0.0,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 2310.27783203125,
"epoch": 0.8363324764353042,
"grad_norm": 0.252458781003952,
"kl": 0.021209716796875,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0239,
"reward": 0.10779337584972382,
"reward_std": 0.6982715576887131,
"rewards/cosine_scaled_reward": 0.053896697354502976,
"rewards/format_reward": 0.0,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 2743.541748046875,
"epoch": 0.8380462724935732,
"grad_norm": 0.17434372007846832,
"kl": 0.024749755859375,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0124,
"reward": -0.16761679388582706,
"reward_std": 0.6089917570352554,
"rewards/cosine_scaled_reward": -0.08380839880555868,
"rewards/format_reward": 0.0,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 2486.236114501953,
"epoch": 0.8397600685518424,
"grad_norm": 0.2090701311826706,
"kl": 0.027984619140625,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0036,
"reward": -0.05014536017552018,
"reward_std": 0.5763295590877533,
"rewards/cosine_scaled_reward": -0.025072677060961723,
"rewards/format_reward": 0.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 2848.888916015625,
"epoch": 0.8414738646101114,
"grad_norm": 0.22741778194904327,
"kl": 0.029052734375,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0629,
"reward": -0.16439465060830116,
"reward_std": 0.6782207787036896,
"rewards/cosine_scaled_reward": -0.08219731226563454,
"rewards/format_reward": 0.0,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 3189.6250610351562,
"epoch": 0.8431876606683805,
"grad_norm": 0.17802605032920837,
"kl": 0.028839111328125,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0435,
"reward": -0.35712628811597824,
"reward_std": 0.6088190823793411,
"rewards/cosine_scaled_reward": -0.17856314033269882,
"rewards/format_reward": 0.0,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 2387.888885498047,
"epoch": 0.8449014567266495,
"grad_norm": 0.2299438714981079,
"kl": 0.025177001953125,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0576,
"reward": -0.17006561160087585,
"reward_std": 0.3991905003786087,
"rewards/cosine_scaled_reward": -0.08503280207514763,
"rewards/format_reward": 0.0,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 2941.8611450195312,
"epoch": 0.8466152527849186,
"grad_norm": 0.18643692135810852,
"kl": 0.0261383056640625,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0474,
"reward": -0.04825907852500677,
"reward_std": 0.6627323552966118,
"rewards/cosine_scaled_reward": -0.02412955043837428,
"rewards/format_reward": 0.0,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 2563.375030517578,
"epoch": 0.8483290488431876,
"grad_norm": 0.32022979855537415,
"kl": 0.03094482421875,
"learning_rate": 1.0039472645551372e-07,
"loss": -0.0218,
"reward": 0.03584544826298952,
"reward_std": 0.5564405769109726,
"rewards/cosine_scaled_reward": 0.017922731814906,
"rewards/format_reward": 0.0,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 2572.77783203125,
"epoch": 0.8500428449014568,
"grad_norm": 0.23663800954818726,
"kl": 0.0213623046875,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0756,
"reward": 0.13385188579559326,
"reward_std": 0.6874089986085892,
"rewards/cosine_scaled_reward": 0.06692593172192574,
"rewards/format_reward": 0.0,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 2480.5694274902344,
"epoch": 0.8517566409597258,
"grad_norm": 0.2888961732387543,
"kl": 0.0250244140625,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.1087,
"reward": 0.2151249535381794,
"reward_std": 0.7869587689638138,
"rewards/cosine_scaled_reward": 0.10756248049438,
"rewards/format_reward": 0.0,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 3000.1666259765625,
"epoch": 0.8534704370179949,
"grad_norm": 0.16567862033843994,
"kl": 0.0272216796875,
"learning_rate": 1.0009869243631952e-07,
"loss": -0.0254,
"reward": -0.33249833807349205,
"reward_std": 0.48314109444618225,
"rewards/cosine_scaled_reward": -0.16624917834997177,
"rewards/format_reward": 0.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 2385.5416870117188,
"epoch": 0.8551842330762639,
"grad_norm": 0.20905697345733643,
"kl": 0.027862548828125,
"learning_rate": 1.000438641958131e-07,
"loss": 0.017,
"reward": 0.046394890174269676,
"reward_std": 0.649334043264389,
"rewards/cosine_scaled_reward": 0.02319744322448969,
"rewards/format_reward": 0.0,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 1828.2777862548828,
"epoch": 0.856898029134533,
"grad_norm": 0.25425171852111816,
"kl": 0.0164031982421875,
"learning_rate": 1.0001096618257236e-07,
"loss": -0.0015,
"reward": 0.1327105201780796,
"reward_std": 0.5622994378209114,
"rewards/cosine_scaled_reward": 0.06635526567697525,
"rewards/format_reward": 0.0,
"step": 500
},
{
"epoch": 0.856898029134533,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.01978990149567835,
"train_runtime": 91059.0796,
"train_samples_per_second": 0.395,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}