{ "best_metric": 0.23313412070274353, "best_model_checkpoint": "models/qwen2.5-3b-dpo-coarse/checkpoint-10000", "epoch": 1.0, "eval_steps": 5000, "global_step": 13291, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.523888345496953e-05, "grad_norm": 2.0655642984991194, "learning_rate": 3.7593984962406016e-10, "logits/chosen": -2.03125, "logits/rejected": -1.53125, "logps/chosen": -121.0, "logps/rejected": -146.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0007523888345496953, "grad_norm": 1.9538445459115994, "learning_rate": 3.759398496240601e-09, "logits/chosen": -2.390625, "logits/rejected": -2.09375, "logps/chosen": -149.0, "logps/rejected": -147.0, "loss": 0.692, "rewards/accuracies": 0.2986111044883728, "rewards/chosen": 0.0013580322265625, "rewards/margins": 0.00238037109375, "rewards/rejected": -0.0010223388671875, "step": 10 }, { "epoch": 0.0015047776690993906, "grad_norm": 1.7482239733475604, "learning_rate": 7.518796992481202e-09, "logits/chosen": -2.390625, "logits/rejected": -2.0, "logps/chosen": -178.0, "logps/rejected": -159.0, "loss": 0.6927, "rewards/accuracies": 0.375, "rewards/chosen": 0.00087738037109375, "rewards/margins": 0.0001277923583984375, "rewards/rejected": 0.000751495361328125, "step": 20 }, { "epoch": 0.0022571665036490857, "grad_norm": 1.6937945346625471, "learning_rate": 1.1278195488721804e-08, "logits/chosen": -2.40625, "logits/rejected": -2.0, "logps/chosen": -173.0, "logps/rejected": -171.0, "loss": 0.6927, "rewards/accuracies": 0.40625, "rewards/chosen": 0.00151824951171875, "rewards/margins": 0.0020599365234375, "rewards/rejected": -0.000545501708984375, "step": 30 }, { "epoch": 0.003009555338198781, "grad_norm": 1.8569983758522628, "learning_rate": 1.5037593984962404e-08, "logits/chosen": -2.34375, "logits/rejected": -2.078125, "logps/chosen": -161.0, "logps/rejected": -161.0, "loss": 0.6927, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.000705718994140625, "rewards/margins": -0.000499725341796875, "rewards/rejected": 0.0012054443359375, "step": 40 }, { "epoch": 0.0037619441727484763, "grad_norm": 1.955956889927452, "learning_rate": 1.8796992481203004e-08, "logits/chosen": -2.375, "logits/rejected": -2.03125, "logps/chosen": -154.0, "logps/rejected": -155.0, "loss": 0.6927, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.00151824951171875, "rewards/margins": 0.00048828125, "rewards/rejected": -0.0019989013671875, "step": 50 }, { "epoch": 0.004514333007298171, "grad_norm": 1.8154990127537094, "learning_rate": 2.2556390977443608e-08, "logits/chosen": -2.40625, "logits/rejected": -2.015625, "logps/chosen": -161.0, "logps/rejected": -147.0, "loss": 0.6924, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.000499725341796875, "rewards/margins": -0.000720977783203125, "rewards/rejected": 0.00021839141845703125, "step": 60 }, { "epoch": 0.005266721841847867, "grad_norm": 1.987488235998793, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -2.40625, "logits/rejected": -1.890625, "logps/chosen": -169.0, "logps/rejected": -171.0, "loss": 0.6926, "rewards/accuracies": 0.375, "rewards/chosen": 0.00022125244140625, "rewards/margins": 0.00021839141845703125, "rewards/rejected": -1.5273690223693848e-06, "step": 70 }, { "epoch": 0.006019110676397562, "grad_norm": 1.7715247772447917, "learning_rate": 3.007518796992481e-08, "logits/chosen": -2.390625, "logits/rejected": -2.03125, "logps/chosen": -157.0, "logps/rejected": -148.0, "loss": 0.6928, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 0.00032806396484375, "rewards/margins": 0.0004253387451171875, "rewards/rejected": -9.059906005859375e-05, "step": 80 }, { "epoch": 0.006771499510947258, "grad_norm": 1.5425426980374812, "learning_rate": 3.383458646616541e-08, "logits/chosen": -2.453125, "logits/rejected": -2.15625, "logps/chosen": -171.0, "logps/rejected": -151.0, "loss": 0.6926, "rewards/accuracies": 0.375, "rewards/chosen": -0.000644683837890625, "rewards/margins": 7.82012939453125e-05, "rewards/rejected": -0.000720977783203125, "step": 90 }, { "epoch": 0.0075238883454969525, "grad_norm": 1.8781269521393504, "learning_rate": 3.759398496240601e-08, "logits/chosen": -2.375, "logits/rejected": -2.015625, "logps/chosen": -163.0, "logps/rejected": -147.0, "loss": 0.6922, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.00089263916015625, "rewards/margins": -0.00010251998901367188, "rewards/rejected": -0.000789642333984375, "step": 100 }, { "epoch": 0.008276277180046648, "grad_norm": 1.8425671870697207, "learning_rate": 4.1353383458646615e-08, "logits/chosen": -2.21875, "logits/rejected": -1.953125, "logps/chosen": -159.0, "logps/rejected": -154.0, "loss": 0.6927, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 0.00018787384033203125, "rewards/margins": -0.000438690185546875, "rewards/rejected": 0.0006256103515625, "step": 110 }, { "epoch": 0.009028666014596343, "grad_norm": 1.8232396339646215, "learning_rate": 4.5112781954887216e-08, "logits/chosen": -2.359375, "logits/rejected": -1.953125, "logps/chosen": -158.0, "logps/rejected": -139.0, "loss": 0.6923, "rewards/accuracies": 0.3125, "rewards/chosen": 0.00189208984375, "rewards/margins": -0.0004062652587890625, "rewards/rejected": 0.0023040771484375, "step": 120 }, { "epoch": 0.009781054849146039, "grad_norm": 2.0676564160851814, "learning_rate": 4.8872180451127816e-08, "logits/chosen": -2.40625, "logits/rejected": -2.015625, "logps/chosen": -137.0, "logps/rejected": -147.0, "loss": 0.692, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 0.000797271728515625, "rewards/margins": -3.0994415283203125e-05, "rewards/rejected": 0.000827789306640625, "step": 130 }, { "epoch": 0.010533443683695734, "grad_norm": 1.9395451342559986, "learning_rate": 5.2631578947368416e-08, "logits/chosen": -2.3125, "logits/rejected": -2.046875, "logps/chosen": -159.0, "logps/rejected": -147.0, "loss": 0.692, "rewards/accuracies": 0.375, "rewards/chosen": 0.001190185546875, "rewards/margins": -1.5974044799804688e-05, "rewards/rejected": 0.0012054443359375, "step": 140 }, { "epoch": 0.01128583251824543, "grad_norm": 1.9021298256381785, "learning_rate": 5.6390977443609016e-08, "logits/chosen": -2.375, "logits/rejected": -1.9140625, "logps/chosen": -175.0, "logps/rejected": -148.0, "loss": 0.6921, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.00177001953125, "rewards/margins": 0.00213623046875, "rewards/rejected": -0.000370025634765625, "step": 150 }, { "epoch": 0.012038221352795125, "grad_norm": 1.9054050326686525, "learning_rate": 6.015037593984962e-08, "logits/chosen": -2.40625, "logits/rejected": -2.0625, "logps/chosen": -160.0, "logps/rejected": -145.0, "loss": 0.6917, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0025482177734375, "rewards/margins": 0.0022430419921875, "rewards/rejected": 0.0003108978271484375, "step": 160 }, { "epoch": 0.01279061018734482, "grad_norm": 1.926853611550257, "learning_rate": 6.390977443609022e-08, "logits/chosen": -2.296875, "logits/rejected": -1.859375, "logps/chosen": -165.0, "logps/rejected": -151.0, "loss": 0.6912, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0035552978515625, "rewards/margins": 0.003875732421875, "rewards/rejected": -0.0003299713134765625, "step": 170 }, { "epoch": 0.013542999021894516, "grad_norm": 1.9234881642608486, "learning_rate": 6.766917293233082e-08, "logits/chosen": -2.359375, "logits/rejected": -2.03125, "logps/chosen": -145.0, "logps/rejected": -154.0, "loss": 0.6915, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.00628662109375, "rewards/margins": 0.00213623046875, "rewards/rejected": 0.004150390625, "step": 180 }, { "epoch": 0.01429538785644421, "grad_norm": 1.9784096184868571, "learning_rate": 7.142857142857142e-08, "logits/chosen": -2.5, "logits/rejected": -2.109375, "logps/chosen": -163.0, "logps/rejected": -142.0, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": 0.006256103515625, "rewards/margins": 0.0028839111328125, "rewards/rejected": 0.0033721923828125, "step": 190 }, { "epoch": 0.015047776690993905, "grad_norm": 1.631724548633527, "learning_rate": 7.518796992481202e-08, "logits/chosen": -2.46875, "logits/rejected": -2.09375, "logps/chosen": -162.0, "logps/rejected": -157.0, "loss": 0.691, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00616455078125, "rewards/margins": 0.0031280517578125, "rewards/rejected": 0.003021240234375, "step": 200 }, { "epoch": 0.0158001655255436, "grad_norm": 1.7460592334442049, "learning_rate": 7.894736842105262e-08, "logits/chosen": -2.421875, "logits/rejected": -2.09375, "logps/chosen": -158.0, "logps/rejected": -153.0, "loss": 0.6906, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00665283203125, "rewards/margins": 0.002532958984375, "rewards/rejected": 0.004119873046875, "step": 210 }, { "epoch": 0.016552554360093296, "grad_norm": 1.864652158743688, "learning_rate": 8.270676691729323e-08, "logits/chosen": -2.46875, "logits/rejected": -2.140625, "logps/chosen": -169.0, "logps/rejected": -157.0, "loss": 0.6898, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.010498046875, "rewards/margins": 0.00445556640625, "rewards/rejected": 0.00604248046875, "step": 220 }, { "epoch": 0.017304943194642992, "grad_norm": 1.8807523922343017, "learning_rate": 8.646616541353382e-08, "logits/chosen": -2.40625, "logits/rejected": -2.03125, "logps/chosen": -150.0, "logps/rejected": -166.0, "loss": 0.6892, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.01275634765625, "rewards/margins": 0.007049560546875, "rewards/rejected": 0.005706787109375, "step": 230 }, { "epoch": 0.018057332029192685, "grad_norm": 1.8543547888579985, "learning_rate": 9.022556390977443e-08, "logits/chosen": -2.453125, "logits/rejected": -2.125, "logps/chosen": -160.0, "logps/rejected": -153.0, "loss": 0.6882, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.016357421875, "rewards/margins": 0.0087890625, "rewards/rejected": 0.007537841796875, "step": 240 }, { "epoch": 0.01880972086374238, "grad_norm": 2.002695069307674, "learning_rate": 9.398496240601502e-08, "logits/chosen": -2.359375, "logits/rejected": -1.953125, "logps/chosen": -138.0, "logps/rejected": -145.0, "loss": 0.6882, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.017822265625, "rewards/margins": 0.01043701171875, "rewards/rejected": 0.00738525390625, "step": 250 }, { "epoch": 0.019562109698292078, "grad_norm": 1.694938582256644, "learning_rate": 9.774436090225563e-08, "logits/chosen": -2.390625, "logits/rejected": -2.0625, "logps/chosen": -143.0, "logps/rejected": -154.0, "loss": 0.6874, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0152587890625, "rewards/margins": 0.00787353515625, "rewards/rejected": 0.007354736328125, "step": 260 }, { "epoch": 0.02031449853284177, "grad_norm": 1.6267392659283901, "learning_rate": 1.0150375939849622e-07, "logits/chosen": -2.484375, "logits/rejected": -2.140625, "logps/chosen": -173.0, "logps/rejected": -155.0, "loss": 0.6872, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0145263671875, "rewards/margins": 0.0120849609375, "rewards/rejected": 0.0024261474609375, "step": 270 }, { "epoch": 0.021066887367391467, "grad_norm": 1.8009833530928583, "learning_rate": 1.0526315789473683e-07, "logits/chosen": -2.359375, "logits/rejected": -2.25, "logps/chosen": -169.0, "logps/rejected": -151.0, "loss": 0.6859, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.01953125, "rewards/margins": 0.0123291015625, "rewards/rejected": 0.00714111328125, "step": 280 }, { "epoch": 0.021819276201941164, "grad_norm": 1.751273107274032, "learning_rate": 1.0902255639097744e-07, "logits/chosen": -2.40625, "logits/rejected": -2.0625, "logps/chosen": -151.0, "logps/rejected": -156.0, "loss": 0.686, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.01953125, "rewards/margins": 0.01397705078125, "rewards/rejected": 0.005462646484375, "step": 290 }, { "epoch": 0.02257166503649086, "grad_norm": 1.8215393575640535, "learning_rate": 1.1278195488721803e-07, "logits/chosen": -2.5, "logits/rejected": -2.0625, "logps/chosen": -148.0, "logps/rejected": -142.0, "loss": 0.6845, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.02099609375, "rewards/margins": 0.0157470703125, "rewards/rejected": 0.005279541015625, "step": 300 }, { "epoch": 0.023324053871040553, "grad_norm": 1.7387394745988194, "learning_rate": 1.1654135338345864e-07, "logits/chosen": -2.328125, "logits/rejected": -2.09375, "logps/chosen": -152.0, "logps/rejected": -127.0, "loss": 0.6844, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.017333984375, "rewards/margins": 0.0174560546875, "rewards/rejected": -0.00016117095947265625, "step": 310 }, { "epoch": 0.02407644270559025, "grad_norm": 1.7127796622071032, "learning_rate": 1.2030075187969923e-07, "logits/chosen": -2.59375, "logits/rejected": -2.09375, "logps/chosen": -158.0, "logps/rejected": -154.0, "loss": 0.6838, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0218505859375, "rewards/margins": 0.016357421875, "rewards/rejected": 0.005523681640625, "step": 320 }, { "epoch": 0.024828831540139946, "grad_norm": 1.8326169180769538, "learning_rate": 1.2406015037593983e-07, "logits/chosen": -2.546875, "logits/rejected": -2.015625, "logps/chosen": -159.0, "logps/rejected": -174.0, "loss": 0.6806, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0296630859375, "rewards/margins": 0.030029296875, "rewards/rejected": -0.0003986358642578125, "step": 330 }, { "epoch": 0.02558122037468964, "grad_norm": 1.7066967555396413, "learning_rate": 1.2781954887218045e-07, "logits/chosen": -2.546875, "logits/rejected": -2.125, "logps/chosen": -165.0, "logps/rejected": -162.0, "loss": 0.6797, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0230712890625, "rewards/margins": 0.0279541015625, "rewards/rejected": -0.0048828125, "step": 340 }, { "epoch": 0.026333609209239335, "grad_norm": 1.735716628750598, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -2.5625, "logits/rejected": -2.03125, "logps/chosen": -170.0, "logps/rejected": -168.0, "loss": 0.6781, "rewards/accuracies": 0.71875, "rewards/chosen": 0.029541015625, "rewards/margins": 0.03857421875, "rewards/rejected": -0.00909423828125, "step": 350 }, { "epoch": 0.02708599804378903, "grad_norm": 2.113635572426321, "learning_rate": 1.3533834586466163e-07, "logits/chosen": -2.40625, "logits/rejected": -1.984375, "logps/chosen": -165.0, "logps/rejected": -153.0, "loss": 0.6773, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.0218505859375, "rewards/margins": 0.02783203125, "rewards/rejected": -0.005889892578125, "step": 360 }, { "epoch": 0.027838386878338724, "grad_norm": 1.7305039899173027, "learning_rate": 1.3909774436090225e-07, "logits/chosen": -2.5, "logits/rejected": -2.078125, "logps/chosen": -147.0, "logps/rejected": -144.0, "loss": 0.6743, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0341796875, "rewards/margins": 0.039794921875, "rewards/rejected": -0.005645751953125, "step": 370 }, { "epoch": 0.02859077571288842, "grad_norm": 1.7424950093564118, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -2.40625, "logits/rejected": -2.109375, "logps/chosen": -151.0, "logps/rejected": -144.0, "loss": 0.6739, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.0274658203125, "rewards/margins": 0.050048828125, "rewards/rejected": -0.0225830078125, "step": 380 }, { "epoch": 0.029343164547438117, "grad_norm": 1.8220095077196594, "learning_rate": 1.4661654135338344e-07, "logits/chosen": -2.5625, "logits/rejected": -2.265625, "logps/chosen": -177.0, "logps/rejected": -157.0, "loss": 0.6736, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.016357421875, "rewards/margins": 0.04541015625, "rewards/rejected": -0.0291748046875, "step": 390 }, { "epoch": 0.03009555338198781, "grad_norm": 1.9063850709586252, "learning_rate": 1.5037593984962403e-07, "logits/chosen": -2.40625, "logits/rejected": -2.109375, "logps/chosen": -157.0, "logps/rejected": -151.0, "loss": 0.6705, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.01025390625, "rewards/margins": 0.0380859375, "rewards/rejected": -0.02783203125, "step": 400 }, { "epoch": 0.030847942216537506, "grad_norm": 1.6795215046369691, "learning_rate": 1.5413533834586465e-07, "logits/chosen": -2.484375, "logits/rejected": -2.125, "logps/chosen": -162.0, "logps/rejected": -154.0, "loss": 0.6681, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.01104736328125, "rewards/margins": 0.056396484375, "rewards/rejected": -0.045166015625, "step": 410 }, { "epoch": 0.0316003310510872, "grad_norm": 2.091341380928474, "learning_rate": 1.5789473684210525e-07, "logits/chosen": -2.4375, "logits/rejected": -2.09375, "logps/chosen": -153.0, "logps/rejected": -152.0, "loss": 0.6638, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.00799560546875, "rewards/margins": 0.056396484375, "rewards/rejected": -0.064453125, "step": 420 }, { "epoch": 0.0323527198856369, "grad_norm": 2.083815342302955, "learning_rate": 1.6165413533834584e-07, "logits/chosen": -2.515625, "logits/rejected": -2.15625, "logps/chosen": -171.0, "logps/rejected": -159.0, "loss": 0.6609, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.004791259765625, "rewards/margins": 0.07470703125, "rewards/rejected": -0.0791015625, "step": 430 }, { "epoch": 0.03310510872018659, "grad_norm": 2.1697428767427125, "learning_rate": 1.6541353383458646e-07, "logits/chosen": -2.625, "logits/rejected": -2.28125, "logps/chosen": -197.0, "logps/rejected": -177.0, "loss": 0.6559, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0189208984375, "rewards/margins": 0.0888671875, "rewards/rejected": -0.10791015625, "step": 440 }, { "epoch": 0.033857497554736285, "grad_norm": 2.1298018090255297, "learning_rate": 1.6917293233082705e-07, "logits/chosen": -2.5, "logits/rejected": -2.3125, "logps/chosen": -179.0, "logps/rejected": -157.0, "loss": 0.6531, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0400390625, "rewards/margins": 0.07080078125, "rewards/rejected": -0.11083984375, "step": 450 }, { "epoch": 0.034609886389285985, "grad_norm": 2.1836244309606507, "learning_rate": 1.7293233082706765e-07, "logits/chosen": -2.578125, "logits/rejected": -2.28125, "logps/chosen": -167.0, "logps/rejected": -182.0, "loss": 0.6425, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.0615234375, "rewards/margins": 0.11376953125, "rewards/rejected": -0.17578125, "step": 460 }, { "epoch": 0.03536227522383568, "grad_norm": 2.4213655673685968, "learning_rate": 1.7669172932330824e-07, "logits/chosen": -2.609375, "logits/rejected": -2.21875, "logps/chosen": -172.0, "logps/rejected": -192.0, "loss": 0.6384, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.0830078125, "rewards/margins": 0.130859375, "rewards/rejected": -0.2138671875, "step": 470 }, { "epoch": 0.03611466405838537, "grad_norm": 1.944004930995184, "learning_rate": 1.8045112781954886e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -176.0, "logps/rejected": -174.0, "loss": 0.6312, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.12451171875, "rewards/margins": 0.1494140625, "rewards/rejected": -0.2734375, "step": 480 }, { "epoch": 0.03686705289293507, "grad_norm": 2.191837059576488, "learning_rate": 1.8421052631578946e-07, "logits/chosen": -2.5625, "logits/rejected": -2.3125, "logps/chosen": -188.0, "logps/rejected": -201.0, "loss": 0.63, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1689453125, "rewards/margins": 0.1669921875, "rewards/rejected": -0.3359375, "step": 490 }, { "epoch": 0.03761944172748476, "grad_norm": 2.2196701470400395, "learning_rate": 1.8796992481203005e-07, "logits/chosen": -2.53125, "logits/rejected": -2.265625, "logps/chosen": -187.0, "logps/rejected": -185.0, "loss": 0.6251, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2314453125, "rewards/margins": 0.138671875, "rewards/rejected": -0.369140625, "step": 500 }, { "epoch": 0.038371830562034456, "grad_norm": 2.3001313944863644, "learning_rate": 1.9172932330827067e-07, "logits/chosen": -2.59375, "logits/rejected": -2.34375, "logps/chosen": -174.0, "logps/rejected": -182.0, "loss": 0.6189, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.255859375, "rewards/margins": 0.2001953125, "rewards/rejected": -0.45703125, "step": 510 }, { "epoch": 0.039124219396584156, "grad_norm": 2.1503105403846376, "learning_rate": 1.9548872180451126e-07, "logits/chosen": -2.625, "logits/rejected": -2.21875, "logps/chosen": -185.0, "logps/rejected": -190.0, "loss": 0.6124, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.30859375, "rewards/margins": 0.1826171875, "rewards/rejected": -0.4921875, "step": 520 }, { "epoch": 0.03987660823113385, "grad_norm": 2.330665502460398, "learning_rate": 1.9924812030075186e-07, "logits/chosen": -2.71875, "logits/rejected": -2.296875, "logps/chosen": -185.0, "logps/rejected": -208.0, "loss": 0.6012, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.333984375, "rewards/margins": 0.25390625, "rewards/rejected": -0.58984375, "step": 530 }, { "epoch": 0.04062899706568354, "grad_norm": 3.5479816270273523, "learning_rate": 2.0300751879699245e-07, "logits/chosen": -2.640625, "logits/rejected": -2.234375, "logps/chosen": -197.0, "logps/rejected": -206.0, "loss": 0.5986, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.416015625, "rewards/margins": 0.2353515625, "rewards/rejected": -0.65234375, "step": 540 }, { "epoch": 0.04138138590023324, "grad_norm": 2.227973578985842, "learning_rate": 2.0676691729323307e-07, "logits/chosen": -2.734375, "logits/rejected": -2.40625, "logps/chosen": -194.0, "logps/rejected": -204.0, "loss": 0.6006, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.439453125, "rewards/margins": 0.2275390625, "rewards/rejected": -0.66796875, "step": 550 }, { "epoch": 0.042133774734782935, "grad_norm": 2.3382178358022494, "learning_rate": 2.1052631578947366e-07, "logits/chosen": -2.84375, "logits/rejected": -2.546875, "logps/chosen": -203.0, "logps/rejected": -231.0, "loss": 0.5962, "rewards/accuracies": 0.75, "rewards/chosen": -0.451171875, "rewards/margins": 0.25390625, "rewards/rejected": -0.703125, "step": 560 }, { "epoch": 0.042886163569332635, "grad_norm": 2.3431494680035843, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -2.8125, "logits/rejected": -2.453125, "logps/chosen": -216.0, "logps/rejected": -224.0, "loss": 0.5836, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.51171875, "rewards/margins": 0.27734375, "rewards/rejected": -0.7890625, "step": 570 }, { "epoch": 0.04363855240388233, "grad_norm": 2.5522490577618884, "learning_rate": 2.1804511278195488e-07, "logits/chosen": -2.875, "logits/rejected": -2.546875, "logps/chosen": -223.0, "logps/rejected": -230.0, "loss": 0.5791, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.546875, "rewards/margins": 0.306640625, "rewards/rejected": -0.85546875, "step": 580 }, { "epoch": 0.04439094123843202, "grad_norm": 2.8512898992256646, "learning_rate": 2.2180451127819547e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -217.0, "logps/rejected": -237.0, "loss": 0.5741, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.62890625, "rewards/margins": 0.36328125, "rewards/rejected": -0.9921875, "step": 590 }, { "epoch": 0.04514333007298172, "grad_norm": 4.011523561431844, "learning_rate": 2.2556390977443606e-07, "logits/chosen": -2.875, "logits/rejected": -2.734375, "logps/chosen": -245.0, "logps/rejected": -266.0, "loss": 0.5703, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.703125, "rewards/margins": 0.369140625, "rewards/rejected": -1.0703125, "step": 600 }, { "epoch": 0.04589571890753141, "grad_norm": 3.41561545490801, "learning_rate": 2.2932330827067666e-07, "logits/chosen": -2.984375, "logits/rejected": -2.671875, "logps/chosen": -208.0, "logps/rejected": -260.0, "loss": 0.548, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6953125, "rewards/margins": 0.484375, "rewards/rejected": -1.1796875, "step": 610 }, { "epoch": 0.046648107742081106, "grad_norm": 4.4201440287646205, "learning_rate": 2.3308270676691728e-07, "logits/chosen": -3.1875, "logits/rejected": -2.765625, "logps/chosen": -224.0, "logps/rejected": -278.0, "loss": 0.5474, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.78515625, "rewards/margins": 0.478515625, "rewards/rejected": -1.265625, "step": 620 }, { "epoch": 0.047400496576630806, "grad_norm": 4.695382679355146, "learning_rate": 2.3684210526315787e-07, "logits/chosen": -3.03125, "logits/rejected": -2.703125, "logps/chosen": -249.0, "logps/rejected": -278.0, "loss": 0.5397, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.80078125, "rewards/margins": 0.458984375, "rewards/rejected": -1.2578125, "step": 630 }, { "epoch": 0.0481528854111805, "grad_norm": 4.146268880636602, "learning_rate": 2.4060150375939847e-07, "logits/chosen": -3.03125, "logits/rejected": -2.671875, "logps/chosen": -234.0, "logps/rejected": -290.0, "loss": 0.5382, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.76171875, "rewards/margins": 0.59765625, "rewards/rejected": -1.359375, "step": 640 }, { "epoch": 0.04890527424573019, "grad_norm": 5.032383387570057, "learning_rate": 2.443609022556391e-07, "logits/chosen": -3.140625, "logits/rejected": -2.734375, "logps/chosen": -237.0, "logps/rejected": -300.0, "loss": 0.548, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.85546875, "rewards/margins": 0.55078125, "rewards/rejected": -1.40625, "step": 650 }, { "epoch": 0.04965766308027989, "grad_norm": 3.68940996433093, "learning_rate": 2.4812030075187965e-07, "logits/chosen": -3.140625, "logits/rejected": -2.84375, "logps/chosen": -228.0, "logps/rejected": -280.0, "loss": 0.5468, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.83984375, "rewards/margins": 0.4921875, "rewards/rejected": -1.328125, "step": 660 }, { "epoch": 0.050410051914829584, "grad_norm": 3.876718495059869, "learning_rate": 2.518796992481203e-07, "logits/chosen": -3.046875, "logits/rejected": -2.8125, "logps/chosen": -258.0, "logps/rejected": -294.0, "loss": 0.5255, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8828125, "rewards/margins": 0.5, "rewards/rejected": -1.3828125, "step": 670 }, { "epoch": 0.05116244074937928, "grad_norm": 4.75321383421562, "learning_rate": 2.556390977443609e-07, "logits/chosen": -3.21875, "logits/rejected": -2.9375, "logps/chosen": -260.0, "logps/rejected": -320.0, "loss": 0.5196, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.875, "rewards/margins": 0.7421875, "rewards/rejected": -1.6171875, "step": 680 }, { "epoch": 0.05191482958392898, "grad_norm": 7.793668826360735, "learning_rate": 2.593984962406015e-07, "logits/chosen": -3.1875, "logits/rejected": -3.03125, "logps/chosen": -266.0, "logps/rejected": -318.0, "loss": 0.5229, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9609375, "rewards/margins": 0.63671875, "rewards/rejected": -1.6015625, "step": 690 }, { "epoch": 0.05266721841847867, "grad_norm": 4.6452917151128785, "learning_rate": 2.631578947368421e-07, "logits/chosen": -3.109375, "logits/rejected": -2.875, "logps/chosen": -266.0, "logps/rejected": -324.0, "loss": 0.5161, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.96875, "rewards/margins": 0.69921875, "rewards/rejected": -1.6640625, "step": 700 }, { "epoch": 0.05341960725302836, "grad_norm": 7.056235535697858, "learning_rate": 2.669172932330827e-07, "logits/chosen": -3.203125, "logits/rejected": -2.828125, "logps/chosen": -276.0, "logps/rejected": -348.0, "loss": 0.5158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.99609375, "rewards/margins": 0.69921875, "rewards/rejected": -1.6953125, "step": 710 }, { "epoch": 0.05417199608757806, "grad_norm": 7.272525055737868, "learning_rate": 2.7067669172932327e-07, "logits/chosen": -3.109375, "logits/rejected": -2.90625, "logps/chosen": -274.0, "logps/rejected": -348.0, "loss": 0.4925, "rewards/accuracies": 0.78125, "rewards/chosen": -1.046875, "rewards/margins": 0.8046875, "rewards/rejected": -1.8515625, "step": 720 }, { "epoch": 0.054924384922127756, "grad_norm": 8.479125069828605, "learning_rate": 2.744360902255639e-07, "logits/chosen": -3.03125, "logits/rejected": -2.9375, "logps/chosen": -274.0, "logps/rejected": -326.0, "loss": 0.5078, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1640625, "rewards/margins": 0.7890625, "rewards/rejected": -1.953125, "step": 730 }, { "epoch": 0.05567677375667745, "grad_norm": 7.1392220553921, "learning_rate": 2.781954887218045e-07, "logits/chosen": -3.046875, "logits/rejected": -2.828125, "logps/chosen": -264.0, "logps/rejected": -320.0, "loss": 0.5066, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.015625, "rewards/margins": 0.7109375, "rewards/rejected": -1.7265625, "step": 740 }, { "epoch": 0.05642916259122715, "grad_norm": 8.584081832995, "learning_rate": 2.8195488721804513e-07, "logits/chosen": -3.109375, "logits/rejected": -2.78125, "logps/chosen": -282.0, "logps/rejected": -352.0, "loss": 0.4837, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.15625, "rewards/margins": 0.82421875, "rewards/rejected": -1.984375, "step": 750 }, { "epoch": 0.05718155142577684, "grad_norm": 7.646861159193735, "learning_rate": 2.857142857142857e-07, "logits/chosen": -2.984375, "logits/rejected": -2.734375, "logps/chosen": -286.0, "logps/rejected": -340.0, "loss": 0.4889, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2109375, "rewards/margins": 0.72265625, "rewards/rejected": -1.9375, "step": 760 }, { "epoch": 0.057933940260326534, "grad_norm": 6.103871181309778, "learning_rate": 2.894736842105263e-07, "logits/chosen": -3.015625, "logits/rejected": -2.84375, "logps/chosen": -300.0, "logps/rejected": -368.0, "loss": 0.4781, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.21875, "rewards/margins": 0.94140625, "rewards/rejected": -2.15625, "step": 770 }, { "epoch": 0.058686329094876234, "grad_norm": 7.815641392476333, "learning_rate": 2.932330827067669e-07, "logits/chosen": -2.953125, "logits/rejected": -2.765625, "logps/chosen": -272.0, "logps/rejected": -358.0, "loss": 0.4766, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.2109375, "rewards/margins": 0.91796875, "rewards/rejected": -2.125, "step": 780 }, { "epoch": 0.05943871792942593, "grad_norm": 13.713331235769063, "learning_rate": 2.969924812030075e-07, "logits/chosen": -2.90625, "logits/rejected": -2.75, "logps/chosen": -304.0, "logps/rejected": -356.0, "loss": 0.4681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2890625, "rewards/margins": 0.80078125, "rewards/rejected": -2.09375, "step": 790 }, { "epoch": 0.06019110676397562, "grad_norm": 6.54533749931702, "learning_rate": 3.0075187969924807e-07, "logits/chosen": -3.0, "logits/rejected": -2.671875, "logps/chosen": -282.0, "logps/rejected": -358.0, "loss": 0.4805, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1640625, "rewards/margins": 0.78515625, "rewards/rejected": -1.9453125, "step": 800 }, { "epoch": 0.06094349559852532, "grad_norm": 7.465470155476823, "learning_rate": 3.0451127819548874e-07, "logits/chosen": -2.859375, "logits/rejected": -2.625, "logps/chosen": -312.0, "logps/rejected": -394.0, "loss": 0.4693, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3984375, "rewards/margins": 0.9140625, "rewards/rejected": -2.3125, "step": 810 }, { "epoch": 0.06169588443307501, "grad_norm": 9.535546928152705, "learning_rate": 3.082706766917293e-07, "logits/chosen": -2.875, "logits/rejected": -2.671875, "logps/chosen": -300.0, "logps/rejected": -396.0, "loss": 0.4545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3125, "rewards/margins": 1.0390625, "rewards/rejected": -2.359375, "step": 820 }, { "epoch": 0.062448273267624706, "grad_norm": 9.078214464965928, "learning_rate": 3.1203007518796993e-07, "logits/chosen": -2.796875, "logits/rejected": -2.609375, "logps/chosen": -288.0, "logps/rejected": -394.0, "loss": 0.4864, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2890625, "rewards/margins": 0.828125, "rewards/rejected": -2.109375, "step": 830 }, { "epoch": 0.0632006621021744, "grad_norm": 8.079147706507966, "learning_rate": 3.157894736842105e-07, "logits/chosen": -2.875, "logits/rejected": -2.546875, "logps/chosen": -292.0, "logps/rejected": -404.0, "loss": 0.4627, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3125, "rewards/margins": 1.171875, "rewards/rejected": -2.484375, "step": 840 }, { "epoch": 0.0639530509367241, "grad_norm": 6.623406076716393, "learning_rate": 3.195488721804511e-07, "logits/chosen": -2.90625, "logits/rejected": -2.640625, "logps/chosen": -294.0, "logps/rejected": -384.0, "loss": 0.463, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3125, "rewards/margins": 1.0078125, "rewards/rejected": -2.3125, "step": 850 }, { "epoch": 0.0647054397712738, "grad_norm": 10.955204890181086, "learning_rate": 3.233082706766917e-07, "logits/chosen": -2.78125, "logits/rejected": -2.71875, "logps/chosen": -316.0, "logps/rejected": -414.0, "loss": 0.4563, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.40625, "rewards/margins": 1.1171875, "rewards/rejected": -2.53125, "step": 860 }, { "epoch": 0.06545782860582348, "grad_norm": 7.434705610127428, "learning_rate": 3.270676691729323e-07, "logits/chosen": -2.90625, "logits/rejected": -2.625, "logps/chosen": -300.0, "logps/rejected": -388.0, "loss": 0.4604, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.359375, "rewards/margins": 1.0625, "rewards/rejected": -2.421875, "step": 870 }, { "epoch": 0.06621021744037318, "grad_norm": 8.609722009537244, "learning_rate": 3.308270676691729e-07, "logits/chosen": -2.9375, "logits/rejected": -2.671875, "logps/chosen": -334.0, "logps/rejected": -454.0, "loss": 0.4257, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.765625, "rewards/margins": 1.1640625, "rewards/rejected": -2.9375, "step": 880 }, { "epoch": 0.06696260627492288, "grad_norm": 11.999385437912036, "learning_rate": 3.3458646616541354e-07, "logits/chosen": -2.765625, "logits/rejected": -2.65625, "logps/chosen": -316.0, "logps/rejected": -426.0, "loss": 0.4347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5625, "rewards/margins": 1.1953125, "rewards/rejected": -2.75, "step": 890 }, { "epoch": 0.06771499510947257, "grad_norm": 9.118722433060201, "learning_rate": 3.383458646616541e-07, "logits/chosen": -2.84375, "logits/rejected": -2.671875, "logps/chosen": -294.0, "logps/rejected": -400.0, "loss": 0.4424, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5234375, "rewards/margins": 1.1015625, "rewards/rejected": -2.625, "step": 900 }, { "epoch": 0.06846738394402227, "grad_norm": 8.90229000330093, "learning_rate": 3.4210526315789473e-07, "logits/chosen": -2.96875, "logits/rejected": -2.625, "logps/chosen": -314.0, "logps/rejected": -450.0, "loss": 0.4658, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.671875, "rewards/margins": 1.1796875, "rewards/rejected": -2.84375, "step": 910 }, { "epoch": 0.06921977277857197, "grad_norm": 12.740993275502067, "learning_rate": 3.458646616541353e-07, "logits/chosen": -2.796875, "logits/rejected": -2.671875, "logps/chosen": -346.0, "logps/rejected": -430.0, "loss": 0.4558, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.703125, "rewards/margins": 1.125, "rewards/rejected": -2.828125, "step": 920 }, { "epoch": 0.06997216161312166, "grad_norm": 9.335082174627217, "learning_rate": 3.496240601503759e-07, "logits/chosen": -2.890625, "logits/rejected": -2.703125, "logps/chosen": -332.0, "logps/rejected": -458.0, "loss": 0.4054, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7578125, "rewards/margins": 1.40625, "rewards/rejected": -3.171875, "step": 930 }, { "epoch": 0.07072455044767136, "grad_norm": 8.239119204332837, "learning_rate": 3.533834586466165e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5625, "logps/chosen": -312.0, "logps/rejected": -446.0, "loss": 0.4412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5703125, "rewards/margins": 1.3515625, "rewards/rejected": -2.90625, "step": 940 }, { "epoch": 0.07147693928222106, "grad_norm": 7.440153278177778, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -2.765625, "logits/rejected": -2.578125, "logps/chosen": -294.0, "logps/rejected": -434.0, "loss": 0.4392, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4921875, "rewards/margins": 1.265625, "rewards/rejected": -2.75, "step": 950 }, { "epoch": 0.07222932811677074, "grad_norm": 9.61612083498033, "learning_rate": 3.609022556390977e-07, "logits/chosen": -2.875, "logits/rejected": -2.5, "logps/chosen": -320.0, "logps/rejected": -434.0, "loss": 0.4324, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5390625, "rewards/margins": 1.2578125, "rewards/rejected": -2.796875, "step": 960 }, { "epoch": 0.07298171695132044, "grad_norm": 10.276517044122064, "learning_rate": 3.6466165413533834e-07, "logits/chosen": -2.765625, "logits/rejected": -2.5625, "logps/chosen": -356.0, "logps/rejected": -448.0, "loss": 0.4271, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6875, "rewards/margins": 1.265625, "rewards/rejected": -2.953125, "step": 970 }, { "epoch": 0.07373410578587014, "grad_norm": 7.955238750998195, "learning_rate": 3.684210526315789e-07, "logits/chosen": -2.796875, "logits/rejected": -2.578125, "logps/chosen": -350.0, "logps/rejected": -476.0, "loss": 0.4415, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8515625, "rewards/margins": 1.3046875, "rewards/rejected": -3.15625, "step": 980 }, { "epoch": 0.07448649462041983, "grad_norm": 9.228975986129836, "learning_rate": 3.7218045112781953e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -320.0, "logps/rejected": -442.0, "loss": 0.4177, "rewards/accuracies": 0.8125, "rewards/chosen": -1.71875, "rewards/margins": 1.265625, "rewards/rejected": -2.984375, "step": 990 }, { "epoch": 0.07523888345496953, "grad_norm": 7.6300102127807765, "learning_rate": 3.759398496240601e-07, "logits/chosen": -2.5625, "logits/rejected": -2.390625, "logps/chosen": -318.0, "logps/rejected": -430.0, "loss": 0.4256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.546875, "rewards/margins": 1.125, "rewards/rejected": -2.671875, "step": 1000 }, { "epoch": 0.07599127228951923, "grad_norm": 11.203017463036868, "learning_rate": 3.796992481203007e-07, "logits/chosen": -2.703125, "logits/rejected": -2.40625, "logps/chosen": -322.0, "logps/rejected": -468.0, "loss": 0.4073, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.5859375, "rewards/margins": 1.6484375, "rewards/rejected": -3.234375, "step": 1010 }, { "epoch": 0.07674366112406891, "grad_norm": 10.44396520900687, "learning_rate": 3.8345864661654134e-07, "logits/chosen": -2.703125, "logits/rejected": -2.5, "logps/chosen": -334.0, "logps/rejected": -470.0, "loss": 0.4036, "rewards/accuracies": 0.8125, "rewards/chosen": -1.71875, "rewards/margins": 1.3828125, "rewards/rejected": -3.09375, "step": 1020 }, { "epoch": 0.07749604995861861, "grad_norm": 10.352095048437782, "learning_rate": 3.8721804511278196e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -302.0, "logps/rejected": -436.0, "loss": 0.4089, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.484375, "rewards/margins": 1.46875, "rewards/rejected": -2.953125, "step": 1030 }, { "epoch": 0.07824843879316831, "grad_norm": 9.027432013217565, "learning_rate": 3.909774436090225e-07, "logits/chosen": -2.71875, "logits/rejected": -2.546875, "logps/chosen": -332.0, "logps/rejected": -444.0, "loss": 0.422, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7109375, "rewards/margins": 1.1640625, "rewards/rejected": -2.875, "step": 1040 }, { "epoch": 0.079000827627718, "grad_norm": 8.493987186289845, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -2.734375, "logits/rejected": -2.546875, "logps/chosen": -340.0, "logps/rejected": -480.0, "loss": 0.4011, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6484375, "rewards/margins": 1.4609375, "rewards/rejected": -3.109375, "step": 1050 }, { "epoch": 0.0797532164622677, "grad_norm": 13.673005824406173, "learning_rate": 3.984962406015037e-07, "logits/chosen": -2.671875, "logits/rejected": -2.5, "logps/chosen": -334.0, "logps/rejected": -456.0, "loss": 0.3971, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6484375, "rewards/margins": 1.3828125, "rewards/rejected": -3.03125, "step": 1060 }, { "epoch": 0.0805056052968174, "grad_norm": 11.280443761201498, "learning_rate": 4.0225563909774433e-07, "logits/chosen": -2.703125, "logits/rejected": -2.5, "logps/chosen": -344.0, "logps/rejected": -462.0, "loss": 0.4124, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7109375, "rewards/margins": 1.3359375, "rewards/rejected": -3.046875, "step": 1070 }, { "epoch": 0.08125799413136708, "grad_norm": 11.852849318261235, "learning_rate": 4.060150375939849e-07, "logits/chosen": -2.6875, "logits/rejected": -2.5, "logps/chosen": -388.0, "logps/rejected": -544.0, "loss": 0.4084, "rewards/accuracies": 0.84375, "rewards/chosen": -2.078125, "rewards/margins": 1.625, "rewards/rejected": -3.703125, "step": 1080 }, { "epoch": 0.08201038296591678, "grad_norm": 8.668455285022874, "learning_rate": 4.0977443609022557e-07, "logits/chosen": -2.75, "logits/rejected": -2.59375, "logps/chosen": -366.0, "logps/rejected": -516.0, "loss": 0.4122, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.875, "rewards/margins": 1.609375, "rewards/rejected": -3.484375, "step": 1090 }, { "epoch": 0.08276277180046648, "grad_norm": 16.385621268125355, "learning_rate": 4.1353383458646614e-07, "logits/chosen": -2.53125, "logits/rejected": -2.25, "logps/chosen": -340.0, "logps/rejected": -454.0, "loss": 0.3919, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6640625, "rewards/margins": 1.40625, "rewards/rejected": -3.078125, "step": 1100 }, { "epoch": 0.08351516063501617, "grad_norm": 11.29836657890373, "learning_rate": 4.1729323308270676e-07, "logits/chosen": -2.78125, "logits/rejected": -2.578125, "logps/chosen": -368.0, "logps/rejected": -560.0, "loss": 0.3525, "rewards/accuracies": 0.875, "rewards/chosen": -2.046875, "rewards/margins": 2.1875, "rewards/rejected": -4.21875, "step": 1110 }, { "epoch": 0.08426754946956587, "grad_norm": 14.191366008515217, "learning_rate": 4.2105263157894733e-07, "logits/chosen": -2.9375, "logits/rejected": -2.6875, "logps/chosen": -346.0, "logps/rejected": -468.0, "loss": 0.4123, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8203125, "rewards/margins": 1.3515625, "rewards/rejected": -3.171875, "step": 1120 }, { "epoch": 0.08501993830411557, "grad_norm": 12.571352956931529, "learning_rate": 4.2481203007518795e-07, "logits/chosen": -2.671875, "logits/rejected": -2.46875, "logps/chosen": -372.0, "logps/rejected": -532.0, "loss": 0.3899, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0625, "rewards/margins": 1.6953125, "rewards/rejected": -3.75, "step": 1130 }, { "epoch": 0.08577232713866527, "grad_norm": 9.584329220428629, "learning_rate": 4.285714285714285e-07, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -396.0, "logps/rejected": -544.0, "loss": 0.3737, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.40625, "rewards/margins": 1.421875, "rewards/rejected": -3.828125, "step": 1140 }, { "epoch": 0.08652471597321496, "grad_norm": 9.655313020328894, "learning_rate": 4.3233082706766913e-07, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -358.0, "logps/rejected": -520.0, "loss": 0.3823, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.6484375, "rewards/rejected": -3.703125, "step": 1150 }, { "epoch": 0.08727710480776465, "grad_norm": 8.344975957766751, "learning_rate": 4.3609022556390975e-07, "logits/chosen": -2.8125, "logits/rejected": -2.65625, "logps/chosen": -424.0, "logps/rejected": -544.0, "loss": 0.4058, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.484375, "rewards/margins": 1.34375, "rewards/rejected": -3.828125, "step": 1160 }, { "epoch": 0.08802949364231435, "grad_norm": 17.90436214681741, "learning_rate": 4.398496240601504e-07, "logits/chosen": -2.625, "logits/rejected": -2.46875, "logps/chosen": -386.0, "logps/rejected": -536.0, "loss": 0.3911, "rewards/accuracies": 0.8125, "rewards/chosen": -2.359375, "rewards/margins": 1.625, "rewards/rejected": -3.984375, "step": 1170 }, { "epoch": 0.08878188247686404, "grad_norm": 10.09502529582343, "learning_rate": 4.4360902255639094e-07, "logits/chosen": -2.59375, "logits/rejected": -2.421875, "logps/chosen": -358.0, "logps/rejected": -468.0, "loss": 0.3816, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0, "rewards/margins": 1.203125, "rewards/rejected": -3.203125, "step": 1180 }, { "epoch": 0.08953427131141374, "grad_norm": 12.723840138089093, "learning_rate": 4.4736842105263156e-07, "logits/chosen": -2.796875, "logits/rejected": -2.671875, "logps/chosen": -386.0, "logps/rejected": -532.0, "loss": 0.3603, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.15625, "rewards/margins": 1.7734375, "rewards/rejected": -3.9375, "step": 1190 }, { "epoch": 0.09028666014596344, "grad_norm": 13.550815678408945, "learning_rate": 4.5112781954887213e-07, "logits/chosen": -2.921875, "logits/rejected": -2.75, "logps/chosen": -348.0, "logps/rejected": -510.0, "loss": 0.3987, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8203125, "rewards/margins": 1.5390625, "rewards/rejected": -3.359375, "step": 1200 }, { "epoch": 0.09103904898051313, "grad_norm": 9.008638592876, "learning_rate": 4.5488721804511275e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -382.0, "logps/rejected": -576.0, "loss": 0.3829, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.1875, "rewards/margins": 1.953125, "rewards/rejected": -4.15625, "step": 1210 }, { "epoch": 0.09179143781506283, "grad_norm": 35.08614510436194, "learning_rate": 4.586466165413533e-07, "logits/chosen": -2.71875, "logits/rejected": -2.484375, "logps/chosen": -330.0, "logps/rejected": -482.0, "loss": 0.3832, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7734375, "rewards/margins": 1.6640625, "rewards/rejected": -3.4375, "step": 1220 }, { "epoch": 0.09254382664961253, "grad_norm": 11.415330910582554, "learning_rate": 4.62406015037594e-07, "logits/chosen": -2.75, "logits/rejected": -2.59375, "logps/chosen": -388.0, "logps/rejected": -572.0, "loss": 0.4032, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.15625, "rewards/margins": 1.875, "rewards/rejected": -4.03125, "step": 1230 }, { "epoch": 0.09329621548416221, "grad_norm": 9.726227634850884, "learning_rate": 4.6616541353383456e-07, "logits/chosen": -2.546875, "logits/rejected": -2.390625, "logps/chosen": -360.0, "logps/rejected": -490.0, "loss": 0.382, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0, "rewards/margins": 1.4296875, "rewards/rejected": -3.4375, "step": 1240 }, { "epoch": 0.09404860431871191, "grad_norm": 10.938204411889682, "learning_rate": 4.699248120300752e-07, "logits/chosen": -2.703125, "logits/rejected": -2.671875, "logps/chosen": -402.0, "logps/rejected": -524.0, "loss": 0.3515, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.140625, "rewards/margins": 1.5703125, "rewards/rejected": -3.71875, "step": 1250 }, { "epoch": 0.09480099315326161, "grad_norm": 11.359648859339549, "learning_rate": 4.7368421052631574e-07, "logits/chosen": -2.921875, "logits/rejected": -2.703125, "logps/chosen": -388.0, "logps/rejected": -564.0, "loss": 0.3956, "rewards/accuracies": 0.84375, "rewards/chosen": -2.265625, "rewards/margins": 1.890625, "rewards/rejected": -4.15625, "step": 1260 }, { "epoch": 0.0955533819878113, "grad_norm": 9.557735064723317, "learning_rate": 4.774436090225564e-07, "logits/chosen": -2.921875, "logits/rejected": -2.796875, "logps/chosen": -386.0, "logps/rejected": -548.0, "loss": 0.3486, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.28125, "rewards/margins": 1.6796875, "rewards/rejected": -3.953125, "step": 1270 }, { "epoch": 0.096305770822361, "grad_norm": 12.586703006940098, "learning_rate": 4.812030075187969e-07, "logits/chosen": -2.90625, "logits/rejected": -2.90625, "logps/chosen": -348.0, "logps/rejected": -516.0, "loss": 0.3639, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.84375, "rewards/margins": 1.78125, "rewards/rejected": -3.609375, "step": 1280 }, { "epoch": 0.0970581596569107, "grad_norm": 11.590852438383495, "learning_rate": 4.849624060150376e-07, "logits/chosen": -2.90625, "logits/rejected": -2.765625, "logps/chosen": -396.0, "logps/rejected": -556.0, "loss": 0.363, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.328125, "rewards/margins": 1.8515625, "rewards/rejected": -4.1875, "step": 1290 }, { "epoch": 0.09781054849146038, "grad_norm": 9.666020281314843, "learning_rate": 4.887218045112782e-07, "logits/chosen": -2.890625, "logits/rejected": -2.78125, "logps/chosen": -412.0, "logps/rejected": -580.0, "loss": 0.3747, "rewards/accuracies": 0.8125, "rewards/chosen": -2.296875, "rewards/margins": 1.921875, "rewards/rejected": -4.21875, "step": 1300 }, { "epoch": 0.09856293732601008, "grad_norm": 17.089677017414804, "learning_rate": 4.924812030075187e-07, "logits/chosen": -2.890625, "logits/rejected": -2.6875, "logps/chosen": -432.0, "logps/rejected": -596.0, "loss": 0.3642, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.546875, "rewards/margins": 1.9453125, "rewards/rejected": -4.5, "step": 1310 }, { "epoch": 0.09931532616055978, "grad_norm": 14.284746492926356, "learning_rate": 4.962406015037593e-07, "logits/chosen": -2.921875, "logits/rejected": -2.75, "logps/chosen": -394.0, "logps/rejected": -536.0, "loss": 0.356, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.34375, "rewards/margins": 1.6484375, "rewards/rejected": -3.984375, "step": 1320 }, { "epoch": 0.10006771499510947, "grad_norm": 10.0309975081672, "learning_rate": 5e-07, "logits/chosen": -2.921875, "logits/rejected": -2.6875, "logps/chosen": -432.0, "logps/rejected": -656.0, "loss": 0.3597, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.78125, "rewards/margins": 2.21875, "rewards/rejected": -5.0, "step": 1330 }, { "epoch": 0.10082010382965917, "grad_norm": 12.11681402357444, "learning_rate": 4.999991376679495e-07, "logits/chosen": -2.625, "logits/rejected": -2.53125, "logps/chosen": -434.0, "logps/rejected": -616.0, "loss": 0.3456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6875, "rewards/margins": 1.921875, "rewards/rejected": -4.59375, "step": 1340 }, { "epoch": 0.10157249266420887, "grad_norm": 9.241869652249655, "learning_rate": 4.999965506777466e-07, "logits/chosen": -2.578125, "logits/rejected": -2.484375, "logps/chosen": -450.0, "logps/rejected": -696.0, "loss": 0.3523, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.875, "rewards/margins": 2.375, "rewards/rejected": -5.25, "step": 1350 }, { "epoch": 0.10232488149875855, "grad_norm": 10.700012765427152, "learning_rate": 4.999922390472384e-07, "logits/chosen": -2.703125, "logits/rejected": -2.6875, "logps/chosen": -418.0, "logps/rejected": -572.0, "loss": 0.3516, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.484375, "rewards/margins": 1.703125, "rewards/rejected": -4.1875, "step": 1360 }, { "epoch": 0.10307727033330825, "grad_norm": 10.778973183136214, "learning_rate": 4.999862028061692e-07, "logits/chosen": -2.765625, "logits/rejected": -2.640625, "logps/chosen": -432.0, "logps/rejected": -584.0, "loss": 0.3608, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.703125, "rewards/margins": 1.609375, "rewards/rejected": -4.3125, "step": 1370 }, { "epoch": 0.10382965916785795, "grad_norm": 10.942848875930892, "learning_rate": 4.99978441996181e-07, "logits/chosen": -2.765625, "logits/rejected": -2.609375, "logps/chosen": -424.0, "logps/rejected": -604.0, "loss": 0.3596, "rewards/accuracies": 0.875, "rewards/chosen": -2.515625, "rewards/margins": 1.953125, "rewards/rejected": -4.46875, "step": 1380 }, { "epoch": 0.10458204800240764, "grad_norm": 13.27627553772671, "learning_rate": 4.999689566708128e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5625, "logps/chosen": -428.0, "logps/rejected": -584.0, "loss": 0.3523, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.609375, "rewards/margins": 1.7421875, "rewards/rejected": -4.34375, "step": 1390 }, { "epoch": 0.10533443683695734, "grad_norm": 9.279591495260535, "learning_rate": 4.999577468955008e-07, "logits/chosen": -2.765625, "logits/rejected": -2.609375, "logps/chosen": -456.0, "logps/rejected": -644.0, "loss": 0.3669, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.890625, "rewards/margins": 2.046875, "rewards/rejected": -4.9375, "step": 1400 }, { "epoch": 0.10608682567150704, "grad_norm": 15.338794234291706, "learning_rate": 4.999448127475773e-07, "logits/chosen": -2.625, "logits/rejected": -2.5625, "logps/chosen": -420.0, "logps/rejected": -656.0, "loss": 0.3322, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.609375, "rewards/margins": 2.53125, "rewards/rejected": -5.125, "step": 1410 }, { "epoch": 0.10683921450605673, "grad_norm": 11.07525013808053, "learning_rate": 4.999301543162706e-07, "logits/chosen": -2.8125, "logits/rejected": -2.6875, "logps/chosen": -444.0, "logps/rejected": -664.0, "loss": 0.3323, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.8125, "rewards/margins": 2.078125, "rewards/rejected": -4.875, "step": 1420 }, { "epoch": 0.10759160334060643, "grad_norm": 14.47067636759482, "learning_rate": 4.999137717027041e-07, "logits/chosen": -2.859375, "logits/rejected": -2.765625, "logps/chosen": -452.0, "logps/rejected": -656.0, "loss": 0.3365, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.046875, "rewards/margins": 2.03125, "rewards/rejected": -5.0625, "step": 1430 }, { "epoch": 0.10834399217515613, "grad_norm": 48.895026311812195, "learning_rate": 4.998956650198959e-07, "logits/chosen": -2.84375, "logits/rejected": -2.765625, "logps/chosen": -420.0, "logps/rejected": -604.0, "loss": 0.3544, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.703125, "rewards/margins": 1.8828125, "rewards/rejected": -4.5625, "step": 1440 }, { "epoch": 0.10909638100970581, "grad_norm": 12.33907739866595, "learning_rate": 4.998758343927576e-07, "logits/chosen": -2.796875, "logits/rejected": -2.6875, "logps/chosen": -464.0, "logps/rejected": -628.0, "loss": 0.3275, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.9375, "rewards/margins": 1.765625, "rewards/rejected": -4.6875, "step": 1450 }, { "epoch": 0.10984876984425551, "grad_norm": 10.624130772065378, "learning_rate": 4.998542799580941e-07, "logits/chosen": -2.90625, "logits/rejected": -2.75, "logps/chosen": -440.0, "logps/rejected": -628.0, "loss": 0.3492, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.734375, "rewards/margins": 2.125, "rewards/rejected": -4.875, "step": 1460 }, { "epoch": 0.11060115867880521, "grad_norm": 15.790847710499458, "learning_rate": 4.998310018646021e-07, "logits/chosen": -2.734375, "logits/rejected": -2.640625, "logps/chosen": -448.0, "logps/rejected": -592.0, "loss": 0.3503, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.828125, "rewards/margins": 1.65625, "rewards/rejected": -4.46875, "step": 1470 }, { "epoch": 0.1113535475133549, "grad_norm": 13.668416666518036, "learning_rate": 4.998060002728689e-07, "logits/chosen": -2.796875, "logits/rejected": -2.671875, "logps/chosen": -504.0, "logps/rejected": -692.0, "loss": 0.339, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.25, "rewards/margins": 2.125, "rewards/rejected": -5.375, "step": 1480 }, { "epoch": 0.1121059363479046, "grad_norm": 13.367505264457792, "learning_rate": 4.99779275355372e-07, "logits/chosen": -2.921875, "logits/rejected": -2.765625, "logps/chosen": -442.0, "logps/rejected": -628.0, "loss": 0.3514, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.640625, "rewards/margins": 1.8203125, "rewards/rejected": -4.46875, "step": 1490 }, { "epoch": 0.1128583251824543, "grad_norm": 9.043259055103826, "learning_rate": 4.997508272964775e-07, "logits/chosen": -2.828125, "logits/rejected": -2.59375, "logps/chosen": -480.0, "logps/rejected": -676.0, "loss": 0.3178, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.09375, "rewards/margins": 2.046875, "rewards/rejected": -5.125, "step": 1500 }, { "epoch": 0.11361071401700398, "grad_norm": 11.016220353679143, "learning_rate": 4.997206562924387e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -528.0, "logps/rejected": -720.0, "loss": 0.3126, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.5, "rewards/margins": 2.171875, "rewards/rejected": -5.6875, "step": 1510 }, { "epoch": 0.11436310285155368, "grad_norm": 9.506781512625706, "learning_rate": 4.99688762551395e-07, "logits/chosen": -2.703125, "logits/rejected": -2.53125, "logps/chosen": -508.0, "logps/rejected": -676.0, "loss": 0.3293, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5, "rewards/margins": 1.921875, "rewards/rejected": -5.40625, "step": 1520 }, { "epoch": 0.11511549168610338, "grad_norm": 9.87913772768346, "learning_rate": 4.996551462933705e-07, "logits/chosen": -2.703125, "logits/rejected": -2.609375, "logps/chosen": -472.0, "logps/rejected": -664.0, "loss": 0.3387, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.03125, "rewards/margins": 2.109375, "rewards/rejected": -5.15625, "step": 1530 }, { "epoch": 0.11586788052065307, "grad_norm": 9.143562630664047, "learning_rate": 4.996198077502719e-07, "logits/chosen": -2.6875, "logits/rejected": -2.546875, "logps/chosen": -452.0, "logps/rejected": -644.0, "loss": 0.323, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.96875, "rewards/margins": 2.078125, "rewards/rejected": -5.03125, "step": 1540 }, { "epoch": 0.11662026935520277, "grad_norm": 10.38817675345434, "learning_rate": 4.99582747165888e-07, "logits/chosen": -2.640625, "logits/rejected": -2.53125, "logps/chosen": -480.0, "logps/rejected": -712.0, "loss": 0.319, "rewards/accuracies": 0.875, "rewards/chosen": -3.1875, "rewards/margins": 2.0625, "rewards/rejected": -5.25, "step": 1550 }, { "epoch": 0.11737265818975247, "grad_norm": 11.492302524993693, "learning_rate": 4.995439647958869e-07, "logits/chosen": -2.6875, "logits/rejected": -2.5625, "logps/chosen": -470.0, "logps/rejected": -664.0, "loss": 0.3247, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0, "rewards/margins": 2.046875, "rewards/rejected": -5.0625, "step": 1560 }, { "epoch": 0.11812504702430215, "grad_norm": 11.709867997960567, "learning_rate": 4.995034609078148e-07, "logits/chosen": -2.890625, "logits/rejected": -2.640625, "logps/chosen": -446.0, "logps/rejected": -644.0, "loss": 0.3327, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.890625, "rewards/margins": 2.0, "rewards/rejected": -4.90625, "step": 1570 }, { "epoch": 0.11887743585885185, "grad_norm": 12.146034299644466, "learning_rate": 4.994612357810942e-07, "logits/chosen": -2.953125, "logits/rejected": -2.75, "logps/chosen": -544.0, "logps/rejected": -716.0, "loss": 0.3127, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5, "rewards/margins": 2.09375, "rewards/rejected": -5.625, "step": 1580 }, { "epoch": 0.11962982469340155, "grad_norm": 37.73933841606365, "learning_rate": 4.994172897070217e-07, "logits/chosen": -3.015625, "logits/rejected": -2.78125, "logps/chosen": -486.0, "logps/rejected": -680.0, "loss": 0.359, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.171875, "rewards/margins": 2.046875, "rewards/rejected": -5.21875, "step": 1590 }, { "epoch": 0.12038221352795124, "grad_norm": 8.414319765873188, "learning_rate": 4.99371622988766e-07, "logits/chosen": -2.96875, "logits/rejected": -2.84375, "logps/chosen": -448.0, "logps/rejected": -644.0, "loss": 0.3201, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.984375, "rewards/margins": 1.921875, "rewards/rejected": -4.90625, "step": 1600 }, { "epoch": 0.12113460236250094, "grad_norm": 9.915553480092923, "learning_rate": 4.993242359413664e-07, "logits/chosen": -2.9375, "logits/rejected": -2.65625, "logps/chosen": -478.0, "logps/rejected": -712.0, "loss": 0.3145, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.203125, "rewards/margins": 2.296875, "rewards/rejected": -5.5, "step": 1610 }, { "epoch": 0.12188699119705064, "grad_norm": 12.120046031370594, "learning_rate": 4.992751288917297e-07, "logits/chosen": -2.9375, "logits/rejected": -2.796875, "logps/chosen": -448.0, "logps/rejected": -688.0, "loss": 0.3246, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.875, "rewards/margins": 2.515625, "rewards/rejected": -5.40625, "step": 1620 }, { "epoch": 0.12263938003160033, "grad_norm": 10.565988140666803, "learning_rate": 4.992243021786285e-07, "logits/chosen": -2.703125, "logits/rejected": -2.578125, "logps/chosen": -498.0, "logps/rejected": -692.0, "loss": 0.3196, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.421875, "rewards/margins": 1.9453125, "rewards/rejected": -5.375, "step": 1630 }, { "epoch": 0.12339176886615003, "grad_norm": 9.873827794682715, "learning_rate": 4.99171756152699e-07, "logits/chosen": -2.75, "logits/rejected": -2.609375, "logps/chosen": -470.0, "logps/rejected": -688.0, "loss": 0.3253, "rewards/accuracies": 0.875, "rewards/chosen": -2.8125, "rewards/margins": 2.421875, "rewards/rejected": -5.21875, "step": 1640 }, { "epoch": 0.12414415770069973, "grad_norm": 11.240991564793916, "learning_rate": 4.991174911764381e-07, "logits/chosen": -2.9375, "logits/rejected": -2.75, "logps/chosen": -502.0, "logps/rejected": -700.0, "loss": 0.3067, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.359375, "rewards/margins": 1.9609375, "rewards/rejected": -5.3125, "step": 1650 }, { "epoch": 0.12489654653524941, "grad_norm": 10.750766844675857, "learning_rate": 4.990615076242011e-07, "logits/chosen": -2.96875, "logits/rejected": -2.765625, "logps/chosen": -464.0, "logps/rejected": -708.0, "loss": 0.3395, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.75, "rewards/margins": 2.484375, "rewards/rejected": -5.25, "step": 1660 }, { "epoch": 0.12564893536979913, "grad_norm": 20.008574618115496, "learning_rate": 4.990038058821995e-07, "logits/chosen": -2.71875, "logits/rejected": -2.671875, "logps/chosen": -524.0, "logps/rejected": -704.0, "loss": 0.3142, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.625, "rewards/margins": 1.890625, "rewards/rejected": -5.53125, "step": 1670 }, { "epoch": 0.1264013242043488, "grad_norm": 12.148202383381813, "learning_rate": 4.989443863484976e-07, "logits/chosen": -2.828125, "logits/rejected": -2.625, "logps/chosen": -468.0, "logps/rejected": -644.0, "loss": 0.3193, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.078125, "rewards/margins": 1.9453125, "rewards/rejected": -5.03125, "step": 1680 }, { "epoch": 0.1271537130388985, "grad_norm": 15.287954256111135, "learning_rate": 4.988832494330106e-07, "logits/chosen": -2.8125, "logits/rejected": -2.640625, "logps/chosen": -482.0, "logps/rejected": -700.0, "loss": 0.3325, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.15625, "rewards/margins": 2.015625, "rewards/rejected": -5.1875, "step": 1690 }, { "epoch": 0.1279061018734482, "grad_norm": 11.463559327588364, "learning_rate": 4.988203955575006e-07, "logits/chosen": -2.90625, "logits/rejected": -2.703125, "logps/chosen": -434.0, "logps/rejected": -656.0, "loss": 0.2976, "rewards/accuracies": 0.875, "rewards/chosen": -2.640625, "rewards/margins": 2.234375, "rewards/rejected": -4.875, "step": 1700 }, { "epoch": 0.1286584907079979, "grad_norm": 12.056679797446584, "learning_rate": 4.987558251555755e-07, "logits/chosen": -2.625, "logits/rejected": -2.5625, "logps/chosen": -456.0, "logps/rejected": -656.0, "loss": 0.317, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.921875, "rewards/margins": 2.078125, "rewards/rejected": -5.0, "step": 1710 }, { "epoch": 0.1294108795425476, "grad_norm": 9.484624994057103, "learning_rate": 4.98689538672684e-07, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -454.0, "logps/rejected": -668.0, "loss": 0.3096, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.859375, "rewards/margins": 2.40625, "rewards/rejected": -5.25, "step": 1720 }, { "epoch": 0.1301632683770973, "grad_norm": 14.197927584727962, "learning_rate": 4.986215365661137e-07, "logits/chosen": -2.859375, "logits/rejected": -2.6875, "logps/chosen": -464.0, "logps/rejected": -708.0, "loss": 0.3089, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.015625, "rewards/margins": 2.40625, "rewards/rejected": -5.40625, "step": 1730 }, { "epoch": 0.13091565721164697, "grad_norm": 8.583390608989294, "learning_rate": 4.985518193049879e-07, "logits/chosen": -2.734375, "logits/rejected": -2.640625, "logps/chosen": -464.0, "logps/rejected": -652.0, "loss": 0.3193, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.765625, "rewards/margins": 2.15625, "rewards/rejected": -4.90625, "step": 1740 }, { "epoch": 0.13166804604619667, "grad_norm": 15.994335121139475, "learning_rate": 4.984803873702619e-07, "logits/chosen": -2.71875, "logits/rejected": -2.609375, "logps/chosen": -446.0, "logps/rejected": -664.0, "loss": 0.3372, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.78125, "rewards/margins": 2.25, "rewards/rejected": -5.03125, "step": 1750 }, { "epoch": 0.13242043488074637, "grad_norm": 21.914213241176697, "learning_rate": 4.984072412547202e-07, "logits/chosen": -2.8125, "logits/rejected": -2.671875, "logps/chosen": -432.0, "logps/rejected": -640.0, "loss": 0.3436, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.828125, "rewards/margins": 2.078125, "rewards/rejected": -4.90625, "step": 1760 }, { "epoch": 0.13317282371529607, "grad_norm": 11.643305728816616, "learning_rate": 4.983323814629727e-07, "logits/chosen": -3.046875, "logits/rejected": -2.9375, "logps/chosen": -420.0, "logps/rejected": -612.0, "loss": 0.3335, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.5625, "rewards/margins": 1.9921875, "rewards/rejected": -4.5625, "step": 1770 }, { "epoch": 0.13392521254984577, "grad_norm": 9.9183144070942, "learning_rate": 4.982558085114515e-07, "logits/chosen": -3.015625, "logits/rejected": -2.9375, "logps/chosen": -460.0, "logps/rejected": -644.0, "loss": 0.349, "rewards/accuracies": 0.875, "rewards/chosen": -3.015625, "rewards/margins": 1.96875, "rewards/rejected": -5.0, "step": 1780 }, { "epoch": 0.13467760138439547, "grad_norm": 12.622299854386242, "learning_rate": 4.981775229284068e-07, "logits/chosen": -2.765625, "logits/rejected": -2.671875, "logps/chosen": -470.0, "logps/rejected": -648.0, "loss": 0.32, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.015625, "rewards/margins": 1.9140625, "rewards/rejected": -4.9375, "step": 1790 }, { "epoch": 0.13542999021894514, "grad_norm": 13.726359537170527, "learning_rate": 4.98097525253904e-07, "logits/chosen": -2.640625, "logits/rejected": -2.484375, "logps/chosen": -458.0, "logps/rejected": -652.0, "loss": 0.3353, "rewards/accuracies": 0.84375, "rewards/chosen": -2.90625, "rewards/margins": 2.015625, "rewards/rejected": -4.9375, "step": 1800 }, { "epoch": 0.13618237905349484, "grad_norm": 8.533957196814246, "learning_rate": 4.980158160398198e-07, "logits/chosen": -2.75, "logits/rejected": -2.578125, "logps/chosen": -442.0, "logps/rejected": -664.0, "loss": 0.287, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.84375, "rewards/margins": 2.390625, "rewards/rejected": -5.21875, "step": 1810 }, { "epoch": 0.13693476788804454, "grad_norm": 9.66875724969042, "learning_rate": 4.979323958498378e-07, "logits/chosen": -2.9375, "logits/rejected": -2.8125, "logps/chosen": -466.0, "logps/rejected": -680.0, "loss": 0.3176, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.953125, "rewards/margins": 2.578125, "rewards/rejected": -5.53125, "step": 1820 }, { "epoch": 0.13768715672259424, "grad_norm": 11.673654825211232, "learning_rate": 4.978472652594453e-07, "logits/chosen": -2.921875, "logits/rejected": -2.765625, "logps/chosen": -504.0, "logps/rejected": -760.0, "loss": 0.3245, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.53125, "rewards/margins": 2.578125, "rewards/rejected": -6.125, "step": 1830 }, { "epoch": 0.13843954555714394, "grad_norm": 9.459008565137387, "learning_rate": 4.977604248559289e-07, "logits/chosen": -2.921875, "logits/rejected": -2.75, "logps/chosen": -444.0, "logps/rejected": -668.0, "loss": 0.3099, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.9375, "rewards/margins": 2.265625, "rewards/rejected": -5.1875, "step": 1840 }, { "epoch": 0.13919193439169364, "grad_norm": 10.074612458953805, "learning_rate": 4.976718752383709e-07, "logits/chosen": -2.859375, "logits/rejected": -2.671875, "logps/chosen": -504.0, "logps/rejected": -736.0, "loss": 0.3083, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.46875, "rewards/margins": 2.328125, "rewards/rejected": -5.78125, "step": 1850 }, { "epoch": 0.1399443232262433, "grad_norm": 9.111393234918845, "learning_rate": 4.975816170176445e-07, "logits/chosen": -2.65625, "logits/rejected": -2.5625, "logps/chosen": -498.0, "logps/rejected": -728.0, "loss": 0.289, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.25, "rewards/margins": 2.453125, "rewards/rejected": -5.71875, "step": 1860 }, { "epoch": 0.140696712060793, "grad_norm": 11.023262933345384, "learning_rate": 4.974896508164101e-07, "logits/chosen": -2.859375, "logits/rejected": -2.65625, "logps/chosen": -500.0, "logps/rejected": -724.0, "loss": 0.3058, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.453125, "rewards/margins": 2.34375, "rewards/rejected": -5.78125, "step": 1870 }, { "epoch": 0.1414491008953427, "grad_norm": 9.465827793956825, "learning_rate": 4.973959772691112e-07, "logits/chosen": -2.875, "logits/rejected": -2.71875, "logps/chosen": -482.0, "logps/rejected": -776.0, "loss": 0.3245, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.265625, "rewards/margins": 2.8125, "rewards/rejected": -6.0625, "step": 1880 }, { "epoch": 0.1422014897298924, "grad_norm": 9.001035578417973, "learning_rate": 4.973005970219692e-07, "logits/chosen": -2.859375, "logits/rejected": -2.6875, "logps/chosen": -504.0, "logps/rejected": -696.0, "loss": 0.294, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.3125, "rewards/margins": 2.21875, "rewards/rejected": -5.53125, "step": 1890 }, { "epoch": 0.1429538785644421, "grad_norm": 10.221159879257584, "learning_rate": 4.972035107329796e-07, "logits/chosen": -2.734375, "logits/rejected": -2.484375, "logps/chosen": -524.0, "logps/rejected": -748.0, "loss": 0.3002, "rewards/accuracies": 0.875, "rewards/chosen": -3.515625, "rewards/margins": 2.46875, "rewards/rejected": -5.96875, "step": 1900 }, { "epoch": 0.1437062673989918, "grad_norm": 9.557554250860324, "learning_rate": 4.971047190719076e-07, "logits/chosen": -2.796875, "logits/rejected": -2.484375, "logps/chosen": -420.0, "logps/rejected": -688.0, "loss": 0.2758, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.78125, "rewards/margins": 2.328125, "rewards/rejected": -5.09375, "step": 1910 }, { "epoch": 0.14445865623354148, "grad_norm": 24.124984900386067, "learning_rate": 4.970042227202828e-07, "logits/chosen": -2.875, "logits/rejected": -2.640625, "logps/chosen": -544.0, "logps/rejected": -828.0, "loss": 0.2653, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.9375, "rewards/margins": 2.8125, "rewards/rejected": -6.75, "step": 1920 }, { "epoch": 0.14521104506809118, "grad_norm": 9.428412327971445, "learning_rate": 4.969020223713948e-07, "logits/chosen": -2.828125, "logits/rejected": -2.578125, "logps/chosen": -476.0, "logps/rejected": -728.0, "loss": 0.3094, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.15625, "rewards/margins": 2.609375, "rewards/rejected": -5.78125, "step": 1930 }, { "epoch": 0.14596343390264088, "grad_norm": 9.244732989145508, "learning_rate": 4.967981187302889e-07, "logits/chosen": -2.84375, "logits/rejected": -2.703125, "logps/chosen": -502.0, "logps/rejected": -760.0, "loss": 0.2917, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.34375, "rewards/margins": 2.640625, "rewards/rejected": -5.96875, "step": 1940 }, { "epoch": 0.14671582273719058, "grad_norm": 10.135613942587367, "learning_rate": 4.966925125137607e-07, "logits/chosen": -2.703125, "logits/rejected": -2.609375, "logps/chosen": -498.0, "logps/rejected": -740.0, "loss": 0.2774, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.390625, "rewards/margins": 2.515625, "rewards/rejected": -5.90625, "step": 1950 }, { "epoch": 0.14746821157174028, "grad_norm": 15.320900000215662, "learning_rate": 4.965852044503512e-07, "logits/chosen": -2.71875, "logits/rejected": -2.625, "logps/chosen": -470.0, "logps/rejected": -732.0, "loss": 0.31, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.046875, "rewards/margins": 2.453125, "rewards/rejected": -5.5, "step": 1960 }, { "epoch": 0.14822060040628998, "grad_norm": 9.157953298675203, "learning_rate": 4.964761952803416e-07, "logits/chosen": -2.6875, "logits/rejected": -2.515625, "logps/chosen": -450.0, "logps/rejected": -660.0, "loss": 0.3058, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.96875, "rewards/margins": 2.09375, "rewards/rejected": -5.0625, "step": 1970 }, { "epoch": 0.14897298924083965, "grad_norm": 9.500153891031406, "learning_rate": 4.963654857557488e-07, "logits/chosen": -2.625, "logits/rejected": -2.46875, "logps/chosen": -468.0, "logps/rejected": -652.0, "loss": 0.3027, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.03125, "rewards/margins": 2.078125, "rewards/rejected": -5.09375, "step": 1980 }, { "epoch": 0.14972537807538935, "grad_norm": 7.779021609791956, "learning_rate": 4.962530766403199e-07, "logits/chosen": -2.71875, "logits/rejected": -2.546875, "logps/chosen": -476.0, "logps/rejected": -712.0, "loss": 0.2961, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.21875, "rewards/margins": 2.390625, "rewards/rejected": -5.625, "step": 1990 }, { "epoch": 0.15047776690993905, "grad_norm": 10.341517042412354, "learning_rate": 4.961389687095267e-07, "logits/chosen": -2.734375, "logits/rejected": -2.671875, "logps/chosen": -484.0, "logps/rejected": -696.0, "loss": 0.2755, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.203125, "rewards/margins": 2.25, "rewards/rejected": -5.4375, "step": 2000 }, { "epoch": 0.15123015574448875, "grad_norm": 10.775662545927513, "learning_rate": 4.960231627505606e-07, "logits/chosen": -2.65625, "logits/rejected": -2.53125, "logps/chosen": -490.0, "logps/rejected": -716.0, "loss": 0.2881, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.15625, "rewards/margins": 2.46875, "rewards/rejected": -5.625, "step": 2010 }, { "epoch": 0.15198254457903845, "grad_norm": 10.755771111022861, "learning_rate": 4.959056595623271e-07, "logits/chosen": -2.53125, "logits/rejected": -2.40625, "logps/chosen": -500.0, "logps/rejected": -708.0, "loss": 0.2946, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.234375, "rewards/margins": 2.265625, "rewards/rejected": -5.5, "step": 2020 }, { "epoch": 0.15273493341358815, "grad_norm": 8.940161113038451, "learning_rate": 4.957864599554404e-07, "logits/chosen": -2.421875, "logits/rejected": -2.1875, "logps/chosen": -440.0, "logps/rejected": -680.0, "loss": 0.3193, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.796875, "rewards/margins": 2.390625, "rewards/rejected": -5.1875, "step": 2030 }, { "epoch": 0.15348732224813783, "grad_norm": 10.0217606927789, "learning_rate": 4.956655647522176e-07, "logits/chosen": -2.5, "logits/rejected": -2.40625, "logps/chosen": -498.0, "logps/rejected": -748.0, "loss": 0.2761, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.1875, "rewards/margins": 2.765625, "rewards/rejected": -5.96875, "step": 2040 }, { "epoch": 0.15423971108268753, "grad_norm": 8.538140666623187, "learning_rate": 4.95542974786673e-07, "logits/chosen": -2.640625, "logits/rejected": -2.4375, "logps/chosen": -516.0, "logps/rejected": -752.0, "loss": 0.2786, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5, "rewards/margins": 2.515625, "rewards/rejected": -6.03125, "step": 2050 }, { "epoch": 0.15499209991723722, "grad_norm": 11.092066592354108, "learning_rate": 4.954186909045129e-07, "logits/chosen": -2.75, "logits/rejected": -2.5, "logps/chosen": -476.0, "logps/rejected": -692.0, "loss": 0.2978, "rewards/accuracies": 0.90625, "rewards/chosen": -3.15625, "rewards/margins": 2.296875, "rewards/rejected": -5.4375, "step": 2060 }, { "epoch": 0.15574448875178692, "grad_norm": 8.810015279238543, "learning_rate": 4.95292713963129e-07, "logits/chosen": -2.671875, "logits/rejected": -2.5625, "logps/chosen": -452.0, "logps/rejected": -684.0, "loss": 0.3037, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.984375, "rewards/margins": 2.40625, "rewards/rejected": -5.40625, "step": 2070 }, { "epoch": 0.15649687758633662, "grad_norm": 10.304083443176951, "learning_rate": 4.951650448315929e-07, "logits/chosen": -2.640625, "logits/rejected": -2.53125, "logps/chosen": -500.0, "logps/rejected": -720.0, "loss": 0.2742, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.359375, "rewards/margins": 2.34375, "rewards/rejected": -5.6875, "step": 2080 }, { "epoch": 0.15724926642088632, "grad_norm": 11.038842624924929, "learning_rate": 4.950356843906501e-07, "logits/chosen": -2.6875, "logits/rejected": -2.578125, "logps/chosen": -482.0, "logps/rejected": -736.0, "loss": 0.3043, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.265625, "rewards/margins": 2.5625, "rewards/rejected": -5.8125, "step": 2090 }, { "epoch": 0.158001655255436, "grad_norm": 16.086119459510908, "learning_rate": 4.949046335327138e-07, "logits/chosen": -2.5625, "logits/rejected": -2.40625, "logps/chosen": -466.0, "logps/rejected": -684.0, "loss": 0.2645, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.953125, "rewards/margins": 2.3125, "rewards/rejected": -5.25, "step": 2100 }, { "epoch": 0.1587540440899857, "grad_norm": 10.440581591351142, "learning_rate": 4.94771893161859e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -490.0, "logps/rejected": -752.0, "loss": 0.3073, "rewards/accuracies": 0.875, "rewards/chosen": -3.25, "rewards/margins": 2.734375, "rewards/rejected": -6.0, "step": 2110 }, { "epoch": 0.1595064329245354, "grad_norm": 8.118566889874467, "learning_rate": 4.946374641938157e-07, "logits/chosen": -2.640625, "logits/rejected": -2.5625, "logps/chosen": -478.0, "logps/rejected": -684.0, "loss": 0.3307, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.203125, "rewards/margins": 2.1875, "rewards/rejected": -5.375, "step": 2120 }, { "epoch": 0.1602588217590851, "grad_norm": 12.613630152927428, "learning_rate": 4.945013475559632e-07, "logits/chosen": -2.65625, "logits/rejected": -2.484375, "logps/chosen": -516.0, "logps/rejected": -728.0, "loss": 0.2839, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.359375, "rewards/margins": 2.296875, "rewards/rejected": -5.65625, "step": 2130 }, { "epoch": 0.1610112105936348, "grad_norm": 10.910342487887005, "learning_rate": 4.943635441873235e-07, "logits/chosen": -2.71875, "logits/rejected": -2.546875, "logps/chosen": -452.0, "logps/rejected": -684.0, "loss": 0.2975, "rewards/accuracies": 0.875, "rewards/chosen": -2.90625, "rewards/margins": 2.359375, "rewards/rejected": -5.28125, "step": 2140 }, { "epoch": 0.1617635994281845, "grad_norm": 7.615912452628361, "learning_rate": 4.942240550385547e-07, "logits/chosen": -2.78125, "logits/rejected": -2.609375, "logps/chosen": -432.0, "logps/rejected": -684.0, "loss": 0.2741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.71875, "rewards/margins": 2.5625, "rewards/rejected": -5.28125, "step": 2150 }, { "epoch": 0.16251598826273417, "grad_norm": 12.776860043962529, "learning_rate": 4.940828810719444e-07, "logits/chosen": -2.765625, "logits/rejected": -2.640625, "logps/chosen": -484.0, "logps/rejected": -740.0, "loss": 0.2956, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.21875, "rewards/margins": 2.6875, "rewards/rejected": -5.90625, "step": 2160 }, { "epoch": 0.16326837709728387, "grad_norm": 7.715958635368018, "learning_rate": 4.939400232614033e-07, "logits/chosen": -2.625, "logits/rejected": -2.46875, "logps/chosen": -440.0, "logps/rejected": -676.0, "loss": 0.3187, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.78125, "rewards/margins": 2.390625, "rewards/rejected": -5.15625, "step": 2170 }, { "epoch": 0.16402076593183357, "grad_norm": 10.909720452071825, "learning_rate": 4.937954825924585e-07, "logits/chosen": -2.5625, "logits/rejected": -2.46875, "logps/chosen": -458.0, "logps/rejected": -656.0, "loss": 0.2798, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.015625, "rewards/margins": 2.140625, "rewards/rejected": -5.15625, "step": 2180 }, { "epoch": 0.16477315476638327, "grad_norm": 11.056113501762782, "learning_rate": 4.936492600622464e-07, "logits/chosen": -2.59375, "logits/rejected": -2.40625, "logps/chosen": -552.0, "logps/rejected": -756.0, "loss": 0.3351, "rewards/accuracies": 0.875, "rewards/chosen": -3.6875, "rewards/margins": 2.328125, "rewards/rejected": -6.03125, "step": 2190 }, { "epoch": 0.16552554360093297, "grad_norm": 9.181288831576246, "learning_rate": 4.935013566795058e-07, "logits/chosen": -2.75, "logits/rejected": -2.5625, "logps/chosen": -494.0, "logps/rejected": -728.0, "loss": 0.2665, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.421875, "rewards/margins": 2.3125, "rewards/rejected": -5.71875, "step": 2200 }, { "epoch": 0.16627793243548267, "grad_norm": 15.421194070464846, "learning_rate": 4.933517734645714e-07, "logits/chosen": -2.84375, "logits/rejected": -2.671875, "logps/chosen": -482.0, "logps/rejected": -816.0, "loss": 0.2725, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.390625, "rewards/margins": 3.1875, "rewards/rejected": -6.59375, "step": 2210 }, { "epoch": 0.16703032127003234, "grad_norm": 7.732850963579106, "learning_rate": 4.932005114493665e-07, "logits/chosen": -2.828125, "logits/rejected": -2.640625, "logps/chosen": -532.0, "logps/rejected": -808.0, "loss": 0.2775, "rewards/accuracies": 0.84375, "rewards/chosen": -3.703125, "rewards/margins": 2.8125, "rewards/rejected": -6.5, "step": 2220 }, { "epoch": 0.16778271010458204, "grad_norm": 8.73231439791769, "learning_rate": 4.930475716773956e-07, "logits/chosen": -2.640625, "logits/rejected": -2.546875, "logps/chosen": -502.0, "logps/rejected": -752.0, "loss": 0.2924, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.390625, "rewards/margins": 2.53125, "rewards/rejected": -5.9375, "step": 2230 }, { "epoch": 0.16853509893913174, "grad_norm": 9.123008878161743, "learning_rate": 4.928929552037378e-07, "logits/chosen": -2.5625, "logits/rejected": -2.34375, "logps/chosen": -492.0, "logps/rejected": -712.0, "loss": 0.2833, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.328125, "rewards/margins": 2.40625, "rewards/rejected": -5.71875, "step": 2240 }, { "epoch": 0.16928748777368144, "grad_norm": 12.07063346888074, "learning_rate": 4.927366630950389e-07, "logits/chosen": -2.609375, "logits/rejected": -2.484375, "logps/chosen": -468.0, "logps/rejected": -700.0, "loss": 0.2693, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.09375, "rewards/margins": 2.46875, "rewards/rejected": -5.5625, "step": 2250 }, { "epoch": 0.17003987660823114, "grad_norm": 9.58671682334943, "learning_rate": 4.925786964295046e-07, "logits/chosen": -2.53125, "logits/rejected": -2.328125, "logps/chosen": -568.0, "logps/rejected": -808.0, "loss": 0.3029, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.03125, "rewards/margins": 2.578125, "rewards/rejected": -6.59375, "step": 2260 }, { "epoch": 0.17079226544278084, "grad_norm": 9.336994205772083, "learning_rate": 4.924190562968926e-07, "logits/chosen": -2.640625, "logits/rejected": -2.453125, "logps/chosen": -512.0, "logps/rejected": -716.0, "loss": 0.2713, "rewards/accuracies": 0.875, "rewards/chosen": -3.4375, "rewards/margins": 2.296875, "rewards/rejected": -5.75, "step": 2270 }, { "epoch": 0.17154465427733054, "grad_norm": 11.646865344527923, "learning_rate": 4.922577437985052e-07, "logits/chosen": -2.78125, "logits/rejected": -2.625, "logps/chosen": -498.0, "logps/rejected": -760.0, "loss": 0.2462, "rewards/accuracies": 0.90625, "rewards/chosen": -3.421875, "rewards/margins": 2.609375, "rewards/rejected": -6.03125, "step": 2280 }, { "epoch": 0.1722970431118802, "grad_norm": 12.116174831993108, "learning_rate": 4.920947600471821e-07, "logits/chosen": -2.8125, "logits/rejected": -2.59375, "logps/chosen": -496.0, "logps/rejected": -764.0, "loss": 0.2828, "rewards/accuracies": 0.84375, "rewards/chosen": -3.375, "rewards/margins": 2.734375, "rewards/rejected": -6.09375, "step": 2290 }, { "epoch": 0.1730494319464299, "grad_norm": 9.542246025337626, "learning_rate": 4.919301061672919e-07, "logits/chosen": -2.75, "logits/rejected": -2.53125, "logps/chosen": -488.0, "logps/rejected": -736.0, "loss": 0.2701, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3125, "rewards/margins": 2.53125, "rewards/rejected": -5.84375, "step": 2300 }, { "epoch": 0.1738018207809796, "grad_norm": 10.606436086492515, "learning_rate": 4.917637832947256e-07, "logits/chosen": -2.609375, "logits/rejected": -2.4375, "logps/chosen": -572.0, "logps/rejected": -804.0, "loss": 0.2781, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.9375, "rewards/margins": 2.5, "rewards/rejected": -6.4375, "step": 2310 }, { "epoch": 0.1745542096155293, "grad_norm": 10.611161178350386, "learning_rate": 4.915957925768871e-07, "logits/chosen": -2.828125, "logits/rejected": -2.609375, "logps/chosen": -520.0, "logps/rejected": -748.0, "loss": 0.2981, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.734375, "rewards/margins": 2.265625, "rewards/rejected": -6.0, "step": 2320 }, { "epoch": 0.175306598450079, "grad_norm": 12.829898396212654, "learning_rate": 4.914261351726868e-07, "logits/chosen": -2.8125, "logits/rejected": -2.625, "logps/chosen": -560.0, "logps/rejected": -784.0, "loss": 0.2684, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.78125, "rewards/margins": 2.421875, "rewards/rejected": -6.1875, "step": 2330 }, { "epoch": 0.1760589872846287, "grad_norm": 10.222444110704744, "learning_rate": 4.91254812252533e-07, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -510.0, "logps/rejected": -904.0, "loss": 0.2601, "rewards/accuracies": 0.90625, "rewards/chosen": -3.5625, "rewards/margins": 3.734375, "rewards/rejected": -7.3125, "step": 2340 }, { "epoch": 0.17681137611917838, "grad_norm": 9.955713067057467, "learning_rate": 4.910818249983235e-07, "logits/chosen": -2.796875, "logits/rejected": -2.65625, "logps/chosen": -548.0, "logps/rejected": -828.0, "loss": 0.283, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.921875, "rewards/margins": 2.734375, "rewards/rejected": -6.65625, "step": 2350 }, { "epoch": 0.17756376495372808, "grad_norm": 9.899985953459382, "learning_rate": 4.909071746034379e-07, "logits/chosen": -2.59375, "logits/rejected": -2.40625, "logps/chosen": -588.0, "logps/rejected": -804.0, "loss": 0.2834, "rewards/accuracies": 0.875, "rewards/chosen": -4.15625, "rewards/margins": 2.3125, "rewards/rejected": -6.46875, "step": 2360 }, { "epoch": 0.17831615378827778, "grad_norm": 9.734416734362911, "learning_rate": 4.907308622727293e-07, "logits/chosen": -2.625, "logits/rejected": -2.375, "logps/chosen": -494.0, "logps/rejected": -748.0, "loss": 0.2868, "rewards/accuracies": 0.875, "rewards/chosen": -3.4375, "rewards/margins": 2.625, "rewards/rejected": -6.0625, "step": 2370 }, { "epoch": 0.17906854262282748, "grad_norm": 8.463086487753493, "learning_rate": 4.90552889222516e-07, "logits/chosen": -2.609375, "logits/rejected": -2.4375, "logps/chosen": -540.0, "logps/rejected": -824.0, "loss": 0.2605, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.671875, "rewards/margins": 2.75, "rewards/rejected": -6.4375, "step": 2380 }, { "epoch": 0.17982093145737718, "grad_norm": 13.146535023342294, "learning_rate": 4.903732566805727e-07, "logits/chosen": -2.671875, "logits/rejected": -2.5625, "logps/chosen": -560.0, "logps/rejected": -844.0, "loss": 0.2833, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.890625, "rewards/margins": 2.9375, "rewards/rejected": -6.84375, "step": 2390 }, { "epoch": 0.18057332029192688, "grad_norm": 9.711136432827589, "learning_rate": 4.901919658861228e-07, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -496.0, "logps/rejected": -680.0, "loss": 0.3008, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.296875, "rewards/margins": 2.078125, "rewards/rejected": -5.375, "step": 2400 }, { "epoch": 0.18132570912647655, "grad_norm": 12.52993235918333, "learning_rate": 4.900090180898292e-07, "logits/chosen": -2.609375, "logits/rejected": -2.4375, "logps/chosen": -524.0, "logps/rejected": -792.0, "loss": 0.2695, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5625, "rewards/margins": 2.78125, "rewards/rejected": -6.34375, "step": 2410 }, { "epoch": 0.18207809796102625, "grad_norm": 7.896403216710635, "learning_rate": 4.898244145537857e-07, "logits/chosen": -2.6875, "logits/rejected": -2.375, "logps/chosen": -510.0, "logps/rejected": -756.0, "loss": 0.2626, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.515625, "rewards/margins": 2.484375, "rewards/rejected": -6.0, "step": 2420 }, { "epoch": 0.18283048679557595, "grad_norm": 8.144329513203688, "learning_rate": 4.896381565515087e-07, "logits/chosen": -2.625, "logits/rejected": -2.5, "logps/chosen": -500.0, "logps/rejected": -740.0, "loss": 0.2379, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.390625, "rewards/margins": 2.65625, "rewards/rejected": -6.03125, "step": 2430 }, { "epoch": 0.18358287563012565, "grad_norm": 9.329496514618313, "learning_rate": 4.894502453679284e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -482.0, "logps/rejected": -764.0, "loss": 0.2785, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.203125, "rewards/margins": 2.90625, "rewards/rejected": -6.125, "step": 2440 }, { "epoch": 0.18433526446467535, "grad_norm": 12.10939641210324, "learning_rate": 4.892606822993793e-07, "logits/chosen": -2.671875, "logits/rejected": -2.578125, "logps/chosen": -436.0, "logps/rejected": -708.0, "loss": 0.299, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.8125, "rewards/margins": 2.640625, "rewards/rejected": -5.4375, "step": 2450 }, { "epoch": 0.18508765329922505, "grad_norm": 9.857872781155606, "learning_rate": 4.890694686535918e-07, "logits/chosen": -2.546875, "logits/rejected": -2.375, "logps/chosen": -510.0, "logps/rejected": -784.0, "loss": 0.2701, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.28125, "rewards/margins": 2.9375, "rewards/rejected": -6.21875, "step": 2460 }, { "epoch": 0.18584004213377472, "grad_norm": 17.101449726474733, "learning_rate": 4.888766057496833e-07, "logits/chosen": -2.46875, "logits/rejected": -2.3125, "logps/chosen": -472.0, "logps/rejected": -728.0, "loss": 0.2916, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.203125, "rewards/margins": 2.484375, "rewards/rejected": -5.71875, "step": 2470 }, { "epoch": 0.18659243096832442, "grad_norm": 8.347297641682713, "learning_rate": 4.886820949181486e-07, "logits/chosen": -2.59375, "logits/rejected": -2.5, "logps/chosen": -438.0, "logps/rejected": -736.0, "loss": 0.2462, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.90625, "rewards/margins": 2.96875, "rewards/rejected": -5.875, "step": 2480 }, { "epoch": 0.18734481980287412, "grad_norm": 9.678661384787706, "learning_rate": 4.884859375008512e-07, "logits/chosen": -2.828125, "logits/rejected": -2.515625, "logps/chosen": -486.0, "logps/rejected": -776.0, "loss": 0.2875, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.0625, "rewards/margins": 2.9375, "rewards/rejected": -6.0, "step": 2490 }, { "epoch": 0.18809720863742382, "grad_norm": 11.145929450555997, "learning_rate": 4.882881348510136e-07, "logits/chosen": -2.859375, "logits/rejected": -2.578125, "logps/chosen": -556.0, "logps/rejected": -856.0, "loss": 0.2611, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.84375, "rewards/margins": 3.171875, "rewards/rejected": -7.03125, "step": 2500 }, { "epoch": 0.18884959747197352, "grad_norm": 7.1935463269752065, "learning_rate": 4.880886883332083e-07, "logits/chosen": -2.640625, "logits/rejected": -2.296875, "logps/chosen": -516.0, "logps/rejected": -848.0, "loss": 0.2421, "rewards/accuracies": 0.9375, "rewards/chosen": -3.4375, "rewards/margins": 3.375, "rewards/rejected": -6.8125, "step": 2510 }, { "epoch": 0.18960198630652322, "grad_norm": 10.65990633403162, "learning_rate": 4.878875993233486e-07, "logits/chosen": -2.5, "logits/rejected": -2.34375, "logps/chosen": -516.0, "logps/rejected": -776.0, "loss": 0.2631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.703125, "rewards/margins": 2.515625, "rewards/rejected": -6.21875, "step": 2520 }, { "epoch": 0.1903543751410729, "grad_norm": 10.814709273380368, "learning_rate": 4.876848692086782e-07, "logits/chosen": -2.671875, "logits/rejected": -2.375, "logps/chosen": -528.0, "logps/rejected": -796.0, "loss": 0.2662, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.515625, "rewards/margins": 2.71875, "rewards/rejected": -6.25, "step": 2530 }, { "epoch": 0.1911067639756226, "grad_norm": 8.274113340101351, "learning_rate": 4.874804993877625e-07, "logits/chosen": -2.53125, "logits/rejected": -2.328125, "logps/chosen": -524.0, "logps/rejected": -768.0, "loss": 0.2679, "rewards/accuracies": 0.875, "rewards/chosen": -3.71875, "rewards/margins": 2.5, "rewards/rejected": -6.21875, "step": 2540 }, { "epoch": 0.1918591528101723, "grad_norm": 7.8585668700799465, "learning_rate": 4.872744912704788e-07, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -480.0, "logps/rejected": -768.0, "loss": 0.2575, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.21875, "rewards/margins": 3.078125, "rewards/rejected": -6.28125, "step": 2550 }, { "epoch": 0.192611541644722, "grad_norm": 9.359705035781545, "learning_rate": 4.870668462780062e-07, "logits/chosen": -2.6875, "logits/rejected": -2.4375, "logps/chosen": -492.0, "logps/rejected": -748.0, "loss": 0.2643, "rewards/accuracies": 0.875, "rewards/chosen": -3.34375, "rewards/margins": 2.671875, "rewards/rejected": -6.03125, "step": 2560 }, { "epoch": 0.1933639304792717, "grad_norm": 10.260726382149574, "learning_rate": 4.868575658428163e-07, "logits/chosen": -2.703125, "logits/rejected": -2.5, "logps/chosen": -474.0, "logps/rejected": -704.0, "loss": 0.2763, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.1875, "rewards/margins": 2.453125, "rewards/rejected": -5.625, "step": 2570 }, { "epoch": 0.1941163193138214, "grad_norm": 14.078164696272687, "learning_rate": 4.866466514086628e-07, "logits/chosen": -2.640625, "logits/rejected": -2.53125, "logps/chosen": -480.0, "logps/rejected": -752.0, "loss": 0.2753, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.03125, "rewards/margins": 2.984375, "rewards/rejected": -6.0, "step": 2580 }, { "epoch": 0.19486870814837107, "grad_norm": 8.453767523010452, "learning_rate": 4.864341044305719e-07, "logits/chosen": -2.5625, "logits/rejected": -2.453125, "logps/chosen": -524.0, "logps/rejected": -720.0, "loss": 0.2626, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.421875, "rewards/margins": 2.375, "rewards/rejected": -5.78125, "step": 2590 }, { "epoch": 0.19562109698292077, "grad_norm": 10.18587974692239, "learning_rate": 4.862199263748323e-07, "logits/chosen": -2.515625, "logits/rejected": -2.296875, "logps/chosen": -536.0, "logps/rejected": -800.0, "loss": 0.2657, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.78125, "rewards/margins": 2.84375, "rewards/rejected": -6.625, "step": 2600 }, { "epoch": 0.19637348581747047, "grad_norm": 11.079570317722963, "learning_rate": 4.860041187189846e-07, "logits/chosen": -2.328125, "logits/rejected": -2.171875, "logps/chosen": -504.0, "logps/rejected": -748.0, "loss": 0.2942, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.46875, "rewards/margins": 2.46875, "rewards/rejected": -5.9375, "step": 2610 }, { "epoch": 0.19712587465202017, "grad_norm": 7.9462869418389745, "learning_rate": 4.857866829518118e-07, "logits/chosen": -2.484375, "logits/rejected": -2.3125, "logps/chosen": -494.0, "logps/rejected": -788.0, "loss": 0.2422, "rewards/accuracies": 0.90625, "rewards/chosen": -3.4375, "rewards/margins": 2.734375, "rewards/rejected": -6.15625, "step": 2620 }, { "epoch": 0.19787826348656987, "grad_norm": 10.695129742443964, "learning_rate": 4.855676205733287e-07, "logits/chosen": -2.84375, "logits/rejected": -2.59375, "logps/chosen": -552.0, "logps/rejected": -832.0, "loss": 0.2931, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.828125, "rewards/margins": 2.890625, "rewards/rejected": -6.71875, "step": 2630 }, { "epoch": 0.19863065232111957, "grad_norm": 13.909715185144693, "learning_rate": 4.853469330947712e-07, "logits/chosen": -2.78125, "logits/rejected": -2.640625, "logps/chosen": -516.0, "logps/rejected": -772.0, "loss": 0.2719, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.328125, "rewards/margins": 2.84375, "rewards/rejected": -6.1875, "step": 2640 }, { "epoch": 0.19938304115566924, "grad_norm": 7.977293712578769, "learning_rate": 4.851246220385862e-07, "logits/chosen": -2.53125, "logits/rejected": -2.421875, "logps/chosen": -548.0, "logps/rejected": -776.0, "loss": 0.2662, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.796875, "rewards/margins": 2.546875, "rewards/rejected": -6.34375, "step": 2650 }, { "epoch": 0.20013542999021894, "grad_norm": 7.690656776892857, "learning_rate": 4.849006889384217e-07, "logits/chosen": -2.5625, "logits/rejected": -2.25, "logps/chosen": -572.0, "logps/rejected": -768.0, "loss": 0.2768, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.9375, "rewards/margins": 2.21875, "rewards/rejected": -6.15625, "step": 2660 }, { "epoch": 0.20088781882476864, "grad_norm": 8.564652490755712, "learning_rate": 4.84675135339115e-07, "logits/chosen": -2.515625, "logits/rejected": -2.3125, "logps/chosen": -494.0, "logps/rejected": -768.0, "loss": 0.2813, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.390625, "rewards/margins": 2.84375, "rewards/rejected": -6.21875, "step": 2670 }, { "epoch": 0.20164020765931834, "grad_norm": 6.442833431737132, "learning_rate": 4.84447962796683e-07, "logits/chosen": -2.578125, "logits/rejected": -2.421875, "logps/chosen": -580.0, "logps/rejected": -844.0, "loss": 0.2707, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.0, "rewards/margins": 2.796875, "rewards/rejected": -6.78125, "step": 2680 }, { "epoch": 0.20239259649386804, "grad_norm": 11.556397527239499, "learning_rate": 4.842191728783107e-07, "logits/chosen": -2.65625, "logits/rejected": -2.453125, "logps/chosen": -520.0, "logps/rejected": -756.0, "loss": 0.2938, "rewards/accuracies": 0.84375, "rewards/chosen": -3.578125, "rewards/margins": 2.390625, "rewards/rejected": -5.96875, "step": 2690 }, { "epoch": 0.20314498532841774, "grad_norm": 10.766048526372968, "learning_rate": 4.839887671623414e-07, "logits/chosen": -2.640625, "logits/rejected": -2.46875, "logps/chosen": -478.0, "logps/rejected": -768.0, "loss": 0.2471, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.21875, "rewards/margins": 2.9375, "rewards/rejected": -6.15625, "step": 2700 }, { "epoch": 0.2038973741629674, "grad_norm": 13.5244223811314, "learning_rate": 4.837567472382651e-07, "logits/chosen": -2.5, "logits/rejected": -2.34375, "logps/chosen": -536.0, "logps/rejected": -812.0, "loss": 0.2909, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.828125, "rewards/margins": 2.796875, "rewards/rejected": -6.625, "step": 2710 }, { "epoch": 0.2046497629975171, "grad_norm": 9.738583048600992, "learning_rate": 4.835231147067072e-07, "logits/chosen": -2.484375, "logits/rejected": -2.359375, "logps/chosen": -516.0, "logps/rejected": -764.0, "loss": 0.2766, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.609375, "rewards/margins": 2.453125, "rewards/rejected": -6.0625, "step": 2720 }, { "epoch": 0.2054021518320668, "grad_norm": 10.261504032860017, "learning_rate": 4.832878711794185e-07, "logits/chosen": -2.703125, "logits/rejected": -2.390625, "logps/chosen": -516.0, "logps/rejected": -860.0, "loss": 0.2551, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.578125, "rewards/margins": 3.375, "rewards/rejected": -6.9375, "step": 2730 }, { "epoch": 0.2061545406666165, "grad_norm": 6.5383786738976415, "learning_rate": 4.830510182792632e-07, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -544.0, "logps/rejected": -824.0, "loss": 0.2445, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.8125, "rewards/margins": 2.90625, "rewards/rejected": -6.71875, "step": 2740 }, { "epoch": 0.2069069295011662, "grad_norm": 10.447397489817225, "learning_rate": 4.82812557640208e-07, "logits/chosen": -2.6875, "logits/rejected": -2.515625, "logps/chosen": -552.0, "logps/rejected": -784.0, "loss": 0.2563, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.671875, "rewards/margins": 2.828125, "rewards/rejected": -6.5, "step": 2750 }, { "epoch": 0.2076593183357159, "grad_norm": 9.348554817009415, "learning_rate": 4.82572490907311e-07, "logits/chosen": -2.625, "logits/rejected": -2.46875, "logps/chosen": -552.0, "logps/rejected": -808.0, "loss": 0.2576, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.765625, "rewards/margins": 2.90625, "rewards/rejected": -6.6875, "step": 2760 }, { "epoch": 0.20841170717026558, "grad_norm": 8.70381283312396, "learning_rate": 4.8233081973671e-07, "logits/chosen": -2.46875, "logits/rejected": -2.25, "logps/chosen": -532.0, "logps/rejected": -800.0, "loss": 0.2484, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.8125, "rewards/margins": 2.640625, "rewards/rejected": -6.4375, "step": 2770 }, { "epoch": 0.20916409600481528, "grad_norm": 7.4097108054434955, "learning_rate": 4.820875457956115e-07, "logits/chosen": -2.625, "logits/rejected": -2.515625, "logps/chosen": -548.0, "logps/rejected": -828.0, "loss": 0.2774, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.875, "rewards/margins": 2.875, "rewards/rejected": -6.75, "step": 2780 }, { "epoch": 0.20991648483936498, "grad_norm": 8.929321008061232, "learning_rate": 4.818426707622788e-07, "logits/chosen": -2.609375, "logits/rejected": -2.390625, "logps/chosen": -494.0, "logps/rejected": -732.0, "loss": 0.257, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.265625, "rewards/margins": 2.546875, "rewards/rejected": -5.8125, "step": 2790 }, { "epoch": 0.21066887367391468, "grad_norm": 8.981128088011406, "learning_rate": 4.815961963260207e-07, "logits/chosen": -2.640625, "logits/rejected": -2.46875, "logps/chosen": -490.0, "logps/rejected": -764.0, "loss": 0.2419, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.21875, "rewards/margins": 2.953125, "rewards/rejected": -6.1875, "step": 2800 }, { "epoch": 0.21142126250846438, "grad_norm": 10.05509938002628, "learning_rate": 4.813481241871794e-07, "logits/chosen": -2.640625, "logits/rejected": -2.421875, "logps/chosen": -484.0, "logps/rejected": -780.0, "loss": 0.2477, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.234375, "rewards/margins": 2.9375, "rewards/rejected": -6.15625, "step": 2810 }, { "epoch": 0.21217365134301408, "grad_norm": 11.72370585695034, "learning_rate": 4.810984560571195e-07, "logits/chosen": -2.65625, "logits/rejected": -2.328125, "logps/chosen": -472.0, "logps/rejected": -772.0, "loss": 0.2559, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.21875, "rewards/margins": 3.234375, "rewards/rejected": -6.4375, "step": 2820 }, { "epoch": 0.21292604017756375, "grad_norm": 9.647642410986215, "learning_rate": 4.808471936582156e-07, "logits/chosen": -2.65625, "logits/rejected": -2.40625, "logps/chosen": -472.0, "logps/rejected": -764.0, "loss": 0.2374, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.140625, "rewards/margins": 3.03125, "rewards/rejected": -6.15625, "step": 2830 }, { "epoch": 0.21367842901211345, "grad_norm": 11.420752468431145, "learning_rate": 4.805943387238409e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -464.0, "logps/rejected": -736.0, "loss": 0.252, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.0, "rewards/margins": 2.921875, "rewards/rejected": -5.90625, "step": 2840 }, { "epoch": 0.21443081784666315, "grad_norm": 9.785252828322875, "learning_rate": 4.803398929983543e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -476.0, "logps/rejected": -760.0, "loss": 0.2856, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.21875, "rewards/margins": 2.90625, "rewards/rejected": -6.125, "step": 2850 }, { "epoch": 0.21518320668121285, "grad_norm": 11.971762302414026, "learning_rate": 4.800838582370898e-07, "logits/chosen": -2.546875, "logits/rejected": -2.34375, "logps/chosen": -464.0, "logps/rejected": -684.0, "loss": 0.2718, "rewards/accuracies": 0.8125, "rewards/chosen": -3.09375, "rewards/margins": 2.234375, "rewards/rejected": -5.3125, "step": 2860 }, { "epoch": 0.21593559551576255, "grad_norm": 8.477706468059246, "learning_rate": 4.79826236206343e-07, "logits/chosen": -2.5, "logits/rejected": -2.171875, "logps/chosen": -488.0, "logps/rejected": -724.0, "loss": 0.2513, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.25, "rewards/margins": 2.390625, "rewards/rejected": -5.625, "step": 2870 }, { "epoch": 0.21668798435031225, "grad_norm": 11.869962119536272, "learning_rate": 4.795670286833599e-07, "logits/chosen": -2.5, "logits/rejected": -2.328125, "logps/chosen": -512.0, "logps/rejected": -784.0, "loss": 0.2611, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.34375, "rewards/margins": 2.859375, "rewards/rejected": -6.1875, "step": 2880 }, { "epoch": 0.21744037318486192, "grad_norm": 8.487625004734987, "learning_rate": 4.79306237456324e-07, "logits/chosen": -2.515625, "logits/rejected": -2.375, "logps/chosen": -512.0, "logps/rejected": -764.0, "loss": 0.265, "rewards/accuracies": 0.90625, "rewards/chosen": -3.5625, "rewards/margins": 2.515625, "rewards/rejected": -6.0625, "step": 2890 }, { "epoch": 0.21819276201941162, "grad_norm": 11.615793395248453, "learning_rate": 4.790438643243447e-07, "logits/chosen": -2.703125, "logits/rejected": -2.40625, "logps/chosen": -500.0, "logps/rejected": -752.0, "loss": 0.2533, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.140625, "rewards/margins": 2.828125, "rewards/rejected": -5.96875, "step": 2900 }, { "epoch": 0.21894515085396132, "grad_norm": 10.318914547631199, "learning_rate": 4.787799110974436e-07, "logits/chosen": -2.90625, "logits/rejected": -2.65625, "logps/chosen": -516.0, "logps/rejected": -816.0, "loss": 0.2414, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.5625, "rewards/margins": 3.21875, "rewards/rejected": -6.78125, "step": 2910 }, { "epoch": 0.21969753968851102, "grad_norm": 12.904513672408855, "learning_rate": 4.785143795965437e-07, "logits/chosen": -2.90625, "logits/rejected": -2.71875, "logps/chosen": -528.0, "logps/rejected": -784.0, "loss": 0.2577, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.640625, "rewards/margins": 2.640625, "rewards/rejected": -6.28125, "step": 2920 }, { "epoch": 0.22044992852306072, "grad_norm": 8.687685973219471, "learning_rate": 4.782472716534554e-07, "logits/chosen": -2.90625, "logits/rejected": -2.6875, "logps/chosen": -576.0, "logps/rejected": -860.0, "loss": 0.25, "rewards/accuracies": 0.875, "rewards/chosen": -3.96875, "rewards/margins": 2.875, "rewards/rejected": -6.84375, "step": 2930 }, { "epoch": 0.22120231735761042, "grad_norm": 8.425580278247608, "learning_rate": 4.779785891108647e-07, "logits/chosen": -2.828125, "logits/rejected": -2.5625, "logps/chosen": -540.0, "logps/rejected": -776.0, "loss": 0.2682, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.734375, "rewards/margins": 2.546875, "rewards/rejected": -6.28125, "step": 2940 }, { "epoch": 0.2219547061921601, "grad_norm": 8.92649200504024, "learning_rate": 4.777083338223202e-07, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -572.0, "logps/rejected": -840.0, "loss": 0.2475, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.25, "rewards/margins": 2.671875, "rewards/rejected": -6.90625, "step": 2950 }, { "epoch": 0.2227070950267098, "grad_norm": 9.22059724524348, "learning_rate": 4.774365076522202e-07, "logits/chosen": -2.875, "logits/rejected": -2.625, "logps/chosen": -564.0, "logps/rejected": -804.0, "loss": 0.2469, "rewards/accuracies": 0.875, "rewards/chosen": -3.765625, "rewards/margins": 2.765625, "rewards/rejected": -6.53125, "step": 2960 }, { "epoch": 0.2234594838612595, "grad_norm": 9.454530660849164, "learning_rate": 4.771631124758e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -520.0, "logps/rejected": -804.0, "loss": 0.2499, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.6875, "rewards/margins": 2.765625, "rewards/rejected": -6.46875, "step": 2970 }, { "epoch": 0.2242118726958092, "grad_norm": 9.480975232151959, "learning_rate": 4.76888150179119e-07, "logits/chosen": -2.984375, "logits/rejected": -2.671875, "logps/chosen": -512.0, "logps/rejected": -828.0, "loss": 0.2496, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.484375, "rewards/margins": 3.21875, "rewards/rejected": -6.71875, "step": 2980 }, { "epoch": 0.2249642615303589, "grad_norm": 7.916418722194274, "learning_rate": 4.7661162265904773e-07, "logits/chosen": -3.109375, "logits/rejected": -2.8125, "logps/chosen": -564.0, "logps/rejected": -840.0, "loss": 0.2273, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.90625, "rewards/margins": 2.984375, "rewards/rejected": -6.875, "step": 2990 }, { "epoch": 0.2257166503649086, "grad_norm": 8.229870586292975, "learning_rate": 4.7633353182325443e-07, "logits/chosen": -2.890625, "logits/rejected": -2.578125, "logps/chosen": -536.0, "logps/rejected": -880.0, "loss": 0.2373, "rewards/accuracies": 0.90625, "rewards/chosen": -3.84375, "rewards/margins": 3.28125, "rewards/rejected": -7.125, "step": 3000 }, { "epoch": 0.22646903919945827, "grad_norm": 10.226118184963164, "learning_rate": 4.760538795901923e-07, "logits/chosen": -2.71875, "logits/rejected": -2.609375, "logps/chosen": -548.0, "logps/rejected": -848.0, "loss": 0.2677, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.828125, "rewards/margins": 3.203125, "rewards/rejected": -7.03125, "step": 3010 }, { "epoch": 0.22722142803400797, "grad_norm": 8.49693960463905, "learning_rate": 4.757726678890859e-07, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -506.0, "logps/rejected": -800.0, "loss": 0.254, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.40625, "rewards/margins": 3.140625, "rewards/rejected": -6.5625, "step": 3020 }, { "epoch": 0.22797381686855767, "grad_norm": 9.5155953716937, "learning_rate": 4.754898986599182e-07, "logits/chosen": -2.65625, "logits/rejected": -2.46875, "logps/chosen": -484.0, "logps/rejected": -716.0, "loss": 0.2568, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.234375, "rewards/margins": 2.46875, "rewards/rejected": -5.71875, "step": 3030 }, { "epoch": 0.22872620570310737, "grad_norm": 9.050451730524532, "learning_rate": 4.75205573853417e-07, "logits/chosen": -2.5625, "logits/rejected": -2.328125, "logps/chosen": -564.0, "logps/rejected": -832.0, "loss": 0.2664, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.03125, "rewards/margins": 2.890625, "rewards/rejected": -6.9375, "step": 3040 }, { "epoch": 0.22947859453765707, "grad_norm": 11.230060400251109, "learning_rate": 4.749196954310414e-07, "logits/chosen": -2.6875, "logits/rejected": -2.515625, "logps/chosen": -580.0, "logps/rejected": -880.0, "loss": 0.2462, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.09375, "rewards/margins": 3.171875, "rewards/rejected": -7.25, "step": 3050 }, { "epoch": 0.23023098337220677, "grad_norm": 8.780783249418834, "learning_rate": 4.746322653649683e-07, "logits/chosen": -2.84375, "logits/rejected": -2.59375, "logps/chosen": -564.0, "logps/rejected": -856.0, "loss": 0.252, "rewards/accuracies": 0.84375, "rewards/chosen": -4.09375, "rewards/margins": 2.890625, "rewards/rejected": -6.96875, "step": 3060 }, { "epoch": 0.23098337220675647, "grad_norm": 9.12154565614575, "learning_rate": 4.7434328563807913e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -528.0, "logps/rejected": -872.0, "loss": 0.2577, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.734375, "rewards/margins": 3.296875, "rewards/rejected": -7.03125, "step": 3070 }, { "epoch": 0.23173576104130614, "grad_norm": 14.056073227705715, "learning_rate": 4.7405275824394574e-07, "logits/chosen": -2.734375, "logits/rejected": -2.578125, "logps/chosen": -568.0, "logps/rejected": -796.0, "loss": 0.2442, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.984375, "rewards/margins": 2.484375, "rewards/rejected": -6.46875, "step": 3080 }, { "epoch": 0.23248814987585584, "grad_norm": 9.397850408648406, "learning_rate": 4.737606851868167e-07, "logits/chosen": -2.765625, "logits/rejected": -2.578125, "logps/chosen": -536.0, "logps/rejected": -812.0, "loss": 0.2361, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.546875, "rewards/margins": 3.1875, "rewards/rejected": -6.75, "step": 3090 }, { "epoch": 0.23324053871040554, "grad_norm": 8.73361335302379, "learning_rate": 4.734670684816037e-07, "logits/chosen": -2.4375, "logits/rejected": -2.3125, "logps/chosen": -510.0, "logps/rejected": -820.0, "loss": 0.2494, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.203125, "rewards/margins": 3.15625, "rewards/rejected": -6.34375, "step": 3100 }, { "epoch": 0.23399292754495524, "grad_norm": 10.750693728458632, "learning_rate": 4.7317191015386744e-07, "logits/chosen": -2.65625, "logits/rejected": -2.53125, "logps/chosen": -510.0, "logps/rejected": -808.0, "loss": 0.2487, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.671875, "rewards/margins": 2.984375, "rewards/rejected": -6.625, "step": 3110 }, { "epoch": 0.23474531637950494, "grad_norm": 7.196445794814489, "learning_rate": 4.7287521223980395e-07, "logits/chosen": -2.578125, "logits/rejected": -2.46875, "logps/chosen": -506.0, "logps/rejected": -756.0, "loss": 0.269, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.28125, "rewards/margins": 2.6875, "rewards/rejected": -6.0, "step": 3120 }, { "epoch": 0.23549770521405464, "grad_norm": 8.778673968279726, "learning_rate": 4.725769767862301e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -544.0, "logps/rejected": -784.0, "loss": 0.2344, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.796875, "rewards/margins": 2.578125, "rewards/rejected": -6.375, "step": 3130 }, { "epoch": 0.2362500940486043, "grad_norm": 9.278296782454238, "learning_rate": 4.7227720585056986e-07, "logits/chosen": -2.859375, "logits/rejected": -2.625, "logps/chosen": -588.0, "logps/rejected": -876.0, "loss": 0.2251, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.1875, "rewards/margins": 3.28125, "rewards/rejected": -7.46875, "step": 3140 }, { "epoch": 0.237002482883154, "grad_norm": 11.778051631603764, "learning_rate": 4.7197590150083986e-07, "logits/chosen": -2.765625, "logits/rejected": -2.484375, "logps/chosen": -524.0, "logps/rejected": -848.0, "loss": 0.2454, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.515625, "rewards/margins": 3.328125, "rewards/rejected": -6.84375, "step": 3150 }, { "epoch": 0.2377548717177037, "grad_norm": 10.909061617481207, "learning_rate": 4.716730658156354e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -544.0, "logps/rejected": -832.0, "loss": 0.2329, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.859375, "rewards/margins": 2.9375, "rewards/rejected": -6.8125, "step": 3160 }, { "epoch": 0.2385072605522534, "grad_norm": 9.24366980179484, "learning_rate": 4.7136870088411564e-07, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -552.0, "logps/rejected": -860.0, "loss": 0.2298, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.921875, "rewards/margins": 3.265625, "rewards/rejected": -7.1875, "step": 3170 }, { "epoch": 0.2392596493868031, "grad_norm": 10.949962350431266, "learning_rate": 4.710628088059898e-07, "logits/chosen": -2.859375, "logits/rejected": -2.515625, "logps/chosen": -540.0, "logps/rejected": -828.0, "loss": 0.2271, "rewards/accuracies": 0.90625, "rewards/chosen": -3.8125, "rewards/margins": 2.84375, "rewards/rejected": -6.65625, "step": 3180 }, { "epoch": 0.2400120382213528, "grad_norm": 11.993941852243937, "learning_rate": 4.707553916915022e-07, "logits/chosen": -2.75, "logits/rejected": -2.5625, "logps/chosen": -536.0, "logps/rejected": -816.0, "loss": 0.2549, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.875, "rewards/margins": 2.890625, "rewards/rejected": -6.78125, "step": 3190 }, { "epoch": 0.24076442705590248, "grad_norm": 10.660695498977127, "learning_rate": 4.704464516614178e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -552.0, "logps/rejected": -864.0, "loss": 0.2583, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.921875, "rewards/margins": 3.125, "rewards/rejected": -7.03125, "step": 3200 }, { "epoch": 0.24151681589045218, "grad_norm": 8.141720298029627, "learning_rate": 4.7013599084700787e-07, "logits/chosen": -2.671875, "logits/rejected": -2.5, "logps/chosen": -568.0, "logps/rejected": -816.0, "loss": 0.2528, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.03125, "rewards/margins": 2.703125, "rewards/rejected": -6.75, "step": 3210 }, { "epoch": 0.24226920472500188, "grad_norm": 12.508230218250487, "learning_rate": 4.698240113900348e-07, "logits/chosen": -2.71875, "logits/rejected": -2.5, "logps/chosen": -572.0, "logps/rejected": -912.0, "loss": 0.2322, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.21875, "rewards/margins": 3.40625, "rewards/rejected": -7.625, "step": 3220 }, { "epoch": 0.24302159355955158, "grad_norm": 8.835584611496607, "learning_rate": 4.6951051544273765e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -532.0, "logps/rejected": -908.0, "loss": 0.2373, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.6875, "rewards/margins": 3.625, "rewards/rejected": -7.3125, "step": 3230 }, { "epoch": 0.24377398239410128, "grad_norm": 8.181209169447907, "learning_rate": 4.6919550516781723e-07, "logits/chosen": -2.640625, "logits/rejected": -2.40625, "logps/chosen": -532.0, "logps/rejected": -844.0, "loss": 0.2355, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.765625, "rewards/margins": 3.15625, "rewards/rejected": -6.90625, "step": 3240 }, { "epoch": 0.24452637122865098, "grad_norm": 9.207106143916054, "learning_rate": 4.688789827384213e-07, "logits/chosen": -2.6875, "logits/rejected": -2.546875, "logps/chosen": -580.0, "logps/rejected": -848.0, "loss": 0.2627, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.25, "rewards/margins": 2.703125, "rewards/rejected": -6.9375, "step": 3250 }, { "epoch": 0.24527876006320065, "grad_norm": 12.041595328496067, "learning_rate": 4.6856095033812927e-07, "logits/chosen": -2.796875, "logits/rejected": -2.46875, "logps/chosen": -520.0, "logps/rejected": -816.0, "loss": 0.2378, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.546875, "rewards/margins": 3.046875, "rewards/rejected": -6.59375, "step": 3260 }, { "epoch": 0.24603114889775035, "grad_norm": 8.162952877767816, "learning_rate": 4.682414101609374e-07, "logits/chosen": -2.734375, "logits/rejected": -2.578125, "logps/chosen": -580.0, "logps/rejected": -884.0, "loss": 0.2397, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.25, "rewards/margins": 3.015625, "rewards/rejected": -7.28125, "step": 3270 }, { "epoch": 0.24678353773230005, "grad_norm": 7.661645701725578, "learning_rate": 4.679203644112436e-07, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -592.0, "logps/rejected": -892.0, "loss": 0.2426, "rewards/accuracies": 0.90625, "rewards/chosen": -4.3125, "rewards/margins": 2.984375, "rewards/rejected": -7.3125, "step": 3280 }, { "epoch": 0.24753592656684975, "grad_norm": 9.39559412819907, "learning_rate": 4.6759781530383214e-07, "logits/chosen": -2.578125, "logits/rejected": -2.390625, "logps/chosen": -576.0, "logps/rejected": -868.0, "loss": 0.2572, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.25, "rewards/margins": 3.015625, "rewards/rejected": -7.25, "step": 3290 }, { "epoch": 0.24828831540139945, "grad_norm": 8.664281653841188, "learning_rate": 4.672737650638586e-07, "logits/chosen": -2.828125, "logits/rejected": -2.609375, "logps/chosen": -604.0, "logps/rejected": -856.0, "loss": 0.2428, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.375, "rewards/margins": 2.828125, "rewards/rejected": -7.1875, "step": 3300 }, { "epoch": 0.24904070423594915, "grad_norm": 9.341469048911085, "learning_rate": 4.669482159268341e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -588.0, "logps/rejected": -912.0, "loss": 0.2357, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.375, "rewards/margins": 3.3125, "rewards/rejected": -7.6875, "step": 3310 }, { "epoch": 0.24979309307049882, "grad_norm": 8.73158517562442, "learning_rate": 4.666211701386103e-07, "logits/chosen": -2.53125, "logits/rejected": -2.390625, "logps/chosen": -612.0, "logps/rejected": -872.0, "loss": 0.2537, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.53125, "rewards/margins": 2.859375, "rewards/rejected": -7.40625, "step": 3320 }, { "epoch": 0.25054548190504855, "grad_norm": 9.193942328899496, "learning_rate": 4.662926299553638e-07, "logits/chosen": -2.75, "logits/rejected": -2.53125, "logps/chosen": -624.0, "logps/rejected": -912.0, "loss": 0.2391, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.03125, "rewards/rejected": -7.625, "step": 3330 }, { "epoch": 0.25129787073959825, "grad_norm": 9.383374978565644, "learning_rate": 4.6596259764358037e-07, "logits/chosen": -2.71875, "logits/rejected": -2.46875, "logps/chosen": -624.0, "logps/rejected": -980.0, "loss": 0.2206, "rewards/accuracies": 0.90625, "rewards/chosen": -4.59375, "rewards/margins": 3.546875, "rewards/rejected": -8.125, "step": 3340 }, { "epoch": 0.2520502595741479, "grad_norm": 8.09815946238127, "learning_rate": 4.6563107548003967e-07, "logits/chosen": -2.890625, "logits/rejected": -2.578125, "logps/chosen": -600.0, "logps/rejected": -936.0, "loss": 0.2575, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.4375, "rewards/margins": 3.4375, "rewards/rejected": -7.875, "step": 3350 }, { "epoch": 0.2528026484086976, "grad_norm": 9.28524948295606, "learning_rate": 4.6529806575179895e-07, "logits/chosen": -2.828125, "logits/rejected": -2.46875, "logps/chosen": -560.0, "logps/rejected": -812.0, "loss": 0.2532, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.09375, "rewards/margins": 2.609375, "rewards/rejected": -6.71875, "step": 3360 }, { "epoch": 0.2535550372432473, "grad_norm": 8.372693128057215, "learning_rate": 4.6496357075617816e-07, "logits/chosen": -2.703125, "logits/rejected": -2.5, "logps/chosen": -624.0, "logps/rejected": -920.0, "loss": 0.2312, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.4375, "rewards/margins": 3.125, "rewards/rejected": -7.5625, "step": 3370 }, { "epoch": 0.254307426077797, "grad_norm": 8.973796350635121, "learning_rate": 4.646275928007431e-07, "logits/chosen": -2.734375, "logits/rejected": -2.59375, "logps/chosen": -576.0, "logps/rejected": -836.0, "loss": 0.2511, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -4.21875, "rewards/margins": 2.671875, "rewards/rejected": -6.875, "step": 3380 }, { "epoch": 0.2550598149123467, "grad_norm": 15.043288873372585, "learning_rate": 4.642901342032905e-07, "logits/chosen": -2.734375, "logits/rejected": -2.484375, "logps/chosen": -580.0, "logps/rejected": -868.0, "loss": 0.2644, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.25, "rewards/margins": 2.984375, "rewards/rejected": -7.21875, "step": 3390 }, { "epoch": 0.2558122037468964, "grad_norm": 8.60829166534939, "learning_rate": 4.639511972918311e-07, "logits/chosen": -2.796875, "logits/rejected": -2.46875, "logps/chosen": -624.0, "logps/rejected": -932.0, "loss": 0.2515, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.78125, "rewards/margins": 3.109375, "rewards/rejected": -7.875, "step": 3400 }, { "epoch": 0.2565645925814461, "grad_norm": 9.286255016844617, "learning_rate": 4.636107844045742e-07, "logits/chosen": -2.625, "logits/rejected": -2.40625, "logps/chosen": -612.0, "logps/rejected": -916.0, "loss": 0.2498, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.59375, "rewards/margins": 2.953125, "rewards/rejected": -7.53125, "step": 3410 }, { "epoch": 0.2573169814159958, "grad_norm": 7.683783123714642, "learning_rate": 4.632688978899114e-07, "logits/chosen": -2.796875, "logits/rejected": -2.484375, "logps/chosen": -588.0, "logps/rejected": -884.0, "loss": 0.2387, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.25, "rewards/margins": 2.9375, "rewards/rejected": -7.1875, "step": 3420 }, { "epoch": 0.2580693702505455, "grad_norm": 10.260751299912629, "learning_rate": 4.629255401064004e-07, "logits/chosen": -2.828125, "logits/rejected": -2.6875, "logps/chosen": -592.0, "logps/rejected": -888.0, "loss": 0.2323, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.34375, "rewards/margins": 3.09375, "rewards/rejected": -7.4375, "step": 3430 }, { "epoch": 0.2588217590850952, "grad_norm": 7.726491373550179, "learning_rate": 4.625807134227483e-07, "logits/chosen": -2.8125, "logits/rejected": -2.6875, "logps/chosen": -528.0, "logps/rejected": -804.0, "loss": 0.2666, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.6875, "rewards/margins": 2.90625, "rewards/rejected": -6.59375, "step": 3440 }, { "epoch": 0.2595741479196449, "grad_norm": 9.175160897527569, "learning_rate": 4.622344202177961e-07, "logits/chosen": -2.71875, "logits/rejected": -2.46875, "logps/chosen": -528.0, "logps/rejected": -816.0, "loss": 0.2426, "rewards/accuracies": 0.9375, "rewards/chosen": -3.59375, "rewards/margins": 2.8125, "rewards/rejected": -6.40625, "step": 3450 }, { "epoch": 0.2603265367541946, "grad_norm": 10.620774123065072, "learning_rate": 4.6188666288050163e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -588.0, "logps/rejected": -888.0, "loss": 0.2542, "rewards/accuracies": 0.9375, "rewards/chosen": -4.21875, "rewards/margins": 3.15625, "rewards/rejected": -7.375, "step": 3460 }, { "epoch": 0.26107892558874424, "grad_norm": 8.48922670158365, "learning_rate": 4.615374438099232e-07, "logits/chosen": -2.46875, "logits/rejected": -2.25, "logps/chosen": -592.0, "logps/rejected": -832.0, "loss": 0.2499, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.34375, "rewards/margins": 2.46875, "rewards/rejected": -6.8125, "step": 3470 }, { "epoch": 0.26183131442329394, "grad_norm": 9.380206108680339, "learning_rate": 4.611867654152033e-07, "logits/chosen": -2.578125, "logits/rejected": -2.359375, "logps/chosen": -620.0, "logps/rejected": -896.0, "loss": 0.2377, "rewards/accuracies": 0.875, "rewards/chosen": -4.625, "rewards/margins": 2.796875, "rewards/rejected": -7.4375, "step": 3480 }, { "epoch": 0.26258370325784364, "grad_norm": 8.630455909850124, "learning_rate": 4.608346301155516e-07, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -548.0, "logps/rejected": -856.0, "loss": 0.2446, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.734375, "rewards/margins": 3.1875, "rewards/rejected": -6.9375, "step": 3490 }, { "epoch": 0.26333609209239334, "grad_norm": 8.010936942529533, "learning_rate": 4.604810403402285e-07, "logits/chosen": -2.65625, "logits/rejected": -2.4375, "logps/chosen": -596.0, "logps/rejected": -852.0, "loss": 0.2511, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.4375, "rewards/margins": 2.765625, "rewards/rejected": -7.1875, "step": 3500 }, { "epoch": 0.26408848092694304, "grad_norm": 8.594226664704044, "learning_rate": 4.601259985285284e-07, "logits/chosen": -2.625, "logits/rejected": -2.5, "logps/chosen": -640.0, "logps/rejected": -916.0, "loss": 0.2225, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.6875, "rewards/margins": 2.984375, "rewards/rejected": -7.65625, "step": 3510 }, { "epoch": 0.26484086976149274, "grad_norm": 7.3686969444976, "learning_rate": 4.5976950712976286e-07, "logits/chosen": -2.65625, "logits/rejected": -2.484375, "logps/chosen": -608.0, "logps/rejected": -916.0, "loss": 0.2473, "rewards/accuracies": 0.875, "rewards/chosen": -4.5, "rewards/margins": 3.109375, "rewards/rejected": -7.625, "step": 3520 }, { "epoch": 0.26559325859604244, "grad_norm": 9.006304280429053, "learning_rate": 4.5941156860324345e-07, "logits/chosen": -2.578125, "logits/rejected": -2.359375, "logps/chosen": -628.0, "logps/rejected": -952.0, "loss": 0.2532, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.5, "rewards/margins": 3.546875, "rewards/rejected": -8.0, "step": 3530 }, { "epoch": 0.26634564743059214, "grad_norm": 8.731170425293595, "learning_rate": 4.590521854182651e-07, "logits/chosen": -2.640625, "logits/rejected": -2.484375, "logps/chosen": -640.0, "logps/rejected": -944.0, "loss": 0.2361, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.5, "rewards/margins": 3.3125, "rewards/rejected": -7.8125, "step": 3540 }, { "epoch": 0.26709803626514184, "grad_norm": 10.35512022707896, "learning_rate": 4.5869136005408893e-07, "logits/chosen": -2.6875, "logits/rejected": -2.53125, "logps/chosen": -652.0, "logps/rejected": -944.0, "loss": 0.255, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5, "rewards/margins": 3.359375, "rewards/rejected": -7.875, "step": 3550 }, { "epoch": 0.26785042509969154, "grad_norm": 8.593676807068588, "learning_rate": 4.5832909499992514e-07, "logits/chosen": -2.75, "logits/rejected": -2.53125, "logps/chosen": -584.0, "logps/rejected": -876.0, "loss": 0.2537, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.28125, "rewards/margins": 3.0, "rewards/rejected": -7.3125, "step": 3560 }, { "epoch": 0.26860281393424124, "grad_norm": 7.229452609142653, "learning_rate": 4.579653927549159e-07, "logits/chosen": -2.765625, "logits/rejected": -2.578125, "logps/chosen": -648.0, "logps/rejected": -944.0, "loss": 0.2514, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 3.171875, "rewards/rejected": -8.0625, "step": 3570 }, { "epoch": 0.26935520276879094, "grad_norm": 9.484733316900101, "learning_rate": 4.57600255828118e-07, "logits/chosen": -2.8125, "logits/rejected": -2.578125, "logps/chosen": -604.0, "logps/rejected": -940.0, "loss": 0.2488, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.46875, "rewards/margins": 3.28125, "rewards/rejected": -7.71875, "step": 3580 }, { "epoch": 0.2701075916033406, "grad_norm": 7.616886332053109, "learning_rate": 4.572336867384856e-07, "logits/chosen": -2.6875, "logits/rejected": -2.546875, "logps/chosen": -604.0, "logps/rejected": -872.0, "loss": 0.2532, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.3125, "rewards/margins": 2.90625, "rewards/rejected": -7.21875, "step": 3590 }, { "epoch": 0.2708599804378903, "grad_norm": 9.424387046076045, "learning_rate": 4.5686568801485306e-07, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -580.0, "logps/rejected": -864.0, "loss": 0.219, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.28125, "rewards/margins": 2.96875, "rewards/rejected": -7.25, "step": 3600 }, { "epoch": 0.27161236927244, "grad_norm": 12.547475129287186, "learning_rate": 4.5649626219591685e-07, "logits/chosen": -2.875, "logits/rejected": -2.578125, "logps/chosen": -620.0, "logps/rejected": -932.0, "loss": 0.2504, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 3.1875, "rewards/rejected": -7.78125, "step": 3610 }, { "epoch": 0.2723647581069897, "grad_norm": 10.17739824717397, "learning_rate": 4.5612541183021905e-07, "logits/chosen": -2.703125, "logits/rejected": -2.40625, "logps/chosen": -620.0, "logps/rejected": -972.0, "loss": 0.2157, "rewards/accuracies": 0.90625, "rewards/chosen": -4.6875, "rewards/margins": 3.578125, "rewards/rejected": -8.3125, "step": 3620 }, { "epoch": 0.2731171469415394, "grad_norm": 12.055309990415822, "learning_rate": 4.5575313947612875e-07, "logits/chosen": -2.609375, "logits/rejected": -2.328125, "logps/chosen": -628.0, "logps/rejected": -968.0, "loss": 0.2546, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.75, "rewards/margins": 3.390625, "rewards/rejected": -8.125, "step": 3630 }, { "epoch": 0.2738695357760891, "grad_norm": 9.381452162285955, "learning_rate": 4.553794477018251e-07, "logits/chosen": -2.5625, "logits/rejected": -2.34375, "logps/chosen": -648.0, "logps/rejected": -940.0, "loss": 0.2405, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 3.0625, "rewards/rejected": -7.84375, "step": 3640 }, { "epoch": 0.2746219246106388, "grad_norm": 9.717179832719665, "learning_rate": 4.550043390852791e-07, "logits/chosen": -2.65625, "logits/rejected": -2.4375, "logps/chosen": -608.0, "logps/rejected": -956.0, "loss": 0.2198, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.5, "rewards/margins": 3.46875, "rewards/rejected": -7.96875, "step": 3650 }, { "epoch": 0.2753743134451885, "grad_norm": 9.596874231784692, "learning_rate": 4.546278162142364e-07, "logits/chosen": -2.828125, "logits/rejected": -2.65625, "logps/chosen": -580.0, "logps/rejected": -888.0, "loss": 0.2173, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.21875, "rewards/margins": 3.34375, "rewards/rejected": -7.5625, "step": 3660 }, { "epoch": 0.2761267022797382, "grad_norm": 7.719511977487069, "learning_rate": 4.542498816861988e-07, "logits/chosen": -2.890625, "logits/rejected": -2.59375, "logps/chosen": -596.0, "logps/rejected": -900.0, "loss": 0.2265, "rewards/accuracies": 0.90625, "rewards/chosen": -4.375, "rewards/margins": 3.234375, "rewards/rejected": -7.625, "step": 3670 }, { "epoch": 0.2768790911142879, "grad_norm": 11.043176918667355, "learning_rate": 4.538705381084067e-07, "logits/chosen": -2.640625, "logits/rejected": -2.46875, "logps/chosen": -620.0, "logps/rejected": -900.0, "loss": 0.2592, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.6875, "rewards/margins": 3.0, "rewards/rejected": -7.6875, "step": 3680 }, { "epoch": 0.2776314799488376, "grad_norm": 11.332378533299988, "learning_rate": 4.5348978809782123e-07, "logits/chosen": -2.609375, "logits/rejected": -2.375, "logps/chosen": -608.0, "logps/rejected": -864.0, "loss": 0.2477, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.40625, "rewards/margins": 2.75, "rewards/rejected": -7.15625, "step": 3690 }, { "epoch": 0.2783838687833873, "grad_norm": 9.797098424450917, "learning_rate": 4.531076342811059e-07, "logits/chosen": -2.71875, "logits/rejected": -2.46875, "logps/chosen": -580.0, "logps/rejected": -880.0, "loss": 0.2161, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.0625, "rewards/margins": 3.140625, "rewards/rejected": -7.1875, "step": 3700 }, { "epoch": 0.2791362576179369, "grad_norm": 12.677047667665043, "learning_rate": 4.5272407929460846e-07, "logits/chosen": -2.671875, "logits/rejected": -2.53125, "logps/chosen": -624.0, "logps/rejected": -896.0, "loss": 0.2478, "rewards/accuracies": 0.875, "rewards/chosen": -4.46875, "rewards/margins": 3.09375, "rewards/rejected": -7.5625, "step": 3710 }, { "epoch": 0.2798886464524866, "grad_norm": 7.7470550829514995, "learning_rate": 4.5233912578434297e-07, "logits/chosen": -2.625, "logits/rejected": -2.390625, "logps/chosen": -664.0, "logps/rejected": -976.0, "loss": 0.2311, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.9375, "rewards/margins": 3.0625, "rewards/rejected": -8.0, "step": 3720 }, { "epoch": 0.2806410352870363, "grad_norm": 9.903990733335885, "learning_rate": 4.5195277640597165e-07, "logits/chosen": -2.59375, "logits/rejected": -2.40625, "logps/chosen": -612.0, "logps/rejected": -908.0, "loss": 0.2546, "rewards/accuracies": 0.84375, "rewards/chosen": -4.4375, "rewards/margins": 3.046875, "rewards/rejected": -7.46875, "step": 3730 }, { "epoch": 0.281393424121586, "grad_norm": 14.923608246368913, "learning_rate": 4.5156503382478583e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -608.0, "logps/rejected": -876.0, "loss": 0.2426, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.3125, "rewards/margins": 3.0, "rewards/rejected": -7.3125, "step": 3740 }, { "epoch": 0.2821458129561357, "grad_norm": 10.290171086499644, "learning_rate": 4.511759007156886e-07, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -644.0, "logps/rejected": -940.0, "loss": 0.2252, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.84375, "rewards/margins": 3.0625, "rewards/rejected": -7.90625, "step": 3750 }, { "epoch": 0.2828982017906854, "grad_norm": 9.12773732569798, "learning_rate": 4.507853797631753e-07, "logits/chosen": -2.6875, "logits/rejected": -2.390625, "logps/chosen": -628.0, "logps/rejected": -908.0, "loss": 0.2214, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.625, "rewards/margins": 2.90625, "rewards/rejected": -7.53125, "step": 3760 }, { "epoch": 0.2836505906252351, "grad_norm": 8.54967309370874, "learning_rate": 4.503934736613161e-07, "logits/chosen": -2.640625, "logits/rejected": -2.484375, "logps/chosen": -632.0, "logps/rejected": -936.0, "loss": 0.2154, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.296875, "rewards/rejected": -7.90625, "step": 3770 }, { "epoch": 0.2844029794597848, "grad_norm": 8.658423249282029, "learning_rate": 4.500001851137363e-07, "logits/chosen": -2.59375, "logits/rejected": -2.3125, "logps/chosen": -612.0, "logps/rejected": -952.0, "loss": 0.2313, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.625, "rewards/margins": 3.546875, "rewards/rejected": -8.1875, "step": 3780 }, { "epoch": 0.2851553682943345, "grad_norm": 9.32583793890318, "learning_rate": 4.496055168335986e-07, "logits/chosen": -2.5, "logits/rejected": -2.265625, "logps/chosen": -600.0, "logps/rejected": -908.0, "loss": 0.2481, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.4375, "rewards/margins": 2.875, "rewards/rejected": -7.34375, "step": 3790 }, { "epoch": 0.2859077571288842, "grad_norm": 12.386535228494425, "learning_rate": 4.4920947154358384e-07, "logits/chosen": -2.578125, "logits/rejected": -2.34375, "logps/chosen": -636.0, "logps/rejected": -964.0, "loss": 0.2265, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.34375, "rewards/rejected": -7.9375, "step": 3800 }, { "epoch": 0.2866601459634339, "grad_norm": 7.1901796846029296, "learning_rate": 4.4881205197587236e-07, "logits/chosen": -2.609375, "logits/rejected": -2.3125, "logps/chosen": -592.0, "logps/rejected": -884.0, "loss": 0.2304, "rewards/accuracies": 0.90625, "rewards/chosen": -4.21875, "rewards/margins": 3.1875, "rewards/rejected": -7.375, "step": 3810 }, { "epoch": 0.2874125347979836, "grad_norm": 8.02806199551109, "learning_rate": 4.484132608721252e-07, "logits/chosen": -2.5625, "logits/rejected": -2.390625, "logps/chosen": -516.0, "logps/rejected": -844.0, "loss": 0.2235, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.71875, "rewards/margins": 3.1875, "rewards/rejected": -6.90625, "step": 3820 }, { "epoch": 0.2881649236325333, "grad_norm": 10.603106664851868, "learning_rate": 4.480131009834651e-07, "logits/chosen": -2.671875, "logits/rejected": -2.390625, "logps/chosen": -564.0, "logps/rejected": -888.0, "loss": 0.2457, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.09375, "rewards/margins": 3.3125, "rewards/rejected": -7.40625, "step": 3830 }, { "epoch": 0.28891731246708297, "grad_norm": 6.45000355010823, "learning_rate": 4.476115750704578e-07, "logits/chosen": -2.65625, "logits/rejected": -2.46875, "logps/chosen": -564.0, "logps/rejected": -860.0, "loss": 0.2465, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.953125, "rewards/margins": 3.234375, "rewards/rejected": -7.1875, "step": 3840 }, { "epoch": 0.28966970130163266, "grad_norm": 8.929005375270732, "learning_rate": 4.472086859030926e-07, "logits/chosen": -2.65625, "logits/rejected": -2.546875, "logps/chosen": -544.0, "logps/rejected": -876.0, "loss": 0.2149, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.0, "rewards/margins": 3.203125, "rewards/rejected": -7.21875, "step": 3850 }, { "epoch": 0.29042209013618236, "grad_norm": 8.024874951607805, "learning_rate": 4.468044362607633e-07, "logits/chosen": -2.765625, "logits/rejected": -2.40625, "logps/chosen": -588.0, "logps/rejected": -944.0, "loss": 0.2153, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.21875, "rewards/margins": 3.484375, "rewards/rejected": -7.71875, "step": 3860 }, { "epoch": 0.29117447897073206, "grad_norm": 9.5901959856907, "learning_rate": 4.4639882893224924e-07, "logits/chosen": -2.65625, "logits/rejected": -2.375, "logps/chosen": -608.0, "logps/rejected": -900.0, "loss": 0.2361, "rewards/accuracies": 0.90625, "rewards/chosen": -4.375, "rewards/margins": 3.1875, "rewards/rejected": -7.5625, "step": 3870 }, { "epoch": 0.29192686780528176, "grad_norm": 9.101424498045278, "learning_rate": 4.4599186671569623e-07, "logits/chosen": -2.546875, "logits/rejected": -2.359375, "logps/chosen": -620.0, "logps/rejected": -888.0, "loss": 0.2144, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.5625, "rewards/margins": 2.765625, "rewards/rejected": -7.3125, "step": 3880 }, { "epoch": 0.29267925663983146, "grad_norm": 8.652186114303964, "learning_rate": 4.4558355241859655e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5625, "logps/chosen": -580.0, "logps/rejected": -904.0, "loss": 0.2362, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.375, "rewards/margins": 3.296875, "rewards/rejected": -7.65625, "step": 3890 }, { "epoch": 0.29343164547438116, "grad_norm": 11.241719704363485, "learning_rate": 4.451738888577705e-07, "logits/chosen": -2.734375, "logits/rejected": -2.578125, "logps/chosen": -596.0, "logps/rejected": -892.0, "loss": 0.2483, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.34375, "rewards/margins": 3.046875, "rewards/rejected": -7.40625, "step": 3900 }, { "epoch": 0.29418403430893086, "grad_norm": 7.151893594323524, "learning_rate": 4.44762878859346e-07, "logits/chosen": -2.6875, "logits/rejected": -2.625, "logps/chosen": -636.0, "logps/rejected": -860.0, "loss": 0.2198, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.625, "rewards/margins": 2.484375, "rewards/rejected": -7.125, "step": 3910 }, { "epoch": 0.29493642314348056, "grad_norm": 6.694485106549714, "learning_rate": 4.443505252587399e-07, "logits/chosen": -2.84375, "logits/rejected": -2.71875, "logps/chosen": -676.0, "logps/rejected": -1024.0, "loss": 0.2236, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.125, "rewards/margins": 3.71875, "rewards/rejected": -8.8125, "step": 3920 }, { "epoch": 0.29568881197803026, "grad_norm": 9.938005153509135, "learning_rate": 4.43936830900638e-07, "logits/chosen": -2.71875, "logits/rejected": -2.53125, "logps/chosen": -564.0, "logps/rejected": -936.0, "loss": 0.2042, "rewards/accuracies": 0.90625, "rewards/chosen": -4.3125, "rewards/margins": 3.609375, "rewards/rejected": -7.90625, "step": 3930 }, { "epoch": 0.29644120081257996, "grad_norm": 16.682092053114292, "learning_rate": 4.4352179863897556e-07, "logits/chosen": -2.765625, "logits/rejected": -2.65625, "logps/chosen": -672.0, "logps/rejected": -972.0, "loss": 0.241, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.125, "rewards/margins": 3.171875, "rewards/rejected": -8.3125, "step": 3940 }, { "epoch": 0.29719358964712966, "grad_norm": 7.512938135048488, "learning_rate": 4.4310543133691746e-07, "logits/chosen": -2.5625, "logits/rejected": -2.484375, "logps/chosen": -684.0, "logps/rejected": -992.0, "loss": 0.2316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.96875, "rewards/margins": 3.3125, "rewards/rejected": -8.3125, "step": 3950 }, { "epoch": 0.2979459784816793, "grad_norm": 12.509486795634867, "learning_rate": 4.426877318668387e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -584.0, "logps/rejected": -904.0, "loss": 0.2155, "rewards/accuracies": 0.90625, "rewards/chosen": -4.34375, "rewards/margins": 3.265625, "rewards/rejected": -7.59375, "step": 3960 }, { "epoch": 0.298698367316229, "grad_norm": 7.987377416432433, "learning_rate": 4.422687031103045e-07, "logits/chosen": -2.578125, "logits/rejected": -2.390625, "logps/chosen": -636.0, "logps/rejected": -944.0, "loss": 0.262, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.5625, "rewards/margins": 3.4375, "rewards/rejected": -8.0, "step": 3970 }, { "epoch": 0.2994507561507787, "grad_norm": 8.497924735104682, "learning_rate": 4.4184834795805016e-07, "logits/chosen": -2.609375, "logits/rejected": -2.34375, "logps/chosen": -552.0, "logps/rejected": -892.0, "loss": 0.2233, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.0625, "rewards/margins": 3.421875, "rewards/rejected": -7.5, "step": 3980 }, { "epoch": 0.3002031449853284, "grad_norm": 12.602114991629081, "learning_rate": 4.414266693099615e-07, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -624.0, "logps/rejected": -916.0, "loss": 0.2254, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.40625, "rewards/margins": 3.171875, "rewards/rejected": -7.59375, "step": 3990 }, { "epoch": 0.3009555338198781, "grad_norm": 8.14636861622685, "learning_rate": 4.4100367007505455e-07, "logits/chosen": -2.921875, "logits/rejected": -2.71875, "logps/chosen": -600.0, "logps/rejected": -944.0, "loss": 0.2228, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.4375, "rewards/margins": 3.4375, "rewards/rejected": -7.875, "step": 4000 }, { "epoch": 0.3017079226544278, "grad_norm": 8.71715153360338, "learning_rate": 4.405793531714558e-07, "logits/chosen": -2.71875, "logits/rejected": -2.59375, "logps/chosen": -584.0, "logps/rejected": -920.0, "loss": 0.233, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.25, "rewards/margins": 3.453125, "rewards/rejected": -7.71875, "step": 4010 }, { "epoch": 0.3024603114889775, "grad_norm": 7.675931534560127, "learning_rate": 4.401537215263816e-07, "logits/chosen": -2.625, "logits/rejected": -2.4375, "logps/chosen": -672.0, "logps/rejected": -960.0, "loss": 0.2029, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.96875, "rewards/margins": 3.078125, "rewards/rejected": -8.0625, "step": 4020 }, { "epoch": 0.3032127003235272, "grad_norm": 10.17405003846414, "learning_rate": 4.3972677807611866e-07, "logits/chosen": -2.65625, "logits/rejected": -2.46875, "logps/chosen": -624.0, "logps/rejected": -996.0, "loss": 0.2365, "rewards/accuracies": 0.90625, "rewards/chosen": -4.5625, "rewards/margins": 3.59375, "rewards/rejected": -8.1875, "step": 4030 }, { "epoch": 0.3039650891580769, "grad_norm": 7.09818779533062, "learning_rate": 4.39298525766003e-07, "logits/chosen": -2.75, "logits/rejected": -2.46875, "logps/chosen": -596.0, "logps/rejected": -908.0, "loss": 0.211, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.28125, "rewards/margins": 3.359375, "rewards/rejected": -7.65625, "step": 4040 }, { "epoch": 0.3047174779926266, "grad_norm": 9.918785165797798, "learning_rate": 4.388689675504002e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -656.0, "logps/rejected": -972.0, "loss": 0.2303, "rewards/accuracies": 0.90625, "rewards/chosen": -4.71875, "rewards/margins": 3.328125, "rewards/rejected": -8.0625, "step": 4050 }, { "epoch": 0.3054698668271763, "grad_norm": 8.835187248857086, "learning_rate": 4.3843810639268475e-07, "logits/chosen": -2.75, "logits/rejected": -2.5, "logps/chosen": -596.0, "logps/rejected": -908.0, "loss": 0.2353, "rewards/accuracies": 0.90625, "rewards/chosen": -4.25, "rewards/margins": 3.421875, "rewards/rejected": -7.65625, "step": 4060 }, { "epoch": 0.306222255661726, "grad_norm": 8.367596510251957, "learning_rate": 4.3800594526521983e-07, "logits/chosen": -2.640625, "logits/rejected": -2.40625, "logps/chosen": -532.0, "logps/rejected": -840.0, "loss": 0.2074, "rewards/accuracies": 0.90625, "rewards/chosen": -3.859375, "rewards/margins": 3.109375, "rewards/rejected": -6.96875, "step": 4070 }, { "epoch": 0.30697464449627565, "grad_norm": 9.414826271664367, "learning_rate": 4.375724871493365e-07, "logits/chosen": -2.75, "logits/rejected": -2.453125, "logps/chosen": -556.0, "logps/rejected": -860.0, "loss": 0.2373, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.953125, "rewards/margins": 3.21875, "rewards/rejected": -7.15625, "step": 4080 }, { "epoch": 0.30772703333082535, "grad_norm": 8.162028844684041, "learning_rate": 4.371377350353134e-07, "logits/chosen": -2.65625, "logits/rejected": -2.40625, "logps/chosen": -536.0, "logps/rejected": -848.0, "loss": 0.229, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.828125, "rewards/margins": 3.21875, "rewards/rejected": -7.0625, "step": 4090 }, { "epoch": 0.30847942216537505, "grad_norm": 7.801467771167628, "learning_rate": 4.3670169192235586e-07, "logits/chosen": -2.703125, "logits/rejected": -2.546875, "logps/chosen": -540.0, "logps/rejected": -824.0, "loss": 0.2394, "rewards/accuracies": 0.875, "rewards/chosen": -3.859375, "rewards/margins": 2.9375, "rewards/rejected": -6.8125, "step": 4100 }, { "epoch": 0.30923181099992475, "grad_norm": 10.277704427965585, "learning_rate": 4.362643608185757e-07, "logits/chosen": -2.765625, "logits/rejected": -2.46875, "logps/chosen": -628.0, "logps/rejected": -932.0, "loss": 0.2344, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.46875, "rewards/margins": 3.140625, "rewards/rejected": -7.625, "step": 4110 }, { "epoch": 0.30998419983447445, "grad_norm": 13.33467825685964, "learning_rate": 4.358257447409698e-07, "logits/chosen": -2.6875, "logits/rejected": -2.4375, "logps/chosen": -588.0, "logps/rejected": -928.0, "loss": 0.2165, "rewards/accuracies": 0.90625, "rewards/chosen": -4.3125, "rewards/margins": 3.40625, "rewards/rejected": -7.71875, "step": 4120 }, { "epoch": 0.31073658866902415, "grad_norm": 8.16160167838467, "learning_rate": 4.353858467153998e-07, "logits/chosen": -2.671875, "logits/rejected": -2.515625, "logps/chosen": -572.0, "logps/rejected": -916.0, "loss": 0.2297, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.15625, "rewards/margins": 3.5625, "rewards/rejected": -7.71875, "step": 4130 }, { "epoch": 0.31148897750357385, "grad_norm": 7.879945152353835, "learning_rate": 4.349446697765711e-07, "logits/chosen": -2.71875, "logits/rejected": -2.53125, "logps/chosen": -588.0, "logps/rejected": -924.0, "loss": 0.227, "rewards/accuracies": 0.90625, "rewards/chosen": -4.25, "rewards/margins": 3.5625, "rewards/rejected": -7.8125, "step": 4140 }, { "epoch": 0.31224136633812355, "grad_norm": 8.616338325133528, "learning_rate": 4.345022169680117e-07, "logits/chosen": -2.65625, "logits/rejected": -2.375, "logps/chosen": -564.0, "logps/rejected": -900.0, "loss": 0.2254, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.1875, "rewards/margins": 3.375, "rewards/rejected": -7.53125, "step": 4150 }, { "epoch": 0.31299375517267325, "grad_norm": 9.359562218439928, "learning_rate": 4.3405849134205164e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -608.0, "logps/rejected": -904.0, "loss": 0.2257, "rewards/accuracies": 0.875, "rewards/chosen": -4.5, "rewards/margins": 2.9375, "rewards/rejected": -7.4375, "step": 4160 }, { "epoch": 0.31374614400722295, "grad_norm": 8.969523285372976, "learning_rate": 4.3361349595980136e-07, "logits/chosen": -2.65625, "logits/rejected": -2.375, "logps/chosen": -696.0, "logps/rejected": -1072.0, "loss": 0.2182, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.25, "rewards/margins": 3.78125, "rewards/rejected": -9.0625, "step": 4170 }, { "epoch": 0.31449853284177265, "grad_norm": 10.138740882360356, "learning_rate": 4.331672338911313e-07, "logits/chosen": -2.5625, "logits/rejected": -2.28125, "logps/chosen": -628.0, "logps/rejected": -936.0, "loss": 0.2201, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 3.171875, "rewards/rejected": -7.875, "step": 4180 }, { "epoch": 0.31525092167632235, "grad_norm": 9.223656893908142, "learning_rate": 4.3271970821465e-07, "logits/chosen": -2.625, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -988.0, "loss": 0.2257, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0, "rewards/margins": 3.1875, "rewards/rejected": -8.1875, "step": 4190 }, { "epoch": 0.316003310510872, "grad_norm": 11.12549439598949, "learning_rate": 4.3227092201768346e-07, "logits/chosen": -2.734375, "logits/rejected": -2.484375, "logps/chosen": -644.0, "logps/rejected": -940.0, "loss": 0.2265, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.90625, "rewards/margins": 3.078125, "rewards/rejected": -8.0, "step": 4200 }, { "epoch": 0.3167556993454217, "grad_norm": 9.207865303317924, "learning_rate": 4.3182087839625326e-07, "logits/chosen": -2.65625, "logits/rejected": -2.484375, "logps/chosen": -584.0, "logps/rejected": -896.0, "loss": 0.2244, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.25, "rewards/margins": 3.140625, "rewards/rejected": -7.40625, "step": 4210 }, { "epoch": 0.3175080881799714, "grad_norm": 8.426670829083685, "learning_rate": 4.313695804550559e-07, "logits/chosen": -2.71875, "logits/rejected": -2.484375, "logps/chosen": -628.0, "logps/rejected": -948.0, "loss": 0.2203, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.6875, "rewards/margins": 3.078125, "rewards/rejected": -7.75, "step": 4220 }, { "epoch": 0.3182604770145211, "grad_norm": 9.764940312491921, "learning_rate": 4.309170313074408e-07, "logits/chosen": -2.71875, "logits/rejected": -2.40625, "logps/chosen": -640.0, "logps/rejected": -964.0, "loss": 0.2608, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.8125, "rewards/margins": 3.046875, "rewards/rejected": -7.875, "step": 4230 }, { "epoch": 0.3190128658490708, "grad_norm": 7.952069517630966, "learning_rate": 4.304632340753889e-07, "logits/chosen": -2.625, "logits/rejected": -2.328125, "logps/chosen": -536.0, "logps/rejected": -848.0, "loss": 0.2279, "rewards/accuracies": 0.90625, "rewards/chosen": -3.796875, "rewards/margins": 3.1875, "rewards/rejected": -7.0, "step": 4240 }, { "epoch": 0.3197652546836205, "grad_norm": 8.83343001948812, "learning_rate": 4.3000819188949145e-07, "logits/chosen": -2.59375, "logits/rejected": -2.390625, "logps/chosen": -608.0, "logps/rejected": -876.0, "loss": 0.2347, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 2.84375, "rewards/rejected": -7.40625, "step": 4250 }, { "epoch": 0.3205176435181702, "grad_norm": 8.597957828748827, "learning_rate": 4.2955190788892827e-07, "logits/chosen": -2.609375, "logits/rejected": -2.375, "logps/chosen": -644.0, "logps/rejected": -932.0, "loss": 0.2496, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.8125, "rewards/margins": 3.03125, "rewards/rejected": -7.84375, "step": 4260 }, { "epoch": 0.3212700323527199, "grad_norm": 7.6696314669829775, "learning_rate": 4.2909438522144566e-07, "logits/chosen": -2.5, "logits/rejected": -2.34375, "logps/chosen": -584.0, "logps/rejected": -944.0, "loss": 0.2319, "rewards/accuracies": 0.90625, "rewards/chosen": -4.28125, "rewards/margins": 3.546875, "rewards/rejected": -7.84375, "step": 4270 }, { "epoch": 0.3220224211872696, "grad_norm": 10.007443872180758, "learning_rate": 4.2863562704333545e-07, "logits/chosen": -2.65625, "logits/rejected": -2.34375, "logps/chosen": -588.0, "logps/rejected": -892.0, "loss": 0.2052, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.25, "rewards/margins": 3.15625, "rewards/rejected": -7.40625, "step": 4280 }, { "epoch": 0.3227748100218193, "grad_norm": 11.118831248083465, "learning_rate": 4.2817563651941263e-07, "logits/chosen": -2.625, "logits/rejected": -2.40625, "logps/chosen": -628.0, "logps/rejected": -932.0, "loss": 0.2277, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.71875, "rewards/margins": 3.15625, "rewards/rejected": -7.875, "step": 4290 }, { "epoch": 0.323527198856369, "grad_norm": 10.345024663416677, "learning_rate": 4.277144168229939e-07, "logits/chosen": -2.609375, "logits/rejected": -2.28125, "logps/chosen": -628.0, "logps/rejected": -940.0, "loss": 0.2453, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.71875, "rewards/margins": 3.0625, "rewards/rejected": -7.78125, "step": 4300 }, { "epoch": 0.3242795876909187, "grad_norm": 7.051363685111973, "learning_rate": 4.272519711358753e-07, "logits/chosen": -2.6875, "logits/rejected": -2.375, "logps/chosen": -584.0, "logps/rejected": -896.0, "loss": 0.2047, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.25, "rewards/margins": 3.234375, "rewards/rejected": -7.5, "step": 4310 }, { "epoch": 0.32503197652546834, "grad_norm": 9.476688840579149, "learning_rate": 4.2678830264831077e-07, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -632.0, "logps/rejected": -952.0, "loss": 0.2251, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.28125, "rewards/rejected": -7.875, "step": 4320 }, { "epoch": 0.32578436536001804, "grad_norm": 7.95698757416428, "learning_rate": 4.2632341455899e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5625, "logps/chosen": -628.0, "logps/rejected": -932.0, "loss": 0.2135, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.78125, "rewards/margins": 3.109375, "rewards/rejected": -7.875, "step": 4330 }, { "epoch": 0.32653675419456774, "grad_norm": 6.006840590992643, "learning_rate": 4.25857310075016e-07, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -596.0, "logps/rejected": -904.0, "loss": 0.2293, "rewards/accuracies": 0.90625, "rewards/chosen": -4.34375, "rewards/margins": 3.1875, "rewards/rejected": -7.53125, "step": 4340 }, { "epoch": 0.32728914302911744, "grad_norm": 10.148748285674634, "learning_rate": 4.2538999241188374e-07, "logits/chosen": -2.671875, "logits/rejected": -2.359375, "logps/chosen": -628.0, "logps/rejected": -932.0, "loss": 0.1906, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.71875, "rewards/margins": 3.328125, "rewards/rejected": -8.0, "step": 4350 }, { "epoch": 0.32804153186366714, "grad_norm": 11.129120168982654, "learning_rate": 4.2492146479345693e-07, "logits/chosen": -2.640625, "logits/rejected": -2.453125, "logps/chosen": -684.0, "logps/rejected": -1032.0, "loss": 0.22, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.28125, "rewards/margins": 3.5, "rewards/rejected": -8.8125, "step": 4360 }, { "epoch": 0.32879392069821683, "grad_norm": 9.910453441221708, "learning_rate": 4.244517304519467e-07, "logits/chosen": -2.671875, "logits/rejected": -2.3125, "logps/chosen": -644.0, "logps/rejected": -1008.0, "loss": 0.2234, "rewards/accuracies": 0.90625, "rewards/chosen": -4.96875, "rewards/margins": 3.46875, "rewards/rejected": -8.4375, "step": 4370 }, { "epoch": 0.32954630953276653, "grad_norm": 11.052178846558816, "learning_rate": 4.2398079262788893e-07, "logits/chosen": -2.671875, "logits/rejected": -2.46875, "logps/chosen": -648.0, "logps/rejected": -956.0, "loss": 0.222, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 3.125, "rewards/rejected": -8.0625, "step": 4380 }, { "epoch": 0.33029869836731623, "grad_norm": 9.642152534654413, "learning_rate": 4.2350865457012184e-07, "logits/chosen": -2.765625, "logits/rejected": -2.453125, "logps/chosen": -648.0, "logps/rejected": -980.0, "loss": 0.2332, "rewards/accuracies": 0.90625, "rewards/chosen": -4.875, "rewards/margins": 3.453125, "rewards/rejected": -8.3125, "step": 4390 }, { "epoch": 0.33105108720186593, "grad_norm": 9.60330425948754, "learning_rate": 4.2303531953576366e-07, "logits/chosen": -2.921875, "logits/rejected": -2.59375, "logps/chosen": -624.0, "logps/rejected": -960.0, "loss": 0.2225, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 3.421875, "rewards/rejected": -8.0, "step": 4400 }, { "epoch": 0.33180347603641563, "grad_norm": 7.6600645560452785, "learning_rate": 4.2256079079019015e-07, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -564.0, "logps/rejected": -880.0, "loss": 0.2416, "rewards/accuracies": 0.90625, "rewards/chosen": -4.15625, "rewards/margins": 3.171875, "rewards/rejected": -7.34375, "step": 4410 }, { "epoch": 0.33255586487096533, "grad_norm": 7.850742770509769, "learning_rate": 4.220850716070121e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -620.0, "logps/rejected": -912.0, "loss": 0.2238, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.5625, "rewards/margins": 3.171875, "rewards/rejected": -7.75, "step": 4420 }, { "epoch": 0.33330825370551503, "grad_norm": 7.909302118744176, "learning_rate": 4.2160816526805267e-07, "logits/chosen": -2.703125, "logits/rejected": -2.4375, "logps/chosen": -664.0, "logps/rejected": -1016.0, "loss": 0.2501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.09375, "rewards/margins": 3.515625, "rewards/rejected": -8.625, "step": 4430 }, { "epoch": 0.3340606425400647, "grad_norm": 8.94738188436282, "learning_rate": 4.211300750633248e-07, "logits/chosen": -2.53125, "logits/rejected": -2.375, "logps/chosen": -672.0, "logps/rejected": -988.0, "loss": 0.2314, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.125, "rewards/margins": 3.1875, "rewards/rejected": -8.3125, "step": 4440 }, { "epoch": 0.3348130313746144, "grad_norm": 8.676073044947644, "learning_rate": 4.2065080429100865e-07, "logits/chosen": -2.65625, "logits/rejected": -2.359375, "logps/chosen": -628.0, "logps/rejected": -924.0, "loss": 0.2225, "rewards/accuracies": 0.90625, "rewards/chosen": -4.625, "rewards/margins": 3.0, "rewards/rejected": -7.625, "step": 4450 }, { "epoch": 0.3355654202091641, "grad_norm": 9.806473078557882, "learning_rate": 4.2017035625742846e-07, "logits/chosen": -2.671875, "logits/rejected": -2.375, "logps/chosen": -632.0, "logps/rejected": -964.0, "loss": 0.2098, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 3.265625, "rewards/rejected": -8.0625, "step": 4460 }, { "epoch": 0.3363178090437138, "grad_norm": 7.522444333584003, "learning_rate": 4.196887342770302e-07, "logits/chosen": -2.78125, "logits/rejected": -2.4375, "logps/chosen": -668.0, "logps/rejected": -1012.0, "loss": 0.2095, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.9375, "rewards/margins": 3.703125, "rewards/rejected": -8.625, "step": 4470 }, { "epoch": 0.3370701978782635, "grad_norm": 8.728501659564188, "learning_rate": 4.1920594167235845e-07, "logits/chosen": -2.703125, "logits/rejected": -2.484375, "logps/chosen": -648.0, "logps/rejected": -960.0, "loss": 0.234, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.78125, "rewards/margins": 3.34375, "rewards/rejected": -8.125, "step": 4480 }, { "epoch": 0.3378225867128132, "grad_norm": 9.465075398652212, "learning_rate": 4.187219817740336e-07, "logits/chosen": -2.703125, "logits/rejected": -2.390625, "logps/chosen": -612.0, "logps/rejected": -936.0, "loss": 0.2285, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.46875, "rewards/margins": 3.15625, "rewards/rejected": -7.625, "step": 4490 }, { "epoch": 0.3385749755473629, "grad_norm": 7.109319264397478, "learning_rate": 4.182368579207285e-07, "logits/chosen": -2.5625, "logits/rejected": -2.3125, "logps/chosen": -616.0, "logps/rejected": -936.0, "loss": 0.2165, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.4375, "rewards/margins": 3.34375, "rewards/rejected": -7.78125, "step": 4500 }, { "epoch": 0.3393273643819126, "grad_norm": 10.507873292947329, "learning_rate": 4.177505734591461e-07, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -608.0, "logps/rejected": -932.0, "loss": 0.2096, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.40625, "rewards/margins": 3.34375, "rewards/rejected": -7.75, "step": 4510 }, { "epoch": 0.3400797532164623, "grad_norm": 7.3959641126703435, "learning_rate": 4.172631317439956e-07, "logits/chosen": -2.703125, "logits/rejected": -2.421875, "logps/chosen": -616.0, "logps/rejected": -992.0, "loss": 0.2317, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.46875, "rewards/margins": 3.6875, "rewards/rejected": -8.125, "step": 4520 }, { "epoch": 0.340832142051012, "grad_norm": 8.87302820276429, "learning_rate": 4.167745361379702e-07, "logits/chosen": -2.59375, "logits/rejected": -2.359375, "logps/chosen": -600.0, "logps/rejected": -928.0, "loss": 0.2207, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.40625, "rewards/margins": 3.40625, "rewards/rejected": -7.8125, "step": 4530 }, { "epoch": 0.3415845308855617, "grad_norm": 9.758571350415783, "learning_rate": 4.162847900117229e-07, "logits/chosen": -2.5625, "logits/rejected": -2.203125, "logps/chosen": -564.0, "logps/rejected": -896.0, "loss": 0.2177, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.03125, "rewards/margins": 3.203125, "rewards/rejected": -7.25, "step": 4540 }, { "epoch": 0.3423369197201114, "grad_norm": 8.896439051201618, "learning_rate": 4.1579389674384394e-07, "logits/chosen": -2.546875, "logits/rejected": -2.234375, "logps/chosen": -632.0, "logps/rejected": -908.0, "loss": 0.2145, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.71875, "rewards/margins": 2.96875, "rewards/rejected": -7.6875, "step": 4550 }, { "epoch": 0.3430893085546611, "grad_norm": 7.711661820543145, "learning_rate": 4.153018597208374e-07, "logits/chosen": -2.640625, "logits/rejected": -2.3125, "logps/chosen": -628.0, "logps/rejected": -980.0, "loss": 0.2203, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.75, "rewards/margins": 3.578125, "rewards/rejected": -8.3125, "step": 4560 }, { "epoch": 0.3438416973892107, "grad_norm": 8.21240547755215, "learning_rate": 4.1480868233709765e-07, "logits/chosen": -2.609375, "logits/rejected": -2.359375, "logps/chosen": -628.0, "logps/rejected": -984.0, "loss": 0.219, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.4375, "rewards/margins": 3.828125, "rewards/rejected": -8.25, "step": 4570 }, { "epoch": 0.3445940862237604, "grad_norm": 11.90114190092608, "learning_rate": 4.1431436799488606e-07, "logits/chosen": -2.65625, "logits/rejected": -2.359375, "logps/chosen": -644.0, "logps/rejected": -948.0, "loss": 0.2127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.78125, "rewards/margins": 3.171875, "rewards/rejected": -7.96875, "step": 4580 }, { "epoch": 0.3453464750583101, "grad_norm": 6.8696936160082265, "learning_rate": 4.1381892010430717e-07, "logits/chosen": -2.65625, "logits/rejected": -2.296875, "logps/chosen": -656.0, "logps/rejected": -960.0, "loss": 0.2498, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.0, "rewards/margins": 3.078125, "rewards/rejected": -8.0625, "step": 4590 }, { "epoch": 0.3460988638928598, "grad_norm": 10.171026913882173, "learning_rate": 4.13322342083286e-07, "logits/chosen": -2.53125, "logits/rejected": -2.296875, "logps/chosen": -688.0, "logps/rejected": -988.0, "loss": 0.2332, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.9375, "rewards/margins": 3.3125, "rewards/rejected": -8.25, "step": 4600 }, { "epoch": 0.3468512527274095, "grad_norm": 8.63453504033253, "learning_rate": 4.1282463735754356e-07, "logits/chosen": -2.65625, "logits/rejected": -2.453125, "logps/chosen": -628.0, "logps/rejected": -932.0, "loss": 0.2289, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.65625, "rewards/margins": 3.046875, "rewards/rejected": -7.71875, "step": 4610 }, { "epoch": 0.3476036415619592, "grad_norm": 9.580783720183787, "learning_rate": 4.123258093605739e-07, "logits/chosen": -2.671875, "logits/rejected": -2.375, "logps/chosen": -648.0, "logps/rejected": -972.0, "loss": 0.2302, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.71875, "rewards/margins": 3.46875, "rewards/rejected": -8.1875, "step": 4620 }, { "epoch": 0.3483560303965089, "grad_norm": 7.120063150875206, "learning_rate": 4.118258615336199e-07, "logits/chosen": -2.65625, "logits/rejected": -2.4375, "logps/chosen": -624.0, "logps/rejected": -920.0, "loss": 0.2366, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.5625, "rewards/margins": 3.015625, "rewards/rejected": -7.5625, "step": 4630 }, { "epoch": 0.3491084192310586, "grad_norm": 5.827687137113672, "learning_rate": 4.1132479732564964e-07, "logits/chosen": -2.546875, "logits/rejected": -2.28125, "logps/chosen": -620.0, "logps/rejected": -968.0, "loss": 0.2027, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.53125, "rewards/margins": 3.34375, "rewards/rejected": -7.875, "step": 4640 }, { "epoch": 0.3498608080656083, "grad_norm": 10.67088810537178, "learning_rate": 4.1082262019333315e-07, "logits/chosen": -2.65625, "logits/rejected": -2.3125, "logps/chosen": -620.0, "logps/rejected": -1012.0, "loss": 0.2132, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.59375, "rewards/margins": 3.78125, "rewards/rejected": -8.375, "step": 4650 }, { "epoch": 0.350613196900158, "grad_norm": 10.094322081691317, "learning_rate": 4.103193336010179e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -684.0, "logps/rejected": -988.0, "loss": 0.215, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.03125, "rewards/margins": 3.5, "rewards/rejected": -8.5, "step": 4660 }, { "epoch": 0.3513655857347077, "grad_norm": 8.374810404276547, "learning_rate": 4.098149410207051e-07, "logits/chosen": -2.625, "logits/rejected": -2.359375, "logps/chosen": -648.0, "logps/rejected": -1012.0, "loss": 0.2365, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.96875, "rewards/margins": 3.515625, "rewards/rejected": -8.5, "step": 4670 }, { "epoch": 0.3521179745692574, "grad_norm": 8.392131172676132, "learning_rate": 4.09309445932026e-07, "logits/chosen": -2.578125, "logits/rejected": -2.359375, "logps/chosen": -672.0, "logps/rejected": -992.0, "loss": 0.226, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.21875, "rewards/margins": 3.1875, "rewards/rejected": -8.375, "step": 4680 }, { "epoch": 0.35287036340380706, "grad_norm": 9.468666629268613, "learning_rate": 4.088028518222173e-07, "logits/chosen": -2.59375, "logits/rejected": -2.28125, "logps/chosen": -620.0, "logps/rejected": -940.0, "loss": 0.2211, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.40625, "rewards/rejected": -8.0, "step": 4690 }, { "epoch": 0.35362275223835676, "grad_norm": 9.71697535829911, "learning_rate": 4.0829516218609785e-07, "logits/chosen": -2.65625, "logits/rejected": -2.40625, "logps/chosen": -664.0, "logps/rejected": -1032.0, "loss": 0.2035, "rewards/accuracies": 0.90625, "rewards/chosen": -4.875, "rewards/margins": 3.859375, "rewards/rejected": -8.75, "step": 4700 }, { "epoch": 0.35437514107290646, "grad_norm": 8.282815293843191, "learning_rate": 4.077863805260439e-07, "logits/chosen": -2.765625, "logits/rejected": -2.625, "logps/chosen": -640.0, "logps/rejected": -936.0, "loss": 0.2176, "rewards/accuracies": 0.90625, "rewards/chosen": -4.5625, "rewards/margins": 3.34375, "rewards/rejected": -7.875, "step": 4710 }, { "epoch": 0.35512752990745616, "grad_norm": 7.901474744472959, "learning_rate": 4.0727651035196545e-07, "logits/chosen": -2.671875, "logits/rejected": -2.3125, "logps/chosen": -600.0, "logps/rejected": -1000.0, "loss": 0.2287, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.40625, "rewards/margins": 3.765625, "rewards/rejected": -8.1875, "step": 4720 }, { "epoch": 0.35587991874200586, "grad_norm": 10.371105232687958, "learning_rate": 4.0676555518128157e-07, "logits/chosen": -2.671875, "logits/rejected": -2.359375, "logps/chosen": -668.0, "logps/rejected": -964.0, "loss": 0.2551, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -5.0625, "rewards/margins": 3.078125, "rewards/rejected": -8.125, "step": 4730 }, { "epoch": 0.35663230757655556, "grad_norm": 7.284751062931884, "learning_rate": 4.062535185388964e-07, "logits/chosen": -2.609375, "logits/rejected": -2.375, "logps/chosen": -592.0, "logps/rejected": -884.0, "loss": 0.2446, "rewards/accuracies": 0.90625, "rewards/chosen": -4.25, "rewards/margins": 3.109375, "rewards/rejected": -7.375, "step": 4740 }, { "epoch": 0.35738469641110526, "grad_norm": 7.798212343706078, "learning_rate": 4.0574040395717493e-07, "logits/chosen": -2.65625, "logits/rejected": -2.46875, "logps/chosen": -588.0, "logps/rejected": -916.0, "loss": 0.199, "rewards/accuracies": 0.9375, "rewards/chosen": -4.15625, "rewards/margins": 3.328125, "rewards/rejected": -7.46875, "step": 4750 }, { "epoch": 0.35813708524565496, "grad_norm": 7.738938259994802, "learning_rate": 4.0522621497591813e-07, "logits/chosen": -2.796875, "logits/rejected": -2.421875, "logps/chosen": -608.0, "logps/rejected": -936.0, "loss": 0.2369, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.5625, "rewards/margins": 3.390625, "rewards/rejected": -7.9375, "step": 4760 }, { "epoch": 0.35888947408020466, "grad_norm": 8.30773676851323, "learning_rate": 4.0471095514233933e-07, "logits/chosen": -2.671875, "logits/rejected": -2.34375, "logps/chosen": -600.0, "logps/rejected": -940.0, "loss": 0.2093, "rewards/accuracies": 0.90625, "rewards/chosen": -4.4375, "rewards/margins": 3.328125, "rewards/rejected": -7.78125, "step": 4770 }, { "epoch": 0.35964186291475436, "grad_norm": 8.691676803479753, "learning_rate": 4.041946280110389e-07, "logits/chosen": -2.671875, "logits/rejected": -2.421875, "logps/chosen": -576.0, "logps/rejected": -896.0, "loss": 0.2141, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.21875, "rewards/margins": 3.3125, "rewards/rejected": -7.53125, "step": 4780 }, { "epoch": 0.36039425174930406, "grad_norm": 8.607549823897239, "learning_rate": 4.0367723714398047e-07, "logits/chosen": -2.71875, "logits/rejected": -2.46875, "logps/chosen": -656.0, "logps/rejected": -1024.0, "loss": 0.1978, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 3.796875, "rewards/rejected": -8.625, "step": 4790 }, { "epoch": 0.36114664058385376, "grad_norm": 8.771643431392596, "learning_rate": 4.031587861104657e-07, "logits/chosen": -2.765625, "logits/rejected": -2.515625, "logps/chosen": -676.0, "logps/rejected": -996.0, "loss": 0.2288, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.15625, "rewards/margins": 3.265625, "rewards/rejected": -8.4375, "step": 4800 }, { "epoch": 0.3618990294184034, "grad_norm": 7.347767311708758, "learning_rate": 4.0263927848711026e-07, "logits/chosen": -2.734375, "logits/rejected": -2.359375, "logps/chosen": -656.0, "logps/rejected": -956.0, "loss": 0.2333, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 3.234375, "rewards/rejected": -8.125, "step": 4810 }, { "epoch": 0.3626514182529531, "grad_norm": 8.949761500130862, "learning_rate": 4.021187178578187e-07, "logits/chosen": -2.5625, "logits/rejected": -2.34375, "logps/chosen": -644.0, "logps/rejected": -972.0, "loss": 0.2175, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 3.296875, "rewards/rejected": -8.1875, "step": 4820 }, { "epoch": 0.3634038070875028, "grad_norm": 10.64662225641983, "learning_rate": 4.0159710781375994e-07, "logits/chosen": -2.6875, "logits/rejected": -2.40625, "logps/chosen": -700.0, "logps/rejected": -1004.0, "loss": 0.2273, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.46875, "rewards/margins": 3.046875, "rewards/rejected": -8.5, "step": 4830 }, { "epoch": 0.3641561959220525, "grad_norm": 10.534790254504463, "learning_rate": 4.0107445195334244e-07, "logits/chosen": -2.546875, "logits/rejected": -2.203125, "logps/chosen": -704.0, "logps/rejected": -1024.0, "loss": 0.2303, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3125, "rewards/margins": 3.4375, "rewards/rejected": -8.75, "step": 4840 }, { "epoch": 0.3649085847566022, "grad_norm": 7.871168672929195, "learning_rate": 4.005507538821894e-07, "logits/chosen": -2.53125, "logits/rejected": -2.1875, "logps/chosen": -692.0, "logps/rejected": -1012.0, "loss": 0.2123, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 3.375, "rewards/rejected": -8.4375, "step": 4850 }, { "epoch": 0.3656609735911519, "grad_norm": 8.593711676069107, "learning_rate": 4.0002601721311393e-07, "logits/chosen": -2.515625, "logits/rejected": -2.25, "logps/chosen": -680.0, "logps/rejected": -1012.0, "loss": 0.1999, "rewards/accuracies": 0.90625, "rewards/chosen": -5.125, "rewards/margins": 3.53125, "rewards/rejected": -8.6875, "step": 4860 }, { "epoch": 0.3664133624257016, "grad_norm": 8.635194837292705, "learning_rate": 3.995002455660939e-07, "logits/chosen": -2.671875, "logits/rejected": -2.359375, "logps/chosen": -680.0, "logps/rejected": -1064.0, "loss": 0.2006, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 3.953125, "rewards/rejected": -9.125, "step": 4870 }, { "epoch": 0.3671657512602513, "grad_norm": 9.553334243561997, "learning_rate": 3.989734425682473e-07, "logits/chosen": -2.484375, "logits/rejected": -2.140625, "logps/chosen": -740.0, "logps/rejected": -1064.0, "loss": 0.2232, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.875, "rewards/margins": 3.34375, "rewards/rejected": -9.1875, "step": 4880 }, { "epoch": 0.367918140094801, "grad_norm": 9.264445545978198, "learning_rate": 3.984456118538071e-07, "logits/chosen": -2.5625, "logits/rejected": -2.359375, "logps/chosen": -688.0, "logps/rejected": -1064.0, "loss": 0.2264, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.3125, "rewards/margins": 3.6875, "rewards/rejected": -9.0, "step": 4890 }, { "epoch": 0.3686705289293507, "grad_norm": 7.532750216761137, "learning_rate": 3.9791675706409593e-07, "logits/chosen": -2.6875, "logits/rejected": -2.296875, "logps/chosen": -600.0, "logps/rejected": -992.0, "loss": 0.2365, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.46875, "rewards/margins": 3.921875, "rewards/rejected": -8.375, "step": 4900 }, { "epoch": 0.3694229177639004, "grad_norm": 6.92523330567057, "learning_rate": 3.9738688184750125e-07, "logits/chosen": -2.59375, "logits/rejected": -2.34375, "logps/chosen": -644.0, "logps/rejected": -964.0, "loss": 0.2077, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.65625, "rewards/margins": 3.359375, "rewards/rejected": -8.0, "step": 4910 }, { "epoch": 0.3701753065984501, "grad_norm": 10.657909024859954, "learning_rate": 3.968559898594502e-07, "logits/chosen": -2.546875, "logits/rejected": -2.28125, "logps/chosen": -620.0, "logps/rejected": -948.0, "loss": 0.2223, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.78125, "rewards/margins": 3.4375, "rewards/rejected": -8.1875, "step": 4920 }, { "epoch": 0.37092769543299975, "grad_norm": 6.720765504539305, "learning_rate": 3.9632408476238416e-07, "logits/chosen": -2.59375, "logits/rejected": -2.3125, "logps/chosen": -648.0, "logps/rejected": -940.0, "loss": 0.2112, "rewards/accuracies": 0.875, "rewards/chosen": -4.875, "rewards/margins": 2.984375, "rewards/rejected": -7.875, "step": 4930 }, { "epoch": 0.37168008426754945, "grad_norm": 7.963706047021695, "learning_rate": 3.957911702257337e-07, "logits/chosen": -2.71875, "logits/rejected": -2.40625, "logps/chosen": -656.0, "logps/rejected": -1016.0, "loss": 0.2229, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.75, "rewards/margins": 3.71875, "rewards/rejected": -8.4375, "step": 4940 }, { "epoch": 0.37243247310209915, "grad_norm": 9.314200881993933, "learning_rate": 3.952572499258929e-07, "logits/chosen": -2.703125, "logits/rejected": -2.375, "logps/chosen": -628.0, "logps/rejected": -952.0, "loss": 0.2239, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.65625, "rewards/margins": 3.421875, "rewards/rejected": -8.0625, "step": 4950 }, { "epoch": 0.37318486193664885, "grad_norm": 8.115661280449446, "learning_rate": 3.9472232754619474e-07, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -632.0, "logps/rejected": -960.0, "loss": 0.216, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.53125, "rewards/rejected": -8.125, "step": 4960 }, { "epoch": 0.37393725077119855, "grad_norm": 7.013722897295247, "learning_rate": 3.9418640677688464e-07, "logits/chosen": -2.546875, "logits/rejected": -2.3125, "logps/chosen": -680.0, "logps/rejected": -960.0, "loss": 0.2489, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.015625, "rewards/rejected": -8.0625, "step": 4970 }, { "epoch": 0.37468963960574825, "grad_norm": 9.184692139331514, "learning_rate": 3.936494913150961e-07, "logits/chosen": -2.5, "logits/rejected": -2.328125, "logps/chosen": -648.0, "logps/rejected": -912.0, "loss": 0.198, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.75, "rewards/margins": 2.875, "rewards/rejected": -7.625, "step": 4980 }, { "epoch": 0.37544202844029795, "grad_norm": 9.392069646399577, "learning_rate": 3.931115848648242e-07, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -648.0, "logps/rejected": -996.0, "loss": 0.2036, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 3.671875, "rewards/rejected": -8.5625, "step": 4990 }, { "epoch": 0.37619441727484765, "grad_norm": 9.776594081515965, "learning_rate": 3.925726911369008e-07, "logits/chosen": -2.8125, "logits/rejected": -2.640625, "logps/chosen": -672.0, "logps/rejected": -1024.0, "loss": 0.1966, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.0, "rewards/margins": 3.734375, "rewards/rejected": -8.75, "step": 5000 }, { "epoch": 0.37619441727484765, "eval_logits/chosen": -2.84375, "eval_logits/rejected": -2.5625, "eval_logps/chosen": -688.0, "eval_logps/rejected": -1032.0, "eval_loss": 0.25248709321022034, "eval_rewards/accuracies": 0.888202428817749, "eval_rewards/chosen": -5.21875, "eval_rewards/margins": 3.578125, "eval_rewards/rejected": -8.8125, "eval_runtime": 3417.1912, "eval_samples_per_second": 27.652, "eval_steps_per_second": 0.432, "step": 5000 }, { "epoch": 0.37694680610939735, "grad_norm": 8.537262562794556, "learning_rate": 3.9203281384896856e-07, "logits/chosen": -2.8125, "logits/rejected": -2.640625, "logps/chosen": -640.0, "logps/rejected": -960.0, "loss": 0.2204, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.9375, "rewards/margins": 3.34375, "rewards/rejected": -8.25, "step": 5010 }, { "epoch": 0.37769919494394705, "grad_norm": 8.964447569309238, "learning_rate": 3.9149195672545547e-07, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1016.0, "loss": 0.2281, "rewards/accuracies": 0.875, "rewards/chosen": -5.15625, "rewards/margins": 3.390625, "rewards/rejected": -8.5625, "step": 5020 }, { "epoch": 0.37845158377849675, "grad_norm": 7.045933538630638, "learning_rate": 3.9095012349754897e-07, "logits/chosen": -2.59375, "logits/rejected": -2.484375, "logps/chosen": -688.0, "logps/rejected": -1020.0, "loss": 0.2218, "rewards/accuracies": 0.9375, "rewards/chosen": -5.21875, "rewards/margins": 3.5625, "rewards/rejected": -8.75, "step": 5030 }, { "epoch": 0.37920397261304645, "grad_norm": 7.612458090206155, "learning_rate": 3.904073179031702e-07, "logits/chosen": -2.59375, "logits/rejected": -2.265625, "logps/chosen": -660.0, "logps/rejected": -980.0, "loss": 0.2149, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 3.1875, "rewards/rejected": -8.0625, "step": 5040 }, { "epoch": 0.3799563614475961, "grad_norm": 8.655710429881823, "learning_rate": 3.898635436869485e-07, "logits/chosen": -2.65625, "logits/rejected": -2.359375, "logps/chosen": -684.0, "logps/rejected": -1024.0, "loss": 0.2207, "rewards/accuracies": 0.875, "rewards/chosen": -5.28125, "rewards/margins": 3.53125, "rewards/rejected": -8.8125, "step": 5050 }, { "epoch": 0.3807087502821458, "grad_norm": 10.581734628831592, "learning_rate": 3.8931880460019537e-07, "logits/chosen": -2.609375, "logits/rejected": -2.328125, "logps/chosen": -644.0, "logps/rejected": -928.0, "loss": 0.2345, "rewards/accuracies": 0.90625, "rewards/chosen": -4.90625, "rewards/margins": 2.984375, "rewards/rejected": -7.875, "step": 5060 }, { "epoch": 0.3814611391166955, "grad_norm": 7.882854152082069, "learning_rate": 3.887731044008787e-07, "logits/chosen": -2.734375, "logits/rejected": -2.53125, "logps/chosen": -656.0, "logps/rejected": -976.0, "loss": 0.1997, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 3.171875, "rewards/rejected": -8.125, "step": 5070 }, { "epoch": 0.3822135279512452, "grad_norm": 9.906732797744418, "learning_rate": 3.8822644685359655e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -672.0, "logps/rejected": -1040.0, "loss": 0.2419, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.0625, "rewards/margins": 3.71875, "rewards/rejected": -8.8125, "step": 5080 }, { "epoch": 0.3829659167857949, "grad_norm": 8.193426118810464, "learning_rate": 3.876788357295516e-07, "logits/chosen": -2.53125, "logits/rejected": -2.265625, "logps/chosen": -604.0, "logps/rejected": -924.0, "loss": 0.2228, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.53125, "rewards/margins": 3.203125, "rewards/rejected": -7.71875, "step": 5090 }, { "epoch": 0.3837183056203446, "grad_norm": 7.671834236636164, "learning_rate": 3.871302748065246e-07, "logits/chosen": -2.546875, "logits/rejected": -2.171875, "logps/chosen": -576.0, "logps/rejected": -936.0, "loss": 0.2184, "rewards/accuracies": 0.90625, "rewards/chosen": -4.34375, "rewards/margins": 3.25, "rewards/rejected": -7.625, "step": 5100 }, { "epoch": 0.3844706944548943, "grad_norm": 8.643588666196429, "learning_rate": 3.8658076786884917e-07, "logits/chosen": -2.765625, "logits/rejected": -2.375, "logps/chosen": -648.0, "logps/rejected": -1024.0, "loss": 0.1989, "rewards/accuracies": 0.90625, "rewards/chosen": -4.90625, "rewards/margins": 3.796875, "rewards/rejected": -8.6875, "step": 5110 }, { "epoch": 0.385223083289444, "grad_norm": 9.75416624908225, "learning_rate": 3.860303187073848e-07, "logits/chosen": -2.71875, "logits/rejected": -2.375, "logps/chosen": -652.0, "logps/rejected": -1012.0, "loss": 0.1922, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.90625, "rewards/margins": 3.671875, "rewards/rejected": -8.625, "step": 5120 }, { "epoch": 0.3859754721239937, "grad_norm": 11.127358228566324, "learning_rate": 3.854789311194909e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -632.0, "logps/rejected": -968.0, "loss": 0.2132, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.75, "rewards/margins": 3.53125, "rewards/rejected": -8.3125, "step": 5130 }, { "epoch": 0.3867278609585434, "grad_norm": 9.280144402951457, "learning_rate": 3.849266089090013e-07, "logits/chosen": -2.65625, "logits/rejected": -2.34375, "logps/chosen": -612.0, "logps/rejected": -996.0, "loss": 0.2038, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.46875, "rewards/margins": 3.890625, "rewards/rejected": -8.375, "step": 5140 }, { "epoch": 0.3874802497930931, "grad_norm": 11.614194573881234, "learning_rate": 3.843733558861969e-07, "logits/chosen": -2.734375, "logits/rejected": -2.40625, "logps/chosen": -620.0, "logps/rejected": -980.0, "loss": 0.2072, "rewards/accuracies": 0.9375, "rewards/chosen": -4.625, "rewards/margins": 3.59375, "rewards/rejected": -8.25, "step": 5150 }, { "epoch": 0.3882326386276428, "grad_norm": 8.34285723391702, "learning_rate": 3.8381917586778036e-07, "logits/chosen": -2.640625, "logits/rejected": -2.359375, "logps/chosen": -624.0, "logps/rejected": -952.0, "loss": 0.2061, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.65625, "rewards/margins": 3.40625, "rewards/rejected": -8.0625, "step": 5160 }, { "epoch": 0.38898502746219243, "grad_norm": 6.882057609993868, "learning_rate": 3.8326407267684916e-07, "logits/chosen": -2.734375, "logits/rejected": -2.328125, "logps/chosen": -640.0, "logps/rejected": -996.0, "loss": 0.2104, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.625, "rewards/margins": 3.75, "rewards/rejected": -8.375, "step": 5170 }, { "epoch": 0.38973741629674213, "grad_norm": 10.713709287777075, "learning_rate": 3.827080501428695e-07, "logits/chosen": -2.65625, "logits/rejected": -2.390625, "logps/chosen": -640.0, "logps/rejected": -984.0, "loss": 0.2094, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 3.59375, "rewards/rejected": -8.375, "step": 5180 }, { "epoch": 0.39048980513129183, "grad_norm": 7.105350641020761, "learning_rate": 3.821511121016498e-07, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -604.0, "logps/rejected": -936.0, "loss": 0.2157, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.5625, "rewards/margins": 3.359375, "rewards/rejected": -7.90625, "step": 5190 }, { "epoch": 0.39124219396584153, "grad_norm": 8.837764335158008, "learning_rate": 3.815932623953142e-07, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -632.0, "logps/rejected": -960.0, "loss": 0.2075, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.625, "rewards/margins": 3.4375, "rewards/rejected": -8.0625, "step": 5200 }, { "epoch": 0.39199458280039123, "grad_norm": 9.041698753486347, "learning_rate": 3.810345048722762e-07, "logits/chosen": -2.796875, "logits/rejected": -2.484375, "logps/chosen": -628.0, "logps/rejected": -940.0, "loss": 0.2086, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.28125, "rewards/rejected": -7.84375, "step": 5210 }, { "epoch": 0.39274697163494093, "grad_norm": 8.84149882552823, "learning_rate": 3.804748433872119e-07, "logits/chosen": -2.625, "logits/rejected": -2.390625, "logps/chosen": -600.0, "logps/rejected": -912.0, "loss": 0.1973, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.375, "rewards/margins": 3.234375, "rewards/rejected": -7.625, "step": 5220 }, { "epoch": 0.39349936046949063, "grad_norm": 10.67174010897917, "learning_rate": 3.7991428180103375e-07, "logits/chosen": -2.71875, "logits/rejected": -2.359375, "logps/chosen": -664.0, "logps/rejected": -1008.0, "loss": 0.2128, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.84375, "rewards/margins": 3.640625, "rewards/rejected": -8.5, "step": 5230 }, { "epoch": 0.39425174930404033, "grad_norm": 9.519966003861786, "learning_rate": 3.7935282398086335e-07, "logits/chosen": -2.640625, "logits/rejected": -2.4375, "logps/chosen": -652.0, "logps/rejected": -968.0, "loss": 0.2288, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.78125, "rewards/margins": 3.453125, "rewards/rejected": -8.25, "step": 5240 }, { "epoch": 0.39500413813859003, "grad_norm": 7.417461935131493, "learning_rate": 3.787904738000053e-07, "logits/chosen": -2.5625, "logits/rejected": -2.34375, "logps/chosen": -620.0, "logps/rejected": -956.0, "loss": 0.2207, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5, "rewards/margins": 3.328125, "rewards/rejected": -7.8125, "step": 5250 }, { "epoch": 0.39575652697313973, "grad_norm": 12.351645854988238, "learning_rate": 3.7822723513792036e-07, "logits/chosen": -2.546875, "logits/rejected": -2.234375, "logps/chosen": -620.0, "logps/rejected": -980.0, "loss": 0.1946, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.5625, "rewards/margins": 3.84375, "rewards/rejected": -8.375, "step": 5260 }, { "epoch": 0.39650891580768943, "grad_norm": 8.905945193049313, "learning_rate": 3.776631118801985e-07, "logits/chosen": -2.65625, "logits/rejected": -2.40625, "logps/chosen": -644.0, "logps/rejected": -992.0, "loss": 0.1959, "rewards/accuracies": 0.90625, "rewards/chosen": -4.875, "rewards/margins": 3.578125, "rewards/rejected": -8.4375, "step": 5270 }, { "epoch": 0.39726130464223913, "grad_norm": 10.875357145554204, "learning_rate": 3.7709810791853224e-07, "logits/chosen": -2.703125, "logits/rejected": -2.421875, "logps/chosen": -632.0, "logps/rejected": -1032.0, "loss": 0.2058, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.0625, "rewards/rejected": -8.75, "step": 5280 }, { "epoch": 0.3980136934767888, "grad_norm": 10.155296544076947, "learning_rate": 3.765322271506898e-07, "logits/chosen": -2.640625, "logits/rejected": -2.296875, "logps/chosen": -628.0, "logps/rejected": -968.0, "loss": 0.1956, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 3.53125, "rewards/rejected": -8.3125, "step": 5290 }, { "epoch": 0.3987660823113385, "grad_norm": 8.845714187658283, "learning_rate": 3.7596547348048806e-07, "logits/chosen": -2.421875, "logits/rejected": -2.1875, "logps/chosen": -692.0, "logps/rejected": -1024.0, "loss": 0.1965, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.46875, "rewards/margins": 3.15625, "rewards/rejected": -8.625, "step": 5300 }, { "epoch": 0.3995184711458882, "grad_norm": 9.208429894417057, "learning_rate": 3.753978508177661e-07, "logits/chosen": -2.53125, "logits/rejected": -2.265625, "logps/chosen": -720.0, "logps/rejected": -1072.0, "loss": 0.19, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.53125, "rewards/margins": 3.546875, "rewards/rejected": -9.0625, "step": 5310 }, { "epoch": 0.4002708599804379, "grad_norm": 9.199388800164709, "learning_rate": 3.7482936307835737e-07, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -668.0, "logps/rejected": -1016.0, "loss": 0.19, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.0, "rewards/margins": 3.640625, "rewards/rejected": -8.625, "step": 5320 }, { "epoch": 0.4010232488149876, "grad_norm": 11.99410111092291, "learning_rate": 3.7426001418406356e-07, "logits/chosen": -2.65625, "logits/rejected": -2.359375, "logps/chosen": -660.0, "logps/rejected": -1064.0, "loss": 0.2066, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.125, "rewards/rejected": -9.0625, "step": 5330 }, { "epoch": 0.4017756376495373, "grad_norm": 8.94621660275642, "learning_rate": 3.7368980806262707e-07, "logits/chosen": -2.703125, "logits/rejected": -2.390625, "logps/chosen": -664.0, "logps/rejected": -1000.0, "loss": 0.1974, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.40625, "rewards/rejected": -8.625, "step": 5340 }, { "epoch": 0.402528026484087, "grad_norm": 9.30671459047822, "learning_rate": 3.73118748647704e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -716.0, "logps/rejected": -1064.0, "loss": 0.2078, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.53125, "rewards/margins": 3.59375, "rewards/rejected": -9.125, "step": 5350 }, { "epoch": 0.4032804153186367, "grad_norm": 8.596185905238903, "learning_rate": 3.725468398788372e-07, "logits/chosen": -2.75, "logits/rejected": -2.453125, "logps/chosen": -716.0, "logps/rejected": -1040.0, "loss": 0.2174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.53125, "rewards/margins": 3.453125, "rewards/rejected": -9.0, "step": 5360 }, { "epoch": 0.4040328041531864, "grad_norm": 9.304031175627815, "learning_rate": 3.719740857014284e-07, "logits/chosen": -2.75, "logits/rejected": -2.46875, "logps/chosen": -724.0, "logps/rejected": -1072.0, "loss": 0.2106, "rewards/accuracies": 0.90625, "rewards/chosen": -5.5625, "rewards/margins": 3.625, "rewards/rejected": -9.1875, "step": 5370 }, { "epoch": 0.4047851929877361, "grad_norm": 9.684060809443176, "learning_rate": 3.714004900667122e-07, "logits/chosen": -2.84375, "logits/rejected": -2.609375, "logps/chosen": -660.0, "logps/rejected": -1020.0, "loss": 0.214, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 3.8125, "rewards/rejected": -8.8125, "step": 5380 }, { "epoch": 0.4055375818222858, "grad_norm": 9.201676557442841, "learning_rate": 3.708260569317277e-07, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -672.0, "logps/rejected": -996.0, "loss": 0.2026, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.34375, "rewards/rejected": -8.5, "step": 5390 }, { "epoch": 0.4062899706568355, "grad_norm": 7.524754017547156, "learning_rate": 3.702507902592917e-07, "logits/chosen": -2.6875, "logits/rejected": -2.359375, "logps/chosen": -656.0, "logps/rejected": -1032.0, "loss": 0.1825, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 3.796875, "rewards/rejected": -8.8125, "step": 5400 }, { "epoch": 0.4070423594913852, "grad_norm": 8.356584739363438, "learning_rate": 3.696746940179713e-07, "logits/chosen": -2.75, "logits/rejected": -2.5, "logps/chosen": -688.0, "logps/rejected": -1032.0, "loss": 0.2052, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.21875, "rewards/margins": 3.453125, "rewards/rejected": -8.6875, "step": 5410 }, { "epoch": 0.4077947483259348, "grad_norm": 9.40662099033202, "learning_rate": 3.690977721820565e-07, "logits/chosen": -2.609375, "logits/rejected": -2.390625, "logps/chosen": -648.0, "logps/rejected": -980.0, "loss": 0.2068, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.6875, "rewards/margins": 3.546875, "rewards/rejected": -8.25, "step": 5420 }, { "epoch": 0.4085471371604845, "grad_norm": 48.40020616188885, "learning_rate": 3.6852002873153293e-07, "logits/chosen": -2.671875, "logits/rejected": -2.46875, "logps/chosen": -720.0, "logps/rejected": -1064.0, "loss": 0.2153, "rewards/accuracies": 0.90625, "rewards/chosen": -5.6875, "rewards/margins": 3.453125, "rewards/rejected": -9.125, "step": 5430 }, { "epoch": 0.4092995259950342, "grad_norm": 12.662814287211088, "learning_rate": 3.679414676520541e-07, "logits/chosen": -2.765625, "logits/rejected": -2.546875, "logps/chosen": -724.0, "logps/rejected": -1104.0, "loss": 0.2081, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.5625, "rewards/margins": 3.8125, "rewards/rejected": -9.375, "step": 5440 }, { "epoch": 0.4100519148295839, "grad_norm": 8.362044095992177, "learning_rate": 3.673620929349141e-07, "logits/chosen": -2.703125, "logits/rejected": -2.375, "logps/chosen": -680.0, "logps/rejected": -1080.0, "loss": 0.1942, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3125, "rewards/margins": 4.0625, "rewards/rejected": -9.375, "step": 5450 }, { "epoch": 0.4108043036641336, "grad_norm": 7.768648809813202, "learning_rate": 3.6678190857701996e-07, "logits/chosen": -2.796875, "logits/rejected": -2.59375, "logps/chosen": -684.0, "logps/rejected": -1008.0, "loss": 0.2033, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 3.375, "rewards/rejected": -8.5625, "step": 5460 }, { "epoch": 0.4115566924986833, "grad_norm": 9.699525227338295, "learning_rate": 3.6620091858086426e-07, "logits/chosen": -2.875, "logits/rejected": -2.609375, "logps/chosen": -696.0, "logps/rejected": -1088.0, "loss": 0.2035, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.3125, "rewards/margins": 3.8125, "rewards/rejected": -9.125, "step": 5470 }, { "epoch": 0.412309081333233, "grad_norm": 8.507906559213271, "learning_rate": 3.6561912695449747e-07, "logits/chosen": -2.796875, "logits/rejected": -2.578125, "logps/chosen": -692.0, "logps/rejected": -1016.0, "loss": 0.2238, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.28125, "rewards/rejected": -8.5, "step": 5480 }, { "epoch": 0.4130614701677827, "grad_norm": 7.2410857606643715, "learning_rate": 3.6503653771150003e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -696.0, "logps/rejected": -1008.0, "loss": 0.2034, "rewards/accuracies": 0.90625, "rewards/chosen": -5.25, "rewards/margins": 3.21875, "rewards/rejected": -8.5, "step": 5490 }, { "epoch": 0.4138138590023324, "grad_norm": 10.453403141917368, "learning_rate": 3.6445315487095487e-07, "logits/chosen": -2.625, "logits/rejected": -2.625, "logps/chosen": -732.0, "logps/rejected": -1020.0, "loss": 0.196, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.6875, "rewards/margins": 3.140625, "rewards/rejected": -8.8125, "step": 5500 }, { "epoch": 0.4145662478368821, "grad_norm": 9.877038756567623, "learning_rate": 3.6386898245741987e-07, "logits/chosen": -2.78125, "logits/rejected": -2.390625, "logps/chosen": -636.0, "logps/rejected": -1056.0, "loss": 0.2096, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.125, "rewards/rejected": -8.9375, "step": 5510 }, { "epoch": 0.4153186366714318, "grad_norm": 10.571725785138156, "learning_rate": 3.632840245008998e-07, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -612.0, "logps/rejected": -932.0, "loss": 0.2066, "rewards/accuracies": 0.90625, "rewards/chosen": -4.5, "rewards/margins": 3.25, "rewards/rejected": -7.75, "step": 5520 }, { "epoch": 0.4160710255059815, "grad_norm": 9.723399391845161, "learning_rate": 3.626982850368185e-07, "logits/chosen": -2.8125, "logits/rejected": -2.4375, "logps/chosen": -664.0, "logps/rejected": -1020.0, "loss": 0.2157, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.125, "rewards/margins": 3.515625, "rewards/rejected": -8.6875, "step": 5530 }, { "epoch": 0.41682341434053116, "grad_norm": 10.430224163906624, "learning_rate": 3.621117681059914e-07, "logits/chosen": -2.765625, "logits/rejected": -2.421875, "logps/chosen": -664.0, "logps/rejected": -1056.0, "loss": 0.1964, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.9375, "rewards/margins": 3.953125, "rewards/rejected": -8.875, "step": 5540 }, { "epoch": 0.41757580317508086, "grad_norm": 8.436788642136804, "learning_rate": 3.6152447775459715e-07, "logits/chosen": -2.78125, "logits/rejected": -2.421875, "logps/chosen": -632.0, "logps/rejected": -968.0, "loss": 0.2039, "rewards/accuracies": 0.875, "rewards/chosen": -4.84375, "rewards/margins": 3.53125, "rewards/rejected": -8.375, "step": 5550 }, { "epoch": 0.41832819200963056, "grad_norm": 9.121013611607667, "learning_rate": 3.609364180341503e-07, "logits/chosen": -2.734375, "logits/rejected": -2.609375, "logps/chosen": -676.0, "logps/rejected": -1064.0, "loss": 0.2005, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.0, "rewards/rejected": -9.125, "step": 5560 }, { "epoch": 0.41908058084418026, "grad_norm": 8.201889530984163, "learning_rate": 3.603475930014726e-07, "logits/chosen": -2.625, "logits/rejected": -2.328125, "logps/chosen": -728.0, "logps/rejected": -1072.0, "loss": 0.1941, "rewards/accuracies": 0.9375, "rewards/chosen": -5.5625, "rewards/margins": 3.6875, "rewards/rejected": -9.25, "step": 5570 }, { "epoch": 0.41983296967872996, "grad_norm": 9.310384454726352, "learning_rate": 3.5975800671866575e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -740.0, "logps/rejected": -1088.0, "loss": 0.2214, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.71875, "rewards/margins": 3.59375, "rewards/rejected": -9.3125, "step": 5580 }, { "epoch": 0.42058535851327966, "grad_norm": 10.912009015261056, "learning_rate": 3.591676632530829e-07, "logits/chosen": -2.6875, "logits/rejected": -2.375, "logps/chosen": -676.0, "logps/rejected": -1040.0, "loss": 0.1968, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 3.828125, "rewards/rejected": -9.0, "step": 5590 }, { "epoch": 0.42133774734782936, "grad_norm": 10.375001718266384, "learning_rate": 3.5857656667730074e-07, "logits/chosen": -2.625, "logits/rejected": -2.328125, "logps/chosen": -700.0, "logps/rejected": -1104.0, "loss": 0.185, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.1875, "rewards/rejected": -9.4375, "step": 5600 }, { "epoch": 0.42209013618237906, "grad_norm": 9.32593833031528, "learning_rate": 3.579847210690915e-07, "logits/chosen": -2.6875, "logits/rejected": -2.390625, "logps/chosen": -736.0, "logps/rejected": -1048.0, "loss": 0.2146, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.5625, "rewards/margins": 3.4375, "rewards/rejected": -9.0, "step": 5610 }, { "epoch": 0.42284252501692876, "grad_norm": 10.072222773238604, "learning_rate": 3.573921305113947e-07, "logits/chosen": -2.625, "logits/rejected": -2.359375, "logps/chosen": -644.0, "logps/rejected": -996.0, "loss": 0.1781, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 3.5625, "rewards/rejected": -8.5, "step": 5620 }, { "epoch": 0.42359491385147846, "grad_norm": 8.272122929093275, "learning_rate": 3.5679879909228895e-07, "logits/chosen": -2.703125, "logits/rejected": -2.375, "logps/chosen": -668.0, "logps/rejected": -1024.0, "loss": 0.222, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 3.6875, "rewards/rejected": -8.75, "step": 5630 }, { "epoch": 0.42434730268602816, "grad_norm": 9.659485715922361, "learning_rate": 3.562047309049638e-07, "logits/chosen": -2.671875, "logits/rejected": -2.46875, "logps/chosen": -672.0, "logps/rejected": -1024.0, "loss": 0.2115, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.625, "rewards/rejected": -8.8125, "step": 5640 }, { "epoch": 0.42509969152057786, "grad_norm": 6.267941272449572, "learning_rate": 3.5560993004769156e-07, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -632.0, "logps/rejected": -976.0, "loss": 0.2074, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.75, "rewards/margins": 3.5, "rewards/rejected": -8.25, "step": 5650 }, { "epoch": 0.4258520803551275, "grad_norm": 27.41279541635157, "learning_rate": 3.550144006237991e-07, "logits/chosen": -2.671875, "logits/rejected": -2.28125, "logps/chosen": -660.0, "logps/rejected": -980.0, "loss": 0.205, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.9375, "rewards/margins": 3.328125, "rewards/rejected": -8.25, "step": 5660 }, { "epoch": 0.4266044691896772, "grad_norm": 13.511045743963125, "learning_rate": 3.5441814674163914e-07, "logits/chosen": -2.734375, "logits/rejected": -2.4375, "logps/chosen": -672.0, "logps/rejected": -1072.0, "loss": 0.191, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 4.21875, "rewards/rejected": -9.125, "step": 5670 }, { "epoch": 0.4273568580242269, "grad_norm": 7.938326652173086, "learning_rate": 3.5382117251456245e-07, "logits/chosen": -2.6875, "logits/rejected": -2.4375, "logps/chosen": -680.0, "logps/rejected": -1048.0, "loss": 0.194, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 3.8125, "rewards/rejected": -8.9375, "step": 5680 }, { "epoch": 0.4281092468587766, "grad_norm": 9.761973101389064, "learning_rate": 3.5322348206088903e-07, "logits/chosen": -2.671875, "logits/rejected": -2.328125, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.1848, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0625, "rewards/margins": 4.03125, "rewards/rejected": -9.125, "step": 5690 }, { "epoch": 0.4288616356933263, "grad_norm": 7.12764182792613, "learning_rate": 3.526250795038801e-07, "logits/chosen": -2.640625, "logits/rejected": -2.3125, "logps/chosen": -708.0, "logps/rejected": -1064.0, "loss": 0.1999, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.5625, "rewards/margins": 3.625, "rewards/rejected": -9.1875, "step": 5700 }, { "epoch": 0.429614024527876, "grad_norm": 9.484679834050828, "learning_rate": 3.520259689717091e-07, "logits/chosen": -2.78125, "logits/rejected": -2.4375, "logps/chosen": -748.0, "logps/rejected": -1080.0, "loss": 0.2158, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.9375, "rewards/margins": 3.296875, "rewards/rejected": -9.25, "step": 5710 }, { "epoch": 0.4303664133624257, "grad_norm": 12.28777473146823, "learning_rate": 3.514261545974338e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -688.0, "logps/rejected": -1064.0, "loss": 0.2056, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.125, "rewards/rejected": -9.125, "step": 5720 }, { "epoch": 0.4311188021969754, "grad_norm": 8.808299496135412, "learning_rate": 3.508256405189675e-07, "logits/chosen": -2.71875, "logits/rejected": -2.53125, "logps/chosen": -696.0, "logps/rejected": -996.0, "loss": 0.2138, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.390625, "rewards/rejected": -8.5625, "step": 5730 }, { "epoch": 0.4318711910315251, "grad_norm": 8.00492738993031, "learning_rate": 3.502244308790506e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5625, "logps/chosen": -668.0, "logps/rejected": -1032.0, "loss": 0.188, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 3.765625, "rewards/rejected": -8.75, "step": 5740 }, { "epoch": 0.4326235798660748, "grad_norm": 13.4475750058397, "learning_rate": 3.496225298252216e-07, "logits/chosen": -2.828125, "logits/rejected": -2.453125, "logps/chosen": -704.0, "logps/rejected": -1032.0, "loss": 0.2117, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.4375, "rewards/margins": 3.390625, "rewards/rejected": -8.8125, "step": 5750 }, { "epoch": 0.4333759687006245, "grad_norm": 9.233762716910919, "learning_rate": 3.490199415097892e-07, "logits/chosen": -2.671875, "logits/rejected": -2.390625, "logps/chosen": -676.0, "logps/rejected": -1072.0, "loss": 0.2074, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.34375, "rewards/margins": 3.9375, "rewards/rejected": -9.3125, "step": 5760 }, { "epoch": 0.4341283575351742, "grad_norm": 10.459722572084084, "learning_rate": 3.4841667008980316e-07, "logits/chosen": -2.65625, "logits/rejected": -2.40625, "logps/chosen": -672.0, "logps/rejected": -1048.0, "loss": 0.198, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 3.859375, "rewards/rejected": -9.125, "step": 5770 }, { "epoch": 0.43488074636972385, "grad_norm": 5.794376913507954, "learning_rate": 3.478127197270257e-07, "logits/chosen": -2.734375, "logits/rejected": -2.4375, "logps/chosen": -680.0, "logps/rejected": -1080.0, "loss": 0.1787, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.28125, "rewards/margins": 3.984375, "rewards/rejected": -9.25, "step": 5780 }, { "epoch": 0.43563313520427355, "grad_norm": 7.350470215517664, "learning_rate": 3.4720809458790277e-07, "logits/chosen": -2.75, "logits/rejected": -2.421875, "logps/chosen": -708.0, "logps/rejected": -1176.0, "loss": 0.2117, "rewards/accuracies": 0.9375, "rewards/chosen": -5.40625, "rewards/margins": 4.71875, "rewards/rejected": -10.125, "step": 5790 }, { "epoch": 0.43638552403882325, "grad_norm": 7.922786635736011, "learning_rate": 3.466027988435356e-07, "logits/chosen": -2.859375, "logits/rejected": -2.359375, "logps/chosen": -684.0, "logps/rejected": -1072.0, "loss": 0.1993, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.21875, "rewards/margins": 3.828125, "rewards/rejected": -9.0625, "step": 5800 }, { "epoch": 0.43713791287337295, "grad_norm": 8.764459781816843, "learning_rate": 3.459968366696515e-07, "logits/chosen": -2.78125, "logits/rejected": -2.46875, "logps/chosen": -648.0, "logps/rejected": -1032.0, "loss": 0.1983, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.84375, "rewards/margins": 3.96875, "rewards/rejected": -8.8125, "step": 5810 }, { "epoch": 0.43789030170792265, "grad_norm": 9.693612208902172, "learning_rate": 3.453902122465753e-07, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -652.0, "logps/rejected": -1008.0, "loss": 0.2172, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.96875, "rewards/margins": 3.6875, "rewards/rejected": -8.6875, "step": 5820 }, { "epoch": 0.43864269054247235, "grad_norm": 10.013718419535769, "learning_rate": 3.4478292975920043e-07, "logits/chosen": -2.75, "logits/rejected": -2.53125, "logps/chosen": -652.0, "logps/rejected": -992.0, "loss": 0.2155, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 3.609375, "rewards/rejected": -8.5625, "step": 5830 }, { "epoch": 0.43939507937702205, "grad_norm": 9.831133197345066, "learning_rate": 3.441749933969601e-07, "logits/chosen": -2.59375, "logits/rejected": -2.375, "logps/chosen": -668.0, "logps/rejected": -960.0, "loss": 0.2001, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 3.25, "rewards/rejected": -8.1875, "step": 5840 }, { "epoch": 0.44014746821157175, "grad_norm": 8.309385028382076, "learning_rate": 3.435664073537985e-07, "logits/chosen": -2.6875, "logits/rejected": -2.5, "logps/chosen": -704.0, "logps/rejected": -1020.0, "loss": 0.2087, "rewards/accuracies": 0.875, "rewards/chosen": -5.375, "rewards/margins": 3.359375, "rewards/rejected": -8.75, "step": 5850 }, { "epoch": 0.44089985704612145, "grad_norm": 9.600889681406729, "learning_rate": 3.4295717582814143e-07, "logits/chosen": -2.5625, "logits/rejected": -2.3125, "logps/chosen": -712.0, "logps/rejected": -1020.0, "loss": 0.1912, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.46875, "rewards/margins": 3.203125, "rewards/rejected": -8.6875, "step": 5860 }, { "epoch": 0.44165224588067115, "grad_norm": 9.084849358967578, "learning_rate": 3.4234730302286814e-07, "logits/chosen": -2.609375, "logits/rejected": -2.40625, "logps/chosen": -656.0, "logps/rejected": -980.0, "loss": 0.1878, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.96875, "rewards/margins": 3.453125, "rewards/rejected": -8.4375, "step": 5870 }, { "epoch": 0.44240463471522085, "grad_norm": 11.796957707433691, "learning_rate": 3.417367931452813e-07, "logits/chosen": -2.609375, "logits/rejected": -2.328125, "logps/chosen": -700.0, "logps/rejected": -1096.0, "loss": 0.1898, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.21875, "rewards/margins": 3.953125, "rewards/rejected": -9.1875, "step": 5880 }, { "epoch": 0.44315702354977055, "grad_norm": 8.20973312387044, "learning_rate": 3.4112565040707893e-07, "logits/chosen": -2.625, "logits/rejected": -2.421875, "logps/chosen": -668.0, "logps/rejected": -1012.0, "loss": 0.2096, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.46875, "rewards/rejected": -8.6875, "step": 5890 }, { "epoch": 0.4439094123843202, "grad_norm": 7.879754564921931, "learning_rate": 3.405138790243248e-07, "logits/chosen": -2.578125, "logits/rejected": -2.3125, "logps/chosen": -660.0, "logps/rejected": -988.0, "loss": 0.2132, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.90625, "rewards/margins": 3.40625, "rewards/rejected": -8.3125, "step": 5900 }, { "epoch": 0.4446618012188699, "grad_norm": 10.339986830006676, "learning_rate": 3.399014832174194e-07, "logits/chosen": -2.734375, "logits/rejected": -2.28125, "logps/chosen": -652.0, "logps/rejected": -984.0, "loss": 0.2051, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.96875, "rewards/margins": 3.421875, "rewards/rejected": -8.375, "step": 5910 }, { "epoch": 0.4454141900534196, "grad_norm": 9.630730722355262, "learning_rate": 3.3928846721107104e-07, "logits/chosen": -2.6875, "logits/rejected": -2.46875, "logps/chosen": -660.0, "logps/rejected": -1024.0, "loss": 0.1756, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 3.734375, "rewards/rejected": -8.6875, "step": 5920 }, { "epoch": 0.4461665788879693, "grad_norm": 9.37045022556023, "learning_rate": 3.386748352342664e-07, "logits/chosen": -2.796875, "logits/rejected": -2.609375, "logps/chosen": -724.0, "logps/rejected": -1080.0, "loss": 0.2018, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.625, "rewards/margins": 3.765625, "rewards/rejected": -9.375, "step": 5930 }, { "epoch": 0.446918967722519, "grad_norm": 12.326318807630711, "learning_rate": 3.380605915202419e-07, "logits/chosen": -2.796875, "logits/rejected": -2.46875, "logps/chosen": -696.0, "logps/rejected": -1048.0, "loss": 0.2138, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.34375, "rewards/margins": 3.703125, "rewards/rejected": -9.0625, "step": 5940 }, { "epoch": 0.4476713565570687, "grad_norm": 7.9992465107079935, "learning_rate": 3.3744574030645366e-07, "logits/chosen": -2.734375, "logits/rejected": -2.484375, "logps/chosen": -688.0, "logps/rejected": -984.0, "loss": 0.2101, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.28125, "rewards/margins": 3.078125, "rewards/rejected": -8.375, "step": 5950 }, { "epoch": 0.4484237453916184, "grad_norm": 10.267488592156788, "learning_rate": 3.3683028583454896e-07, "logits/chosen": -2.84375, "logits/rejected": -2.53125, "logps/chosen": -664.0, "logps/rejected": -1024.0, "loss": 0.2049, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 3.65625, "rewards/rejected": -8.625, "step": 5960 }, { "epoch": 0.4491761342261681, "grad_norm": 9.587336120051534, "learning_rate": 3.3621423235033687e-07, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -680.0, "logps/rejected": -1040.0, "loss": 0.2091, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.734375, "rewards/rejected": -8.875, "step": 5970 }, { "epoch": 0.4499285230607178, "grad_norm": 14.237958212512485, "learning_rate": 3.355975841037585e-07, "logits/chosen": -2.71875, "logits/rejected": -2.390625, "logps/chosen": -664.0, "logps/rejected": -1016.0, "loss": 0.2047, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.46875, "rewards/rejected": -8.625, "step": 5980 }, { "epoch": 0.4506809118952675, "grad_norm": 7.320491760776586, "learning_rate": 3.3498034534885844e-07, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -684.0, "logps/rejected": -1024.0, "loss": 0.1951, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 3.5625, "rewards/rejected": -8.875, "step": 5990 }, { "epoch": 0.4514333007298172, "grad_norm": 7.2013732968721085, "learning_rate": 3.343625203437547e-07, "logits/chosen": -2.734375, "logits/rejected": -2.4375, "logps/chosen": -676.0, "logps/rejected": -1048.0, "loss": 0.1931, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.25, "rewards/margins": 3.84375, "rewards/rejected": -9.125, "step": 6000 }, { "epoch": 0.4521856895643669, "grad_norm": 7.647738073826342, "learning_rate": 3.337441133506096e-07, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -680.0, "logps/rejected": -1040.0, "loss": 0.2241, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.25, "rewards/margins": 3.640625, "rewards/rejected": -8.875, "step": 6010 }, { "epoch": 0.45293807839891653, "grad_norm": 12.023931746185132, "learning_rate": 3.3312512863560053e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -648.0, "logps/rejected": -1008.0, "loss": 0.202, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.0, "rewards/margins": 3.703125, "rewards/rejected": -8.6875, "step": 6020 }, { "epoch": 0.45369046723346623, "grad_norm": 6.179188085390876, "learning_rate": 3.325055704688906e-07, "logits/chosen": -2.65625, "logits/rejected": -2.5, "logps/chosen": -656.0, "logps/rejected": -1000.0, "loss": 0.1926, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0, "rewards/margins": 3.640625, "rewards/rejected": -8.625, "step": 6030 }, { "epoch": 0.45444285606801593, "grad_norm": 12.492144030308038, "learning_rate": 3.318854431245984e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -668.0, "logps/rejected": -1040.0, "loss": 0.2073, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.09375, "rewards/margins": 3.875, "rewards/rejected": -9.0, "step": 6040 }, { "epoch": 0.45519524490256563, "grad_norm": 8.812424602807239, "learning_rate": 3.312647508807696e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5625, "logps/chosen": -700.0, "logps/rejected": -1048.0, "loss": 0.2168, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.46875, "rewards/margins": 3.421875, "rewards/rejected": -8.875, "step": 6050 }, { "epoch": 0.45594763373711533, "grad_norm": 13.63172423692031, "learning_rate": 3.3064349801934664e-07, "logits/chosen": -2.65625, "logits/rejected": -2.328125, "logps/chosen": -680.0, "logps/rejected": -1072.0, "loss": 0.1776, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 4.0625, "rewards/rejected": -9.1875, "step": 6060 }, { "epoch": 0.45670002257166503, "grad_norm": 8.121800577128553, "learning_rate": 3.300216888261396e-07, "logits/chosen": -2.796875, "logits/rejected": -2.453125, "logps/chosen": -752.0, "logps/rejected": -1128.0, "loss": 0.1985, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.78125, "rewards/margins": 3.875, "rewards/rejected": -9.625, "step": 6070 }, { "epoch": 0.45745241140621473, "grad_norm": 8.190803307707759, "learning_rate": 3.2939932759079655e-07, "logits/chosen": -2.6875, "logits/rejected": -2.40625, "logps/chosen": -716.0, "logps/rejected": -1064.0, "loss": 0.1866, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.46875, "rewards/margins": 3.640625, "rewards/rejected": -9.0625, "step": 6080 }, { "epoch": 0.45820480024076443, "grad_norm": 8.516271503772927, "learning_rate": 3.2877641860677357e-07, "logits/chosen": -2.65625, "logits/rejected": -2.40625, "logps/chosen": -696.0, "logps/rejected": -1096.0, "loss": 0.1821, "rewards/accuracies": 0.9375, "rewards/chosen": -5.65625, "rewards/margins": 3.796875, "rewards/rejected": -9.4375, "step": 6090 }, { "epoch": 0.45895718907531413, "grad_norm": 9.771485569804804, "learning_rate": 3.2815296617130585e-07, "logits/chosen": -2.625, "logits/rejected": -2.28125, "logps/chosen": -676.0, "logps/rejected": -1032.0, "loss": 0.2016, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.484375, "rewards/rejected": -8.6875, "step": 6100 }, { "epoch": 0.45970957790986383, "grad_norm": 9.188812610635678, "learning_rate": 3.275289745853775e-07, "logits/chosen": -2.71875, "logits/rejected": -2.421875, "logps/chosen": -672.0, "logps/rejected": -1024.0, "loss": 0.1841, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 3.6875, "rewards/rejected": -8.625, "step": 6110 }, { "epoch": 0.46046196674441353, "grad_norm": 8.341594160581772, "learning_rate": 3.269044481536921e-07, "logits/chosen": -2.796875, "logits/rejected": -2.609375, "logps/chosen": -688.0, "logps/rejected": -1056.0, "loss": 0.2123, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.28125, "rewards/margins": 3.859375, "rewards/rejected": -9.125, "step": 6120 }, { "epoch": 0.46121435557896323, "grad_norm": 10.532580186377736, "learning_rate": 3.2627939118464296e-07, "logits/chosen": -2.71875, "logits/rejected": -2.515625, "logps/chosen": -648.0, "logps/rejected": -1032.0, "loss": 0.1958, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.90625, "rewards/margins": 3.984375, "rewards/rejected": -8.875, "step": 6130 }, { "epoch": 0.46196674441351293, "grad_norm": 7.320156752221225, "learning_rate": 3.256538079902833e-07, "logits/chosen": -2.78125, "logits/rejected": -2.46875, "logps/chosen": -676.0, "logps/rejected": -1088.0, "loss": 0.2068, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.21875, "rewards/rejected": -9.375, "step": 6140 }, { "epoch": 0.4627191332480626, "grad_norm": 12.347346631437645, "learning_rate": 3.2502770288629655e-07, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -708.0, "logps/rejected": -1032.0, "loss": 0.2143, "rewards/accuracies": 0.90625, "rewards/chosen": -5.375, "rewards/margins": 3.421875, "rewards/rejected": -8.8125, "step": 6150 }, { "epoch": 0.4634715220826123, "grad_norm": 7.790431152555406, "learning_rate": 3.244010801919668e-07, "logits/chosen": -2.84375, "logits/rejected": -2.484375, "logps/chosen": -708.0, "logps/rejected": -1072.0, "loss": 0.2069, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.46875, "rewards/margins": 3.84375, "rewards/rejected": -9.3125, "step": 6160 }, { "epoch": 0.464223910917162, "grad_norm": 9.668021425592368, "learning_rate": 3.237739442301487e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1088.0, "loss": 0.1832, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.34375, "rewards/rejected": -9.4375, "step": 6170 }, { "epoch": 0.4649762997517117, "grad_norm": 7.34718078248086, "learning_rate": 3.231462993272377e-07, "logits/chosen": -2.765625, "logits/rejected": -2.546875, "logps/chosen": -660.0, "logps/rejected": -1024.0, "loss": 0.1953, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 3.703125, "rewards/rejected": -8.625, "step": 6180 }, { "epoch": 0.4657286885862614, "grad_norm": 6.688761060186978, "learning_rate": 3.225181498131404e-07, "logits/chosen": -2.75, "logits/rejected": -2.484375, "logps/chosen": -616.0, "logps/rejected": -1032.0, "loss": 0.1804, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.09375, "rewards/rejected": -8.6875, "step": 6190 }, { "epoch": 0.4664810774208111, "grad_norm": 9.071499105270645, "learning_rate": 3.218895000212445e-07, "logits/chosen": -2.71875, "logits/rejected": -2.484375, "logps/chosen": -628.0, "logps/rejected": -1020.0, "loss": 0.2185, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.6875, "rewards/margins": 3.84375, "rewards/rejected": -8.5, "step": 6200 }, { "epoch": 0.4672334662553608, "grad_norm": 7.959567822906466, "learning_rate": 3.2126035428838874e-07, "logits/chosen": -2.796875, "logits/rejected": -2.65625, "logps/chosen": -676.0, "logps/rejected": -1012.0, "loss": 0.1909, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.90625, "rewards/margins": 3.515625, "rewards/rejected": -8.4375, "step": 6210 }, { "epoch": 0.4679858550899105, "grad_norm": 10.87084790524432, "learning_rate": 3.2063071695483354e-07, "logits/chosen": -2.796875, "logits/rejected": -2.546875, "logps/chosen": -676.0, "logps/rejected": -1016.0, "loss": 0.2136, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -5.21875, "rewards/margins": 3.265625, "rewards/rejected": -8.5, "step": 6220 }, { "epoch": 0.4687382439244602, "grad_norm": 8.853900543247734, "learning_rate": 3.200005923642305e-07, "logits/chosen": -2.765625, "logits/rejected": -2.421875, "logps/chosen": -672.0, "logps/rejected": -1032.0, "loss": 0.2018, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.515625, "rewards/rejected": -8.75, "step": 6230 }, { "epoch": 0.4694906327590099, "grad_norm": 7.092157700338752, "learning_rate": 3.193699848635925e-07, "logits/chosen": -2.578125, "logits/rejected": -2.296875, "logps/chosen": -704.0, "logps/rejected": -1056.0, "loss": 0.1805, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.40625, "rewards/margins": 3.640625, "rewards/rejected": -9.0625, "step": 6240 }, { "epoch": 0.4702430215935596, "grad_norm": 6.085087527736119, "learning_rate": 3.1873889880326425e-07, "logits/chosen": -2.59375, "logits/rejected": -2.359375, "logps/chosen": -644.0, "logps/rejected": -968.0, "loss": 0.2104, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.875, "rewards/margins": 3.359375, "rewards/rejected": -8.25, "step": 6250 }, { "epoch": 0.4709954104281093, "grad_norm": 8.454444908810444, "learning_rate": 3.1810733853689153e-07, "logits/chosen": -2.84375, "logits/rejected": -2.328125, "logps/chosen": -656.0, "logps/rejected": -1012.0, "loss": 0.1951, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.59375, "rewards/margins": 3.65625, "rewards/rejected": -8.25, "step": 6260 }, { "epoch": 0.4717477992626589, "grad_norm": 11.017228443394977, "learning_rate": 3.1747530842139157e-07, "logits/chosen": -2.859375, "logits/rejected": -2.546875, "logps/chosen": -620.0, "logps/rejected": -972.0, "loss": 0.2342, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.59375, "rewards/margins": 3.5625, "rewards/rejected": -8.1875, "step": 6270 }, { "epoch": 0.4725001880972086, "grad_norm": 7.775119852317317, "learning_rate": 3.1684281281692305e-07, "logits/chosen": -2.84375, "logits/rejected": -2.53125, "logps/chosen": -616.0, "logps/rejected": -924.0, "loss": 0.2249, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.625, "rewards/margins": 3.125, "rewards/rejected": -7.71875, "step": 6280 }, { "epoch": 0.4732525769317583, "grad_norm": 10.2283707956898, "learning_rate": 3.162098560868558e-07, "logits/chosen": -2.546875, "logits/rejected": -2.40625, "logps/chosen": -608.0, "logps/rejected": -936.0, "loss": 0.1865, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.4375, "rewards/margins": 3.484375, "rewards/rejected": -7.90625, "step": 6290 }, { "epoch": 0.474004965766308, "grad_norm": 9.541214997477915, "learning_rate": 3.155764425977408e-07, "logits/chosen": -2.796875, "logits/rejected": -2.515625, "logps/chosen": -664.0, "logps/rejected": -996.0, "loss": 0.1723, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.9375, "rewards/margins": 3.703125, "rewards/rejected": -8.625, "step": 6300 }, { "epoch": 0.4747573546008577, "grad_norm": 8.776310012064307, "learning_rate": 3.1494257671928006e-07, "logits/chosen": -2.703125, "logits/rejected": -2.546875, "logps/chosen": -708.0, "logps/rejected": -1040.0, "loss": 0.2202, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.625, "rewards/margins": 3.390625, "rewards/rejected": -9.0, "step": 6310 }, { "epoch": 0.4755097434354074, "grad_norm": 10.364964175951227, "learning_rate": 3.1430826282429653e-07, "logits/chosen": -2.734375, "logits/rejected": -2.4375, "logps/chosen": -692.0, "logps/rejected": -1032.0, "loss": 0.1931, "rewards/accuracies": 0.90625, "rewards/chosen": -5.34375, "rewards/margins": 3.609375, "rewards/rejected": -8.9375, "step": 6320 }, { "epoch": 0.4762621322699571, "grad_norm": 9.44277278928709, "learning_rate": 3.136735052887038e-07, "logits/chosen": -2.671875, "logits/rejected": -2.484375, "logps/chosen": -644.0, "logps/rejected": -956.0, "loss": 0.208, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.625, "rewards/margins": 3.5, "rewards/rejected": -8.125, "step": 6330 }, { "epoch": 0.4770145211045068, "grad_norm": 9.715746849550793, "learning_rate": 3.1303830849147594e-07, "logits/chosen": -2.71875, "logits/rejected": -2.390625, "logps/chosen": -676.0, "logps/rejected": -972.0, "loss": 0.2013, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.09375, "rewards/margins": 3.1875, "rewards/rejected": -8.3125, "step": 6340 }, { "epoch": 0.4777669099390565, "grad_norm": 10.84726188854446, "learning_rate": 3.124026768146176e-07, "logits/chosen": -2.578125, "logits/rejected": -2.359375, "logps/chosen": -660.0, "logps/rejected": -1016.0, "loss": 0.2007, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 3.59375, "rewards/rejected": -8.5625, "step": 6350 }, { "epoch": 0.4785192987736062, "grad_norm": 9.066014191689984, "learning_rate": 3.117666146431331e-07, "logits/chosen": -2.515625, "logits/rejected": -2.296875, "logps/chosen": -700.0, "logps/rejected": -1020.0, "loss": 0.2123, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.375, "rewards/margins": 3.25, "rewards/rejected": -8.625, "step": 6360 }, { "epoch": 0.4792716876081559, "grad_norm": 6.842520890359096, "learning_rate": 3.111301263649969e-07, "logits/chosen": -2.4375, "logits/rejected": -2.140625, "logps/chosen": -680.0, "logps/rejected": -1012.0, "loss": 0.2037, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.28125, "rewards/margins": 3.34375, "rewards/rejected": -8.625, "step": 6370 }, { "epoch": 0.4800240764427056, "grad_norm": 12.209356701860676, "learning_rate": 3.1049321637112303e-07, "logits/chosen": -2.546875, "logits/rejected": -2.328125, "logps/chosen": -696.0, "logps/rejected": -1056.0, "loss": 0.2112, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 3.671875, "rewards/rejected": -9.0, "step": 6380 }, { "epoch": 0.48077646527725526, "grad_norm": 9.512552820232575, "learning_rate": 3.0985588905533456e-07, "logits/chosen": -2.53125, "logits/rejected": -2.234375, "logps/chosen": -652.0, "logps/rejected": -1012.0, "loss": 0.1866, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.84375, "rewards/margins": 3.84375, "rewards/rejected": -8.6875, "step": 6390 }, { "epoch": 0.48152885411180496, "grad_norm": 10.059637531130333, "learning_rate": 3.0921814881433373e-07, "logits/chosen": -2.546875, "logits/rejected": -2.265625, "logps/chosen": -668.0, "logps/rejected": -1008.0, "loss": 0.2111, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0625, "rewards/margins": 3.390625, "rewards/rejected": -8.4375, "step": 6400 }, { "epoch": 0.48228124294635466, "grad_norm": 8.644085022609678, "learning_rate": 3.0858000004767125e-07, "logits/chosen": -2.546875, "logits/rejected": -2.234375, "logps/chosen": -704.0, "logps/rejected": -1032.0, "loss": 0.1879, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.4375, "rewards/margins": 3.40625, "rewards/rejected": -8.875, "step": 6410 }, { "epoch": 0.48303363178090436, "grad_norm": 12.066282329982663, "learning_rate": 3.079414471577163e-07, "logits/chosen": -2.65625, "logits/rejected": -2.4375, "logps/chosen": -636.0, "logps/rejected": -992.0, "loss": 0.2003, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.78125, "rewards/margins": 3.765625, "rewards/rejected": -8.5625, "step": 6420 }, { "epoch": 0.48378602061545406, "grad_norm": 7.856803020586083, "learning_rate": 3.073024945496258e-07, "logits/chosen": -2.671875, "logits/rejected": -2.453125, "logps/chosen": -640.0, "logps/rejected": -1008.0, "loss": 0.1913, "rewards/accuracies": 0.90625, "rewards/chosen": -4.625, "rewards/margins": 3.859375, "rewards/rejected": -8.5, "step": 6430 }, { "epoch": 0.48453840945000376, "grad_norm": 11.09236658150085, "learning_rate": 3.0666314663131435e-07, "logits/chosen": -2.71875, "logits/rejected": -2.390625, "logps/chosen": -688.0, "logps/rejected": -1040.0, "loss": 0.1982, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3125, "rewards/margins": 3.6875, "rewards/rejected": -9.0, "step": 6440 }, { "epoch": 0.48529079828455346, "grad_norm": 9.143651153138784, "learning_rate": 3.0602340781342343e-07, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -684.0, "logps/rejected": -996.0, "loss": 0.1978, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 3.453125, "rewards/rejected": -8.5, "step": 6450 }, { "epoch": 0.48604318711910316, "grad_norm": 7.950015653785829, "learning_rate": 3.0538328250929147e-07, "logits/chosen": -2.796875, "logits/rejected": -2.453125, "logps/chosen": -668.0, "logps/rejected": -1096.0, "loss": 0.2173, "rewards/accuracies": 0.9375, "rewards/chosen": -5.21875, "rewards/margins": 4.34375, "rewards/rejected": -9.5625, "step": 6460 }, { "epoch": 0.48679557595365286, "grad_norm": 6.443015459033557, "learning_rate": 3.047427751349228e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -676.0, "logps/rejected": -980.0, "loss": 0.1794, "rewards/accuracies": 0.90625, "rewards/chosen": -4.90625, "rewards/margins": 3.40625, "rewards/rejected": -8.3125, "step": 6470 }, { "epoch": 0.48754796478820256, "grad_norm": 10.78015035404081, "learning_rate": 3.041018901089579e-07, "logits/chosen": -2.75, "logits/rejected": -2.3125, "logps/chosen": -640.0, "logps/rejected": -1004.0, "loss": 0.1987, "rewards/accuracies": 0.875, "rewards/chosen": -4.9375, "rewards/margins": 3.5, "rewards/rejected": -8.4375, "step": 6480 }, { "epoch": 0.48830035362275226, "grad_norm": 10.682871563209549, "learning_rate": 3.034606318526423e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -628.0, "logps/rejected": -980.0, "loss": 0.1876, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.6875, "rewards/margins": 3.671875, "rewards/rejected": -8.375, "step": 6490 }, { "epoch": 0.48905274245730196, "grad_norm": 8.745725840708767, "learning_rate": 3.028190047897964e-07, "logits/chosen": -2.84375, "logits/rejected": -2.53125, "logps/chosen": -628.0, "logps/rejected": -984.0, "loss": 0.2016, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.78125, "rewards/margins": 3.640625, "rewards/rejected": -8.4375, "step": 6500 }, { "epoch": 0.4898051312918516, "grad_norm": 10.212810861352294, "learning_rate": 3.021770133467848e-07, "logits/chosen": -2.640625, "logits/rejected": -2.359375, "logps/chosen": -648.0, "logps/rejected": -1000.0, "loss": 0.2123, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.8125, "rewards/margins": 3.390625, "rewards/rejected": -8.25, "step": 6510 }, { "epoch": 0.4905575201264013, "grad_norm": 10.969586831458997, "learning_rate": 3.015346619524859e-07, "logits/chosen": -2.65625, "logits/rejected": -2.375, "logps/chosen": -660.0, "logps/rejected": -1020.0, "loss": 0.1798, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 3.765625, "rewards/rejected": -8.6875, "step": 6520 }, { "epoch": 0.491309908960951, "grad_norm": 10.171261226144825, "learning_rate": 3.008919550382613e-07, "logits/chosen": -2.6875, "logits/rejected": -2.34375, "logps/chosen": -640.0, "logps/rejected": -1004.0, "loss": 0.1922, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.71875, "rewards/margins": 3.65625, "rewards/rejected": -8.375, "step": 6530 }, { "epoch": 0.4920622977955007, "grad_norm": 8.057760830640836, "learning_rate": 3.0024889703792513e-07, "logits/chosen": -2.65625, "logits/rejected": -2.34375, "logps/chosen": -652.0, "logps/rejected": -996.0, "loss": 0.1943, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.84375, "rewards/margins": 3.578125, "rewards/rejected": -8.375, "step": 6540 }, { "epoch": 0.4928146866300504, "grad_norm": 11.694363321978567, "learning_rate": 2.996054923877136e-07, "logits/chosen": -2.65625, "logits/rejected": -2.328125, "logps/chosen": -644.0, "logps/rejected": -996.0, "loss": 0.2085, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.0, "rewards/margins": 3.546875, "rewards/rejected": -8.5, "step": 6550 }, { "epoch": 0.4935670754646001, "grad_norm": 7.79473303639143, "learning_rate": 2.9896174552625427e-07, "logits/chosen": -2.765625, "logits/rejected": -2.4375, "logps/chosen": -608.0, "logps/rejected": -988.0, "loss": 0.1914, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.34375, "rewards/margins": 3.9375, "rewards/rejected": -8.25, "step": 6560 }, { "epoch": 0.4943194642991498, "grad_norm": 9.037599359618401, "learning_rate": 2.983176608945356e-07, "logits/chosen": -2.65625, "logits/rejected": -2.34375, "logps/chosen": -620.0, "logps/rejected": -1020.0, "loss": 0.209, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 3.984375, "rewards/rejected": -8.625, "step": 6570 }, { "epoch": 0.4950718531336995, "grad_norm": 9.352384133988807, "learning_rate": 2.9767324293587617e-07, "logits/chosen": -2.546875, "logits/rejected": -2.25, "logps/chosen": -612.0, "logps/rejected": -976.0, "loss": 0.2026, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.40625, "rewards/margins": 3.71875, "rewards/rejected": -8.125, "step": 6580 }, { "epoch": 0.4958242419682492, "grad_norm": 9.59161054448862, "learning_rate": 2.9702849609589403e-07, "logits/chosen": -2.65625, "logits/rejected": -2.28125, "logps/chosen": -588.0, "logps/rejected": -952.0, "loss": 0.2098, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.375, "rewards/margins": 3.46875, "rewards/rejected": -7.84375, "step": 6590 }, { "epoch": 0.4965766308027989, "grad_norm": 8.118843911163207, "learning_rate": 2.96383424822476e-07, "logits/chosen": -2.578125, "logits/rejected": -2.25, "logps/chosen": -624.0, "logps/rejected": -964.0, "loss": 0.188, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 3.671875, "rewards/rejected": -8.25, "step": 6600 }, { "epoch": 0.4973290196373486, "grad_norm": 11.423896632062768, "learning_rate": 2.957380335657473e-07, "logits/chosen": -2.671875, "logits/rejected": -2.375, "logps/chosen": -624.0, "logps/rejected": -1032.0, "loss": 0.1888, "rewards/accuracies": 0.90625, "rewards/chosen": -4.6875, "rewards/margins": 4.0625, "rewards/rejected": -8.75, "step": 6610 }, { "epoch": 0.4980814084718983, "grad_norm": 9.96296597496019, "learning_rate": 2.950923267780405e-07, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -668.0, "logps/rejected": -1020.0, "loss": 0.1912, "rewards/accuracies": 0.90625, "rewards/chosen": -5.125, "rewards/margins": 3.578125, "rewards/rejected": -8.6875, "step": 6620 }, { "epoch": 0.49883379730644795, "grad_norm": 9.8686695233564, "learning_rate": 2.944463089138648e-07, "logits/chosen": -2.625, "logits/rejected": -2.375, "logps/chosen": -620.0, "logps/rejected": -976.0, "loss": 0.2012, "rewards/accuracies": 0.9375, "rewards/chosen": -4.625, "rewards/margins": 3.703125, "rewards/rejected": -8.3125, "step": 6630 }, { "epoch": 0.49958618614099765, "grad_norm": 6.179640034047122, "learning_rate": 2.937999844298753e-07, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -640.0, "logps/rejected": -1004.0, "loss": 0.1938, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.9375, "rewards/margins": 3.765625, "rewards/rejected": -8.6875, "step": 6640 }, { "epoch": 0.5003385749755473, "grad_norm": 7.675131257102138, "learning_rate": 2.931533577848428e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -696.0, "logps/rejected": -1056.0, "loss": 0.1997, "rewards/accuracies": 0.90625, "rewards/chosen": -5.25, "rewards/margins": 3.890625, "rewards/rejected": -9.125, "step": 6650 }, { "epoch": 0.5010909638100971, "grad_norm": 8.313739766862003, "learning_rate": 2.9250643343962216e-07, "logits/chosen": -2.703125, "logits/rejected": -2.40625, "logps/chosen": -692.0, "logps/rejected": -1072.0, "loss": 0.2064, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.9375, "rewards/rejected": -9.125, "step": 6660 }, { "epoch": 0.5018433526446467, "grad_norm": 11.372548847751178, "learning_rate": 2.918592158571223e-07, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -672.0, "logps/rejected": -1032.0, "loss": 0.1949, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.15625, "rewards/margins": 3.59375, "rewards/rejected": -8.75, "step": 6670 }, { "epoch": 0.5025957414791965, "grad_norm": 11.266999098170334, "learning_rate": 2.912117095022749e-07, "logits/chosen": -2.84375, "logits/rejected": -2.453125, "logps/chosen": -676.0, "logps/rejected": -1056.0, "loss": 0.1877, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.84375, "rewards/rejected": -8.875, "step": 6680 }, { "epoch": 0.5033481303137461, "grad_norm": 9.136922702076072, "learning_rate": 2.9056391884200374e-07, "logits/chosen": -2.796875, "logits/rejected": -2.375, "logps/chosen": -652.0, "logps/rejected": -1012.0, "loss": 0.1963, "rewards/accuracies": 0.9375, "rewards/chosen": -4.90625, "rewards/margins": 3.734375, "rewards/rejected": -8.625, "step": 6690 }, { "epoch": 0.5041005191482958, "grad_norm": 9.283390523324467, "learning_rate": 2.8991584834519405e-07, "logits/chosen": -2.65625, "logits/rejected": -2.46875, "logps/chosen": -664.0, "logps/rejected": -1000.0, "loss": 0.2166, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.90625, "rewards/margins": 3.40625, "rewards/rejected": -8.3125, "step": 6700 }, { "epoch": 0.5048529079828455, "grad_norm": 9.816366740181282, "learning_rate": 2.8926750248266165e-07, "logits/chosen": -2.734375, "logits/rejected": -2.375, "logps/chosen": -696.0, "logps/rejected": -1040.0, "loss": 0.1932, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.125, "rewards/margins": 3.65625, "rewards/rejected": -8.8125, "step": 6710 }, { "epoch": 0.5056052968173952, "grad_norm": 22.193797590045083, "learning_rate": 2.886188857271217e-07, "logits/chosen": -2.75, "logits/rejected": -2.375, "logps/chosen": -680.0, "logps/rejected": -1056.0, "loss": 0.1897, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.15625, "rewards/margins": 3.984375, "rewards/rejected": -9.125, "step": 6720 }, { "epoch": 0.5063576856519449, "grad_norm": 10.433022889811856, "learning_rate": 2.879700025531583e-07, "logits/chosen": -2.71875, "logits/rejected": -2.34375, "logps/chosen": -644.0, "logps/rejected": -980.0, "loss": 0.1868, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 3.609375, "rewards/rejected": -8.5625, "step": 6730 }, { "epoch": 0.5071100744864946, "grad_norm": 9.7116225698569, "learning_rate": 2.873208574371937e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -692.0, "logps/rejected": -1032.0, "loss": 0.2132, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.28125, "rewards/margins": 3.515625, "rewards/rejected": -8.8125, "step": 6740 }, { "epoch": 0.5078624633210443, "grad_norm": 9.385981192467634, "learning_rate": 2.8667145485745684e-07, "logits/chosen": -2.78125, "logits/rejected": -2.40625, "logps/chosen": -660.0, "logps/rejected": -1016.0, "loss": 0.1993, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.75, "rewards/margins": 3.8125, "rewards/rejected": -8.5625, "step": 6750 }, { "epoch": 0.508614852155594, "grad_norm": 8.52486755295249, "learning_rate": 2.860217992939532e-07, "logits/chosen": -2.5625, "logits/rejected": -2.3125, "logps/chosen": -624.0, "logps/rejected": -980.0, "loss": 0.1908, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.625, "rewards/margins": 3.59375, "rewards/rejected": -8.25, "step": 6760 }, { "epoch": 0.5093672409901437, "grad_norm": 16.725319922468845, "learning_rate": 2.853718952284331e-07, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -636.0, "logps/rejected": -992.0, "loss": 0.1945, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.625, "rewards/margins": 3.84375, "rewards/rejected": -8.4375, "step": 6770 }, { "epoch": 0.5101196298246934, "grad_norm": 10.766654597956364, "learning_rate": 2.8472174714436137e-07, "logits/chosen": -2.75, "logits/rejected": -2.359375, "logps/chosen": -632.0, "logps/rejected": -1008.0, "loss": 0.2055, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.78125, "rewards/margins": 3.859375, "rewards/rejected": -8.625, "step": 6780 }, { "epoch": 0.5108720186592431, "grad_norm": 10.372433884469864, "learning_rate": 2.8407135952688634e-07, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -648.0, "logps/rejected": -968.0, "loss": 0.1685, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.90625, "rewards/margins": 3.34375, "rewards/rejected": -8.25, "step": 6790 }, { "epoch": 0.5116244074937928, "grad_norm": 7.884038617129319, "learning_rate": 2.834207368628088e-07, "logits/chosen": -2.78125, "logits/rejected": -2.453125, "logps/chosen": -648.0, "logps/rejected": -1040.0, "loss": 0.2004, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.90625, "rewards/margins": 3.9375, "rewards/rejected": -8.875, "step": 6800 }, { "epoch": 0.5123767963283424, "grad_norm": 8.052127827588135, "learning_rate": 2.8276988364055076e-07, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.1753, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 4.84375, "rewards/rejected": -9.5625, "step": 6810 }, { "epoch": 0.5131291851628922, "grad_norm": 9.558785990781068, "learning_rate": 2.821188043501251e-07, "logits/chosen": -2.9375, "logits/rejected": -2.671875, "logps/chosen": -664.0, "logps/rejected": -1032.0, "loss": 0.1988, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.15625, "rewards/margins": 3.78125, "rewards/rejected": -8.9375, "step": 6820 }, { "epoch": 0.5138815739974418, "grad_norm": 10.191515513058384, "learning_rate": 2.8146750348310406e-07, "logits/chosen": -2.8125, "logits/rejected": -2.640625, "logps/chosen": -652.0, "logps/rejected": -1000.0, "loss": 0.1967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.78125, "rewards/margins": 3.65625, "rewards/rejected": -8.4375, "step": 6830 }, { "epoch": 0.5146339628319916, "grad_norm": 9.206459675501884, "learning_rate": 2.8081598553258863e-07, "logits/chosen": -2.9375, "logits/rejected": -2.59375, "logps/chosen": -696.0, "logps/rejected": -1088.0, "loss": 0.1693, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.0625, "rewards/rejected": -9.25, "step": 6840 }, { "epoch": 0.5153863516665412, "grad_norm": 8.438051844247585, "learning_rate": 2.801642549931773e-07, "logits/chosen": -2.921875, "logits/rejected": -2.5625, "logps/chosen": -696.0, "logps/rejected": -1056.0, "loss": 0.166, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.21875, "rewards/margins": 3.765625, "rewards/rejected": -9.0, "step": 6850 }, { "epoch": 0.516138740501091, "grad_norm": 8.411109863368345, "learning_rate": 2.7951231636093496e-07, "logits/chosen": -2.875, "logits/rejected": -2.625, "logps/chosen": -648.0, "logps/rejected": -1040.0, "loss": 0.1671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.1875, "rewards/rejected": -8.9375, "step": 6860 }, { "epoch": 0.5168911293356406, "grad_norm": 9.113752670152984, "learning_rate": 2.788601741333623e-07, "logits/chosen": -2.859375, "logits/rejected": -2.46875, "logps/chosen": -652.0, "logps/rejected": -1056.0, "loss": 0.1667, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.15625, "rewards/rejected": -9.0, "step": 6870 }, { "epoch": 0.5176435181701904, "grad_norm": 9.555789599756581, "learning_rate": 2.782078328093646e-07, "logits/chosen": -2.9375, "logits/rejected": -2.46875, "logps/chosen": -620.0, "logps/rejected": -1056.0, "loss": 0.1981, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.25, "rewards/rejected": -8.875, "step": 6880 }, { "epoch": 0.51839590700474, "grad_norm": 9.329766699762661, "learning_rate": 2.775552968892205e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -616.0, "logps/rejected": -972.0, "loss": 0.1863, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.59375, "rewards/margins": 3.703125, "rewards/rejected": -8.3125, "step": 6890 }, { "epoch": 0.5191482958392898, "grad_norm": 10.161837533103533, "learning_rate": 2.76902570874551e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -632.0, "logps/rejected": -1024.0, "loss": 0.1849, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.03125, "rewards/rejected": -8.75, "step": 6900 }, { "epoch": 0.5199006846738394, "grad_norm": 10.983457686576111, "learning_rate": 2.7624965926828856e-07, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -604.0, "logps/rejected": -984.0, "loss": 0.2057, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 3.828125, "rewards/rejected": -8.375, "step": 6910 }, { "epoch": 0.5206530735083892, "grad_norm": 9.70579239659648, "learning_rate": 2.7559656657464615e-07, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -652.0, "logps/rejected": -968.0, "loss": 0.2082, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.8125, "rewards/margins": 3.3125, "rewards/rejected": -8.125, "step": 6920 }, { "epoch": 0.5214054623429388, "grad_norm": 11.541497496246196, "learning_rate": 2.7494329729908585e-07, "logits/chosen": -2.765625, "logits/rejected": -2.4375, "logps/chosen": -652.0, "logps/rejected": -1072.0, "loss": 0.1915, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.03125, "rewards/rejected": -9.125, "step": 6930 }, { "epoch": 0.5221578511774885, "grad_norm": 7.92010618143839, "learning_rate": 2.7428985594828785e-07, "logits/chosen": -2.875, "logits/rejected": -2.65625, "logps/chosen": -664.0, "logps/rejected": -1064.0, "loss": 0.2087, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 4.03125, "rewards/rejected": -8.9375, "step": 6940 }, { "epoch": 0.5229102400120382, "grad_norm": 9.537578165888158, "learning_rate": 2.736362470301195e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -660.0, "logps/rejected": -1008.0, "loss": 0.1796, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.15625, "rewards/margins": 3.59375, "rewards/rejected": -8.75, "step": 6950 }, { "epoch": 0.5236626288465879, "grad_norm": 10.626977421177818, "learning_rate": 2.7298247505360414e-07, "logits/chosen": -2.90625, "logits/rejected": -2.578125, "logps/chosen": -672.0, "logps/rejected": -1080.0, "loss": 0.1903, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.21875, "rewards/rejected": -9.25, "step": 6960 }, { "epoch": 0.5244150176811376, "grad_norm": 8.431235259296844, "learning_rate": 2.723285445288902e-07, "logits/chosen": -2.78125, "logits/rejected": -2.609375, "logps/chosen": -640.0, "logps/rejected": -1004.0, "loss": 0.1986, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.8125, "rewards/margins": 3.765625, "rewards/rejected": -8.5625, "step": 6970 }, { "epoch": 0.5251674065156873, "grad_norm": 10.03229995476381, "learning_rate": 2.7167445996721954e-07, "logits/chosen": -2.859375, "logits/rejected": -2.625, "logps/chosen": -660.0, "logps/rejected": -976.0, "loss": 0.1903, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 3.265625, "rewards/rejected": -8.1875, "step": 6980 }, { "epoch": 0.525919795350237, "grad_norm": 7.476779653388887, "learning_rate": 2.710202258808967e-07, "logits/chosen": -2.828125, "logits/rejected": -2.5625, "logps/chosen": -616.0, "logps/rejected": -976.0, "loss": 0.186, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.46875, "rewards/margins": 3.765625, "rewards/rejected": -8.25, "step": 6990 }, { "epoch": 0.5266721841847867, "grad_norm": 9.682232421431149, "learning_rate": 2.70365846783258e-07, "logits/chosen": -2.75, "logits/rejected": -2.46875, "logps/chosen": -660.0, "logps/rejected": -1032.0, "loss": 0.1915, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0, "rewards/margins": 3.703125, "rewards/rejected": -8.6875, "step": 7000 }, { "epoch": 0.5274245730193364, "grad_norm": 6.954785345590251, "learning_rate": 2.6971132718864005e-07, "logits/chosen": -2.609375, "logits/rejected": -2.375, "logps/chosen": -648.0, "logps/rejected": -1012.0, "loss": 0.1862, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 3.828125, "rewards/rejected": -8.5625, "step": 7010 }, { "epoch": 0.5281769618538861, "grad_norm": 10.205161787990173, "learning_rate": 2.6905667161234844e-07, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -640.0, "logps/rejected": -996.0, "loss": 0.2083, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 3.609375, "rewards/rejected": -8.4375, "step": 7020 }, { "epoch": 0.5289293506884358, "grad_norm": 6.3527477235968215, "learning_rate": 2.6840188457062725e-07, "logits/chosen": -2.71875, "logits/rejected": -2.515625, "logps/chosen": -672.0, "logps/rejected": -1024.0, "loss": 0.1782, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 3.96875, "rewards/rejected": -8.75, "step": 7030 }, { "epoch": 0.5296817395229855, "grad_norm": 12.108587765802628, "learning_rate": 2.6774697058062713e-07, "logits/chosen": -2.765625, "logits/rejected": -2.421875, "logps/chosen": -640.0, "logps/rejected": -1024.0, "loss": 0.1955, "rewards/accuracies": 0.90625, "rewards/chosen": -4.8125, "rewards/margins": 3.96875, "rewards/rejected": -8.8125, "step": 7040 }, { "epoch": 0.5304341283575352, "grad_norm": 8.516026422713471, "learning_rate": 2.6709193416037475e-07, "logits/chosen": -2.765625, "logits/rejected": -2.4375, "logps/chosen": -600.0, "logps/rejected": -992.0, "loss": 0.1814, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.4375, "rewards/margins": 3.90625, "rewards/rejected": -8.3125, "step": 7050 }, { "epoch": 0.5311865171920849, "grad_norm": 9.720629632343826, "learning_rate": 2.6643677982874133e-07, "logits/chosen": -2.859375, "logits/rejected": -2.546875, "logps/chosen": -588.0, "logps/rejected": -952.0, "loss": 0.2008, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.46875, "rewards/margins": 3.75, "rewards/rejected": -8.1875, "step": 7060 }, { "epoch": 0.5319389060266345, "grad_norm": 9.509223667047454, "learning_rate": 2.657815121054115e-07, "logits/chosen": -3.03125, "logits/rejected": -2.703125, "logps/chosen": -616.0, "logps/rejected": -984.0, "loss": 0.1894, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.4375, "rewards/margins": 3.734375, "rewards/rejected": -8.1875, "step": 7070 }, { "epoch": 0.5326912948611843, "grad_norm": 9.245136404559346, "learning_rate": 2.6512613551085214e-07, "logits/chosen": -2.765625, "logits/rejected": -2.609375, "logps/chosen": -624.0, "logps/rejected": -960.0, "loss": 0.1823, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.4375, "rewards/margins": 3.640625, "rewards/rejected": -8.0625, "step": 7080 }, { "epoch": 0.5334436836957339, "grad_norm": 9.894052286966772, "learning_rate": 2.6447065456628105e-07, "logits/chosen": -2.859375, "logits/rejected": -2.484375, "logps/chosen": -624.0, "logps/rejected": -984.0, "loss": 0.1845, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.5625, "rewards/margins": 3.75, "rewards/rejected": -8.3125, "step": 7090 }, { "epoch": 0.5341960725302837, "grad_norm": 9.553544752988854, "learning_rate": 2.6381507379363627e-07, "logits/chosen": -2.8125, "logits/rejected": -2.53125, "logps/chosen": -584.0, "logps/rejected": -976.0, "loss": 0.1861, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.09375, "rewards/margins": 4.125, "rewards/rejected": -8.25, "step": 7100 }, { "epoch": 0.5349484613648333, "grad_norm": 10.193963283951748, "learning_rate": 2.6315939771554407e-07, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -604.0, "logps/rejected": -972.0, "loss": 0.1899, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.375, "rewards/margins": 3.84375, "rewards/rejected": -8.25, "step": 7110 }, { "epoch": 0.5357008501993831, "grad_norm": 9.637160856140365, "learning_rate": 2.6250363085528867e-07, "logits/chosen": -2.65625, "logits/rejected": -2.375, "logps/chosen": -632.0, "logps/rejected": -988.0, "loss": 0.2127, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.8125, "rewards/margins": 3.578125, "rewards/rejected": -8.375, "step": 7120 }, { "epoch": 0.5364532390339327, "grad_norm": 8.700264918397732, "learning_rate": 2.618477777367801e-07, "logits/chosen": -2.671875, "logits/rejected": -2.25, "logps/chosen": -640.0, "logps/rejected": -1012.0, "loss": 0.2108, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.6875, "rewards/margins": 3.921875, "rewards/rejected": -8.625, "step": 7130 }, { "epoch": 0.5372056278684825, "grad_norm": 8.134833894710736, "learning_rate": 2.6119184288452377e-07, "logits/chosen": -2.6875, "logits/rejected": -2.28125, "logps/chosen": -656.0, "logps/rejected": -1032.0, "loss": 0.1793, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 3.75, "rewards/rejected": -8.6875, "step": 7140 }, { "epoch": 0.5379580167030321, "grad_norm": 6.603760811837454, "learning_rate": 2.6053583082358887e-07, "logits/chosen": -2.765625, "logits/rejected": -2.484375, "logps/chosen": -616.0, "logps/rejected": -984.0, "loss": 0.1813, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.625, "rewards/margins": 3.78125, "rewards/rejected": -8.375, "step": 7150 }, { "epoch": 0.5387104055375819, "grad_norm": 12.925965387407317, "learning_rate": 2.598797460795772e-07, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -628.0, "logps/rejected": -960.0, "loss": 0.1801, "rewards/accuracies": 0.90625, "rewards/chosen": -4.5625, "rewards/margins": 3.6875, "rewards/rejected": -8.25, "step": 7160 }, { "epoch": 0.5394627943721315, "grad_norm": 7.161982538239592, "learning_rate": 2.5922359317859195e-07, "logits/chosen": -2.859375, "logits/rejected": -2.53125, "logps/chosen": -616.0, "logps/rejected": -1008.0, "loss": 0.1821, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.5, "rewards/margins": 4.03125, "rewards/rejected": -8.5625, "step": 7170 }, { "epoch": 0.5402151832066812, "grad_norm": 9.670500724682935, "learning_rate": 2.585673766472065e-07, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -652.0, "logps/rejected": -1048.0, "loss": 0.1662, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.125, "rewards/rejected": -8.9375, "step": 7180 }, { "epoch": 0.5409675720412309, "grad_norm": 10.653981044551228, "learning_rate": 2.5791110101243337e-07, "logits/chosen": -2.796875, "logits/rejected": -2.46875, "logps/chosen": -720.0, "logps/rejected": -1088.0, "loss": 0.2079, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.65625, "rewards/margins": 3.828125, "rewards/rejected": -9.4375, "step": 7190 }, { "epoch": 0.5417199608757806, "grad_norm": 8.566022489278872, "learning_rate": 2.5725477080169243e-07, "logits/chosen": -2.78125, "logits/rejected": -2.421875, "logps/chosen": -728.0, "logps/rejected": -1080.0, "loss": 0.209, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.59375, "rewards/margins": 3.59375, "rewards/rejected": -9.1875, "step": 7200 }, { "epoch": 0.5424723497103303, "grad_norm": 9.199996930661813, "learning_rate": 2.565983905427806e-07, "logits/chosen": -2.8125, "logits/rejected": -2.453125, "logps/chosen": -676.0, "logps/rejected": -1004.0, "loss": 0.1991, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.28125, "rewards/margins": 3.28125, "rewards/rejected": -8.5625, "step": 7210 }, { "epoch": 0.54322473854488, "grad_norm": 10.610837436282099, "learning_rate": 2.559419647638395e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -660.0, "logps/rejected": -1000.0, "loss": 0.2025, "rewards/accuracies": 0.875, "rewards/chosen": -5.09375, "rewards/margins": 3.5625, "rewards/rejected": -8.625, "step": 7220 }, { "epoch": 0.5439771273794297, "grad_norm": 7.6443150182433035, "learning_rate": 2.5528549799332515e-07, "logits/chosen": -2.71875, "logits/rejected": -2.375, "logps/chosen": -688.0, "logps/rejected": -1048.0, "loss": 0.202, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.40625, "rewards/margins": 3.6875, "rewards/rejected": -9.0625, "step": 7230 }, { "epoch": 0.5447295162139794, "grad_norm": 10.089524522681916, "learning_rate": 2.5462899475997617e-07, "logits/chosen": -2.75, "logits/rejected": -2.421875, "logps/chosen": -684.0, "logps/rejected": -1056.0, "loss": 0.2053, "rewards/accuracies": 0.9375, "rewards/chosen": -5.21875, "rewards/margins": 3.921875, "rewards/rejected": -9.125, "step": 7240 }, { "epoch": 0.5454819050485291, "grad_norm": 11.14010778305217, "learning_rate": 2.5397245959278284e-07, "logits/chosen": -2.8125, "logits/rejected": -2.453125, "logps/chosen": -680.0, "logps/rejected": -1040.0, "loss": 0.2039, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.640625, "rewards/rejected": -8.8125, "step": 7250 }, { "epoch": 0.5462342938830788, "grad_norm": 9.959907622564678, "learning_rate": 2.533158970209558e-07, "logits/chosen": -2.515625, "logits/rejected": -2.25, "logps/chosen": -696.0, "logps/rejected": -1048.0, "loss": 0.1998, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.640625, "rewards/rejected": -8.8125, "step": 7260 }, { "epoch": 0.5469866827176285, "grad_norm": 9.70323307418387, "learning_rate": 2.526593115738945e-07, "logits/chosen": -2.65625, "logits/rejected": -2.3125, "logps/chosen": -716.0, "logps/rejected": -1056.0, "loss": 0.1889, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.53125, "rewards/margins": 3.59375, "rewards/rejected": -9.125, "step": 7270 }, { "epoch": 0.5477390715521782, "grad_norm": 12.429429648330792, "learning_rate": 2.5200270778115634e-07, "logits/chosen": -2.71875, "logits/rejected": -2.390625, "logps/chosen": -692.0, "logps/rejected": -1064.0, "loss": 0.1802, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 4.0625, "rewards/rejected": -9.1875, "step": 7280 }, { "epoch": 0.5484914603867279, "grad_norm": 12.425165028439498, "learning_rate": 2.513460901724253e-07, "logits/chosen": -2.671875, "logits/rejected": -2.53125, "logps/chosen": -692.0, "logps/rejected": -1064.0, "loss": 0.1761, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3125, "rewards/margins": 3.8125, "rewards/rejected": -9.125, "step": 7290 }, { "epoch": 0.5492438492212776, "grad_norm": 6.42017367577843, "learning_rate": 2.5068946327748073e-07, "logits/chosen": -2.734375, "logits/rejected": -2.34375, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.182, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 7300 }, { "epoch": 0.5499962380558272, "grad_norm": 10.146319490014568, "learning_rate": 2.5003283162616585e-07, "logits/chosen": -2.703125, "logits/rejected": -2.40625, "logps/chosen": -696.0, "logps/rejected": -1080.0, "loss": 0.1667, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.03125, "rewards/rejected": -9.25, "step": 7310 }, { "epoch": 0.550748626890377, "grad_norm": 9.133315990487066, "learning_rate": 2.493761997483569e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -700.0, "logps/rejected": -1096.0, "loss": 0.1965, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.5, "rewards/margins": 3.9375, "rewards/rejected": -9.4375, "step": 7320 }, { "epoch": 0.5515010157249266, "grad_norm": 9.96126188722454, "learning_rate": 2.4871957217393155e-07, "logits/chosen": -2.734375, "logits/rejected": -2.4375, "logps/chosen": -720.0, "logps/rejected": -1080.0, "loss": 0.2008, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.46875, "rewards/margins": 3.6875, "rewards/rejected": -9.125, "step": 7330 }, { "epoch": 0.5522534045594764, "grad_norm": 7.703266984369524, "learning_rate": 2.480629534327378e-07, "logits/chosen": -2.703125, "logits/rejected": -2.34375, "logps/chosen": -668.0, "logps/rejected": -1056.0, "loss": 0.1767, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.03125, "rewards/margins": 3.96875, "rewards/rejected": -9.0, "step": 7340 }, { "epoch": 0.553005793394026, "grad_norm": 11.036520110749903, "learning_rate": 2.474063480545628e-07, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -636.0, "logps/rejected": -1012.0, "loss": 0.1908, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 3.8125, "rewards/rejected": -8.625, "step": 7350 }, { "epoch": 0.5537581822285758, "grad_norm": 10.372930262089573, "learning_rate": 2.4674976056910136e-07, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -628.0, "logps/rejected": -992.0, "loss": 0.1867, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.65625, "rewards/margins": 3.765625, "rewards/rejected": -8.4375, "step": 7360 }, { "epoch": 0.5545105710631254, "grad_norm": 7.469778055442896, "learning_rate": 2.460931955059251e-07, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -628.0, "logps/rejected": -1032.0, "loss": 0.1806, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 4.03125, "rewards/rejected": -8.875, "step": 7370 }, { "epoch": 0.5552629598976752, "grad_norm": 8.649079014691724, "learning_rate": 2.4543665739445054e-07, "logits/chosen": -2.8125, "logits/rejected": -2.53125, "logps/chosen": -624.0, "logps/rejected": -1032.0, "loss": 0.1668, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 4.1875, "rewards/rejected": -8.8125, "step": 7380 }, { "epoch": 0.5560153487322248, "grad_norm": 8.931574849238933, "learning_rate": 2.447801507639087e-07, "logits/chosen": -2.8125, "logits/rejected": -2.515625, "logps/chosen": -664.0, "logps/rejected": -1040.0, "loss": 0.2007, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.09375, "rewards/margins": 3.75, "rewards/rejected": -8.875, "step": 7390 }, { "epoch": 0.5567677375667746, "grad_norm": 13.202148704345227, "learning_rate": 2.4412368014331326e-07, "logits/chosen": -2.703125, "logits/rejected": -2.359375, "logps/chosen": -644.0, "logps/rejected": -1048.0, "loss": 0.214, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.03125, "rewards/margins": 4.03125, "rewards/rejected": -9.0625, "step": 7400 }, { "epoch": 0.5575201264013242, "grad_norm": 5.9662166661161775, "learning_rate": 2.434672500614294e-07, "logits/chosen": -2.671875, "logits/rejected": -2.421875, "logps/chosen": -632.0, "logps/rejected": -988.0, "loss": 0.1808, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 3.65625, "rewards/rejected": -8.375, "step": 7410 }, { "epoch": 0.5582725152358738, "grad_norm": 8.568530720173657, "learning_rate": 2.428108650467427e-07, "logits/chosen": -2.75, "logits/rejected": -2.46875, "logps/chosen": -700.0, "logps/rejected": -1032.0, "loss": 0.1973, "rewards/accuracies": 0.90625, "rewards/chosen": -5.28125, "rewards/margins": 3.40625, "rewards/rejected": -8.6875, "step": 7420 }, { "epoch": 0.5590249040704236, "grad_norm": 9.518151204978215, "learning_rate": 2.4215452962742795e-07, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -676.0, "logps/rejected": -1056.0, "loss": 0.176, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.25, "rewards/margins": 3.890625, "rewards/rejected": -9.125, "step": 7430 }, { "epoch": 0.5597772929049732, "grad_norm": 9.381073859805955, "learning_rate": 2.4149824833131766e-07, "logits/chosen": -2.796875, "logits/rejected": -2.546875, "logps/chosen": -648.0, "logps/rejected": -1064.0, "loss": 0.1927, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.34375, "rewards/rejected": -9.3125, "step": 7440 }, { "epoch": 0.560529681739523, "grad_norm": 8.53592549544634, "learning_rate": 2.408420256858709e-07, "logits/chosen": -2.84375, "logits/rejected": -2.578125, "logps/chosen": -632.0, "logps/rejected": -996.0, "loss": 0.1949, "rewards/accuracies": 0.90625, "rewards/chosen": -4.84375, "rewards/margins": 3.609375, "rewards/rejected": -8.4375, "step": 7450 }, { "epoch": 0.5612820705740726, "grad_norm": 7.3386201760751915, "learning_rate": 2.4018586621814245e-07, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -660.0, "logps/rejected": -1032.0, "loss": 0.174, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 3.78125, "rewards/rejected": -8.6875, "step": 7460 }, { "epoch": 0.5620344594086224, "grad_norm": 9.509046000241845, "learning_rate": 2.395297744547507e-07, "logits/chosen": -2.59375, "logits/rejected": -2.4375, "logps/chosen": -660.0, "logps/rejected": -996.0, "loss": 0.2118, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.4375, "rewards/rejected": -8.5, "step": 7470 }, { "epoch": 0.562786848243172, "grad_norm": 6.611905514599548, "learning_rate": 2.3887375492184754e-07, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -672.0, "logps/rejected": -1040.0, "loss": 0.1714, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 3.75, "rewards/rejected": -8.875, "step": 7480 }, { "epoch": 0.5635392370777218, "grad_norm": 10.287896987883586, "learning_rate": 2.3821781214508624e-07, "logits/chosen": -2.71875, "logits/rejected": -2.5625, "logps/chosen": -692.0, "logps/rejected": -1024.0, "loss": 0.1993, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.46875, "rewards/rejected": -8.625, "step": 7490 }, { "epoch": 0.5642916259122714, "grad_norm": 7.132483843089514, "learning_rate": 2.3756195064959062e-07, "logits/chosen": -2.8125, "logits/rejected": -2.453125, "logps/chosen": -660.0, "logps/rejected": -1056.0, "loss": 0.1616, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.1875, "rewards/rejected": -9.125, "step": 7500 }, { "epoch": 0.5650440147468212, "grad_norm": 9.38473146664766, "learning_rate": 2.369061749599238e-07, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.1822, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.0625, "rewards/rejected": -9.125, "step": 7510 }, { "epoch": 0.5657964035813708, "grad_norm": 11.06870568028871, "learning_rate": 2.36250489600057e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -656.0, "logps/rejected": -1020.0, "loss": 0.1714, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 3.84375, "rewards/rejected": -8.75, "step": 7520 }, { "epoch": 0.5665487924159206, "grad_norm": 13.638691339075915, "learning_rate": 2.3559489909333812e-07, "logits/chosen": -2.78125, "logits/rejected": -2.453125, "logps/chosen": -676.0, "logps/rejected": -1048.0, "loss": 0.1795, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 3.765625, "rewards/rejected": -8.9375, "step": 7530 }, { "epoch": 0.5673011812504702, "grad_norm": 12.446245321810611, "learning_rate": 2.3493940796246088e-07, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.1821, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.34375, "rewards/margins": 4.09375, "rewards/rejected": -9.4375, "step": 7540 }, { "epoch": 0.5680535700850199, "grad_norm": 10.008249249632032, "learning_rate": 2.342840207294335e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -692.0, "logps/rejected": -1088.0, "loss": 0.1748, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 4.03125, "rewards/rejected": -9.375, "step": 7550 }, { "epoch": 0.5688059589195696, "grad_norm": 7.044727387892051, "learning_rate": 2.33628741915547e-07, "logits/chosen": -2.6875, "logits/rejected": -2.34375, "logps/chosen": -672.0, "logps/rejected": -1072.0, "loss": 0.1548, "rewards/accuracies": 0.9375, "rewards/chosen": -5.34375, "rewards/margins": 3.953125, "rewards/rejected": -9.25, "step": 7560 }, { "epoch": 0.5695583477541193, "grad_norm": 8.545277753714306, "learning_rate": 2.32973576041345e-07, "logits/chosen": -2.8125, "logits/rejected": -2.40625, "logps/chosen": -696.0, "logps/rejected": -1120.0, "loss": 0.1877, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.3125, "rewards/margins": 4.28125, "rewards/rejected": -9.625, "step": 7570 }, { "epoch": 0.570310736588669, "grad_norm": 8.15048607027103, "learning_rate": 2.3231852762659165e-07, "logits/chosen": -2.8125, "logits/rejected": -2.515625, "logps/chosen": -672.0, "logps/rejected": -1072.0, "loss": 0.1849, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.125, "rewards/margins": 4.125, "rewards/rejected": -9.25, "step": 7580 }, { "epoch": 0.5710631254232187, "grad_norm": 8.425289105784666, "learning_rate": 2.3166360119024094e-07, "logits/chosen": -2.734375, "logits/rejected": -2.328125, "logps/chosen": -752.0, "logps/rejected": -1112.0, "loss": 0.1735, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.65625, "rewards/margins": 3.875, "rewards/rejected": -9.5, "step": 7590 }, { "epoch": 0.5718155142577684, "grad_norm": 7.974257289934345, "learning_rate": 2.310088012504053e-07, "logits/chosen": -2.703125, "logits/rejected": -2.3125, "logps/chosen": -720.0, "logps/rejected": -1104.0, "loss": 0.1985, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.6875, "rewards/margins": 3.78125, "rewards/rejected": -9.5, "step": 7600 }, { "epoch": 0.5725679030923181, "grad_norm": 7.55519028519371, "learning_rate": 2.303541323243246e-07, "logits/chosen": -2.671875, "logits/rejected": -2.375, "logps/chosen": -732.0, "logps/rejected": -1088.0, "loss": 0.1793, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.75, "rewards/margins": 3.671875, "rewards/rejected": -9.4375, "step": 7610 }, { "epoch": 0.5733202919268678, "grad_norm": 8.69426857579776, "learning_rate": 2.2969959892833474e-07, "logits/chosen": -2.734375, "logits/rejected": -2.296875, "logps/chosen": -720.0, "logps/rejected": -1152.0, "loss": 0.1844, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.71875, "rewards/margins": 4.15625, "rewards/rejected": -9.875, "step": 7620 }, { "epoch": 0.5740726807614175, "grad_norm": 7.995168153719178, "learning_rate": 2.290452055778367e-07, "logits/chosen": -2.765625, "logits/rejected": -2.4375, "logps/chosen": -728.0, "logps/rejected": -1104.0, "loss": 0.1952, "rewards/accuracies": 0.9375, "rewards/chosen": -5.65625, "rewards/margins": 3.96875, "rewards/rejected": -9.625, "step": 7630 }, { "epoch": 0.5748250695959672, "grad_norm": 9.47978147859696, "learning_rate": 2.2839095678726553e-07, "logits/chosen": -2.765625, "logits/rejected": -2.46875, "logps/chosen": -680.0, "logps/rejected": -1080.0, "loss": 0.1751, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.3125, "rewards/margins": 3.890625, "rewards/rejected": -9.1875, "step": 7640 }, { "epoch": 0.5755774584305169, "grad_norm": 8.068045603290606, "learning_rate": 2.2773685707005863e-07, "logits/chosen": -2.703125, "logits/rejected": -2.5, "logps/chosen": -744.0, "logps/rejected": -1112.0, "loss": 0.1766, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.6875, "rewards/margins": 3.84375, "rewards/rejected": -9.5, "step": 7650 }, { "epoch": 0.5763298472650666, "grad_norm": 10.132016914801373, "learning_rate": 2.270829109386253e-07, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -712.0, "logps/rejected": -1080.0, "loss": 0.1861, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.40625, "rewards/margins": 3.8125, "rewards/rejected": -9.25, "step": 7660 }, { "epoch": 0.5770822360996163, "grad_norm": 7.187891351598791, "learning_rate": 2.2642912290431518e-07, "logits/chosen": -2.640625, "logits/rejected": -2.453125, "logps/chosen": -688.0, "logps/rejected": -1120.0, "loss": 0.184, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.3125, "rewards/margins": 4.375, "rewards/rejected": -9.6875, "step": 7670 }, { "epoch": 0.5778346249341659, "grad_norm": 7.084610122669814, "learning_rate": 2.257754974773873e-07, "logits/chosen": -2.65625, "logits/rejected": -2.359375, "logps/chosen": -744.0, "logps/rejected": -1160.0, "loss": 0.2082, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.78125, "rewards/margins": 4.25, "rewards/rejected": -10.0, "step": 7680 }, { "epoch": 0.5785870137687157, "grad_norm": 7.898558900449446, "learning_rate": 2.2512203916697892e-07, "logits/chosen": -2.578125, "logits/rejected": -2.359375, "logps/chosen": -712.0, "logps/rejected": -1080.0, "loss": 0.1747, "rewards/accuracies": 0.875, "rewards/chosen": -5.46875, "rewards/margins": 3.75, "rewards/rejected": -9.1875, "step": 7690 }, { "epoch": 0.5793394026032653, "grad_norm": 7.349864652389393, "learning_rate": 2.2446875248107432e-07, "logits/chosen": -2.6875, "logits/rejected": -2.296875, "logps/chosen": -704.0, "logps/rejected": -1112.0, "loss": 0.188, "rewards/accuracies": 0.9375, "rewards/chosen": -5.46875, "rewards/margins": 4.125, "rewards/rejected": -9.5625, "step": 7700 }, { "epoch": 0.5800917914378151, "grad_norm": 7.426858571337103, "learning_rate": 2.2381564192647396e-07, "logits/chosen": -2.703125, "logits/rejected": -2.328125, "logps/chosen": -676.0, "logps/rejected": -1048.0, "loss": 0.1832, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 3.78125, "rewards/rejected": -8.9375, "step": 7710 }, { "epoch": 0.5808441802723647, "grad_norm": 9.780007794128954, "learning_rate": 2.231627120087631e-07, "logits/chosen": -2.765625, "logits/rejected": -2.328125, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.1757, "rewards/accuracies": 0.9375, "rewards/chosen": -5.09375, "rewards/margins": 4.25, "rewards/rejected": -9.3125, "step": 7720 }, { "epoch": 0.5815965691069145, "grad_norm": 9.862081709378993, "learning_rate": 2.2250996723228104e-07, "logits/chosen": -2.71875, "logits/rejected": -2.484375, "logps/chosen": -700.0, "logps/rejected": -1104.0, "loss": 0.1918, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.4375, "rewards/margins": 4.03125, "rewards/rejected": -9.5, "step": 7730 }, { "epoch": 0.5823489579414641, "grad_norm": 9.796114568966233, "learning_rate": 2.2185741210008947e-07, "logits/chosen": -2.6875, "logits/rejected": -2.4375, "logps/chosen": -684.0, "logps/rejected": -1072.0, "loss": 0.1859, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 3.90625, "rewards/rejected": -9.0, "step": 7740 }, { "epoch": 0.5831013467760139, "grad_norm": 9.088441129163732, "learning_rate": 2.2120505111394224e-07, "logits/chosen": -2.65625, "logits/rejected": -2.5, "logps/chosen": -692.0, "logps/rejected": -1048.0, "loss": 0.1815, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.21875, "rewards/margins": 3.609375, "rewards/rejected": -8.8125, "step": 7750 }, { "epoch": 0.5838537356105635, "grad_norm": 11.190049192399824, "learning_rate": 2.205528887742535e-07, "logits/chosen": -2.640625, "logits/rejected": -2.328125, "logps/chosen": -720.0, "logps/rejected": -1112.0, "loss": 0.1963, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.375, "rewards/margins": 4.0, "rewards/rejected": -9.375, "step": 7760 }, { "epoch": 0.5846061244451133, "grad_norm": 7.882822688571665, "learning_rate": 2.199009295800672e-07, "logits/chosen": -2.65625, "logits/rejected": -2.34375, "logps/chosen": -712.0, "logps/rejected": -1096.0, "loss": 0.1811, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.5625, "rewards/margins": 3.9375, "rewards/rejected": -9.5, "step": 7770 }, { "epoch": 0.5853585132796629, "grad_norm": 12.163265635174126, "learning_rate": 2.192491780290259e-07, "logits/chosen": -2.734375, "logits/rejected": -2.40625, "logps/chosen": -740.0, "logps/rejected": -1144.0, "loss": 0.18, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.65625, "rewards/margins": 4.25, "rewards/rejected": -9.875, "step": 7780 }, { "epoch": 0.5861109021142126, "grad_norm": 11.023139609307764, "learning_rate": 2.1859763861733947e-07, "logits/chosen": -2.75, "logits/rejected": -2.375, "logps/chosen": -688.0, "logps/rejected": -1040.0, "loss": 0.1954, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.34375, "rewards/margins": 3.6875, "rewards/rejected": -9.0, "step": 7790 }, { "epoch": 0.5868632909487623, "grad_norm": 8.846784626177584, "learning_rate": 2.179463158397545e-07, "logits/chosen": -2.703125, "logits/rejected": -2.4375, "logps/chosen": -712.0, "logps/rejected": -1072.0, "loss": 0.1868, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.625, "rewards/margins": 3.703125, "rewards/rejected": -9.3125, "step": 7800 }, { "epoch": 0.587615679783312, "grad_norm": 9.983411475943397, "learning_rate": 2.1729521418952304e-07, "logits/chosen": -2.8125, "logits/rejected": -2.53125, "logps/chosen": -668.0, "logps/rejected": -1104.0, "loss": 0.1717, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 4.4375, "rewards/rejected": -9.5, "step": 7810 }, { "epoch": 0.5883680686178617, "grad_norm": 7.96272848222726, "learning_rate": 2.166443381583718e-07, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -716.0, "logps/rejected": -1112.0, "loss": 0.1632, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.6875, "rewards/margins": 3.8125, "rewards/rejected": -9.5, "step": 7820 }, { "epoch": 0.5891204574524114, "grad_norm": 8.079780562583833, "learning_rate": 2.1599369223647068e-07, "logits/chosen": -2.765625, "logits/rejected": -2.546875, "logps/chosen": -716.0, "logps/rejected": -1072.0, "loss": 0.2225, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.53125, "rewards/margins": 3.78125, "rewards/rejected": -9.3125, "step": 7830 }, { "epoch": 0.5898728462869611, "grad_norm": 12.685745100141066, "learning_rate": 2.1534328091240238e-07, "logits/chosen": -2.8125, "logits/rejected": -2.546875, "logps/chosen": -720.0, "logps/rejected": -1096.0, "loss": 0.198, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.5625, "rewards/margins": 3.765625, "rewards/rejected": -9.3125, "step": 7840 }, { "epoch": 0.5906252351215108, "grad_norm": 9.694926490966695, "learning_rate": 2.1469310867313118e-07, "logits/chosen": -2.796875, "logits/rejected": -2.53125, "logps/chosen": -720.0, "logps/rejected": -1088.0, "loss": 0.1962, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.3125, "rewards/margins": 4.0, "rewards/rejected": -9.3125, "step": 7850 }, { "epoch": 0.5913776239560605, "grad_norm": 7.470161379455044, "learning_rate": 2.1404318000397192e-07, "logits/chosen": -2.71875, "logits/rejected": -2.359375, "logps/chosen": -644.0, "logps/rejected": -1008.0, "loss": 0.1719, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 3.875, "rewards/rejected": -8.625, "step": 7860 }, { "epoch": 0.5921300127906102, "grad_norm": 15.635656624979678, "learning_rate": 2.1339349938855929e-07, "logits/chosen": -2.859375, "logits/rejected": -2.46875, "logps/chosen": -624.0, "logps/rejected": -1000.0, "loss": 0.189, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 3.84375, "rewards/rejected": -8.6875, "step": 7870 }, { "epoch": 0.5928824016251599, "grad_norm": 8.84669114325579, "learning_rate": 2.127440713088165e-07, "logits/chosen": -2.796875, "logits/rejected": -2.515625, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.1959, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.125, "rewards/rejected": -9.25, "step": 7880 }, { "epoch": 0.5936347904597096, "grad_norm": 10.608839437296322, "learning_rate": 2.1209490024492477e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1040.0, "loss": 0.19, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 3.828125, "rewards/rejected": -8.9375, "step": 7890 }, { "epoch": 0.5943871792942593, "grad_norm": 11.261167695549236, "learning_rate": 2.114459906752923e-07, "logits/chosen": -2.71875, "logits/rejected": -2.484375, "logps/chosen": -696.0, "logps/rejected": -1056.0, "loss": 0.1984, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.28125, "rewards/margins": 3.8125, "rewards/rejected": -9.125, "step": 7900 }, { "epoch": 0.595139568128809, "grad_norm": 7.217595152184596, "learning_rate": 2.1079734707652315e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -704.0, "logps/rejected": -1040.0, "loss": 0.206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.3125, "rewards/margins": 3.578125, "rewards/rejected": -8.875, "step": 7910 }, { "epoch": 0.5958919569633586, "grad_norm": 6.7734942212408855, "learning_rate": 2.101489739233867e-07, "logits/chosen": -2.640625, "logits/rejected": -2.3125, "logps/chosen": -656.0, "logps/rejected": -1072.0, "loss": 0.1851, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.25, "rewards/rejected": -9.1875, "step": 7920 }, { "epoch": 0.5966443457979084, "grad_norm": 8.189998016070621, "learning_rate": 2.0950087568878643e-07, "logits/chosen": -2.78125, "logits/rejected": -2.375, "logps/chosen": -696.0, "logps/rejected": -1080.0, "loss": 0.1908, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.0625, "rewards/rejected": -9.3125, "step": 7930 }, { "epoch": 0.597396734632458, "grad_norm": 8.188292839949094, "learning_rate": 2.088530568437295e-07, "logits/chosen": -2.78125, "logits/rejected": -2.40625, "logps/chosen": -712.0, "logps/rejected": -1088.0, "loss": 0.1911, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.5625, "rewards/margins": 3.84375, "rewards/rejected": -9.4375, "step": 7940 }, { "epoch": 0.5981491234670078, "grad_norm": 9.397579034044462, "learning_rate": 2.0820552185729553e-07, "logits/chosen": -2.75, "logits/rejected": -2.484375, "logps/chosen": -696.0, "logps/rejected": -1064.0, "loss": 0.1733, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.25, "rewards/margins": 3.890625, "rewards/rejected": -9.125, "step": 7950 }, { "epoch": 0.5989015123015574, "grad_norm": 9.047964513784462, "learning_rate": 2.0755827519660585e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -668.0, "logps/rejected": -1040.0, "loss": 0.1839, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.0, "rewards/rejected": -8.9375, "step": 7960 }, { "epoch": 0.5996539011361072, "grad_norm": 11.445154601848932, "learning_rate": 2.069113213267928e-07, "logits/chosen": -2.765625, "logits/rejected": -2.484375, "logps/chosen": -672.0, "logps/rejected": -1032.0, "loss": 0.1736, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 3.71875, "rewards/rejected": -8.8125, "step": 7970 }, { "epoch": 0.6004062899706568, "grad_norm": 10.501805165582523, "learning_rate": 2.062646647109688e-07, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -632.0, "logps/rejected": -1040.0, "loss": 0.1783, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 4.1875, "rewards/rejected": -9.0, "step": 7980 }, { "epoch": 0.6011586788052066, "grad_norm": 10.339753662857872, "learning_rate": 2.0561830981019582e-07, "logits/chosen": -2.65625, "logits/rejected": -2.421875, "logps/chosen": -704.0, "logps/rejected": -1128.0, "loss": 0.185, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.40625, "rewards/margins": 4.3125, "rewards/rejected": -9.75, "step": 7990 }, { "epoch": 0.6019110676397562, "grad_norm": 5.477932125415138, "learning_rate": 2.04972261083454e-07, "logits/chosen": -2.796875, "logits/rejected": -2.484375, "logps/chosen": -704.0, "logps/rejected": -1088.0, "loss": 0.1581, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.3125, "rewards/margins": 3.890625, "rewards/rejected": -9.1875, "step": 8000 }, { "epoch": 0.602663456474306, "grad_norm": 10.860208268593594, "learning_rate": 2.0432652298761176e-07, "logits/chosen": -2.75, "logits/rejected": -2.484375, "logps/chosen": -684.0, "logps/rejected": -1056.0, "loss": 0.2022, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.25, "rewards/margins": 3.71875, "rewards/rejected": -9.0, "step": 8010 }, { "epoch": 0.6034158453088556, "grad_norm": 8.134360329596852, "learning_rate": 2.0368109997739415e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -664.0, "logps/rejected": -1104.0, "loss": 0.1554, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.34375, "rewards/rejected": -9.4375, "step": 8020 }, { "epoch": 0.6041682341434053, "grad_norm": 8.877210411595883, "learning_rate": 2.0303599650535283e-07, "logits/chosen": -2.78125, "logits/rejected": -2.34375, "logps/chosen": -724.0, "logps/rejected": -1096.0, "loss": 0.1929, "rewards/accuracies": 0.875, "rewards/chosen": -5.53125, "rewards/margins": 3.875, "rewards/rejected": -9.375, "step": 8030 }, { "epoch": 0.604920622977955, "grad_norm": 8.533779164861182, "learning_rate": 2.0239121702183505e-07, "logits/chosen": -2.71875, "logits/rejected": -2.40625, "logps/chosen": -652.0, "logps/rejected": -1072.0, "loss": 0.1867, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.1875, "rewards/rejected": -9.25, "step": 8040 }, { "epoch": 0.6056730118125047, "grad_norm": 7.678079728481108, "learning_rate": 2.017467659749528e-07, "logits/chosen": -2.796875, "logits/rejected": -2.46875, "logps/chosen": -740.0, "logps/rejected": -1112.0, "loss": 0.1775, "rewards/accuracies": 0.9375, "rewards/chosen": -5.4375, "rewards/margins": 4.1875, "rewards/rejected": -9.625, "step": 8050 }, { "epoch": 0.6064254006470544, "grad_norm": 7.362214903304492, "learning_rate": 2.011026478105525e-07, "logits/chosen": -2.6875, "logits/rejected": -2.578125, "logps/chosen": -664.0, "logps/rejected": -1064.0, "loss": 0.1865, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0, "rewards/margins": 4.03125, "rewards/rejected": -9.0, "step": 8060 }, { "epoch": 0.6071777894816041, "grad_norm": 6.554803722933717, "learning_rate": 2.004588669721841e-07, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -684.0, "logps/rejected": -1096.0, "loss": 0.1729, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.28125, "rewards/margins": 4.09375, "rewards/rejected": -9.375, "step": 8070 }, { "epoch": 0.6079301783161538, "grad_norm": 7.301637306446734, "learning_rate": 1.9981542790107032e-07, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.1685, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.15625, "rewards/rejected": -9.0625, "step": 8080 }, { "epoch": 0.6086825671507035, "grad_norm": 9.634613626300386, "learning_rate": 1.9917233503607625e-07, "logits/chosen": -2.796875, "logits/rejected": -2.453125, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.1636, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 4.40625, "rewards/rejected": -9.4375, "step": 8090 }, { "epoch": 0.6094349559852532, "grad_norm": 9.650023332883672, "learning_rate": 1.9852959281367872e-07, "logits/chosen": -2.8125, "logits/rejected": -2.46875, "logps/chosen": -720.0, "logps/rejected": -1088.0, "loss": 0.1797, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.65625, "rewards/margins": 3.78125, "rewards/rejected": -9.4375, "step": 8100 }, { "epoch": 0.6101873448198029, "grad_norm": 8.50128482310566, "learning_rate": 1.9788720566793527e-07, "logits/chosen": -2.8125, "logits/rejected": -2.421875, "logps/chosen": -704.0, "logps/rejected": -1056.0, "loss": 0.1888, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -5.5, "rewards/margins": 3.59375, "rewards/rejected": -9.125, "step": 8110 }, { "epoch": 0.6109397336543526, "grad_norm": 7.556379718476982, "learning_rate": 1.9724517803045417e-07, "logits/chosen": -2.671875, "logits/rejected": -2.453125, "logps/chosen": -712.0, "logps/rejected": -1080.0, "loss": 0.1978, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.5625, "rewards/margins": 3.5625, "rewards/rejected": -9.125, "step": 8120 }, { "epoch": 0.6116921224889023, "grad_norm": 9.095065944956065, "learning_rate": 1.966035143303636e-07, "logits/chosen": -2.78125, "logits/rejected": -2.421875, "logps/chosen": -700.0, "logps/rejected": -1024.0, "loss": 0.1675, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 3.53125, "rewards/rejected": -8.75, "step": 8130 }, { "epoch": 0.612444511323452, "grad_norm": 8.963937404932498, "learning_rate": 1.959622189942808e-07, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -684.0, "logps/rejected": -1072.0, "loss": 0.17, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 3.828125, "rewards/rejected": -9.0625, "step": 8140 }, { "epoch": 0.6131969001580017, "grad_norm": 8.450587169477771, "learning_rate": 1.9532129644628204e-07, "logits/chosen": -2.765625, "logits/rejected": -2.453125, "logps/chosen": -680.0, "logps/rejected": -1020.0, "loss": 0.1888, "rewards/accuracies": 0.9375, "rewards/chosen": -5.25, "rewards/margins": 3.46875, "rewards/rejected": -8.6875, "step": 8150 }, { "epoch": 0.6139492889925513, "grad_norm": 10.13442209524969, "learning_rate": 1.946807511078718e-07, "logits/chosen": -2.6875, "logits/rejected": -2.46875, "logps/chosen": -712.0, "logps/rejected": -1088.0, "loss": 0.1807, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.34375, "rewards/margins": 4.0625, "rewards/rejected": -9.375, "step": 8160 }, { "epoch": 0.6147016778271011, "grad_norm": 13.447019109532024, "learning_rate": 1.9404058739795217e-07, "logits/chosen": -2.71875, "logits/rejected": -2.5, "logps/chosen": -708.0, "logps/rejected": -1088.0, "loss": 0.1775, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.40625, "rewards/margins": 4.0, "rewards/rejected": -9.4375, "step": 8170 }, { "epoch": 0.6154540666616507, "grad_norm": 8.774865738407245, "learning_rate": 1.9340080973279268e-07, "logits/chosen": -2.828125, "logits/rejected": -2.4375, "logps/chosen": -696.0, "logps/rejected": -1120.0, "loss": 0.164, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 4.3125, "rewards/rejected": -9.625, "step": 8180 }, { "epoch": 0.6162064554962005, "grad_norm": 11.89253519639188, "learning_rate": 1.9276142252599971e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -704.0, "logps/rejected": -1080.0, "loss": 0.1704, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.4375, "rewards/margins": 3.953125, "rewards/rejected": -9.375, "step": 8190 }, { "epoch": 0.6169588443307501, "grad_norm": 11.177241943586392, "learning_rate": 1.9212243018848572e-07, "logits/chosen": -2.84375, "logits/rejected": -2.546875, "logps/chosen": -692.0, "logps/rejected": -1128.0, "loss": 0.1966, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3125, "rewards/margins": 4.375, "rewards/rejected": -9.6875, "step": 8200 }, { "epoch": 0.6177112331652999, "grad_norm": 7.67297793851783, "learning_rate": 1.9148383712843946e-07, "logits/chosen": -2.828125, "logits/rejected": -2.578125, "logps/chosen": -724.0, "logps/rejected": -1096.0, "loss": 0.1746, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.5, "rewards/margins": 4.0, "rewards/rejected": -9.5, "step": 8210 }, { "epoch": 0.6184636219998495, "grad_norm": 6.957493941018748, "learning_rate": 1.908456477512949e-07, "logits/chosen": -2.671875, "logits/rejected": -2.421875, "logps/chosen": -708.0, "logps/rejected": -1120.0, "loss": 0.1758, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.40625, "rewards/margins": 4.25, "rewards/rejected": -9.6875, "step": 8220 }, { "epoch": 0.6192160108343993, "grad_norm": 8.931938647060058, "learning_rate": 1.9020786645970132e-07, "logits/chosen": -2.625, "logits/rejected": -2.359375, "logps/chosen": -716.0, "logps/rejected": -1096.0, "loss": 0.2052, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.46875, "rewards/margins": 3.90625, "rewards/rejected": -9.375, "step": 8230 }, { "epoch": 0.6199683996689489, "grad_norm": 11.634130049543522, "learning_rate": 1.8957049765349275e-07, "logits/chosen": -2.671875, "logits/rejected": -2.34375, "logps/chosen": -684.0, "logps/rejected": -1056.0, "loss": 0.1936, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.3125, "rewards/margins": 3.75, "rewards/rejected": -9.0625, "step": 8240 }, { "epoch": 0.6207207885034987, "grad_norm": 7.675316481738456, "learning_rate": 1.8893354572965764e-07, "logits/chosen": -2.65625, "logits/rejected": -2.515625, "logps/chosen": -684.0, "logps/rejected": -1072.0, "loss": 0.1721, "rewards/accuracies": 0.9375, "rewards/chosen": -5.46875, "rewards/margins": 3.78125, "rewards/rejected": -9.25, "step": 8250 }, { "epoch": 0.6214731773380483, "grad_norm": 9.973255437064827, "learning_rate": 1.882970150823083e-07, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -700.0, "logps/rejected": -1088.0, "loss": 0.1814, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.3125, "rewards/margins": 3.953125, "rewards/rejected": -9.25, "step": 8260 }, { "epoch": 0.6222255661725979, "grad_norm": 5.705753808858491, "learning_rate": 1.8766091010265107e-07, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -688.0, "logps/rejected": -1088.0, "loss": 0.1667, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 4.25, "rewards/rejected": -9.4375, "step": 8270 }, { "epoch": 0.6229779550071477, "grad_norm": 9.144340823375144, "learning_rate": 1.870252351789557e-07, "logits/chosen": -2.828125, "logits/rejected": -2.59375, "logps/chosen": -676.0, "logps/rejected": -1072.0, "loss": 0.1827, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 4.09375, "rewards/rejected": -9.25, "step": 8280 }, { "epoch": 0.6237303438416973, "grad_norm": 8.597966874281301, "learning_rate": 1.8638999469652485e-07, "logits/chosen": -2.75, "logits/rejected": -2.46875, "logps/chosen": -680.0, "logps/rejected": -1072.0, "loss": 0.1901, "rewards/accuracies": 0.875, "rewards/chosen": -5.3125, "rewards/margins": 3.875, "rewards/rejected": -9.1875, "step": 8290 }, { "epoch": 0.6244827326762471, "grad_norm": 8.356923305837611, "learning_rate": 1.8575519303766456e-07, "logits/chosen": -2.734375, "logits/rejected": -2.484375, "logps/chosen": -732.0, "logps/rejected": -1144.0, "loss": 0.2041, "rewards/accuracies": 0.90625, "rewards/chosen": -5.59375, "rewards/margins": 4.34375, "rewards/rejected": -9.9375, "step": 8300 }, { "epoch": 0.6252351215107967, "grad_norm": 8.330276529550803, "learning_rate": 1.8512083458165322e-07, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -696.0, "logps/rejected": -1080.0, "loss": 0.1908, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.375, "rewards/margins": 3.9375, "rewards/rejected": -9.3125, "step": 8310 }, { "epoch": 0.6259875103453465, "grad_norm": 9.745317202075682, "learning_rate": 1.8448692370471197e-07, "logits/chosen": -2.671875, "logits/rejected": -2.375, "logps/chosen": -680.0, "logps/rejected": -1064.0, "loss": 0.1772, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.21875, "rewards/margins": 3.875, "rewards/rejected": -9.0625, "step": 8320 }, { "epoch": 0.6267398991798961, "grad_norm": 10.412831393628213, "learning_rate": 1.838534647799741e-07, "logits/chosen": -2.75, "logits/rejected": -2.421875, "logps/chosen": -708.0, "logps/rejected": -1032.0, "loss": 0.1868, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.34375, "rewards/margins": 3.40625, "rewards/rejected": -8.75, "step": 8330 }, { "epoch": 0.6274922880144459, "grad_norm": 11.039358030397528, "learning_rate": 1.83220462177455e-07, "logits/chosen": -2.796875, "logits/rejected": -2.375, "logps/chosen": -704.0, "logps/rejected": -1104.0, "loss": 0.1793, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.46875, "rewards/margins": 4.125, "rewards/rejected": -9.5625, "step": 8340 }, { "epoch": 0.6282446768489955, "grad_norm": 8.167227600473227, "learning_rate": 1.825879202640222e-07, "logits/chosen": -2.8125, "logits/rejected": -2.421875, "logps/chosen": -704.0, "logps/rejected": -1088.0, "loss": 0.1827, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.40625, "rewards/margins": 3.765625, "rewards/rejected": -9.1875, "step": 8350 }, { "epoch": 0.6289970656835453, "grad_norm": 9.909887900151999, "learning_rate": 1.81955843403365e-07, "logits/chosen": -2.6875, "logits/rejected": -2.40625, "logps/chosen": -656.0, "logps/rejected": -1064.0, "loss": 0.1742, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 4.15625, "rewards/rejected": -9.25, "step": 8360 }, { "epoch": 0.6297494545180949, "grad_norm": 6.664940260255985, "learning_rate": 1.8132423595596464e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -700.0, "logps/rejected": -1080.0, "loss": 0.1782, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.34375, "rewards/margins": 3.875, "rewards/rejected": -9.25, "step": 8370 }, { "epoch": 0.6305018433526447, "grad_norm": 8.040497731217704, "learning_rate": 1.8069310227906365e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -728.0, "logps/rejected": -1088.0, "loss": 0.1883, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.40625, "rewards/margins": 3.921875, "rewards/rejected": -9.3125, "step": 8380 }, { "epoch": 0.6312542321871943, "grad_norm": 7.855479759367712, "learning_rate": 1.8006244672663644e-07, "logits/chosen": -2.59375, "logits/rejected": -2.40625, "logps/chosen": -724.0, "logps/rejected": -1096.0, "loss": 0.1708, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.65625, "rewards/margins": 3.78125, "rewards/rejected": -9.4375, "step": 8390 }, { "epoch": 0.632006621021744, "grad_norm": 10.782491204548583, "learning_rate": 1.794322736493591e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -732.0, "logps/rejected": -1112.0, "loss": 0.1897, "rewards/accuracies": 0.90625, "rewards/chosen": -5.53125, "rewards/margins": 4.03125, "rewards/rejected": -9.5625, "step": 8400 }, { "epoch": 0.6327590098562937, "grad_norm": 6.395845132416539, "learning_rate": 1.7880258739457905e-07, "logits/chosen": -2.875, "logits/rejected": -2.625, "logps/chosen": -708.0, "logps/rejected": -1088.0, "loss": 0.1809, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.4375, "rewards/margins": 3.859375, "rewards/rejected": -9.3125, "step": 8410 }, { "epoch": 0.6335113986908434, "grad_norm": 11.500493086122349, "learning_rate": 1.7817339230628553e-07, "logits/chosen": -2.828125, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1048.0, "loss": 0.1948, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.09375, "rewards/margins": 3.921875, "rewards/rejected": -9.0, "step": 8420 }, { "epoch": 0.6342637875253931, "grad_norm": 9.480030645590286, "learning_rate": 1.7754469272507914e-07, "logits/chosen": -2.6875, "logits/rejected": -2.265625, "logps/chosen": -700.0, "logps/rejected": -1152.0, "loss": 0.1784, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.4375, "rewards/margins": 4.4375, "rewards/rejected": -9.875, "step": 8430 }, { "epoch": 0.6350161763599428, "grad_norm": 9.324651618478956, "learning_rate": 1.7691649298814232e-07, "logits/chosen": -2.75, "logits/rejected": -2.59375, "logps/chosen": -660.0, "logps/rejected": -1056.0, "loss": 0.168, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 3.921875, "rewards/rejected": -9.0625, "step": 8440 }, { "epoch": 0.6357685651944925, "grad_norm": 8.774893284452308, "learning_rate": 1.7628879742920928e-07, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -656.0, "logps/rejected": -1136.0, "loss": 0.1803, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.0, "rewards/margins": 4.71875, "rewards/rejected": -9.75, "step": 8450 }, { "epoch": 0.6365209540290422, "grad_norm": 9.340652243790641, "learning_rate": 1.7566161037853594e-07, "logits/chosen": -2.921875, "logits/rejected": -2.5625, "logps/chosen": -656.0, "logps/rejected": -1056.0, "loss": 0.1712, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 4.125, "rewards/rejected": -9.0, "step": 8460 }, { "epoch": 0.6372733428635919, "grad_norm": 8.096050960575042, "learning_rate": 1.7503493616287023e-07, "logits/chosen": -2.984375, "logits/rejected": -2.703125, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.202, "rewards/accuracies": 0.9375, "rewards/chosen": -5.09375, "rewards/margins": 4.25, "rewards/rejected": -9.375, "step": 8470 }, { "epoch": 0.6380257316981416, "grad_norm": 7.937269964263276, "learning_rate": 1.7440877910542225e-07, "logits/chosen": -2.90625, "logits/rejected": -2.453125, "logps/chosen": -684.0, "logps/rejected": -1072.0, "loss": 0.1962, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.21875, "rewards/rejected": -9.1875, "step": 8480 }, { "epoch": 0.6387781205326913, "grad_norm": 9.754686407049281, "learning_rate": 1.7378314352583446e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1080.0, "loss": 0.1633, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.0, "rewards/rejected": -8.9375, "step": 8490 }, { "epoch": 0.639530509367241, "grad_norm": 9.467168252886747, "learning_rate": 1.731580337401517e-07, "logits/chosen": -2.96875, "logits/rejected": -2.609375, "logps/chosen": -648.0, "logps/rejected": -1048.0, "loss": 0.1861, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.0, "rewards/rejected": -8.8125, "step": 8500 }, { "epoch": 0.6402828982017907, "grad_norm": 11.663198748608533, "learning_rate": 1.7253345406079161e-07, "logits/chosen": -2.875, "logits/rejected": -2.671875, "logps/chosen": -712.0, "logps/rejected": -1072.0, "loss": 0.1913, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.375, "rewards/margins": 3.765625, "rewards/rejected": -9.125, "step": 8510 }, { "epoch": 0.6410352870363404, "grad_norm": 9.445575901580403, "learning_rate": 1.719094087965148e-07, "logits/chosen": -2.859375, "logits/rejected": -2.515625, "logps/chosen": -688.0, "logps/rejected": -1104.0, "loss": 0.1721, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.125, "rewards/rejected": -9.375, "step": 8520 }, { "epoch": 0.64178767587089, "grad_norm": 7.603228701217873, "learning_rate": 1.7128590225239515e-07, "logits/chosen": -2.9375, "logits/rejected": -2.6875, "logps/chosen": -656.0, "logps/rejected": -1048.0, "loss": 0.1947, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.03125, "rewards/rejected": -9.0, "step": 8530 }, { "epoch": 0.6425400647054398, "grad_norm": 11.514964298081834, "learning_rate": 1.7066293872979005e-07, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -700.0, "logps/rejected": -1080.0, "loss": 0.2049, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 3.90625, "rewards/rejected": -9.0625, "step": 8540 }, { "epoch": 0.6432924535399894, "grad_norm": 8.976859425848769, "learning_rate": 1.7004052252631085e-07, "logits/chosen": -2.8125, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1064.0, "loss": 0.1992, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.0, "rewards/rejected": -9.0625, "step": 8550 }, { "epoch": 0.6440448423745392, "grad_norm": 8.397828268335257, "learning_rate": 1.6941865793579302e-07, "logits/chosen": -2.75, "logits/rejected": -2.40625, "logps/chosen": -620.0, "logps/rejected": -1048.0, "loss": 0.177, "rewards/accuracies": 0.9375, "rewards/chosen": -4.625, "rewards/margins": 4.25, "rewards/rejected": -8.8125, "step": 8560 }, { "epoch": 0.6447972312090888, "grad_norm": 7.577018016800494, "learning_rate": 1.687973492482666e-07, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -644.0, "logps/rejected": -1024.0, "loss": 0.1597, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 3.8125, "rewards/rejected": -8.5625, "step": 8570 }, { "epoch": 0.6455496200436386, "grad_norm": 11.235843949131914, "learning_rate": 1.6817660074992694e-07, "logits/chosen": -2.84375, "logits/rejected": -2.65625, "logps/chosen": -644.0, "logps/rejected": -1016.0, "loss": 0.1608, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.875, "rewards/margins": 3.984375, "rewards/rejected": -8.875, "step": 8580 }, { "epoch": 0.6463020088781882, "grad_norm": 9.991635501784412, "learning_rate": 1.6755641672310456e-07, "logits/chosen": -2.84375, "logits/rejected": -2.53125, "logps/chosen": -664.0, "logps/rejected": -1080.0, "loss": 0.17, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 4.125, "rewards/rejected": -9.4375, "step": 8590 }, { "epoch": 0.647054397712738, "grad_norm": 10.353997306005274, "learning_rate": 1.6693680144623594e-07, "logits/chosen": -2.9375, "logits/rejected": -2.6875, "logps/chosen": -692.0, "logps/rejected": -1088.0, "loss": 0.1757, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.3125, "rewards/margins": 4.0625, "rewards/rejected": -9.375, "step": 8600 }, { "epoch": 0.6478067865472876, "grad_norm": 10.78521221480445, "learning_rate": 1.6631775919383398e-07, "logits/chosen": -2.890625, "logits/rejected": -2.46875, "logps/chosen": -688.0, "logps/rejected": -1112.0, "loss": 0.1858, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 4.4375, "rewards/rejected": -9.625, "step": 8610 }, { "epoch": 0.6485591753818374, "grad_norm": 9.525179269161058, "learning_rate": 1.656992942364585e-07, "logits/chosen": -2.828125, "logits/rejected": -2.515625, "logps/chosen": -688.0, "logps/rejected": -1096.0, "loss": 0.1698, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.25, "rewards/margins": 3.90625, "rewards/rejected": -9.1875, "step": 8620 }, { "epoch": 0.649311564216387, "grad_norm": 8.303050969135796, "learning_rate": 1.6508141084068682e-07, "logits/chosen": -2.8125, "logits/rejected": -2.484375, "logps/chosen": -664.0, "logps/rejected": -1056.0, "loss": 0.1718, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.0, "rewards/rejected": -9.125, "step": 8630 }, { "epoch": 0.6500639530509367, "grad_norm": 10.120955544801877, "learning_rate": 1.6446411326908415e-07, "logits/chosen": -2.890625, "logits/rejected": -2.5625, "logps/chosen": -704.0, "logps/rejected": -1120.0, "loss": 0.1684, "rewards/accuracies": 0.9375, "rewards/chosen": -5.4375, "rewards/margins": 4.375, "rewards/rejected": -9.8125, "step": 8640 }, { "epoch": 0.6508163418854864, "grad_norm": 7.469156784372811, "learning_rate": 1.6384740578017419e-07, "logits/chosen": -2.828125, "logits/rejected": -2.53125, "logps/chosen": -708.0, "logps/rejected": -1128.0, "loss": 0.1837, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.5, "rewards/margins": 4.28125, "rewards/rejected": -9.75, "step": 8650 }, { "epoch": 0.6515687307200361, "grad_norm": 10.847781757212124, "learning_rate": 1.6323129262841016e-07, "logits/chosen": -2.828125, "logits/rejected": -2.609375, "logps/chosen": -672.0, "logps/rejected": -1080.0, "loss": 0.1601, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.21875, "rewards/rejected": -9.4375, "step": 8660 }, { "epoch": 0.6523211195545858, "grad_norm": 11.914460280940185, "learning_rate": 1.626157780641449e-07, "logits/chosen": -2.90625, "logits/rejected": -2.546875, "logps/chosen": -676.0, "logps/rejected": -1048.0, "loss": 0.1815, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 3.953125, "rewards/rejected": -9.0625, "step": 8670 }, { "epoch": 0.6530735083891355, "grad_norm": 6.998242989519436, "learning_rate": 1.6200086633360203e-07, "logits/chosen": -2.828125, "logits/rejected": -2.453125, "logps/chosen": -664.0, "logps/rejected": -1072.0, "loss": 0.1642, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.1875, "rewards/rejected": -9.25, "step": 8680 }, { "epoch": 0.6538258972236852, "grad_norm": 11.464774461391972, "learning_rate": 1.6138656167884615e-07, "logits/chosen": -2.890625, "logits/rejected": -2.4375, "logps/chosen": -688.0, "logps/rejected": -1088.0, "loss": 0.2046, "rewards/accuracies": 0.90625, "rewards/chosen": -5.46875, "rewards/margins": 4.03125, "rewards/rejected": -9.5, "step": 8690 }, { "epoch": 0.6545782860582349, "grad_norm": 8.25921588629842, "learning_rate": 1.6077286833775407e-07, "logits/chosen": -2.828125, "logits/rejected": -2.5625, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.1626, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.09375, "rewards/rejected": -9.25, "step": 8700 }, { "epoch": 0.6553306748927846, "grad_norm": 8.075272226411743, "learning_rate": 1.6015979054398537e-07, "logits/chosen": -2.828125, "logits/rejected": -2.46875, "logps/chosen": -656.0, "logps/rejected": -1024.0, "loss": 0.1733, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0, "rewards/margins": 3.890625, "rewards/rejected": -8.875, "step": 8710 }, { "epoch": 0.6560830637273343, "grad_norm": 6.549709126315134, "learning_rate": 1.5954733252695297e-07, "logits/chosen": -2.8125, "logits/rejected": -2.484375, "logps/chosen": -628.0, "logps/rejected": -1000.0, "loss": 0.1884, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.84375, "rewards/margins": 3.734375, "rewards/rejected": -8.5625, "step": 8720 }, { "epoch": 0.656835452561884, "grad_norm": 6.948138061116944, "learning_rate": 1.5893549851179443e-07, "logits/chosen": -2.703125, "logits/rejected": -2.4375, "logps/chosen": -656.0, "logps/rejected": -1072.0, "loss": 0.1691, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.125, "rewards/rejected": -9.1875, "step": 8730 }, { "epoch": 0.6575878413964337, "grad_norm": 10.007531481704225, "learning_rate": 1.5832429271934216e-07, "logits/chosen": -2.765625, "logits/rejected": -2.453125, "logps/chosen": -688.0, "logps/rejected": -1048.0, "loss": 0.1776, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.28125, "rewards/margins": 3.71875, "rewards/rejected": -9.0, "step": 8740 }, { "epoch": 0.6583402302309834, "grad_norm": 11.37778642015286, "learning_rate": 1.5771371936609512e-07, "logits/chosen": -2.765625, "logits/rejected": -2.484375, "logps/chosen": -692.0, "logps/rejected": -1064.0, "loss": 0.1982, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.3125, "rewards/margins": 3.875, "rewards/rejected": -9.1875, "step": 8750 }, { "epoch": 0.6590926190655331, "grad_norm": 9.387826228804798, "learning_rate": 1.5710378266418909e-07, "logits/chosen": -2.859375, "logits/rejected": -2.453125, "logps/chosen": -684.0, "logps/rejected": -1120.0, "loss": 0.1817, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.28125, "rewards/margins": 4.40625, "rewards/rejected": -9.6875, "step": 8760 }, { "epoch": 0.6598450079000827, "grad_norm": 9.516184140445384, "learning_rate": 1.5649448682136768e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -648.0, "logps/rejected": -1032.0, "loss": 0.1815, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 3.953125, "rewards/rejected": -8.9375, "step": 8770 }, { "epoch": 0.6605973967346325, "grad_norm": 11.06259854813874, "learning_rate": 1.558858360409536e-07, "logits/chosen": -2.8125, "logits/rejected": -2.4375, "logps/chosen": -660.0, "logps/rejected": -1072.0, "loss": 0.1983, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.15625, "rewards/margins": 3.96875, "rewards/rejected": -9.125, "step": 8780 }, { "epoch": 0.6613497855691821, "grad_norm": 8.901877165227225, "learning_rate": 1.5527783452181947e-07, "logits/chosen": -2.71875, "logits/rejected": -2.578125, "logps/chosen": -700.0, "logps/rejected": -1080.0, "loss": 0.1985, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.3125, "rewards/margins": 4.0, "rewards/rejected": -9.3125, "step": 8790 }, { "epoch": 0.6621021744037319, "grad_norm": 9.179292363251772, "learning_rate": 1.5467048645835895e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -684.0, "logps/rejected": -1048.0, "loss": 0.1684, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 3.828125, "rewards/rejected": -8.9375, "step": 8800 }, { "epoch": 0.6628545632382815, "grad_norm": 9.274499894853587, "learning_rate": 1.540637960404575e-07, "logits/chosen": -2.703125, "logits/rejected": -2.484375, "logps/chosen": -652.0, "logps/rejected": -992.0, "loss": 0.183, "rewards/accuracies": 0.90625, "rewards/chosen": -4.96875, "rewards/margins": 3.4375, "rewards/rejected": -8.375, "step": 8810 }, { "epoch": 0.6636069520728313, "grad_norm": 9.77106572903292, "learning_rate": 1.53457767453464e-07, "logits/chosen": -2.6875, "logits/rejected": -2.390625, "logps/chosen": -652.0, "logps/rejected": -1012.0, "loss": 0.1981, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 3.796875, "rewards/rejected": -8.6875, "step": 8820 }, { "epoch": 0.6643593409073809, "grad_norm": 9.949300022505936, "learning_rate": 1.528524048781613e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -616.0, "logps/rejected": -1032.0, "loss": 0.1754, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.1875, "rewards/rejected": -8.875, "step": 8830 }, { "epoch": 0.6651117297419307, "grad_norm": 8.253969377620539, "learning_rate": 1.5224771249073787e-07, "logits/chosen": -2.8125, "logits/rejected": -2.515625, "logps/chosen": -676.0, "logps/rejected": -1072.0, "loss": 0.183, "rewards/accuracies": 0.90625, "rewards/chosen": -5.1875, "rewards/margins": 3.953125, "rewards/rejected": -9.125, "step": 8840 }, { "epoch": 0.6658641185764803, "grad_norm": 12.424443319980968, "learning_rate": 1.5164369446275878e-07, "logits/chosen": -2.78125, "logits/rejected": -2.609375, "logps/chosen": -676.0, "logps/rejected": -1020.0, "loss": 0.2074, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.09375, "rewards/margins": 3.71875, "rewards/rejected": -8.8125, "step": 8850 }, { "epoch": 0.6666165074110301, "grad_norm": 8.456340145386433, "learning_rate": 1.5104035496113672e-07, "logits/chosen": -2.8125, "logits/rejected": -2.59375, "logps/chosen": -656.0, "logps/rejected": -1024.0, "loss": 0.1718, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 3.921875, "rewards/rejected": -8.75, "step": 8860 }, { "epoch": 0.6673688962455797, "grad_norm": 6.214576997509315, "learning_rate": 1.5043769814810374e-07, "logits/chosen": -2.65625, "logits/rejected": -2.28125, "logps/chosen": -636.0, "logps/rejected": -1048.0, "loss": 0.1833, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.71875, "rewards/margins": 4.125, "rewards/rejected": -8.875, "step": 8870 }, { "epoch": 0.6681212850801294, "grad_norm": 10.767908060045015, "learning_rate": 1.4983572818118213e-07, "logits/chosen": -2.671875, "logits/rejected": -2.28125, "logps/chosen": -656.0, "logps/rejected": -1048.0, "loss": 0.1946, "rewards/accuracies": 0.90625, "rewards/chosen": -4.9375, "rewards/margins": 3.8125, "rewards/rejected": -8.75, "step": 8880 }, { "epoch": 0.6688736739146791, "grad_norm": 10.280012486077073, "learning_rate": 1.4923444921315575e-07, "logits/chosen": -2.703125, "logits/rejected": -2.40625, "logps/chosen": -636.0, "logps/rejected": -1040.0, "loss": 0.1904, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.75, "rewards/margins": 4.0, "rewards/rejected": -8.75, "step": 8890 }, { "epoch": 0.6696260627492288, "grad_norm": 6.968371247196364, "learning_rate": 1.4863386539204167e-07, "logits/chosen": -2.625, "logits/rejected": -2.484375, "logps/chosen": -656.0, "logps/rejected": -1012.0, "loss": 0.1847, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 3.71875, "rewards/rejected": -8.5625, "step": 8900 }, { "epoch": 0.6703784515837785, "grad_norm": 8.052669523242836, "learning_rate": 1.480339808610614e-07, "logits/chosen": -2.828125, "logits/rejected": -2.515625, "logps/chosen": -624.0, "logps/rejected": -1040.0, "loss": 0.1789, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 4.125, "rewards/rejected": -8.8125, "step": 8910 }, { "epoch": 0.6711308404183282, "grad_norm": 11.694396467204639, "learning_rate": 1.47434799758612e-07, "logits/chosen": -2.6875, "logits/rejected": -2.40625, "logps/chosen": -668.0, "logps/rejected": -1012.0, "loss": 0.1953, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0, "rewards/margins": 3.71875, "rewards/rejected": -8.75, "step": 8920 }, { "epoch": 0.6718832292528779, "grad_norm": 10.816484640860933, "learning_rate": 1.4683632621823822e-07, "logits/chosen": -2.5625, "logits/rejected": -2.375, "logps/chosen": -668.0, "logps/rejected": -1020.0, "loss": 0.1864, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 3.796875, "rewards/rejected": -8.8125, "step": 8930 }, { "epoch": 0.6726356180874276, "grad_norm": 10.54852159257338, "learning_rate": 1.4623856436860322e-07, "logits/chosen": -2.625, "logits/rejected": -2.390625, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.2048, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.875, "rewards/margins": 3.96875, "rewards/rejected": -8.875, "step": 8940 }, { "epoch": 0.6733880069219773, "grad_norm": 8.950735778962162, "learning_rate": 1.4564151833346072e-07, "logits/chosen": -2.671875, "logits/rejected": -2.421875, "logps/chosen": -696.0, "logps/rejected": -1064.0, "loss": 0.2021, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.21875, "rewards/margins": 3.84375, "rewards/rejected": -9.0625, "step": 8950 }, { "epoch": 0.674140395756527, "grad_norm": 8.68936003944014, "learning_rate": 1.4504519223162602e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -680.0, "logps/rejected": -1032.0, "loss": 0.1832, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.578125, "rewards/rejected": -8.75, "step": 8960 }, { "epoch": 0.6748927845910767, "grad_norm": 9.489397991424491, "learning_rate": 1.4444959017694827e-07, "logits/chosen": -2.640625, "logits/rejected": -2.484375, "logps/chosen": -668.0, "logps/rejected": -1032.0, "loss": 0.1752, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 3.8125, "rewards/rejected": -8.8125, "step": 8970 }, { "epoch": 0.6756451734256264, "grad_norm": 9.236883649300923, "learning_rate": 1.4385471627828112e-07, "logits/chosen": -2.71875, "logits/rejected": -2.390625, "logps/chosen": -700.0, "logps/rejected": -1072.0, "loss": 0.1826, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.3125, "rewards/margins": 3.78125, "rewards/rejected": -9.0625, "step": 8980 }, { "epoch": 0.6763975622601761, "grad_norm": 9.774332294200663, "learning_rate": 1.4326057463945544e-07, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.1658, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.125, "rewards/rejected": -9.1875, "step": 8990 }, { "epoch": 0.6771499510947258, "grad_norm": 8.167283611622954, "learning_rate": 1.4266716935925025e-07, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1024.0, "loss": 0.1437, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 3.90625, "rewards/rejected": -9.0, "step": 9000 }, { "epoch": 0.6779023399292754, "grad_norm": 8.628674441098303, "learning_rate": 1.4207450453136434e-07, "logits/chosen": -2.8125, "logits/rejected": -2.59375, "logps/chosen": -704.0, "logps/rejected": -1128.0, "loss": 0.186, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.375, "rewards/margins": 4.375, "rewards/rejected": -9.75, "step": 9010 }, { "epoch": 0.6786547287638252, "grad_norm": 10.650440588006587, "learning_rate": 1.4148258424438912e-07, "logits/chosen": -2.828125, "logits/rejected": -2.578125, "logps/chosen": -736.0, "logps/rejected": -1104.0, "loss": 0.1771, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.65625, "rewards/margins": 3.828125, "rewards/rejected": -9.5, "step": 9020 }, { "epoch": 0.6794071175983748, "grad_norm": 8.996896689300682, "learning_rate": 1.4089141258177894e-07, "logits/chosen": -2.765625, "logits/rejected": -2.5625, "logps/chosen": -672.0, "logps/rejected": -1048.0, "loss": 0.1569, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.125, "rewards/margins": 3.9375, "rewards/rejected": -9.0625, "step": 9030 }, { "epoch": 0.6801595064329246, "grad_norm": 13.995169040536673, "learning_rate": 1.4030099362182429e-07, "logits/chosen": -2.90625, "logits/rejected": -2.546875, "logps/chosen": -716.0, "logps/rejected": -1160.0, "loss": 0.1598, "rewards/accuracies": 0.9375, "rewards/chosen": -5.5625, "rewards/margins": 4.5625, "rewards/rejected": -10.125, "step": 9040 }, { "epoch": 0.6809118952674742, "grad_norm": 10.307881060915332, "learning_rate": 1.3971133143762255e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -696.0, "logps/rejected": -1152.0, "loss": 0.1824, "rewards/accuracies": 0.9375, "rewards/chosen": -5.375, "rewards/margins": 4.53125, "rewards/rejected": -9.875, "step": 9050 }, { "epoch": 0.681664284102024, "grad_norm": 10.583713776190244, "learning_rate": 1.3912243009705043e-07, "logits/chosen": -2.765625, "logits/rejected": -2.546875, "logps/chosen": -752.0, "logps/rejected": -1128.0, "loss": 0.1965, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.875, "rewards/margins": 3.875, "rewards/rejected": -9.75, "step": 9060 }, { "epoch": 0.6824166729365736, "grad_norm": 8.823124004557704, "learning_rate": 1.3853429366273617e-07, "logits/chosen": -2.890625, "logits/rejected": -2.453125, "logps/chosen": -700.0, "logps/rejected": -1096.0, "loss": 0.192, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.375, "rewards/margins": 4.0625, "rewards/rejected": -9.4375, "step": 9070 }, { "epoch": 0.6831690617711234, "grad_norm": 11.27009671510071, "learning_rate": 1.379469261920308e-07, "logits/chosen": -2.671875, "logits/rejected": -2.421875, "logps/chosen": -704.0, "logps/rejected": -1088.0, "loss": 0.1684, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.3125, "rewards/margins": 4.03125, "rewards/rejected": -9.375, "step": 9080 }, { "epoch": 0.683921450605673, "grad_norm": 7.825518749938118, "learning_rate": 1.373603317369807e-07, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -708.0, "logps/rejected": -1136.0, "loss": 0.1885, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.4375, "rewards/margins": 4.375, "rewards/rejected": -9.8125, "step": 9090 }, { "epoch": 0.6846738394402228, "grad_norm": 8.501497320045095, "learning_rate": 1.3677451434429945e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5625, "logps/chosen": -676.0, "logps/rejected": -1088.0, "loss": 0.173, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.1875, "rewards/rejected": -9.5, "step": 9100 }, { "epoch": 0.6854262282747724, "grad_norm": 7.399791243037484, "learning_rate": 1.3618947805533993e-07, "logits/chosen": -2.65625, "logits/rejected": -2.484375, "logps/chosen": -700.0, "logps/rejected": -1080.0, "loss": 0.1628, "rewards/accuracies": 0.9375, "rewards/chosen": -5.40625, "rewards/margins": 3.90625, "rewards/rejected": -9.3125, "step": 9110 }, { "epoch": 0.6861786171093222, "grad_norm": 7.0064300761151745, "learning_rate": 1.3560522690606657e-07, "logits/chosen": -2.765625, "logits/rejected": -2.484375, "logps/chosen": -708.0, "logps/rejected": -1152.0, "loss": 0.1753, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.625, "rewards/margins": 4.4375, "rewards/rejected": -10.0625, "step": 9120 }, { "epoch": 0.6869310059438718, "grad_norm": 9.795949518702054, "learning_rate": 1.3502176492702732e-07, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -700.0, "logps/rejected": -1128.0, "loss": 0.1783, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.46875, "rewards/rejected": -9.75, "step": 9130 }, { "epoch": 0.6876833947784214, "grad_norm": 11.25158699559909, "learning_rate": 1.344390961433257e-07, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -676.0, "logps/rejected": -1056.0, "loss": 0.1681, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 3.96875, "rewards/rejected": -9.0625, "step": 9140 }, { "epoch": 0.6884357836129712, "grad_norm": 6.844617722011671, "learning_rate": 1.338572245745937e-07, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -660.0, "logps/rejected": -1072.0, "loss": 0.171, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 4.1875, "rewards/rejected": -9.3125, "step": 9150 }, { "epoch": 0.6891881724475208, "grad_norm": 11.525771622235945, "learning_rate": 1.3327615423496324e-07, "logits/chosen": -2.703125, "logits/rejected": -2.5, "logps/chosen": -688.0, "logps/rejected": -1080.0, "loss": 0.1894, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 4.1875, "rewards/rejected": -9.3125, "step": 9160 }, { "epoch": 0.6899405612820706, "grad_norm": 7.667288119796913, "learning_rate": 1.326958891330388e-07, "logits/chosen": -2.8125, "logits/rejected": -2.453125, "logps/chosen": -712.0, "logps/rejected": -1112.0, "loss": 0.1762, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.5, "rewards/margins": 3.890625, "rewards/rejected": -9.375, "step": 9170 }, { "epoch": 0.6906929501166202, "grad_norm": 8.78840029740877, "learning_rate": 1.3211643327187028e-07, "logits/chosen": -2.796875, "logits/rejected": -2.53125, "logps/chosen": -704.0, "logps/rejected": -1104.0, "loss": 0.1767, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.5, "rewards/margins": 4.03125, "rewards/rejected": -9.5625, "step": 9180 }, { "epoch": 0.69144533895117, "grad_norm": 10.296498754282055, "learning_rate": 1.3153779064892417e-07, "logits/chosen": -2.78125, "logits/rejected": -2.609375, "logps/chosen": -692.0, "logps/rejected": -1104.0, "loss": 0.187, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.21875, "rewards/margins": 4.3125, "rewards/rejected": -9.5625, "step": 9190 }, { "epoch": 0.6921977277857196, "grad_norm": 13.779489718183672, "learning_rate": 1.309599652560574e-07, "logits/chosen": -2.859375, "logits/rejected": -2.578125, "logps/chosen": -652.0, "logps/rejected": -1088.0, "loss": 0.1735, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.3125, "rewards/rejected": -9.375, "step": 9200 }, { "epoch": 0.6929501166202694, "grad_norm": 9.16509339113319, "learning_rate": 1.3038296107948872e-07, "logits/chosen": -2.890625, "logits/rejected": -2.515625, "logps/chosen": -700.0, "logps/rejected": -1104.0, "loss": 0.1817, "rewards/accuracies": 0.90625, "rewards/chosen": -5.25, "rewards/margins": 4.09375, "rewards/rejected": -9.3125, "step": 9210 }, { "epoch": 0.693702505454819, "grad_norm": 10.21618698882234, "learning_rate": 1.2980678209977158e-07, "logits/chosen": -2.8125, "logits/rejected": -2.578125, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.1713, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 4.15625, "rewards/rejected": -9.375, "step": 9220 }, { "epoch": 0.6944548942893688, "grad_norm": 11.642858173571353, "learning_rate": 1.2923143229176696e-07, "logits/chosen": -2.734375, "logits/rejected": -2.4375, "logps/chosen": -716.0, "logps/rejected": -1080.0, "loss": 0.2089, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.4375, "rewards/margins": 3.890625, "rewards/rejected": -9.3125, "step": 9230 }, { "epoch": 0.6952072831239184, "grad_norm": 7.082808348259589, "learning_rate": 1.286569156246154e-07, "logits/chosen": -2.890625, "logits/rejected": -2.515625, "logps/chosen": -716.0, "logps/rejected": -1128.0, "loss": 0.15, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.625, "rewards/margins": 4.15625, "rewards/rejected": -9.8125, "step": 9240 }, { "epoch": 0.6959596719584681, "grad_norm": 8.349773826508297, "learning_rate": 1.2808323606171006e-07, "logits/chosen": -2.859375, "logits/rejected": -2.515625, "logps/chosen": -708.0, "logps/rejected": -1080.0, "loss": 0.1748, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.46875, "rewards/margins": 4.03125, "rewards/rejected": -9.5, "step": 9250 }, { "epoch": 0.6967120607930178, "grad_norm": 9.112706785252943, "learning_rate": 1.2751039756066907e-07, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -692.0, "logps/rejected": -1088.0, "loss": 0.1645, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.0625, "rewards/rejected": -9.25, "step": 9260 }, { "epoch": 0.6974644496275675, "grad_norm": 10.470465403023757, "learning_rate": 1.2693840407330838e-07, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1935, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.21875, "rewards/margins": 3.9375, "rewards/rejected": -9.125, "step": 9270 }, { "epoch": 0.6982168384621172, "grad_norm": 9.337580317905838, "learning_rate": 1.2636725954561479e-07, "logits/chosen": -2.828125, "logits/rejected": -2.46875, "logps/chosen": -680.0, "logps/rejected": -1120.0, "loss": 0.1863, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.59375, "rewards/rejected": -9.625, "step": 9280 }, { "epoch": 0.6989692272966669, "grad_norm": 9.891066885435336, "learning_rate": 1.257969679177177e-07, "logits/chosen": -2.8125, "logits/rejected": -2.421875, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.1777, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.375, "rewards/rejected": -9.25, "step": 9290 }, { "epoch": 0.6997216161312166, "grad_norm": 8.508062070892356, "learning_rate": 1.2522753312386347e-07, "logits/chosen": -2.875, "logits/rejected": -2.46875, "logps/chosen": -664.0, "logps/rejected": -1056.0, "loss": 0.1799, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.0625, "rewards/rejected": -9.0, "step": 9300 }, { "epoch": 0.7004740049657663, "grad_norm": 9.102667875750308, "learning_rate": 1.2465895909238698e-07, "logits/chosen": -2.75, "logits/rejected": -2.546875, "logps/chosen": -680.0, "logps/rejected": -1032.0, "loss": 0.1791, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 3.796875, "rewards/rejected": -8.8125, "step": 9310 }, { "epoch": 0.701226393800316, "grad_norm": 11.374327168541502, "learning_rate": 1.24091249745685e-07, "logits/chosen": -2.6875, "logits/rejected": -2.46875, "logps/chosen": -656.0, "logps/rejected": -1056.0, "loss": 0.1778, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.21875, "rewards/rejected": -9.0625, "step": 9320 }, { "epoch": 0.7019787826348657, "grad_norm": 8.211157418676647, "learning_rate": 1.2352440900018943e-07, "logits/chosen": -2.828125, "logits/rejected": -2.40625, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.1684, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.0625, "rewards/rejected": -9.1875, "step": 9330 }, { "epoch": 0.7027311714694154, "grad_norm": 10.469371653774871, "learning_rate": 1.2295844076633973e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -660.0, "logps/rejected": -1072.0, "loss": 0.191, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.25, "rewards/rejected": -9.3125, "step": 9340 }, { "epoch": 0.7034835603039651, "grad_norm": 9.42145146901656, "learning_rate": 1.2239334894855622e-07, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -636.0, "logps/rejected": -1040.0, "loss": 0.1921, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.1875, "rewards/rejected": -9.0, "step": 9350 }, { "epoch": 0.7042359491385148, "grad_norm": 8.009475055610869, "learning_rate": 1.2182913744521332e-07, "logits/chosen": -2.828125, "logits/rejected": -2.46875, "logps/chosen": -652.0, "logps/rejected": -992.0, "loss": 0.1896, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.9375, "rewards/margins": 3.5625, "rewards/rejected": -8.5, "step": 9360 }, { "epoch": 0.7049883379730645, "grad_norm": 10.882108464272054, "learning_rate": 1.2126581014861227e-07, "logits/chosen": -2.6875, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1024.0, "loss": 0.1932, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.96875, "rewards/margins": 3.75, "rewards/rejected": -8.6875, "step": 9370 }, { "epoch": 0.7057407268076141, "grad_norm": 6.842288897849046, "learning_rate": 1.207033709449545e-07, "logits/chosen": -2.65625, "logits/rejected": -2.484375, "logps/chosen": -652.0, "logps/rejected": -1016.0, "loss": 0.1937, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.0, "rewards/margins": 3.8125, "rewards/rejected": -8.8125, "step": 9380 }, { "epoch": 0.7064931156421639, "grad_norm": 9.393673021501025, "learning_rate": 1.2014182371431486e-07, "logits/chosen": -2.765625, "logits/rejected": -2.40625, "logps/chosen": -640.0, "logps/rejected": -1024.0, "loss": 0.1577, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.125, "rewards/rejected": -8.875, "step": 9390 }, { "epoch": 0.7072455044767135, "grad_norm": 10.770423337068612, "learning_rate": 1.1958117233061465e-07, "logits/chosen": -2.75, "logits/rejected": -2.390625, "logps/chosen": -664.0, "logps/rejected": -1056.0, "loss": 0.1626, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.09375, "rewards/rejected": -9.0625, "step": 9400 }, { "epoch": 0.7079978933112633, "grad_norm": 21.808391597326604, "learning_rate": 1.1902142066159529e-07, "logits/chosen": -2.796875, "logits/rejected": -2.5625, "logps/chosen": -668.0, "logps/rejected": -1104.0, "loss": 0.1889, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0, "rewards/margins": 4.3125, "rewards/rejected": -9.3125, "step": 9410 }, { "epoch": 0.7087502821458129, "grad_norm": 10.702203908885501, "learning_rate": 1.1846257256879116e-07, "logits/chosen": -2.796875, "logits/rejected": -2.4375, "logps/chosen": -680.0, "logps/rejected": -1072.0, "loss": 0.1845, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.15625, "rewards/margins": 4.0625, "rewards/rejected": -9.25, "step": 9420 }, { "epoch": 0.7095026709803627, "grad_norm": 11.364598529357458, "learning_rate": 1.1790463190750313e-07, "logits/chosen": -2.90625, "logits/rejected": -2.578125, "logps/chosen": -660.0, "logps/rejected": -1056.0, "loss": 0.1853, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.28125, "rewards/rejected": -9.25, "step": 9430 }, { "epoch": 0.7102550598149123, "grad_norm": 8.753631634329706, "learning_rate": 1.1734760252677228e-07, "logits/chosen": -2.703125, "logits/rejected": -2.484375, "logps/chosen": -652.0, "logps/rejected": -1032.0, "loss": 0.1669, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.84375, "rewards/margins": 3.953125, "rewards/rejected": -8.8125, "step": 9440 }, { "epoch": 0.7110074486494621, "grad_norm": 9.94382735225419, "learning_rate": 1.1679148826935284e-07, "logits/chosen": -2.71875, "logits/rejected": -2.515625, "logps/chosen": -656.0, "logps/rejected": -1032.0, "loss": 0.1847, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 3.796875, "rewards/rejected": -8.8125, "step": 9450 }, { "epoch": 0.7117598374840117, "grad_norm": 9.838493336923607, "learning_rate": 1.1623629297168599e-07, "logits/chosen": -2.78125, "logits/rejected": -2.4375, "logps/chosen": -616.0, "logps/rejected": -1024.0, "loss": 0.1896, "rewards/accuracies": 0.90625, "rewards/chosen": -4.625, "rewards/margins": 4.09375, "rewards/rejected": -8.6875, "step": 9460 }, { "epoch": 0.7125122263185615, "grad_norm": 9.585492473960032, "learning_rate": 1.1568202046387334e-07, "logits/chosen": -2.71875, "logits/rejected": -2.484375, "logps/chosen": -664.0, "logps/rejected": -1016.0, "loss": 0.1807, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.546875, "rewards/rejected": -8.5625, "step": 9470 }, { "epoch": 0.7132646151531111, "grad_norm": 9.265983416511775, "learning_rate": 1.1512867456965036e-07, "logits/chosen": -2.890625, "logits/rejected": -2.59375, "logps/chosen": -644.0, "logps/rejected": -1056.0, "loss": 0.1676, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 4.34375, "rewards/rejected": -9.0625, "step": 9480 }, { "epoch": 0.7140170039876608, "grad_norm": 9.23155140930734, "learning_rate": 1.1457625910636042e-07, "logits/chosen": -2.84375, "logits/rejected": -2.359375, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.1809, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 4.21875, "rewards/rejected": -9.3125, "step": 9490 }, { "epoch": 0.7147693928222105, "grad_norm": 10.055006739194052, "learning_rate": 1.1402477788492796e-07, "logits/chosen": -2.859375, "logits/rejected": -2.546875, "logps/chosen": -664.0, "logps/rejected": -1072.0, "loss": 0.2063, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.125, "rewards/margins": 4.0625, "rewards/rejected": -9.1875, "step": 9500 }, { "epoch": 0.7155217816567602, "grad_norm": 9.051779429722119, "learning_rate": 1.134742347098323e-07, "logits/chosen": -2.765625, "logits/rejected": -2.40625, "logps/chosen": -644.0, "logps/rejected": -1040.0, "loss": 0.1737, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.15625, "rewards/rejected": -9.0625, "step": 9510 }, { "epoch": 0.7162741704913099, "grad_norm": 9.738489694790811, "learning_rate": 1.1292463337908185e-07, "logits/chosen": -2.71875, "logits/rejected": -2.46875, "logps/chosen": -708.0, "logps/rejected": -1056.0, "loss": 0.1818, "rewards/accuracies": 0.90625, "rewards/chosen": -5.46875, "rewards/margins": 3.671875, "rewards/rejected": -9.125, "step": 9520 }, { "epoch": 0.7170265593258596, "grad_norm": 7.3835263007077785, "learning_rate": 1.1237597768418714e-07, "logits/chosen": -2.734375, "logits/rejected": -2.390625, "logps/chosen": -684.0, "logps/rejected": -1040.0, "loss": 0.1705, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 3.796875, "rewards/rejected": -8.875, "step": 9530 }, { "epoch": 0.7177789481604093, "grad_norm": 9.635982346463374, "learning_rate": 1.1182827141013549e-07, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -664.0, "logps/rejected": -1048.0, "loss": 0.1891, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.0625, "rewards/rejected": -9.125, "step": 9540 }, { "epoch": 0.718531336994959, "grad_norm": 9.078747354114716, "learning_rate": 1.112815183353642e-07, "logits/chosen": -2.671875, "logits/rejected": -2.4375, "logps/chosen": -684.0, "logps/rejected": -1080.0, "loss": 0.1714, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.15625, "rewards/margins": 4.1875, "rewards/rejected": -9.3125, "step": 9550 }, { "epoch": 0.7192837258295087, "grad_norm": 7.906910158972716, "learning_rate": 1.1073572223173489e-07, "logits/chosen": -2.6875, "logits/rejected": -2.40625, "logps/chosen": -672.0, "logps/rejected": -1048.0, "loss": 0.1691, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0625, "rewards/margins": 3.875, "rewards/rejected": -8.9375, "step": 9560 }, { "epoch": 0.7200361146640584, "grad_norm": 9.668892587581858, "learning_rate": 1.1019088686450731e-07, "logits/chosen": -2.84375, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1004.0, "loss": 0.1994, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.0625, "rewards/margins": 3.4375, "rewards/rejected": -8.5, "step": 9570 }, { "epoch": 0.7207885034986081, "grad_norm": 10.8770164787015, "learning_rate": 1.0964701599231341e-07, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -644.0, "logps/rejected": -1056.0, "loss": 0.1706, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.25, "rewards/rejected": -9.0625, "step": 9580 }, { "epoch": 0.7215408923331578, "grad_norm": 8.388635696346707, "learning_rate": 1.091041133671316e-07, "logits/chosen": -2.8125, "logits/rejected": -2.65625, "logps/chosen": -668.0, "logps/rejected": -1040.0, "loss": 0.1709, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.15625, "rewards/margins": 3.828125, "rewards/rejected": -9.0, "step": 9590 }, { "epoch": 0.7222932811677075, "grad_norm": 7.955366283806861, "learning_rate": 1.0856218273426049e-07, "logits/chosen": -2.78125, "logits/rejected": -2.546875, "logps/chosen": -628.0, "logps/rejected": -1072.0, "loss": 0.1787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 4.6875, "rewards/rejected": -9.4375, "step": 9600 }, { "epoch": 0.7230456700022572, "grad_norm": 10.223005539532137, "learning_rate": 1.0802122783229323e-07, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -652.0, "logps/rejected": -1048.0, "loss": 0.1659, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.0625, "rewards/rejected": -9.0625, "step": 9610 }, { "epoch": 0.7237980588368068, "grad_norm": 10.96860806650249, "learning_rate": 1.0748125239309197e-07, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -692.0, "logps/rejected": -1104.0, "loss": 0.1728, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.3125, "rewards/rejected": -9.5625, "step": 9620 }, { "epoch": 0.7245504476713566, "grad_norm": 8.692095566318107, "learning_rate": 1.0694226014176167e-07, "logits/chosen": -2.828125, "logits/rejected": -2.4375, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.1753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 4.21875, "rewards/rejected": -9.0625, "step": 9630 }, { "epoch": 0.7253028365059062, "grad_norm": 9.805529480204843, "learning_rate": 1.0640425479662465e-07, "logits/chosen": -2.8125, "logits/rejected": -2.484375, "logps/chosen": -644.0, "logps/rejected": -1088.0, "loss": 0.1719, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 4.40625, "rewards/rejected": -9.25, "step": 9640 }, { "epoch": 0.726055225340456, "grad_norm": 8.46465203189475, "learning_rate": 1.0586724006919496e-07, "logits/chosen": -2.859375, "logits/rejected": -2.53125, "logps/chosen": -636.0, "logps/rejected": -1024.0, "loss": 0.162, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 3.828125, "rewards/rejected": -8.6875, "step": 9650 }, { "epoch": 0.7268076141750056, "grad_norm": 8.467009438269589, "learning_rate": 1.0533121966415257e-07, "logits/chosen": -2.84375, "logits/rejected": -2.578125, "logps/chosen": -648.0, "logps/rejected": -1032.0, "loss": 0.1656, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.90625, "rewards/margins": 3.984375, "rewards/rejected": -8.875, "step": 9660 }, { "epoch": 0.7275600030095554, "grad_norm": 9.939899833719256, "learning_rate": 1.0479619727931827e-07, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -664.0, "logps/rejected": -1080.0, "loss": 0.1605, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.28125, "rewards/rejected": -9.3125, "step": 9670 }, { "epoch": 0.728312391844105, "grad_norm": 9.86220382562208, "learning_rate": 1.0426217660562758e-07, "logits/chosen": -2.734375, "logits/rejected": -2.34375, "logps/chosen": -676.0, "logps/rejected": -1040.0, "loss": 0.1803, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 3.796875, "rewards/rejected": -8.875, "step": 9680 }, { "epoch": 0.7290647806786548, "grad_norm": 7.5288526501323005, "learning_rate": 1.0372916132710555e-07, "logits/chosen": -2.703125, "logits/rejected": -2.484375, "logps/chosen": -696.0, "logps/rejected": -1016.0, "loss": 0.1591, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.0625, "rewards/margins": 3.65625, "rewards/rejected": -8.6875, "step": 9690 }, { "epoch": 0.7298171695132044, "grad_norm": 12.651261448629873, "learning_rate": 1.031971551208416e-07, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -620.0, "logps/rejected": -1032.0, "loss": 0.1852, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.1875, "rewards/rejected": -8.8125, "step": 9700 }, { "epoch": 0.7305695583477542, "grad_norm": 6.553804927250272, "learning_rate": 1.026661616569637e-07, "logits/chosen": -2.671875, "logits/rejected": -2.375, "logps/chosen": -680.0, "logps/rejected": -1040.0, "loss": 0.1711, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.25, "rewards/margins": 3.609375, "rewards/rejected": -8.875, "step": 9710 }, { "epoch": 0.7313219471823038, "grad_norm": 6.778448048748569, "learning_rate": 1.0213618459861321e-07, "logits/chosen": -2.8125, "logits/rejected": -2.40625, "logps/chosen": -652.0, "logps/rejected": -1032.0, "loss": 0.1722, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0625, "rewards/margins": 3.78125, "rewards/rejected": -8.8125, "step": 9720 }, { "epoch": 0.7320743360168535, "grad_norm": 10.188751496163695, "learning_rate": 1.0160722760192e-07, "logits/chosen": -2.734375, "logits/rejected": -2.46875, "logps/chosen": -660.0, "logps/rejected": -1056.0, "loss": 0.1754, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.1875, "rewards/rejected": -9.1875, "step": 9730 }, { "epoch": 0.7328267248514032, "grad_norm": 9.468220885030195, "learning_rate": 1.010792943159763e-07, "logits/chosen": -2.8125, "logits/rejected": -2.546875, "logps/chosen": -700.0, "logps/rejected": -1072.0, "loss": 0.1825, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 3.78125, "rewards/rejected": -9.0, "step": 9740 }, { "epoch": 0.7335791136859529, "grad_norm": 10.86395812337371, "learning_rate": 1.0055238838281275e-07, "logits/chosen": -2.75, "logits/rejected": -2.421875, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.1707, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.90625, "rewards/rejected": -9.0625, "step": 9750 }, { "epoch": 0.7343315025205026, "grad_norm": 8.879896833215565, "learning_rate": 1.000265134373722e-07, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -692.0, "logps/rejected": -1032.0, "loss": 0.1886, "rewards/accuracies": 0.90625, "rewards/chosen": -5.28125, "rewards/margins": 3.546875, "rewards/rejected": -8.8125, "step": 9760 }, { "epoch": 0.7350838913550523, "grad_norm": 11.673191469930098, "learning_rate": 9.950167310748516e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -680.0, "logps/rejected": -1024.0, "loss": 0.2021, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0, "rewards/margins": 3.71875, "rewards/rejected": -8.75, "step": 9770 }, { "epoch": 0.735836280189602, "grad_norm": 8.480948647247656, "learning_rate": 9.897787101384492e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5, "logps/chosen": -676.0, "logps/rejected": -1072.0, "loss": 0.1879, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.0, "rewards/rejected": -9.1875, "step": 9780 }, { "epoch": 0.7365886690241517, "grad_norm": 11.316343261425263, "learning_rate": 9.845511076998195e-08, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -652.0, "logps/rejected": -1024.0, "loss": 0.1941, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 3.84375, "rewards/rejected": -8.75, "step": 9790 }, { "epoch": 0.7373410578587014, "grad_norm": 7.946997283617458, "learning_rate": 9.79333959822397e-08, "logits/chosen": -2.796875, "logits/rejected": -2.421875, "logps/chosen": -652.0, "logps/rejected": -1088.0, "loss": 0.1749, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.34375, "rewards/rejected": -9.25, "step": 9800 }, { "epoch": 0.738093446693251, "grad_norm": 11.014704537409067, "learning_rate": 9.741273024974919e-08, "logits/chosen": -2.78125, "logits/rejected": -2.4375, "logps/chosen": -700.0, "logps/rejected": -1096.0, "loss": 0.1855, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.4375, "rewards/margins": 3.921875, "rewards/rejected": -9.3125, "step": 9810 }, { "epoch": 0.7388458355278008, "grad_norm": 7.359825088225063, "learning_rate": 9.68931171644044e-08, "logits/chosen": -2.703125, "logits/rejected": -2.390625, "logps/chosen": -692.0, "logps/rejected": -1048.0, "loss": 0.1879, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.375, "rewards/margins": 3.609375, "rewards/rejected": -9.0, "step": 9820 }, { "epoch": 0.7395982243623505, "grad_norm": 13.258771156337893, "learning_rate": 9.637456031083746e-08, "logits/chosen": -2.640625, "logits/rejected": -2.34375, "logps/chosen": -660.0, "logps/rejected": -1024.0, "loss": 0.1734, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 3.8125, "rewards/rejected": -8.625, "step": 9830 }, { "epoch": 0.7403506131969002, "grad_norm": 8.149307767061735, "learning_rate": 9.585706326639383e-08, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -656.0, "logps/rejected": -996.0, "loss": 0.1707, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 3.578125, "rewards/rejected": -8.4375, "step": 9840 }, { "epoch": 0.7411030020314499, "grad_norm": 7.294894487594937, "learning_rate": 9.534062960110803e-08, "logits/chosen": -2.765625, "logits/rejected": -2.4375, "logps/chosen": -668.0, "logps/rejected": -1048.0, "loss": 0.1913, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 3.828125, "rewards/rejected": -8.9375, "step": 9850 }, { "epoch": 0.7418553908659995, "grad_norm": 6.963836197185632, "learning_rate": 9.482526287767836e-08, "logits/chosen": -2.71875, "logits/rejected": -2.4375, "logps/chosen": -676.0, "logps/rejected": -1032.0, "loss": 0.1938, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.640625, "rewards/rejected": -8.8125, "step": 9860 }, { "epoch": 0.7426077797005493, "grad_norm": 10.019178889430904, "learning_rate": 9.431096665144268e-08, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -712.0, "logps/rejected": -1040.0, "loss": 0.18, "rewards/accuracies": 0.90625, "rewards/chosen": -5.40625, "rewards/margins": 3.609375, "rewards/rejected": -9.0, "step": 9870 }, { "epoch": 0.7433601685350989, "grad_norm": 11.771486703890151, "learning_rate": 9.379774447035408e-08, "logits/chosen": -2.9375, "logits/rejected": -2.40625, "logps/chosen": -656.0, "logps/rejected": -1080.0, "loss": 0.1924, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.28125, "rewards/rejected": -9.25, "step": 9880 }, { "epoch": 0.7441125573696487, "grad_norm": 8.416236345319431, "learning_rate": 9.328559987495602e-08, "logits/chosen": -2.8125, "logits/rejected": -2.4375, "logps/chosen": -648.0, "logps/rejected": -1064.0, "loss": 0.2017, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.03125, "rewards/rejected": -8.9375, "step": 9890 }, { "epoch": 0.7448649462041983, "grad_norm": 9.134436384409844, "learning_rate": 9.277453639835795e-08, "logits/chosen": -2.796875, "logits/rejected": -2.609375, "logps/chosen": -700.0, "logps/rejected": -1024.0, "loss": 0.1766, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.28125, "rewards/margins": 3.390625, "rewards/rejected": -8.6875, "step": 9900 }, { "epoch": 0.745617335038748, "grad_norm": 9.151728884375531, "learning_rate": 9.226455756621152e-08, "logits/chosen": -2.65625, "logits/rejected": -2.515625, "logps/chosen": -684.0, "logps/rejected": -1008.0, "loss": 0.1791, "rewards/accuracies": 0.90625, "rewards/chosen": -5.03125, "rewards/margins": 3.640625, "rewards/rejected": -8.6875, "step": 9910 }, { "epoch": 0.7463697238732977, "grad_norm": 10.840220850853532, "learning_rate": 9.175566689668506e-08, "logits/chosen": -2.71875, "logits/rejected": -2.40625, "logps/chosen": -636.0, "logps/rejected": -1024.0, "loss": 0.2038, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.09375, "rewards/rejected": -8.75, "step": 9920 }, { "epoch": 0.7471221127078475, "grad_norm": 9.358295766377482, "learning_rate": 9.124786790044076e-08, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -636.0, "logps/rejected": -1072.0, "loss": 0.1776, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 4.40625, "rewards/rejected": -9.1875, "step": 9930 }, { "epoch": 0.7478745015423971, "grad_norm": 7.152463775059365, "learning_rate": 9.074116408060931e-08, "logits/chosen": -2.75, "logits/rejected": -2.359375, "logps/chosen": -688.0, "logps/rejected": -1064.0, "loss": 0.1562, "rewards/accuracies": 0.90625, "rewards/chosen": -5.21875, "rewards/margins": 3.890625, "rewards/rejected": -9.125, "step": 9940 }, { "epoch": 0.7486268903769469, "grad_norm": 12.766664981308125, "learning_rate": 9.023555893276613e-08, "logits/chosen": -2.671875, "logits/rejected": -2.515625, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.1911, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.0, "rewards/rejected": -8.875, "step": 9950 }, { "epoch": 0.7493792792114965, "grad_norm": 10.858578672050218, "learning_rate": 8.973105594490766e-08, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -640.0, "logps/rejected": -1032.0, "loss": 0.1734, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 4.0625, "rewards/rejected": -8.875, "step": 9960 }, { "epoch": 0.7501316680460463, "grad_norm": 10.227524754185085, "learning_rate": 8.922765859742654e-08, "logits/chosen": -2.78125, "logits/rejected": -2.40625, "logps/chosen": -632.0, "logps/rejected": -1024.0, "loss": 0.1805, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 4.03125, "rewards/rejected": -8.8125, "step": 9970 }, { "epoch": 0.7508840568805959, "grad_norm": 7.220708688993811, "learning_rate": 8.872537036308802e-08, "logits/chosen": -2.71875, "logits/rejected": -2.546875, "logps/chosen": -680.0, "logps/rejected": -1072.0, "loss": 0.1596, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.03125, "rewards/rejected": -9.0, "step": 9980 }, { "epoch": 0.7516364457151455, "grad_norm": 8.982727497144737, "learning_rate": 8.822419470700624e-08, "logits/chosen": -2.84375, "logits/rejected": -2.53125, "logps/chosen": -644.0, "logps/rejected": -1072.0, "loss": 0.1833, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.34375, "rewards/rejected": -9.1875, "step": 9990 }, { "epoch": 0.7523888345496953, "grad_norm": 7.428998419527347, "learning_rate": 8.772413508661972e-08, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -668.0, "logps/rejected": -1048.0, "loss": 0.1665, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.03125, "rewards/margins": 3.796875, "rewards/rejected": -8.8125, "step": 10000 }, { "epoch": 0.7523888345496953, "eval_logits/chosen": -2.75, "eval_logits/rejected": -2.46875, "eval_logps/chosen": -680.0, "eval_logps/rejected": -1048.0, "eval_loss": 0.23313412070274353, "eval_rewards/accuracies": 0.8960731029510498, "eval_rewards/chosen": -5.125, "eval_rewards/margins": 3.78125, "eval_rewards/rejected": -8.9375, "eval_runtime": 3416.6122, "eval_samples_per_second": 27.657, "eval_steps_per_second": 0.432, "step": 10000 }, { "epoch": 0.7531412233842449, "grad_norm": 9.12618473241598, "learning_rate": 8.722519495166799e-08, "logits/chosen": -2.765625, "logits/rejected": -2.46875, "logps/chosen": -676.0, "logps/rejected": -1020.0, "loss": 0.1664, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.6875, "rewards/rejected": -8.875, "step": 10010 }, { "epoch": 0.7538936122187947, "grad_norm": 10.399997997659609, "learning_rate": 8.672737774416761e-08, "logits/chosen": -2.8125, "logits/rejected": -2.40625, "logps/chosen": -680.0, "logps/rejected": -1072.0, "loss": 0.1755, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 3.953125, "rewards/rejected": -9.0625, "step": 10020 }, { "epoch": 0.7546460010533443, "grad_norm": 10.652603126033481, "learning_rate": 8.623068689838836e-08, "logits/chosen": -2.734375, "logits/rejected": -2.453125, "logps/chosen": -656.0, "logps/rejected": -1056.0, "loss": 0.1669, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.875, "rewards/margins": 4.09375, "rewards/rejected": -9.0, "step": 10030 }, { "epoch": 0.7553983898878941, "grad_norm": 9.186947252524327, "learning_rate": 8.57351258408299e-08, "logits/chosen": -2.890625, "logits/rejected": -2.59375, "logps/chosen": -648.0, "logps/rejected": -1072.0, "loss": 0.1674, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.3125, "rewards/rejected": -9.125, "step": 10040 }, { "epoch": 0.7561507787224437, "grad_norm": 9.65754958699827, "learning_rate": 8.52406979901975e-08, "logits/chosen": -2.78125, "logits/rejected": -2.421875, "logps/chosen": -664.0, "logps/rejected": -1072.0, "loss": 0.1753, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.125, "rewards/rejected": -9.125, "step": 10050 }, { "epoch": 0.7569031675569935, "grad_norm": 8.950269011710796, "learning_rate": 8.474740675737921e-08, "logits/chosen": -2.78125, "logits/rejected": -2.40625, "logps/chosen": -636.0, "logps/rejected": -1088.0, "loss": 0.1593, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 4.40625, "rewards/rejected": -9.3125, "step": 10060 }, { "epoch": 0.7576555563915431, "grad_norm": 8.508586377653767, "learning_rate": 8.425525554542167e-08, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -688.0, "logps/rejected": -1064.0, "loss": 0.2017, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.15625, "rewards/margins": 3.78125, "rewards/rejected": -8.9375, "step": 10070 }, { "epoch": 0.7584079452260929, "grad_norm": 10.66256478967201, "learning_rate": 8.376424774950691e-08, "logits/chosen": -2.8125, "logits/rejected": -2.484375, "logps/chosen": -596.0, "logps/rejected": -1032.0, "loss": 0.1692, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.125, "rewards/rejected": -8.6875, "step": 10080 }, { "epoch": 0.7591603340606425, "grad_norm": 10.36167482015296, "learning_rate": 8.327438675692921e-08, "logits/chosen": -2.765625, "logits/rejected": -2.515625, "logps/chosen": -636.0, "logps/rejected": -1032.0, "loss": 0.1943, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.03125, "rewards/margins": 3.75, "rewards/rejected": -8.75, "step": 10090 }, { "epoch": 0.7599127228951922, "grad_norm": 7.697043573657919, "learning_rate": 8.278567594707098e-08, "logits/chosen": -2.828125, "logits/rejected": -2.640625, "logps/chosen": -656.0, "logps/rejected": -1024.0, "loss": 0.1738, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 3.90625, "rewards/rejected": -8.6875, "step": 10100 }, { "epoch": 0.7606651117297419, "grad_norm": 9.904598280078455, "learning_rate": 8.229811869138036e-08, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -636.0, "logps/rejected": -1056.0, "loss": 0.1916, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.6875, "rewards/margins": 4.15625, "rewards/rejected": -8.875, "step": 10110 }, { "epoch": 0.7614175005642916, "grad_norm": 11.8921082637588, "learning_rate": 8.181171835334733e-08, "logits/chosen": -2.859375, "logits/rejected": -2.546875, "logps/chosen": -624.0, "logps/rejected": -1048.0, "loss": 0.1758, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.75, "rewards/margins": 4.21875, "rewards/rejected": -8.9375, "step": 10120 }, { "epoch": 0.7621698893988413, "grad_norm": 10.341136589624496, "learning_rate": 8.132647828848052e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -636.0, "logps/rejected": -1072.0, "loss": 0.1756, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 4.28125, "rewards/rejected": -9.0625, "step": 10130 }, { "epoch": 0.762922278233391, "grad_norm": 7.466247090163396, "learning_rate": 8.084240184428465e-08, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -652.0, "logps/rejected": -1072.0, "loss": 0.1578, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.40625, "rewards/rejected": -9.1875, "step": 10140 }, { "epoch": 0.7636746670679407, "grad_norm": 8.628893990755198, "learning_rate": 8.035949236023668e-08, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -664.0, "logps/rejected": -1080.0, "loss": 0.163, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 4.125, "rewards/rejected": -9.1875, "step": 10150 }, { "epoch": 0.7644270559024904, "grad_norm": 9.207818771970539, "learning_rate": 7.987775316776311e-08, "logits/chosen": -2.8125, "logits/rejected": -2.4375, "logps/chosen": -644.0, "logps/rejected": -1048.0, "loss": 0.1953, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 4.15625, "rewards/rejected": -9.0625, "step": 10160 }, { "epoch": 0.7651794447370401, "grad_norm": 12.719931264853546, "learning_rate": 7.939718759021729e-08, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -996.0, "loss": 0.1853, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.90625, "rewards/margins": 3.46875, "rewards/rejected": -8.375, "step": 10170 }, { "epoch": 0.7659318335715898, "grad_norm": 11.616782082982464, "learning_rate": 7.891779894285597e-08, "logits/chosen": -2.734375, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1056.0, "loss": 0.1897, "rewards/accuracies": 0.90625, "rewards/chosen": -5.125, "rewards/margins": 3.921875, "rewards/rejected": -9.0625, "step": 10180 }, { "epoch": 0.7666842224061395, "grad_norm": 9.305716216830726, "learning_rate": 7.843959053281663e-08, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -700.0, "logps/rejected": -1072.0, "loss": 0.1689, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.3125, "rewards/margins": 3.96875, "rewards/rejected": -9.25, "step": 10190 }, { "epoch": 0.7674366112406892, "grad_norm": 10.881026555203078, "learning_rate": 7.796256565909487e-08, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -704.0, "logps/rejected": -1104.0, "loss": 0.1971, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.34375, "rewards/margins": 4.03125, "rewards/rejected": -9.375, "step": 10200 }, { "epoch": 0.7681890000752389, "grad_norm": 7.835757600755162, "learning_rate": 7.748672761252123e-08, "logits/chosen": -2.796875, "logits/rejected": -2.515625, "logps/chosen": -648.0, "logps/rejected": -1056.0, "loss": 0.1631, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.15625, "rewards/rejected": -9.1875, "step": 10210 }, { "epoch": 0.7689413889097886, "grad_norm": 7.770590581726989, "learning_rate": 7.701207967573911e-08, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1064.0, "loss": 0.1755, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.09375, "rewards/rejected": -9.1875, "step": 10220 }, { "epoch": 0.7696937777443382, "grad_norm": 9.940083520036907, "learning_rate": 7.653862512318146e-08, "logits/chosen": -2.84375, "logits/rejected": -2.625, "logps/chosen": -652.0, "logps/rejected": -1040.0, "loss": 0.1714, "rewards/accuracies": 0.9375, "rewards/chosen": -5.125, "rewards/margins": 3.734375, "rewards/rejected": -8.875, "step": 10230 }, { "epoch": 0.770446166578888, "grad_norm": 8.925579343601644, "learning_rate": 7.606636722104845e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5, "logps/chosen": -644.0, "logps/rejected": -1064.0, "loss": 0.163, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.1875, "rewards/rejected": -9.1875, "step": 10240 }, { "epoch": 0.7711985554134376, "grad_norm": 9.764468368540816, "learning_rate": 7.559530922728527e-08, "logits/chosen": -2.671875, "logits/rejected": -2.46875, "logps/chosen": -692.0, "logps/rejected": -1072.0, "loss": 0.176, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.25, "rewards/margins": 3.84375, "rewards/rejected": -9.125, "step": 10250 }, { "epoch": 0.7719509442479874, "grad_norm": 9.206715004074338, "learning_rate": 7.512545439155904e-08, "logits/chosen": -2.75, "logits/rejected": -2.484375, "logps/chosen": -664.0, "logps/rejected": -1064.0, "loss": 0.1848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.0, "rewards/margins": 3.953125, "rewards/rejected": -8.9375, "step": 10260 }, { "epoch": 0.772703333082537, "grad_norm": 9.644715781394588, "learning_rate": 7.46568059552369e-08, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -704.0, "logps/rejected": -1040.0, "loss": 0.18, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.375, "rewards/margins": 3.59375, "rewards/rejected": -8.9375, "step": 10270 }, { "epoch": 0.7734557219170868, "grad_norm": 10.280130422025161, "learning_rate": 7.418936715136334e-08, "logits/chosen": -2.828125, "logits/rejected": -2.65625, "logps/chosen": -688.0, "logps/rejected": -1104.0, "loss": 0.1621, "rewards/accuracies": 0.9375, "rewards/chosen": -5.28125, "rewards/margins": 4.3125, "rewards/rejected": -9.5625, "step": 10280 }, { "epoch": 0.7742081107516364, "grad_norm": 8.298236385227671, "learning_rate": 7.372314120463798e-08, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -692.0, "logps/rejected": -1080.0, "loss": 0.1501, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.34375, "rewards/margins": 4.03125, "rewards/rejected": -9.375, "step": 10290 }, { "epoch": 0.7749604995861862, "grad_norm": 8.344930602682298, "learning_rate": 7.325813133139361e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -704.0, "logps/rejected": -1096.0, "loss": 0.1752, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.40625, "rewards/margins": 3.90625, "rewards/rejected": -9.3125, "step": 10300 }, { "epoch": 0.7757128884207358, "grad_norm": 10.961934877649696, "learning_rate": 7.279434073957349e-08, "logits/chosen": -2.734375, "logits/rejected": -2.5625, "logps/chosen": -720.0, "logps/rejected": -1104.0, "loss": 0.1666, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.4375, "rewards/margins": 3.96875, "rewards/rejected": -9.375, "step": 10310 }, { "epoch": 0.7764652772552856, "grad_norm": 11.538299228218866, "learning_rate": 7.233177262870946e-08, "logits/chosen": -2.84375, "logits/rejected": -2.453125, "logps/chosen": -688.0, "logps/rejected": -1080.0, "loss": 0.1968, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.28125, "rewards/margins": 3.875, "rewards/rejected": -9.125, "step": 10320 }, { "epoch": 0.7772176660898352, "grad_norm": 11.703461632053674, "learning_rate": 7.187043018990016e-08, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -704.0, "logps/rejected": -1064.0, "loss": 0.1778, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 3.78125, "rewards/rejected": -9.0625, "step": 10330 }, { "epoch": 0.7779700549243849, "grad_norm": 7.898266863613665, "learning_rate": 7.141031660578839e-08, "logits/chosen": -2.796875, "logits/rejected": -2.640625, "logps/chosen": -668.0, "logps/rejected": -1040.0, "loss": 0.1737, "rewards/accuracies": 0.90625, "rewards/chosen": -4.96875, "rewards/margins": 3.875, "rewards/rejected": -8.8125, "step": 10340 }, { "epoch": 0.7787224437589346, "grad_norm": 9.458205276937695, "learning_rate": 7.095143505053982e-08, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -676.0, "logps/rejected": -1048.0, "loss": 0.159, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 3.796875, "rewards/rejected": -8.875, "step": 10350 }, { "epoch": 0.7794748325934843, "grad_norm": 11.045545809015394, "learning_rate": 7.049378868982065e-08, "logits/chosen": -2.828125, "logits/rejected": -2.625, "logps/chosen": -624.0, "logps/rejected": -1032.0, "loss": 0.1707, "rewards/accuracies": 0.90625, "rewards/chosen": -4.71875, "rewards/margins": 4.125, "rewards/rejected": -8.875, "step": 10360 }, { "epoch": 0.780227221428034, "grad_norm": 11.507523995363746, "learning_rate": 7.003738068077564e-08, "logits/chosen": -2.84375, "logits/rejected": -2.484375, "logps/chosen": -656.0, "logps/rejected": -1056.0, "loss": 0.1632, "rewards/accuracies": 0.90625, "rewards/chosen": -5.1875, "rewards/margins": 3.828125, "rewards/rejected": -9.0, "step": 10370 }, { "epoch": 0.7809796102625837, "grad_norm": 8.411601445371018, "learning_rate": 6.958221417200705e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.1845, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.03125, "rewards/rejected": -9.125, "step": 10380 }, { "epoch": 0.7817319990971334, "grad_norm": 8.117821038200702, "learning_rate": 6.912829230355208e-08, "logits/chosen": -2.953125, "logits/rejected": -2.671875, "logps/chosen": -708.0, "logps/rejected": -1120.0, "loss": 0.1559, "rewards/accuracies": 0.9375, "rewards/chosen": -5.28125, "rewards/margins": 4.15625, "rewards/rejected": -9.4375, "step": 10390 }, { "epoch": 0.7824843879316831, "grad_norm": 6.6385483505424245, "learning_rate": 6.867561820686187e-08, "logits/chosen": -2.859375, "logits/rejected": -2.640625, "logps/chosen": -660.0, "logps/rejected": -1080.0, "loss": 0.1499, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.34375, "rewards/rejected": -9.25, "step": 10400 }, { "epoch": 0.7832367767662328, "grad_norm": 10.661984680299412, "learning_rate": 6.822419500477947e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -656.0, "logps/rejected": -1080.0, "loss": 0.1695, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.4375, "rewards/rejected": -9.3125, "step": 10410 }, { "epoch": 0.7839891656007825, "grad_norm": 13.134179720755919, "learning_rate": 6.777402581151825e-08, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -688.0, "logps/rejected": -1096.0, "loss": 0.2094, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.3125, "rewards/margins": 4.09375, "rewards/rejected": -9.375, "step": 10420 }, { "epoch": 0.7847415544353322, "grad_norm": 9.01643760423085, "learning_rate": 6.732511373264107e-08, "logits/chosen": -2.828125, "logits/rejected": -2.484375, "logps/chosen": -664.0, "logps/rejected": -1072.0, "loss": 0.1777, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.28125, "rewards/rejected": -9.25, "step": 10430 }, { "epoch": 0.7854939432698819, "grad_norm": 7.900787859549136, "learning_rate": 6.687746186503796e-08, "logits/chosen": -2.90625, "logits/rejected": -2.546875, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.1801, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.15625, "rewards/rejected": -9.25, "step": 10440 }, { "epoch": 0.7862463321044316, "grad_norm": 9.382459092264867, "learning_rate": 6.64310732969053e-08, "logits/chosen": -2.75, "logits/rejected": -2.515625, "logps/chosen": -664.0, "logps/rejected": -1016.0, "loss": 0.181, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.78125, "rewards/rejected": -8.8125, "step": 10450 }, { "epoch": 0.7869987209389813, "grad_norm": 12.887404631186007, "learning_rate": 6.598595110772467e-08, "logits/chosen": -2.9375, "logits/rejected": -2.546875, "logps/chosen": -660.0, "logps/rejected": -1032.0, "loss": 0.1593, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 3.796875, "rewards/rejected": -8.8125, "step": 10460 }, { "epoch": 0.7877511097735309, "grad_norm": 8.834351335913821, "learning_rate": 6.554209836824081e-08, "logits/chosen": -2.78125, "logits/rejected": -2.453125, "logps/chosen": -656.0, "logps/rejected": -1072.0, "loss": 0.1638, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 4.25, "rewards/rejected": -9.1875, "step": 10470 }, { "epoch": 0.7885034986080807, "grad_norm": 9.336234266274053, "learning_rate": 6.509951814044151e-08, "logits/chosen": -2.75, "logits/rejected": -2.5625, "logps/chosen": -648.0, "logps/rejected": -1032.0, "loss": 0.1612, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 3.921875, "rewards/rejected": -8.8125, "step": 10480 }, { "epoch": 0.7892558874426303, "grad_norm": 10.852858815315502, "learning_rate": 6.465821347753555e-08, "logits/chosen": -2.8125, "logits/rejected": -2.578125, "logps/chosen": -664.0, "logps/rejected": -1032.0, "loss": 0.1837, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.09375, "rewards/margins": 3.765625, "rewards/rejected": -8.875, "step": 10490 }, { "epoch": 0.7900082762771801, "grad_norm": 10.147163952583437, "learning_rate": 6.421818742393217e-08, "logits/chosen": -2.890625, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1088.0, "loss": 0.1676, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.125, "rewards/margins": 4.0625, "rewards/rejected": -9.1875, "step": 10500 }, { "epoch": 0.7907606651117297, "grad_norm": 9.461367255730154, "learning_rate": 6.377944301522004e-08, "logits/chosen": -2.78125, "logits/rejected": -2.4375, "logps/chosen": -644.0, "logps/rejected": -1048.0, "loss": 0.1758, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 4.21875, "rewards/rejected": -8.9375, "step": 10510 }, { "epoch": 0.7915130539462795, "grad_norm": 9.031095981425155, "learning_rate": 6.3341983278146e-08, "logits/chosen": -2.734375, "logits/rejected": -2.484375, "logps/chosen": -676.0, "logps/rejected": -1064.0, "loss": 0.1839, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.25, "rewards/margins": 3.90625, "rewards/rejected": -9.125, "step": 10520 }, { "epoch": 0.7922654427808291, "grad_norm": 10.11293213806407, "learning_rate": 6.290581123059441e-08, "logits/chosen": -2.875, "logits/rejected": -2.515625, "logps/chosen": -656.0, "logps/rejected": -1064.0, "loss": 0.1972, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.25, "rewards/rejected": -9.25, "step": 10530 }, { "epoch": 0.7930178316153789, "grad_norm": 11.536533803810315, "learning_rate": 6.247092988156652e-08, "logits/chosen": -2.859375, "logits/rejected": -2.609375, "logps/chosen": -692.0, "logps/rejected": -1032.0, "loss": 0.1765, "rewards/accuracies": 0.90625, "rewards/chosen": -5.21875, "rewards/margins": 3.75, "rewards/rejected": -8.9375, "step": 10540 }, { "epoch": 0.7937702204499285, "grad_norm": 7.784005981266443, "learning_rate": 6.203734223115922e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -684.0, "logps/rejected": -1088.0, "loss": 0.1575, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.03125, "rewards/rejected": -9.1875, "step": 10550 }, { "epoch": 0.7945226092844783, "grad_norm": 7.660367703984741, "learning_rate": 6.160505127054475e-08, "logits/chosen": -2.734375, "logits/rejected": -2.390625, "logps/chosen": -632.0, "logps/rejected": -1056.0, "loss": 0.1671, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 4.09375, "rewards/rejected": -8.8125, "step": 10560 }, { "epoch": 0.7952749981190279, "grad_norm": 8.482696088273697, "learning_rate": 6.117405998194991e-08, "logits/chosen": -2.796875, "logits/rejected": -2.5625, "logps/chosen": -644.0, "logps/rejected": -1048.0, "loss": 0.1697, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.875, "rewards/margins": 4.0625, "rewards/rejected": -8.9375, "step": 10570 }, { "epoch": 0.7960273869535776, "grad_norm": 12.695180267533518, "learning_rate": 6.074437133863547e-08, "logits/chosen": -2.828125, "logits/rejected": -2.484375, "logps/chosen": -648.0, "logps/rejected": -1112.0, "loss": 0.2106, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.8125, "rewards/rejected": -9.75, "step": 10580 }, { "epoch": 0.7967797757881273, "grad_norm": 8.256129867392023, "learning_rate": 6.031598830487586e-08, "logits/chosen": -2.734375, "logits/rejected": -2.46875, "logps/chosen": -688.0, "logps/rejected": -1056.0, "loss": 0.1579, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 3.796875, "rewards/rejected": -9.0625, "step": 10590 }, { "epoch": 0.797532164622677, "grad_norm": 7.850039749386082, "learning_rate": 5.98889138359383e-08, "logits/chosen": -2.890625, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1104.0, "loss": 0.176, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.40625, "rewards/rejected": -9.5625, "step": 10600 }, { "epoch": 0.7982845534572267, "grad_norm": 13.624212582198435, "learning_rate": 5.946315087806294e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -676.0, "logps/rejected": -1048.0, "loss": 0.2059, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.28125, "rewards/margins": 3.640625, "rewards/rejected": -8.9375, "step": 10610 }, { "epoch": 0.7990369422917764, "grad_norm": 11.196949925098338, "learning_rate": 5.903870236844211e-08, "logits/chosen": -2.71875, "logits/rejected": -2.578125, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.2102, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.28125, "rewards/margins": 3.9375, "rewards/rejected": -9.25, "step": 10620 }, { "epoch": 0.7997893311263261, "grad_norm": 6.793839166449602, "learning_rate": 5.861557123520011e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -644.0, "logps/rejected": -1096.0, "loss": 0.1713, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.4375, "rewards/rejected": -9.5, "step": 10630 }, { "epoch": 0.8005417199608758, "grad_norm": 7.673695410301518, "learning_rate": 5.819376039737348e-08, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -632.0, "logps/rejected": -1048.0, "loss": 0.1578, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 4.34375, "rewards/rejected": -9.0625, "step": 10640 }, { "epoch": 0.8012941087954255, "grad_norm": 8.67499552802156, "learning_rate": 5.7773272764889966e-08, "logits/chosen": -2.875, "logits/rejected": -2.546875, "logps/chosen": -640.0, "logps/rejected": -1064.0, "loss": 0.165, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 4.21875, "rewards/rejected": -9.125, "step": 10650 }, { "epoch": 0.8020464976299752, "grad_norm": 7.852442101853427, "learning_rate": 5.735411123854952e-08, "logits/chosen": -2.8125, "logits/rejected": -2.53125, "logps/chosen": -668.0, "logps/rejected": -1096.0, "loss": 0.1599, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.3125, "rewards/rejected": -9.3125, "step": 10660 }, { "epoch": 0.8027988864645249, "grad_norm": 9.696644089395818, "learning_rate": 5.693627871000337e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5, "logps/chosen": -640.0, "logps/rejected": -1016.0, "loss": 0.1779, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.78125, "rewards/margins": 4.03125, "rewards/rejected": -8.8125, "step": 10670 }, { "epoch": 0.8035512752990746, "grad_norm": 6.7252195818272185, "learning_rate": 5.651977806173452e-08, "logits/chosen": -2.875, "logits/rejected": -2.671875, "logps/chosen": -684.0, "logps/rejected": -1032.0, "loss": 0.1761, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.15625, "rewards/margins": 3.640625, "rewards/rejected": -8.8125, "step": 10680 }, { "epoch": 0.8043036641336243, "grad_norm": 8.597160574235266, "learning_rate": 5.610461216703796e-08, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -664.0, "logps/rejected": -1048.0, "loss": 0.176, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.09375, "rewards/margins": 3.796875, "rewards/rejected": -8.875, "step": 10690 }, { "epoch": 0.805056052968174, "grad_norm": 10.04590241742574, "learning_rate": 5.569078389000048e-08, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -648.0, "logps/rejected": -1056.0, "loss": 0.1688, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.0, "rewards/rejected": -8.8125, "step": 10700 }, { "epoch": 0.8058084418027236, "grad_norm": 8.503730122679245, "learning_rate": 5.5278296085481125e-08, "logits/chosen": -2.859375, "logits/rejected": -2.53125, "logps/chosen": -680.0, "logps/rejected": -1080.0, "loss": 0.1557, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.125, "rewards/margins": 4.15625, "rewards/rejected": -9.25, "step": 10710 }, { "epoch": 0.8065608306372734, "grad_norm": 7.149590329647261, "learning_rate": 5.486715159909166e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -640.0, "logps/rejected": -1040.0, "loss": 0.1607, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.875, "rewards/margins": 4.03125, "rewards/rejected": -8.875, "step": 10720 }, { "epoch": 0.807313219471823, "grad_norm": 7.476268448786567, "learning_rate": 5.4457353267176545e-08, "logits/chosen": -2.78125, "logits/rejected": -2.4375, "logps/chosen": -668.0, "logps/rejected": -1088.0, "loss": 0.1619, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.4375, "rewards/rejected": -9.4375, "step": 10730 }, { "epoch": 0.8080656083063728, "grad_norm": 11.564976188523628, "learning_rate": 5.4048903916793676e-08, "logits/chosen": -2.890625, "logits/rejected": -2.546875, "logps/chosen": -672.0, "logps/rejected": -1072.0, "loss": 0.1814, "rewards/accuracies": 0.9375, "rewards/chosen": -5.09375, "rewards/margins": 4.03125, "rewards/rejected": -9.125, "step": 10740 }, { "epoch": 0.8088179971409224, "grad_norm": 10.303911357962447, "learning_rate": 5.3641806365694765e-08, "logits/chosen": -2.859375, "logits/rejected": -2.59375, "logps/chosen": -656.0, "logps/rejected": -1072.0, "loss": 0.174, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.3125, "rewards/rejected": -9.25, "step": 10750 }, { "epoch": 0.8095703859754722, "grad_norm": 7.784240955972202, "learning_rate": 5.323606342230591e-08, "logits/chosen": -2.90625, "logits/rejected": -2.5625, "logps/chosen": -628.0, "logps/rejected": -1048.0, "loss": 0.1892, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.21875, "rewards/rejected": -8.9375, "step": 10760 }, { "epoch": 0.8103227748100218, "grad_norm": 9.701191360009647, "learning_rate": 5.283167788570836e-08, "logits/chosen": -2.890625, "logits/rejected": -2.5625, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.1731, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 4.28125, "rewards/rejected": -9.125, "step": 10770 }, { "epoch": 0.8110751636445716, "grad_norm": 8.88445324171346, "learning_rate": 5.2428652545618954e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.1908, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.0, "rewards/margins": 4.15625, "rewards/rejected": -9.1875, "step": 10780 }, { "epoch": 0.8118275524791212, "grad_norm": 10.392887677029911, "learning_rate": 5.202699018237094e-08, "logits/chosen": -2.859375, "logits/rejected": -2.59375, "logps/chosen": -660.0, "logps/rejected": -1032.0, "loss": 0.1865, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.03125, "rewards/margins": 3.90625, "rewards/rejected": -8.9375, "step": 10790 }, { "epoch": 0.812579941313671, "grad_norm": 9.936782354151353, "learning_rate": 5.162669356689511e-08, "logits/chosen": -2.859375, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1056.0, "loss": 0.1762, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.9375, "rewards/rejected": -9.125, "step": 10800 }, { "epoch": 0.8133323301482206, "grad_norm": 9.017218482098691, "learning_rate": 5.122776546070015e-08, "logits/chosen": -2.796875, "logits/rejected": -2.453125, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1872, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 3.9375, "rewards/rejected": -9.0, "step": 10810 }, { "epoch": 0.8140847189827704, "grad_norm": 9.518886032561776, "learning_rate": 5.083020861585413e-08, "logits/chosen": -2.796875, "logits/rejected": -2.46875, "logps/chosen": -636.0, "logps/rejected": -1080.0, "loss": 0.1532, "rewards/accuracies": 0.9375, "rewards/chosen": -4.90625, "rewards/margins": 4.34375, "rewards/rejected": -9.25, "step": 10820 }, { "epoch": 0.81483710781732, "grad_norm": 11.627287727491849, "learning_rate": 5.0434025774965e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -656.0, "logps/rejected": -1040.0, "loss": 0.1553, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0, "rewards/margins": 4.03125, "rewards/rejected": -9.0, "step": 10830 }, { "epoch": 0.8155894966518696, "grad_norm": 7.12655931930796, "learning_rate": 5.003921967116201e-08, "logits/chosen": -2.71875, "logits/rejected": -2.609375, "logps/chosen": -676.0, "logps/rejected": -1056.0, "loss": 0.1703, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -5.21875, "rewards/margins": 3.828125, "rewards/rejected": -9.0625, "step": 10840 }, { "epoch": 0.8163418854864194, "grad_norm": 8.237459465070174, "learning_rate": 4.9645793028076975e-08, "logits/chosen": -2.6875, "logits/rejected": -2.484375, "logps/chosen": -668.0, "logps/rejected": -1096.0, "loss": 0.2007, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 4.40625, "rewards/rejected": -9.5, "step": 10850 }, { "epoch": 0.817094274320969, "grad_norm": 11.592954634006631, "learning_rate": 4.925374855982495e-08, "logits/chosen": -2.828125, "logits/rejected": -2.515625, "logps/chosen": -700.0, "logps/rejected": -1112.0, "loss": 0.19, "rewards/accuracies": 0.9375, "rewards/chosen": -5.375, "rewards/margins": 4.3125, "rewards/rejected": -9.6875, "step": 10860 }, { "epoch": 0.8178466631555188, "grad_norm": 8.609947938917319, "learning_rate": 4.886308897098621e-08, "logits/chosen": -2.71875, "logits/rejected": -2.390625, "logps/chosen": -668.0, "logps/rejected": -1096.0, "loss": 0.1616, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.4375, "rewards/rejected": -9.4375, "step": 10870 }, { "epoch": 0.8185990519900684, "grad_norm": 8.749544796066038, "learning_rate": 4.847381695658692e-08, "logits/chosen": -2.65625, "logits/rejected": -2.484375, "logps/chosen": -696.0, "logps/rejected": -1064.0, "loss": 0.1763, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 3.8125, "rewards/rejected": -9.0625, "step": 10880 }, { "epoch": 0.8193514408246182, "grad_norm": 8.822730343025961, "learning_rate": 4.80859352020809e-08, "logits/chosen": -2.6875, "logits/rejected": -2.5625, "logps/chosen": -740.0, "logps/rejected": -1104.0, "loss": 0.1698, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.6875, "rewards/margins": 3.875, "rewards/rejected": -9.5625, "step": 10890 }, { "epoch": 0.8201038296591678, "grad_norm": 11.982401358379029, "learning_rate": 4.769944638333123e-08, "logits/chosen": -2.671875, "logits/rejected": -2.453125, "logps/chosen": -708.0, "logps/rejected": -1096.0, "loss": 0.166, "rewards/accuracies": 0.9375, "rewards/chosen": -5.40625, "rewards/margins": 4.03125, "rewards/rejected": -9.4375, "step": 10900 }, { "epoch": 0.8208562184937176, "grad_norm": 9.080737731925156, "learning_rate": 4.731435316659141e-08, "logits/chosen": -2.75, "logits/rejected": -2.359375, "logps/chosen": -688.0, "logps/rejected": -1104.0, "loss": 0.1783, "rewards/accuracies": 0.9375, "rewards/chosen": -5.25, "rewards/margins": 4.25, "rewards/rejected": -9.5, "step": 10910 }, { "epoch": 0.8216086073282672, "grad_norm": 10.021211634373925, "learning_rate": 4.693065820848724e-08, "logits/chosen": -2.78125, "logits/rejected": -2.421875, "logps/chosen": -676.0, "logps/rejected": -1104.0, "loss": 0.1783, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 4.21875, "rewards/rejected": -9.375, "step": 10920 }, { "epoch": 0.822360996162817, "grad_norm": 7.732573710084663, "learning_rate": 4.654836415599836e-08, "logits/chosen": -2.78125, "logits/rejected": -2.625, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.1698, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.28125, "rewards/rejected": -9.5, "step": 10930 }, { "epoch": 0.8231133849973666, "grad_norm": 12.638930481142362, "learning_rate": 4.616747364644008e-08, "logits/chosen": -2.796875, "logits/rejected": -2.53125, "logps/chosen": -688.0, "logps/rejected": -1104.0, "loss": 0.1876, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.1875, "rewards/rejected": -9.4375, "step": 10940 }, { "epoch": 0.8238657738319163, "grad_norm": 9.7288094717691, "learning_rate": 4.578798930744523e-08, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -660.0, "logps/rejected": -1056.0, "loss": 0.1914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.953125, "rewards/rejected": -8.9375, "step": 10950 }, { "epoch": 0.824618162666466, "grad_norm": 9.23054954568939, "learning_rate": 4.5409913756945835e-08, "logits/chosen": -2.78125, "logits/rejected": -2.453125, "logps/chosen": -640.0, "logps/rejected": -1024.0, "loss": 0.1575, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.0625, "rewards/rejected": -8.8125, "step": 10960 }, { "epoch": 0.8253705515010157, "grad_norm": 13.864283595671754, "learning_rate": 4.5033249603155135e-08, "logits/chosen": -2.71875, "logits/rejected": -2.515625, "logps/chosen": -732.0, "logps/rejected": -1048.0, "loss": 0.1893, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.59375, "rewards/margins": 3.5, "rewards/rejected": -9.0625, "step": 10970 }, { "epoch": 0.8261229403355654, "grad_norm": 7.7996723112653585, "learning_rate": 4.465799944454984e-08, "logits/chosen": -2.765625, "logits/rejected": -2.5625, "logps/chosen": -700.0, "logps/rejected": -1064.0, "loss": 0.1978, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.859375, "rewards/rejected": -9.0, "step": 10980 }, { "epoch": 0.8268753291701151, "grad_norm": 9.098140876721343, "learning_rate": 4.428416586985184e-08, "logits/chosen": -2.828125, "logits/rejected": -2.484375, "logps/chosen": -680.0, "logps/rejected": -1096.0, "loss": 0.1728, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.125, "rewards/margins": 4.25, "rewards/rejected": -9.375, "step": 10990 }, { "epoch": 0.8276277180046648, "grad_norm": 11.851825273753372, "learning_rate": 4.3911751458010434e-08, "logits/chosen": -2.796875, "logits/rejected": -2.5625, "logps/chosen": -660.0, "logps/rejected": -1040.0, "loss": 0.1533, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 3.953125, "rewards/rejected": -8.9375, "step": 11000 }, { "epoch": 0.8283801068392145, "grad_norm": 8.179511099338784, "learning_rate": 4.354075877818475e-08, "logits/chosen": -2.84375, "logits/rejected": -2.515625, "logps/chosen": -696.0, "logps/rejected": -1056.0, "loss": 0.1911, "rewards/accuracies": 0.90625, "rewards/chosen": -5.21875, "rewards/margins": 3.9375, "rewards/rejected": -9.1875, "step": 11010 }, { "epoch": 0.8291324956737642, "grad_norm": 9.377746486833292, "learning_rate": 4.3171190389725746e-08, "logits/chosen": -2.890625, "logits/rejected": -2.46875, "logps/chosen": -632.0, "logps/rejected": -1064.0, "loss": 0.158, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.75, "rewards/margins": 4.5, "rewards/rejected": -9.25, "step": 11020 }, { "epoch": 0.8298848845083139, "grad_norm": 14.070670305936215, "learning_rate": 4.280304884215885e-08, "logits/chosen": -2.84375, "logits/rejected": -2.59375, "logps/chosen": -696.0, "logps/rejected": -1096.0, "loss": 0.1846, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.21875, "rewards/rejected": -9.4375, "step": 11030 }, { "epoch": 0.8306372733428636, "grad_norm": 9.44927650967451, "learning_rate": 4.2436336675166076e-08, "logits/chosen": -2.71875, "logits/rejected": -2.359375, "logps/chosen": -664.0, "logps/rejected": -1080.0, "loss": 0.196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.21875, "rewards/rejected": -9.25, "step": 11040 }, { "epoch": 0.8313896621774133, "grad_norm": 10.48472725820432, "learning_rate": 4.207105641856859e-08, "logits/chosen": -2.84375, "logits/rejected": -2.40625, "logps/chosen": -656.0, "logps/rejected": -1048.0, "loss": 0.1791, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.03125, "rewards/margins": 3.96875, "rewards/rejected": -9.0, "step": 11050 }, { "epoch": 0.832142051011963, "grad_norm": 8.158161132513957, "learning_rate": 4.17072105923095e-08, "logits/chosen": -2.703125, "logits/rejected": -2.484375, "logps/chosen": -680.0, "logps/rejected": -1040.0, "loss": 0.1711, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.25, "rewards/margins": 3.796875, "rewards/rejected": -9.0625, "step": 11060 }, { "epoch": 0.8328944398465127, "grad_norm": 11.113624875557074, "learning_rate": 4.134480170643606e-08, "logits/chosen": -2.84375, "logits/rejected": -2.46875, "logps/chosen": -696.0, "logps/rejected": -1088.0, "loss": 0.1984, "rewards/accuracies": 0.875, "rewards/chosen": -5.40625, "rewards/margins": 3.90625, "rewards/rejected": -9.3125, "step": 11070 }, { "epoch": 0.8336468286810623, "grad_norm": 7.777023552204875, "learning_rate": 4.0983832261082624e-08, "logits/chosen": -2.765625, "logits/rejected": -2.484375, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.1824, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.15625, "rewards/margins": 3.96875, "rewards/rejected": -9.125, "step": 11080 }, { "epoch": 0.8343992175156121, "grad_norm": 8.75340034607004, "learning_rate": 4.062430474645353e-08, "logits/chosen": -2.875, "logits/rejected": -2.53125, "logps/chosen": -652.0, "logps/rejected": -1080.0, "loss": 0.1551, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.4375, "rewards/rejected": -9.4375, "step": 11090 }, { "epoch": 0.8351516063501617, "grad_norm": 8.520444397583018, "learning_rate": 4.0266221642805355e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -664.0, "logps/rejected": -1040.0, "loss": 0.1722, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.90625, "rewards/rejected": -8.9375, "step": 11100 }, { "epoch": 0.8359039951847115, "grad_norm": 9.367313275601472, "learning_rate": 3.990958542043052e-08, "logits/chosen": -2.875, "logits/rejected": -2.546875, "logps/chosen": -684.0, "logps/rejected": -1048.0, "loss": 0.1758, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.3125, "rewards/margins": 3.625, "rewards/rejected": -8.9375, "step": 11110 }, { "epoch": 0.8366563840192611, "grad_norm": 9.259122704271554, "learning_rate": 3.9554398539639766e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1032.0, "loss": 0.1702, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 3.875, "rewards/rejected": -9.0, "step": 11120 }, { "epoch": 0.8374087728538109, "grad_norm": 10.851613409601374, "learning_rate": 3.92006634507453e-08, "logits/chosen": -2.78125, "logits/rejected": -2.453125, "logps/chosen": -688.0, "logps/rejected": -1080.0, "loss": 0.1846, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.15625, "rewards/rejected": -9.3125, "step": 11130 }, { "epoch": 0.8381611616883605, "grad_norm": 8.37141175351515, "learning_rate": 3.884838259404402e-08, "logits/chosen": -2.734375, "logits/rejected": -2.546875, "logps/chosen": -648.0, "logps/rejected": -1072.0, "loss": 0.1939, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.25, "rewards/rejected": -9.0625, "step": 11140 }, { "epoch": 0.8389135505229103, "grad_norm": 7.38861113494188, "learning_rate": 3.8497558399800454e-08, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.1748, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.375, "rewards/rejected": -9.3125, "step": 11150 }, { "epoch": 0.8396659393574599, "grad_norm": 9.33147790540556, "learning_rate": 3.814819328823027e-08, "logits/chosen": -2.734375, "logits/rejected": -2.53125, "logps/chosen": -708.0, "logps/rejected": -1056.0, "loss": 0.1715, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.375, "rewards/margins": 3.625, "rewards/rejected": -9.0, "step": 11160 }, { "epoch": 0.8404183281920097, "grad_norm": 9.575575536830598, "learning_rate": 3.780028966948326e-08, "logits/chosen": -2.78125, "logits/rejected": -2.453125, "logps/chosen": -680.0, "logps/rejected": -1048.0, "loss": 0.2003, "rewards/accuracies": 0.90625, "rewards/chosen": -5.1875, "rewards/margins": 3.828125, "rewards/rejected": -9.0, "step": 11170 }, { "epoch": 0.8411707170265593, "grad_norm": 8.51804493221108, "learning_rate": 3.7453849943626964e-08, "logits/chosen": -2.734375, "logits/rejected": -2.578125, "logps/chosen": -696.0, "logps/rejected": -1032.0, "loss": 0.1759, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.3125, "rewards/margins": 3.4375, "rewards/rejected": -8.75, "step": 11180 }, { "epoch": 0.841923105861109, "grad_norm": 8.009274265381986, "learning_rate": 3.710887650063005e-08, "logits/chosen": -2.734375, "logits/rejected": -2.46875, "logps/chosen": -688.0, "logps/rejected": -1016.0, "loss": 0.1843, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.28125, "rewards/margins": 3.5625, "rewards/rejected": -8.8125, "step": 11190 }, { "epoch": 0.8426754946956587, "grad_norm": 12.126048736945192, "learning_rate": 3.676537172034563e-08, "logits/chosen": -2.859375, "logits/rejected": -2.46875, "logps/chosen": -684.0, "logps/rejected": -1088.0, "loss": 0.2028, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.21875, "rewards/rejected": -9.25, "step": 11200 }, { "epoch": 0.8434278835302084, "grad_norm": 8.424633740939056, "learning_rate": 3.642333797249536e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5, "logps/chosen": -692.0, "logps/rejected": -1072.0, "loss": 0.172, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.28125, "rewards/margins": 3.9375, "rewards/rejected": -9.25, "step": 11210 }, { "epoch": 0.8441802723647581, "grad_norm": 8.620373688970624, "learning_rate": 3.608277761665243e-08, "logits/chosen": -2.828125, "logits/rejected": -2.515625, "logps/chosen": -668.0, "logps/rejected": -1064.0, "loss": 0.1898, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 3.71875, "rewards/rejected": -9.0, "step": 11220 }, { "epoch": 0.8449326611993078, "grad_norm": 9.64829790776722, "learning_rate": 3.574369300222568e-08, "logits/chosen": -2.796875, "logits/rejected": -2.546875, "logps/chosen": -656.0, "logps/rejected": -1056.0, "loss": 0.1646, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.09375, "rewards/rejected": -9.125, "step": 11230 }, { "epoch": 0.8456850500338575, "grad_norm": 10.526184041499265, "learning_rate": 3.540608646844348e-08, "logits/chosen": -2.75, "logits/rejected": -2.515625, "logps/chosen": -652.0, "logps/rejected": -1072.0, "loss": 0.1729, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 3.984375, "rewards/rejected": -8.9375, "step": 11240 }, { "epoch": 0.8464374388684072, "grad_norm": 10.666131037832978, "learning_rate": 3.506996034433723e-08, "logits/chosen": -2.84375, "logits/rejected": -2.546875, "logps/chosen": -668.0, "logps/rejected": -1048.0, "loss": 0.1745, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.09375, "rewards/rejected": -9.0625, "step": 11250 }, { "epoch": 0.8471898277029569, "grad_norm": 9.557541053018198, "learning_rate": 3.473531694872556e-08, "logits/chosen": -2.890625, "logits/rejected": -2.640625, "logps/chosen": -676.0, "logps/rejected": -1112.0, "loss": 0.1879, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.21875, "rewards/margins": 4.1875, "rewards/rejected": -9.4375, "step": 11260 }, { "epoch": 0.8479422165375066, "grad_norm": 8.85528266983337, "learning_rate": 3.440215859019838e-08, "logits/chosen": -2.953125, "logits/rejected": -2.578125, "logps/chosen": -652.0, "logps/rejected": -1056.0, "loss": 0.1539, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.1875, "rewards/rejected": -9.1875, "step": 11270 }, { "epoch": 0.8486946053720563, "grad_norm": 9.819789173917103, "learning_rate": 3.4070487567100516e-08, "logits/chosen": -2.859375, "logits/rejected": -2.578125, "logps/chosen": -688.0, "logps/rejected": -1056.0, "loss": 0.1617, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.21875, "rewards/margins": 3.953125, "rewards/rejected": -9.1875, "step": 11280 }, { "epoch": 0.849446994206606, "grad_norm": 10.266384251602961, "learning_rate": 3.374030616751661e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -680.0, "logps/rejected": -1048.0, "loss": 0.167, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.0625, "rewards/margins": 3.84375, "rewards/rejected": -8.875, "step": 11290 }, { "epoch": 0.8501993830411557, "grad_norm": 8.611853836848319, "learning_rate": 3.3411616669254627e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5, "logps/chosen": -660.0, "logps/rejected": -1012.0, "loss": 0.194, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.03125, "rewards/margins": 3.609375, "rewards/rejected": -8.625, "step": 11300 }, { "epoch": 0.8509517718757054, "grad_norm": 6.137956172545956, "learning_rate": 3.308442133983036e-08, "logits/chosen": -2.859375, "logits/rejected": -2.609375, "logps/chosen": -684.0, "logps/rejected": -1048.0, "loss": 0.1634, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.15625, "rewards/rejected": -9.0625, "step": 11310 }, { "epoch": 0.851704160710255, "grad_norm": 10.655708813688022, "learning_rate": 3.275872243645214e-08, "logits/chosen": -2.734375, "logits/rejected": -2.59375, "logps/chosen": -704.0, "logps/rejected": -1024.0, "loss": 0.1835, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 3.671875, "rewards/rejected": -8.875, "step": 11320 }, { "epoch": 0.8524565495448048, "grad_norm": 11.650077476080263, "learning_rate": 3.243452220600473e-08, "logits/chosen": -2.765625, "logits/rejected": -2.578125, "logps/chosen": -684.0, "logps/rejected": -1088.0, "loss": 0.1754, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.96875, "rewards/rejected": -9.1875, "step": 11330 }, { "epoch": 0.8532089383793544, "grad_norm": 7.208242396686533, "learning_rate": 3.2111822885034054e-08, "logits/chosen": -2.71875, "logits/rejected": -2.46875, "logps/chosen": -704.0, "logps/rejected": -1104.0, "loss": 0.1767, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -5.53125, "rewards/margins": 3.953125, "rewards/rejected": -9.5, "step": 11340 }, { "epoch": 0.8539613272139042, "grad_norm": 11.211657721439165, "learning_rate": 3.1790626699731955e-08, "logits/chosen": -2.859375, "logits/rejected": -2.578125, "logps/chosen": -688.0, "logps/rejected": -1096.0, "loss": 0.1795, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.21875, "rewards/margins": 4.125, "rewards/rejected": -9.375, "step": 11350 }, { "epoch": 0.8547137160484538, "grad_norm": 12.234856938959442, "learning_rate": 3.1470935865920505e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5, "logps/chosen": -660.0, "logps/rejected": -1056.0, "loss": 0.1618, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.0625, "rewards/rejected": -9.0, "step": 11360 }, { "epoch": 0.8554661048830036, "grad_norm": 15.466946869514333, "learning_rate": 3.1152752589036904e-08, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.1741, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.25, "rewards/rejected": -9.0625, "step": 11370 }, { "epoch": 0.8562184937175532, "grad_norm": 6.788421806575068, "learning_rate": 3.083607906411831e-08, "logits/chosen": -2.859375, "logits/rejected": -2.546875, "logps/chosen": -684.0, "logps/rejected": -1048.0, "loss": 0.1575, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.40625, "rewards/margins": 3.46875, "rewards/rejected": -8.875, "step": 11380 }, { "epoch": 0.856970882552103, "grad_norm": 11.985197080093016, "learning_rate": 3.052091747578644e-08, "logits/chosen": -2.921875, "logits/rejected": -2.65625, "logps/chosen": -692.0, "logps/rejected": -1080.0, "loss": 0.1745, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.9375, "rewards/rejected": -9.0625, "step": 11390 }, { "epoch": 0.8577232713866526, "grad_norm": 8.951594779960894, "learning_rate": 3.020726999823298e-08, "logits/chosen": -2.90625, "logits/rejected": -2.609375, "logps/chosen": -668.0, "logps/rejected": -1072.0, "loss": 0.1794, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.15625, "rewards/margins": 4.0, "rewards/rejected": -9.125, "step": 11400 }, { "epoch": 0.8584756602212024, "grad_norm": 9.728312858882287, "learning_rate": 2.989513879520394e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.18, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.25, "rewards/rejected": -9.1875, "step": 11410 }, { "epoch": 0.859228049055752, "grad_norm": 6.553941987888495, "learning_rate": 2.9584526019985373e-08, "logits/chosen": -2.84375, "logits/rejected": -2.390625, "logps/chosen": -672.0, "logps/rejected": -1088.0, "loss": 0.1901, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.0625, "rewards/rejected": -9.25, "step": 11420 }, { "epoch": 0.8599804378903018, "grad_norm": 8.759913978594696, "learning_rate": 2.927543381538805e-08, "logits/chosen": -2.75, "logits/rejected": -2.4375, "logps/chosen": -688.0, "logps/rejected": -1112.0, "loss": 0.1755, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.3125, "rewards/margins": 4.1875, "rewards/rejected": -9.5, "step": 11430 }, { "epoch": 0.8607328267248514, "grad_norm": 9.368246443496433, "learning_rate": 2.8967864313732826e-08, "logits/chosen": -2.859375, "logits/rejected": -2.59375, "logps/chosen": -688.0, "logps/rejected": -1096.0, "loss": 0.17, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.34375, "rewards/rejected": -9.625, "step": 11440 }, { "epoch": 0.861485215559401, "grad_norm": 6.348542476671366, "learning_rate": 2.866181963683617e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -696.0, "logps/rejected": -1112.0, "loss": 0.1792, "rewards/accuracies": 0.9375, "rewards/chosen": -5.28125, "rewards/margins": 4.34375, "rewards/rejected": -9.625, "step": 11450 }, { "epoch": 0.8622376043939508, "grad_norm": 8.260700798228266, "learning_rate": 2.8357301895994946e-08, "logits/chosen": -2.8125, "logits/rejected": -2.53125, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.1493, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.3125, "rewards/margins": 3.96875, "rewards/rejected": -9.25, "step": 11460 }, { "epoch": 0.8629899932285005, "grad_norm": 9.656102185845027, "learning_rate": 2.8054313191972574e-08, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -704.0, "logps/rejected": -1096.0, "loss": 0.1557, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 4.03125, "rewards/rejected": -9.25, "step": 11470 }, { "epoch": 0.8637423820630502, "grad_norm": 9.844846763630663, "learning_rate": 2.775285561498397e-08, "logits/chosen": -2.78125, "logits/rejected": -2.578125, "logps/chosen": -716.0, "logps/rejected": -1048.0, "loss": 0.1772, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.4375, "rewards/margins": 3.5625, "rewards/rejected": -9.0, "step": 11480 }, { "epoch": 0.8644947708975999, "grad_norm": 8.95652161004821, "learning_rate": 2.7452931244681314e-08, "logits/chosen": -2.796875, "logits/rejected": -2.5625, "logps/chosen": -696.0, "logps/rejected": -1096.0, "loss": 0.1821, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.25, "rewards/margins": 4.25, "rewards/rejected": -9.5, "step": 11490 }, { "epoch": 0.8652471597321496, "grad_norm": 9.166404931212753, "learning_rate": 2.7154542150139876e-08, "logits/chosen": -2.796875, "logits/rejected": -2.5625, "logps/chosen": -692.0, "logps/rejected": -1024.0, "loss": 0.1976, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.46875, "rewards/margins": 3.390625, "rewards/rejected": -8.875, "step": 11500 }, { "epoch": 0.8659995485666993, "grad_norm": 11.402523424142428, "learning_rate": 2.6857690389843478e-08, "logits/chosen": -2.765625, "logits/rejected": -2.484375, "logps/chosen": -688.0, "logps/rejected": -1088.0, "loss": 0.1709, "rewards/accuracies": 0.9375, "rewards/chosen": -5.25, "rewards/margins": 4.0, "rewards/rejected": -9.25, "step": 11510 }, { "epoch": 0.866751937401249, "grad_norm": 9.619687076845858, "learning_rate": 2.656237801167033e-08, "logits/chosen": -2.890625, "logits/rejected": -2.578125, "logps/chosen": -668.0, "logps/rejected": -1088.0, "loss": 0.1767, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.25, "rewards/rejected": -9.3125, "step": 11520 }, { "epoch": 0.8675043262357987, "grad_norm": 10.768747265202173, "learning_rate": 2.62686070528792e-08, "logits/chosen": -2.875, "logits/rejected": -2.625, "logps/chosen": -680.0, "logps/rejected": -1072.0, "loss": 0.1698, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 3.828125, "rewards/rejected": -9.0625, "step": 11530 }, { "epoch": 0.8682567150703484, "grad_norm": 9.997525141089067, "learning_rate": 2.5976379540094907e-08, "logits/chosen": -2.921875, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1080.0, "loss": 0.1666, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.1875, "rewards/rejected": -9.1875, "step": 11540 }, { "epoch": 0.869009103904898, "grad_norm": 9.701661283852008, "learning_rate": 2.5685697489294665e-08, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -680.0, "logps/rejected": -1040.0, "loss": 0.1525, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0, "rewards/margins": 3.828125, "rewards/rejected": -8.8125, "step": 11550 }, { "epoch": 0.8697614927394477, "grad_norm": 8.930382495018732, "learning_rate": 2.539656290579409e-08, "logits/chosen": -2.734375, "logits/rejected": -2.53125, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.172, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.21875, "rewards/margins": 3.921875, "rewards/rejected": -9.1875, "step": 11560 }, { "epoch": 0.8705138815739975, "grad_norm": 10.827313254904702, "learning_rate": 2.510897778423324e-08, "logits/chosen": -2.734375, "logits/rejected": -2.4375, "logps/chosen": -664.0, "logps/rejected": -1048.0, "loss": 0.183, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 3.984375, "rewards/rejected": -9.0625, "step": 11570 }, { "epoch": 0.8712662704085471, "grad_norm": 9.488737499710968, "learning_rate": 2.482294410856317e-08, "logits/chosen": -2.921875, "logits/rejected": -2.453125, "logps/chosen": -648.0, "logps/rejected": -1088.0, "loss": 0.1836, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 4.4375, "rewards/rejected": -9.3125, "step": 11580 }, { "epoch": 0.8720186592430968, "grad_norm": 9.51048440384545, "learning_rate": 2.4538463852031897e-08, "logits/chosen": -2.796875, "logits/rejected": -2.546875, "logps/chosen": -676.0, "logps/rejected": -1056.0, "loss": 0.1959, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.03125, "rewards/rejected": -9.1875, "step": 11590 }, { "epoch": 0.8727710480776465, "grad_norm": 11.27212047058623, "learning_rate": 2.425553897717092e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -692.0, "logps/rejected": -1064.0, "loss": 0.1648, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 4.03125, "rewards/rejected": -9.25, "step": 11600 }, { "epoch": 0.8735234369121962, "grad_norm": 9.945973374384252, "learning_rate": 2.3974171435781787e-08, "logits/chosen": -2.78125, "logits/rejected": -2.453125, "logps/chosen": -656.0, "logps/rejected": -1040.0, "loss": 0.1752, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.0625, "rewards/rejected": -9.0, "step": 11610 }, { "epoch": 0.8742758257467459, "grad_norm": 9.631724774327521, "learning_rate": 2.3694363168922454e-08, "logits/chosen": -2.796875, "logits/rejected": -2.546875, "logps/chosen": -656.0, "logps/rejected": -1088.0, "loss": 0.1699, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.28125, "rewards/rejected": -9.25, "step": 11620 }, { "epoch": 0.8750282145812956, "grad_norm": 8.480092534292009, "learning_rate": 2.3416116106894062e-08, "logits/chosen": -2.859375, "logits/rejected": -2.515625, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1725, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.28125, "rewards/margins": 4.03125, "rewards/rejected": -9.3125, "step": 11630 }, { "epoch": 0.8757806034158453, "grad_norm": 9.050081151967134, "learning_rate": 2.3139432169227507e-08, "logits/chosen": -2.8125, "logits/rejected": -2.515625, "logps/chosen": -668.0, "logps/rejected": -1088.0, "loss": 0.1704, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.15625, "rewards/rejected": -9.4375, "step": 11640 }, { "epoch": 0.876532992250395, "grad_norm": 8.49269512787991, "learning_rate": 2.2864313264670058e-08, "logits/chosen": -2.78125, "logits/rejected": -2.5625, "logps/chosen": -688.0, "logps/rejected": -1112.0, "loss": 0.176, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.34375, "rewards/rejected": -9.5, "step": 11650 }, { "epoch": 0.8772853810849447, "grad_norm": 7.742171750152288, "learning_rate": 2.2590761291172655e-08, "logits/chosen": -2.90625, "logits/rejected": -2.46875, "logps/chosen": -684.0, "logps/rejected": -1112.0, "loss": 0.1604, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.3125, "rewards/rejected": -9.5, "step": 11660 }, { "epoch": 0.8780377699194944, "grad_norm": 6.937668342015903, "learning_rate": 2.2318778135876292e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -696.0, "logps/rejected": -1072.0, "loss": 0.1609, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.90625, "rewards/rejected": -9.125, "step": 11670 }, { "epoch": 0.8787901587540441, "grad_norm": 9.903710675002216, "learning_rate": 2.2048365675099378e-08, "logits/chosen": -2.8125, "logits/rejected": -2.609375, "logps/chosen": -676.0, "logps/rejected": -1104.0, "loss": 0.1899, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 3.90625, "rewards/rejected": -9.1875, "step": 11680 }, { "epoch": 0.8795425475885937, "grad_norm": 9.459220782338425, "learning_rate": 2.1779525774324514e-08, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -688.0, "logps/rejected": -1064.0, "loss": 0.1595, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.25, "rewards/margins": 3.9375, "rewards/rejected": -9.1875, "step": 11690 }, { "epoch": 0.8802949364231435, "grad_norm": 10.508603108648265, "learning_rate": 2.1512260288185786e-08, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -696.0, "logps/rejected": -1048.0, "loss": 0.171, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 3.734375, "rewards/rejected": -8.9375, "step": 11700 }, { "epoch": 0.8810473252576931, "grad_norm": 7.98824665789973, "learning_rate": 2.124657106045602e-08, "logits/chosen": -2.84375, "logits/rejected": -2.546875, "logps/chosen": -672.0, "logps/rejected": -1048.0, "loss": 0.1698, "rewards/accuracies": 0.90625, "rewards/chosen": -5.25, "rewards/margins": 3.84375, "rewards/rejected": -9.125, "step": 11710 }, { "epoch": 0.8817997140922429, "grad_norm": 12.741703044040593, "learning_rate": 2.0982459924033857e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.18, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.875, "rewards/rejected": -9.0625, "step": 11720 }, { "epoch": 0.8825521029267925, "grad_norm": 9.119204165231253, "learning_rate": 2.071992870093131e-08, "logits/chosen": -2.828125, "logits/rejected": -2.640625, "logps/chosen": -708.0, "logps/rejected": -1104.0, "loss": 0.1571, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.40625, "rewards/margins": 3.953125, "rewards/rejected": -9.375, "step": 11730 }, { "epoch": 0.8833044917613423, "grad_norm": 8.920017594497281, "learning_rate": 2.0458979202261057e-08, "logits/chosen": -2.796875, "logits/rejected": -2.484375, "logps/chosen": -688.0, "logps/rejected": -1096.0, "loss": 0.1638, "rewards/accuracies": 0.9375, "rewards/chosen": -5.375, "rewards/margins": 4.0, "rewards/rejected": -9.375, "step": 11740 }, { "epoch": 0.8840568805958919, "grad_norm": 11.421461434254617, "learning_rate": 2.0199613228224e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5, "logps/chosen": -692.0, "logps/rejected": -1080.0, "loss": 0.1692, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.25, "rewards/margins": 4.125, "rewards/rejected": -9.375, "step": 11750 }, { "epoch": 0.8848092694304417, "grad_norm": 8.47197431942457, "learning_rate": 1.9941832568096978e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -664.0, "logps/rejected": -1048.0, "loss": 0.1611, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0625, "rewards/margins": 3.90625, "rewards/rejected": -9.0, "step": 11760 }, { "epoch": 0.8855616582649913, "grad_norm": 9.776945257168322, "learning_rate": 1.9685639000220105e-08, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -684.0, "logps/rejected": -1128.0, "loss": 0.1696, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.28125, "rewards/rejected": -9.5, "step": 11770 }, { "epoch": 0.8863140470995411, "grad_norm": 10.754537890756227, "learning_rate": 1.9431034291984755e-08, "logits/chosen": -2.859375, "logits/rejected": -2.59375, "logps/chosen": -692.0, "logps/rejected": -1080.0, "loss": 0.17, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 4.03125, "rewards/rejected": -9.3125, "step": 11780 }, { "epoch": 0.8870664359340907, "grad_norm": 10.240761740962872, "learning_rate": 1.9178020199821426e-08, "logits/chosen": -2.859375, "logits/rejected": -2.5, "logps/chosen": -684.0, "logps/rejected": -1112.0, "loss": 0.1738, "rewards/accuracies": 0.9375, "rewards/chosen": -5.125, "rewards/margins": 4.25, "rewards/rejected": -9.375, "step": 11790 }, { "epoch": 0.8878188247686404, "grad_norm": 10.945704341837086, "learning_rate": 1.892659846918737e-08, "logits/chosen": -2.859375, "logits/rejected": -2.59375, "logps/chosen": -668.0, "logps/rejected": -1064.0, "loss": 0.1623, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.03125, "rewards/margins": 3.96875, "rewards/rejected": -9.0, "step": 11800 }, { "epoch": 0.8885712136031901, "grad_norm": 8.986638733858724, "learning_rate": 1.8676770834554655e-08, "logits/chosen": -2.6875, "logits/rejected": -2.546875, "logps/chosen": -680.0, "logps/rejected": -1104.0, "loss": 0.1518, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.3125, "rewards/rejected": -9.375, "step": 11810 }, { "epoch": 0.8893236024377398, "grad_norm": 9.787168413631637, "learning_rate": 1.842853901939842e-08, "logits/chosen": -2.75, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1080.0, "loss": 0.163, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.0625, "rewards/rejected": -9.3125, "step": 11820 }, { "epoch": 0.8900759912722895, "grad_norm": 28.312725671618338, "learning_rate": 1.8181904736184528e-08, "logits/chosen": -2.71875, "logits/rejected": -2.40625, "logps/chosen": -700.0, "logps/rejected": -1104.0, "loss": 0.1733, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.34375, "rewards/margins": 4.21875, "rewards/rejected": -9.5625, "step": 11830 }, { "epoch": 0.8908283801068392, "grad_norm": 8.905561472725662, "learning_rate": 1.793686968635824e-08, "logits/chosen": -2.71875, "logits/rejected": -2.515625, "logps/chosen": -712.0, "logps/rejected": -1080.0, "loss": 0.15, "rewards/accuracies": 0.9375, "rewards/chosen": -5.34375, "rewards/margins": 4.125, "rewards/rejected": -9.4375, "step": 11840 }, { "epoch": 0.8915807689413889, "grad_norm": 8.867421479210398, "learning_rate": 1.7693435560332148e-08, "logits/chosen": -2.8125, "logits/rejected": -2.53125, "logps/chosen": -648.0, "logps/rejected": -1032.0, "loss": 0.1484, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.03125, "rewards/margins": 3.875, "rewards/rejected": -8.875, "step": 11850 }, { "epoch": 0.8923331577759386, "grad_norm": 7.853192080340564, "learning_rate": 1.7451604037474616e-08, "logits/chosen": -2.921875, "logits/rejected": -2.59375, "logps/chosen": -692.0, "logps/rejected": -1080.0, "loss": 0.1661, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.34375, "rewards/margins": 4.03125, "rewards/rejected": -9.375, "step": 11860 }, { "epoch": 0.8930855466104883, "grad_norm": 13.407487143279933, "learning_rate": 1.72113767860983e-08, "logits/chosen": -2.796875, "logits/rejected": -2.484375, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.1803, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.28125, "rewards/rejected": -9.25, "step": 11870 }, { "epoch": 0.893837935445038, "grad_norm": 8.613004172224727, "learning_rate": 1.6972755463448423e-08, "logits/chosen": -2.84375, "logits/rejected": -2.578125, "logps/chosen": -696.0, "logps/rejected": -1040.0, "loss": 0.1893, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.15625, "rewards/margins": 3.84375, "rewards/rejected": -9.0, "step": 11880 }, { "epoch": 0.8945903242795877, "grad_norm": 9.501855426931444, "learning_rate": 1.6735741715691448e-08, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -684.0, "logps/rejected": -1056.0, "loss": 0.177, "rewards/accuracies": 0.875, "rewards/chosen": -5.1875, "rewards/margins": 3.828125, "rewards/rejected": -9.0, "step": 11890 }, { "epoch": 0.8953427131141374, "grad_norm": 15.804445726258075, "learning_rate": 1.650033717790389e-08, "logits/chosen": -2.875, "logits/rejected": -2.453125, "logps/chosen": -664.0, "logps/rejected": -1048.0, "loss": 0.1847, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.09375, "rewards/margins": 3.9375, "rewards/rejected": -9.0625, "step": 11900 }, { "epoch": 0.8960951019486871, "grad_norm": 9.911905372689894, "learning_rate": 1.626654347406073e-08, "logits/chosen": -2.828125, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1128.0, "loss": 0.1622, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.25, "rewards/margins": 4.28125, "rewards/rejected": -9.5, "step": 11910 }, { "epoch": 0.8968474907832368, "grad_norm": 12.851328969910908, "learning_rate": 1.6034362217024396e-08, "logits/chosen": -2.6875, "logits/rejected": -2.46875, "logps/chosen": -652.0, "logps/rejected": -1032.0, "loss": 0.1855, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.0, "rewards/rejected": -8.9375, "step": 11920 }, { "epoch": 0.8975998796177864, "grad_norm": 9.778822695359391, "learning_rate": 1.580379500853357e-08, "logits/chosen": -2.734375, "logits/rejected": -2.609375, "logps/chosen": -704.0, "logps/rejected": -1104.0, "loss": 0.1782, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.34375, "rewards/margins": 4.1875, "rewards/rejected": -9.5625, "step": 11930 }, { "epoch": 0.8983522684523362, "grad_norm": 9.092371664695987, "learning_rate": 1.5574843439192213e-08, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -644.0, "logps/rejected": -1032.0, "loss": 0.163, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 3.9375, "rewards/rejected": -8.9375, "step": 11940 }, { "epoch": 0.8991046572868858, "grad_norm": 11.004774481394646, "learning_rate": 1.534750908845858e-08, "logits/chosen": -2.8125, "logits/rejected": -2.46875, "logps/chosen": -688.0, "logps/rejected": -1088.0, "loss": 0.1807, "rewards/accuracies": 0.9375, "rewards/chosen": -5.28125, "rewards/margins": 4.125, "rewards/rejected": -9.375, "step": 11950 }, { "epoch": 0.8998570461214356, "grad_norm": 9.550468656787656, "learning_rate": 1.512179352463419e-08, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -672.0, "logps/rejected": -1048.0, "loss": 0.1488, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.765625, "rewards/rejected": -8.9375, "step": 11960 }, { "epoch": 0.9006094349559852, "grad_norm": 11.023045635964674, "learning_rate": 1.4897698304853213e-08, "logits/chosen": -2.796875, "logits/rejected": -2.515625, "logps/chosen": -712.0, "logps/rejected": -1128.0, "loss": 0.1749, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.375, "rewards/margins": 4.25, "rewards/rejected": -9.625, "step": 11970 }, { "epoch": 0.901361823790535, "grad_norm": 8.830743392254266, "learning_rate": 1.4675224975071565e-08, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.1806, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 4.21875, "rewards/rejected": -9.375, "step": 11980 }, { "epoch": 0.9021142126250846, "grad_norm": 12.444841541494037, "learning_rate": 1.445437507005623e-08, "logits/chosen": -2.796875, "logits/rejected": -2.4375, "logps/chosen": -660.0, "logps/rejected": -1080.0, "loss": 0.2042, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.15625, "rewards/rejected": -9.1875, "step": 11990 }, { "epoch": 0.9028666014596344, "grad_norm": 8.58099196313245, "learning_rate": 1.4235150113374977e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1766, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.3125, "rewards/margins": 4.0625, "rewards/rejected": -9.375, "step": 12000 }, { "epoch": 0.903618990294184, "grad_norm": 8.283440836287527, "learning_rate": 1.4017551617385298e-08, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -676.0, "logps/rejected": -1048.0, "loss": 0.1564, "rewards/accuracies": 0.96875, "rewards/chosen": -5.125, "rewards/margins": 3.859375, "rewards/rejected": -9.0, "step": 12010 }, { "epoch": 0.9043713791287338, "grad_norm": 9.256828733022326, "learning_rate": 1.3801581083224544e-08, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -692.0, "logps/rejected": -1056.0, "loss": 0.1704, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.921875, "rewards/rejected": -9.125, "step": 12020 }, { "epoch": 0.9051237679632834, "grad_norm": 6.959164715689976, "learning_rate": 1.3587240000799166e-08, "logits/chosen": -2.84375, "logits/rejected": -2.453125, "logps/chosen": -696.0, "logps/rejected": -1088.0, "loss": 0.1602, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.4375, "rewards/margins": 3.9375, "rewards/rejected": -9.375, "step": 12030 }, { "epoch": 0.9058761567978331, "grad_norm": 8.402217612489448, "learning_rate": 1.3374529848774685e-08, "logits/chosen": -2.78125, "logits/rejected": -2.546875, "logps/chosen": -684.0, "logps/rejected": -1064.0, "loss": 0.1713, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 3.921875, "rewards/rejected": -9.125, "step": 12040 }, { "epoch": 0.9066285456323828, "grad_norm": 11.406670308993652, "learning_rate": 1.3163452094565348e-08, "logits/chosen": -2.90625, "logits/rejected": -2.5625, "logps/chosen": -732.0, "logps/rejected": -1064.0, "loss": 0.1787, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.53125, "rewards/margins": 3.5, "rewards/rejected": -9.0625, "step": 12050 }, { "epoch": 0.9073809344669325, "grad_norm": 7.6670132542671405, "learning_rate": 1.2954008194324046e-08, "logits/chosen": -2.6875, "logits/rejected": -2.5, "logps/chosen": -684.0, "logps/rejected": -1064.0, "loss": 0.1679, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.25, "rewards/margins": 3.984375, "rewards/rejected": -9.25, "step": 12060 }, { "epoch": 0.9081333233014822, "grad_norm": 11.445529436211473, "learning_rate": 1.2746199592932272e-08, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -692.0, "logps/rejected": -1112.0, "loss": 0.1787, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.34375, "rewards/margins": 4.21875, "rewards/rejected": -9.5625, "step": 12070 }, { "epoch": 0.9088857121360319, "grad_norm": 11.128326972262249, "learning_rate": 1.254002772399021e-08, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -652.0, "logps/rejected": -1048.0, "loss": 0.189, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0, "rewards/margins": 3.96875, "rewards/rejected": -9.0, "step": 12080 }, { "epoch": 0.9096381009705816, "grad_norm": 8.021484842797552, "learning_rate": 1.2335494009806712e-08, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -708.0, "logps/rejected": -1048.0, "loss": 0.192, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -5.3125, "rewards/margins": 3.765625, "rewards/rejected": -9.0625, "step": 12090 }, { "epoch": 0.9103904898051313, "grad_norm": 8.30130844967839, "learning_rate": 1.2132599861389591e-08, "logits/chosen": -2.859375, "logits/rejected": -2.578125, "logps/chosen": -696.0, "logps/rejected": -1080.0, "loss": 0.1723, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.25, "rewards/margins": 3.921875, "rewards/rejected": -9.1875, "step": 12100 }, { "epoch": 0.911142878639681, "grad_norm": 6.59121830849841, "learning_rate": 1.1931346678435872e-08, "logits/chosen": -2.890625, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1120.0, "loss": 0.1573, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.34375, "rewards/rejected": -9.5625, "step": 12110 }, { "epoch": 0.9118952674742307, "grad_norm": 10.051925927498676, "learning_rate": 1.1731735849322077e-08, "logits/chosen": -2.75, "logits/rejected": -2.53125, "logps/chosen": -692.0, "logps/rejected": -1024.0, "loss": 0.1768, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.21875, "rewards/margins": 3.484375, "rewards/rejected": -8.6875, "step": 12120 }, { "epoch": 0.9126476563087804, "grad_norm": 12.836231379573508, "learning_rate": 1.1533768751094798e-08, "logits/chosen": -2.765625, "logits/rejected": -2.46875, "logps/chosen": -708.0, "logps/rejected": -1072.0, "loss": 0.1796, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.3125, "rewards/margins": 3.875, "rewards/rejected": -9.1875, "step": 12130 }, { "epoch": 0.9134000451433301, "grad_norm": 11.780097698601958, "learning_rate": 1.1337446749461021e-08, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -684.0, "logps/rejected": -1080.0, "loss": 0.1554, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 3.96875, "rewards/rejected": -9.0625, "step": 12140 }, { "epoch": 0.9141524339778798, "grad_norm": 11.323879890359798, "learning_rate": 1.1142771198778683e-08, "logits/chosen": -2.828125, "logits/rejected": -2.515625, "logps/chosen": -692.0, "logps/rejected": -1056.0, "loss": 0.1793, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.375, "rewards/margins": 3.703125, "rewards/rejected": -9.0625, "step": 12150 }, { "epoch": 0.9149048228124295, "grad_norm": 8.854166431560051, "learning_rate": 1.0949743442047632e-08, "logits/chosen": -2.796875, "logits/rejected": -2.46875, "logps/chosen": -672.0, "logps/rejected": -1096.0, "loss": 0.1812, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.28125, "rewards/rejected": -9.3125, "step": 12160 }, { "epoch": 0.9156572116469791, "grad_norm": 7.318841061464719, "learning_rate": 1.0758364810899977e-08, "logits/chosen": -2.875, "logits/rejected": -2.5625, "logps/chosen": -660.0, "logps/rejected": -1040.0, "loss": 0.2023, "rewards/accuracies": 0.84375, "rewards/chosen": -5.15625, "rewards/margins": 3.6875, "rewards/rejected": -8.875, "step": 12170 }, { "epoch": 0.9164096004815289, "grad_norm": 8.797418576419995, "learning_rate": 1.056863662559121e-08, "logits/chosen": -2.734375, "logits/rejected": -2.40625, "logps/chosen": -700.0, "logps/rejected": -1096.0, "loss": 0.1832, "rewards/accuracies": 0.90625, "rewards/chosen": -5.34375, "rewards/margins": 4.03125, "rewards/rejected": -9.375, "step": 12180 }, { "epoch": 0.9171619893160785, "grad_norm": 10.595248494142249, "learning_rate": 1.0380560194990784e-08, "logits/chosen": -2.734375, "logits/rejected": -2.5625, "logps/chosen": -684.0, "logps/rejected": -1064.0, "loss": 0.1591, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 4.21875, "rewards/rejected": -9.25, "step": 12190 }, { "epoch": 0.9179143781506283, "grad_norm": 9.48087075502737, "learning_rate": 1.0194136816573411e-08, "logits/chosen": -2.734375, "logits/rejected": -2.546875, "logps/chosen": -708.0, "logps/rejected": -1048.0, "loss": 0.1835, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3125, "rewards/margins": 3.6875, "rewards/rejected": -9.0, "step": 12200 }, { "epoch": 0.9186667669851779, "grad_norm": 15.060095328155613, "learning_rate": 1.0009367776409982e-08, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.1676, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.25, "rewards/rejected": -9.375, "step": 12210 }, { "epoch": 0.9194191558197277, "grad_norm": 9.077832627600628, "learning_rate": 9.826254349158513e-09, "logits/chosen": -2.765625, "logits/rejected": -2.453125, "logps/chosen": -676.0, "logps/rejected": -1064.0, "loss": 0.1705, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.15625, "rewards/margins": 4.0625, "rewards/rejected": -9.1875, "step": 12220 }, { "epoch": 0.9201715446542773, "grad_norm": 11.714903815714866, "learning_rate": 9.644797798055743e-09, "logits/chosen": -2.84375, "logits/rejected": -2.5, "logps/chosen": -660.0, "logps/rejected": -1040.0, "loss": 0.1835, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.78125, "rewards/rejected": -8.9375, "step": 12230 }, { "epoch": 0.9209239334888271, "grad_norm": 9.860977528620435, "learning_rate": 9.464999374907994e-09, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -624.0, "logps/rejected": -1072.0, "loss": 0.1814, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 4.40625, "rewards/rejected": -9.25, "step": 12240 }, { "epoch": 0.9216763223233767, "grad_norm": 11.278295090450293, "learning_rate": 9.286860320082801e-09, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -684.0, "logps/rejected": -1064.0, "loss": 0.1631, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.25, "rewards/margins": 3.890625, "rewards/rejected": -9.125, "step": 12250 }, { "epoch": 0.9224287111579265, "grad_norm": 8.765467668684938, "learning_rate": 9.110381862500349e-09, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -680.0, "logps/rejected": -1088.0, "loss": 0.1513, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.03125, "rewards/rejected": -9.125, "step": 12260 }, { "epoch": 0.9231810999924761, "grad_norm": 9.612990200423665, "learning_rate": 8.935565219624852e-09, "logits/chosen": -2.703125, "logits/rejected": -2.40625, "logps/chosen": -668.0, "logps/rejected": -1096.0, "loss": 0.1787, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 4.40625, "rewards/rejected": -9.375, "step": 12270 }, { "epoch": 0.9239334888270259, "grad_norm": 12.84183839452451, "learning_rate": 8.762411597456249e-09, "logits/chosen": -2.8125, "logits/rejected": -2.546875, "logps/chosen": -676.0, "logps/rejected": -1088.0, "loss": 0.1957, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 4.0, "rewards/rejected": -9.1875, "step": 12280 }, { "epoch": 0.9246858776615755, "grad_norm": 13.471057164476878, "learning_rate": 8.590922190521904e-09, "logits/chosen": -2.625, "logits/rejected": -2.390625, "logps/chosen": -660.0, "logps/rejected": -1056.0, "loss": 0.1826, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.15625, "rewards/rejected": -9.125, "step": 12290 }, { "epoch": 0.9254382664961252, "grad_norm": 10.765908118724695, "learning_rate": 8.421098181868285e-09, "logits/chosen": -2.703125, "logits/rejected": -2.515625, "logps/chosen": -656.0, "logps/rejected": -1024.0, "loss": 0.1701, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 3.921875, "rewards/rejected": -8.8125, "step": 12300 }, { "epoch": 0.9261906553306749, "grad_norm": 10.962414203799439, "learning_rate": 8.25294074305291e-09, "logits/chosen": -2.75, "logits/rejected": -2.46875, "logps/chosen": -676.0, "logps/rejected": -1072.0, "loss": 0.1559, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.25, "rewards/margins": 4.0, "rewards/rejected": -9.25, "step": 12310 }, { "epoch": 0.9269430441652246, "grad_norm": 10.328841037991777, "learning_rate": 8.086451034136187e-09, "logits/chosen": -2.796875, "logits/rejected": -2.53125, "logps/chosen": -656.0, "logps/rejected": -1064.0, "loss": 0.175, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.09375, "rewards/rejected": -9.0625, "step": 12320 }, { "epoch": 0.9276954329997743, "grad_norm": 9.852893692279082, "learning_rate": 7.921630203673341e-09, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -656.0, "logps/rejected": -1056.0, "loss": 0.1582, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 4.25, "rewards/rejected": -9.125, "step": 12330 }, { "epoch": 0.928447821834324, "grad_norm": 12.913708626586196, "learning_rate": 7.758479388706718e-09, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -664.0, "logps/rejected": -1104.0, "loss": 0.1779, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.5, "rewards/rejected": -9.4375, "step": 12340 }, { "epoch": 0.9292002106688737, "grad_norm": 10.803705478831203, "learning_rate": 7.596999714757718e-09, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.157, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 4.5, "rewards/rejected": -9.375, "step": 12350 }, { "epoch": 0.9299525995034233, "grad_norm": 9.133803700899708, "learning_rate": 7.4371922958191e-09, "logits/chosen": -2.640625, "logits/rejected": -2.34375, "logps/chosen": -672.0, "logps/rejected": -1040.0, "loss": 0.1665, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.09375, "rewards/margins": 3.75, "rewards/rejected": -8.875, "step": 12360 }, { "epoch": 0.9307049883379731, "grad_norm": 7.739557009401821, "learning_rate": 7.279058234347352e-09, "logits/chosen": -2.875, "logits/rejected": -2.46875, "logps/chosen": -656.0, "logps/rejected": -1072.0, "loss": 0.18, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.21875, "rewards/rejected": -9.125, "step": 12370 }, { "epoch": 0.9314573771725227, "grad_norm": 7.3684455019717525, "learning_rate": 7.122598621255027e-09, "logits/chosen": -2.75, "logits/rejected": -2.5, "logps/chosen": -692.0, "logps/rejected": -1072.0, "loss": 0.1753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.28125, "rewards/margins": 3.921875, "rewards/rejected": -9.1875, "step": 12380 }, { "epoch": 0.9322097660070725, "grad_norm": 8.923023356339863, "learning_rate": 6.967814535903283e-09, "logits/chosen": -2.640625, "logits/rejected": -2.40625, "logps/chosen": -676.0, "logps/rejected": -1056.0, "loss": 0.1755, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 3.96875, "rewards/rejected": -8.9375, "step": 12390 }, { "epoch": 0.9329621548416221, "grad_norm": 11.084018002442349, "learning_rate": 6.81470704609427e-09, "logits/chosen": -2.703125, "logits/rejected": -2.4375, "logps/chosen": -680.0, "logps/rejected": -1112.0, "loss": 0.1717, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 4.46875, "rewards/rejected": -9.5625, "step": 12400 }, { "epoch": 0.9337145436761718, "grad_norm": 8.85754879006267, "learning_rate": 6.6632772080639775e-09, "logits/chosen": -2.71875, "logits/rejected": -2.46875, "logps/chosen": -692.0, "logps/rejected": -1080.0, "loss": 0.1635, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.25, "rewards/margins": 4.03125, "rewards/rejected": -9.3125, "step": 12410 }, { "epoch": 0.9344669325107215, "grad_norm": 10.021606193158734, "learning_rate": 6.513526066474873e-09, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -680.0, "logps/rejected": -1064.0, "loss": 0.1572, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 4.03125, "rewards/rejected": -9.125, "step": 12420 }, { "epoch": 0.9352193213452712, "grad_norm": 10.661648381304078, "learning_rate": 6.365454654408547e-09, "logits/chosen": -2.734375, "logits/rejected": -2.578125, "logps/chosen": -692.0, "logps/rejected": -1072.0, "loss": 0.1618, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.375, "rewards/margins": 3.9375, "rewards/rejected": -9.3125, "step": 12430 }, { "epoch": 0.935971710179821, "grad_norm": 9.809907560870254, "learning_rate": 6.219063993358864e-09, "logits/chosen": -2.734375, "logits/rejected": -2.390625, "logps/chosen": -708.0, "logps/rejected": -1096.0, "loss": 0.1667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.0, "rewards/rejected": -9.3125, "step": 12440 }, { "epoch": 0.9367240990143706, "grad_norm": 7.609560734254073, "learning_rate": 6.0743550932247086e-09, "logits/chosen": -2.65625, "logits/rejected": -2.4375, "logps/chosen": -680.0, "logps/rejected": -1048.0, "loss": 0.1631, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 3.875, "rewards/rejected": -9.0625, "step": 12450 }, { "epoch": 0.9374764878489203, "grad_norm": 10.230258975163913, "learning_rate": 5.931328952302972e-09, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -708.0, "logps/rejected": -1080.0, "loss": 0.1852, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.40625, "rewards/margins": 3.765625, "rewards/rejected": -9.1875, "step": 12460 }, { "epoch": 0.93822887668347, "grad_norm": 10.554847554405203, "learning_rate": 5.7899865572819116e-09, "logits/chosen": -2.765625, "logits/rejected": -2.421875, "logps/chosen": -660.0, "logps/rejected": -1072.0, "loss": 0.1726, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.21875, "rewards/rejected": -9.1875, "step": 12470 }, { "epoch": 0.9389812655180197, "grad_norm": 9.40660496955398, "learning_rate": 5.650328883234134e-09, "logits/chosen": -2.6875, "logits/rejected": -2.515625, "logps/chosen": -656.0, "logps/rejected": -1072.0, "loss": 0.177, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.34375, "rewards/rejected": -9.375, "step": 12480 }, { "epoch": 0.9397336543525694, "grad_norm": 8.166728062833425, "learning_rate": 5.512356893609987e-09, "logits/chosen": -2.828125, "logits/rejected": -2.578125, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.1539, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.15625, "rewards/rejected": -9.125, "step": 12490 }, { "epoch": 0.9404860431871191, "grad_norm": 9.063833933658618, "learning_rate": 5.376071540230787e-09, "logits/chosen": -2.671875, "logits/rejected": -2.5, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.175, "rewards/accuracies": 0.96875, "rewards/chosen": -5.25, "rewards/margins": 4.0625, "rewards/rejected": -9.3125, "step": 12500 }, { "epoch": 0.9412384320216688, "grad_norm": 9.856453427481298, "learning_rate": 5.24147376328235e-09, "logits/chosen": -2.78125, "logits/rejected": -2.46875, "logps/chosen": -624.0, "logps/rejected": -1040.0, "loss": 0.1707, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.5625, "rewards/margins": 4.3125, "rewards/rejected": -8.875, "step": 12510 }, { "epoch": 0.9419908208562185, "grad_norm": 10.270735826552094, "learning_rate": 5.108564491308504e-09, "logits/chosen": -2.75, "logits/rejected": -2.375, "logps/chosen": -648.0, "logps/rejected": -1056.0, "loss": 0.1718, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.1875, "rewards/rejected": -9.0625, "step": 12520 }, { "epoch": 0.9427432096907682, "grad_norm": 9.55450789784712, "learning_rate": 4.9773446412046675e-09, "logits/chosen": -2.6875, "logits/rejected": -2.34375, "logps/chosen": -684.0, "logps/rejected": -1088.0, "loss": 0.1615, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.09375, "rewards/rejected": -9.25, "step": 12530 }, { "epoch": 0.9434955985253178, "grad_norm": 9.93778171098822, "learning_rate": 4.8478151182114735e-09, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.1743, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.46875, "rewards/rejected": -9.4375, "step": 12540 }, { "epoch": 0.9442479873598676, "grad_norm": 7.439495017959088, "learning_rate": 4.719976815908605e-09, "logits/chosen": -2.875, "logits/rejected": -2.578125, "logps/chosen": -692.0, "logps/rejected": -1104.0, "loss": 0.1854, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 4.28125, "rewards/rejected": -9.5625, "step": 12550 }, { "epoch": 0.9450003761944172, "grad_norm": 9.333331538675528, "learning_rate": 4.59383061620855e-09, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -696.0, "logps/rejected": -1096.0, "loss": 0.1617, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.25, "rewards/margins": 4.09375, "rewards/rejected": -9.3125, "step": 12560 }, { "epoch": 0.945752765028967, "grad_norm": 9.63025121862242, "learning_rate": 4.469377389350659e-09, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.166, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.21875, "rewards/rejected": -9.125, "step": 12570 }, { "epoch": 0.9465051538635166, "grad_norm": 8.390314616312889, "learning_rate": 4.3466179938949635e-09, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -648.0, "logps/rejected": -1040.0, "loss": 0.1697, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.15625, "rewards/rejected": -9.0625, "step": 12580 }, { "epoch": 0.9472575426980664, "grad_norm": 7.130887210053568, "learning_rate": 4.225553276716309e-09, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -660.0, "logps/rejected": -1048.0, "loss": 0.1694, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.0, "rewards/rejected": -8.9375, "step": 12590 }, { "epoch": 0.948009931532616, "grad_norm": 9.029440417343197, "learning_rate": 4.106184072998647e-09, "logits/chosen": -2.84375, "logits/rejected": -2.578125, "logps/chosen": -696.0, "logps/rejected": -1072.0, "loss": 0.1647, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.34375, "rewards/margins": 4.0, "rewards/rejected": -9.375, "step": 12600 }, { "epoch": 0.9487623203671658, "grad_norm": 10.544602729465147, "learning_rate": 3.988511206229062e-09, "logits/chosen": -2.828125, "logits/rejected": -2.5625, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1712, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.890625, "rewards/rejected": -9.0625, "step": 12610 }, { "epoch": 0.9495147092017154, "grad_norm": 11.177767893526282, "learning_rate": 3.872535488192247e-09, "logits/chosen": -2.75, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1096.0, "loss": 0.1654, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 4.3125, "rewards/rejected": -9.3125, "step": 12620 }, { "epoch": 0.9502670980362652, "grad_norm": 9.846233049119817, "learning_rate": 3.7582577189648194e-09, "logits/chosen": -2.734375, "logits/rejected": -2.375, "logps/chosen": -668.0, "logps/rejected": -1048.0, "loss": 0.1748, "rewards/accuracies": 0.90625, "rewards/chosen": -5.125, "rewards/margins": 3.9375, "rewards/rejected": -9.0625, "step": 12630 }, { "epoch": 0.9510194868708148, "grad_norm": 14.322310291841655, "learning_rate": 3.64567868690982e-09, "logits/chosen": -2.859375, "logits/rejected": -2.53125, "logps/chosen": -708.0, "logps/rejected": -1080.0, "loss": 0.1851, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.34375, "rewards/margins": 3.890625, "rewards/rejected": -9.25, "step": 12640 }, { "epoch": 0.9517718757053645, "grad_norm": 12.197540376697233, "learning_rate": 3.53479916867136e-09, "logits/chosen": -2.84375, "logits/rejected": -2.515625, "logps/chosen": -668.0, "logps/rejected": -1056.0, "loss": 0.1772, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 3.90625, "rewards/rejected": -9.0625, "step": 12650 }, { "epoch": 0.9525242645399142, "grad_norm": 11.577378965434255, "learning_rate": 3.4256199291691214e-09, "logits/chosen": -2.796875, "logits/rejected": -2.53125, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.1762, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 4.21875, "rewards/rejected": -9.1875, "step": 12660 }, { "epoch": 0.9532766533744639, "grad_norm": 10.963729060307312, "learning_rate": 3.318141721593143e-09, "logits/chosen": -2.84375, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1064.0, "loss": 0.1752, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.03125, "rewards/rejected": -9.25, "step": 12670 }, { "epoch": 0.9540290422090136, "grad_norm": 10.098349431628977, "learning_rate": 3.212365287398655e-09, "logits/chosen": -2.65625, "logits/rejected": -2.453125, "logps/chosen": -692.0, "logps/rejected": -1072.0, "loss": 0.1861, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.3125, "rewards/margins": 3.703125, "rewards/rejected": -9.0, "step": 12680 }, { "epoch": 0.9547814310435633, "grad_norm": 9.904610664721758, "learning_rate": 3.1082913563009737e-09, "logits/chosen": -2.8125, "logits/rejected": -2.453125, "logps/chosen": -676.0, "logps/rejected": -1128.0, "loss": 0.1629, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.40625, "rewards/rejected": -9.4375, "step": 12690 }, { "epoch": 0.955533819878113, "grad_norm": 11.849514303033153, "learning_rate": 3.00592064627031e-09, "logits/chosen": -2.8125, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1080.0, "loss": 0.1866, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.15625, "rewards/margins": 4.15625, "rewards/rejected": -9.3125, "step": 12700 }, { "epoch": 0.9562862087126627, "grad_norm": 10.649296533290217, "learning_rate": 2.905253863527107e-09, "logits/chosen": -2.6875, "logits/rejected": -2.53125, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.921875, "rewards/rejected": -9.125, "step": 12710 }, { "epoch": 0.9570385975472124, "grad_norm": 10.221002971985236, "learning_rate": 2.8062917025368504e-09, "logits/chosen": -2.96875, "logits/rejected": -2.59375, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1775, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.09375, "rewards/margins": 3.921875, "rewards/rejected": -9.0, "step": 12720 }, { "epoch": 0.9577909863817621, "grad_norm": 8.549626003024416, "learning_rate": 2.7090348460055146e-09, "logits/chosen": -2.703125, "logits/rejected": -2.515625, "logps/chosen": -688.0, "logps/rejected": -1096.0, "loss": 0.1851, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.09375, "rewards/rejected": -9.25, "step": 12730 }, { "epoch": 0.9585433752163118, "grad_norm": 9.598846692225287, "learning_rate": 2.6134839648747075e-09, "logits/chosen": -2.71875, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1024.0, "loss": 0.177, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.125, "rewards/margins": 3.625, "rewards/rejected": -8.75, "step": 12740 }, { "epoch": 0.9592957640508615, "grad_norm": 10.652265686252678, "learning_rate": 2.519639718317146e-09, "logits/chosen": -2.765625, "logits/rejected": -2.390625, "logps/chosen": -704.0, "logps/rejected": -1128.0, "loss": 0.1879, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.375, "rewards/margins": 4.03125, "rewards/rejected": -9.375, "step": 12750 }, { "epoch": 0.9600481528854112, "grad_norm": 7.43864423486277, "learning_rate": 2.4275027537320745e-09, "logits/chosen": -2.6875, "logits/rejected": -2.3125, "logps/chosen": -680.0, "logps/rejected": -1080.0, "loss": 0.1641, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 4.15625, "rewards/rejected": -9.3125, "step": 12760 }, { "epoch": 0.9608005417199609, "grad_norm": 10.92322333796448, "learning_rate": 2.3370737067406886e-09, "logits/chosen": -2.890625, "logits/rejected": -2.578125, "logps/chosen": -668.0, "logps/rejected": -1056.0, "loss": 0.1608, "rewards/accuracies": 0.9375, "rewards/chosen": -5.09375, "rewards/margins": 3.984375, "rewards/rejected": -9.0625, "step": 12770 }, { "epoch": 0.9615529305545105, "grad_norm": 7.695758366364471, "learning_rate": 2.2483532011819408e-09, "logits/chosen": -2.78125, "logits/rejected": -2.421875, "logps/chosen": -656.0, "logps/rejected": -1096.0, "loss": 0.1652, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 4.5, "rewards/rejected": -9.375, "step": 12780 }, { "epoch": 0.9623053193890603, "grad_norm": 10.039606375467754, "learning_rate": 2.161341849108156e-09, "logits/chosen": -2.640625, "logits/rejected": -2.5, "logps/chosen": -680.0, "logps/rejected": -1056.0, "loss": 0.1672, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.0, "rewards/rejected": -9.1875, "step": 12790 }, { "epoch": 0.9630577082236099, "grad_norm": 10.089802968142102, "learning_rate": 2.076040250780675e-09, "logits/chosen": -2.765625, "logits/rejected": -2.640625, "logps/chosen": -660.0, "logps/rejected": -1048.0, "loss": 0.1575, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.09375, "rewards/rejected": -9.0625, "step": 12800 }, { "epoch": 0.9638100970581597, "grad_norm": 8.212913608496786, "learning_rate": 1.9924489946659963e-09, "logits/chosen": -2.84375, "logits/rejected": -2.5625, "logps/chosen": -676.0, "logps/rejected": -1072.0, "loss": 0.1511, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.125, "rewards/rejected": -9.3125, "step": 12810 }, { "epoch": 0.9645624858927093, "grad_norm": 6.098037778986118, "learning_rate": 1.910568657431416e-09, "logits/chosen": -2.828125, "logits/rejected": -2.5, "logps/chosen": -688.0, "logps/rejected": -1080.0, "loss": 0.1844, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.3125, "rewards/margins": 4.1875, "rewards/rejected": -9.5, "step": 12820 }, { "epoch": 0.9653148747272591, "grad_norm": 10.3664133785279, "learning_rate": 1.830399803941285e-09, "logits/chosen": -2.703125, "logits/rejected": -2.46875, "logps/chosen": -676.0, "logps/rejected": -1040.0, "loss": 0.16, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.3125, "rewards/margins": 3.71875, "rewards/rejected": -9.0, "step": 12830 }, { "epoch": 0.9660672635618087, "grad_norm": 8.508915620802393, "learning_rate": 1.7519429872529523e-09, "logits/chosen": -2.890625, "logits/rejected": -2.5, "logps/chosen": -648.0, "logps/rejected": -1048.0, "loss": 0.1875, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 3.96875, "rewards/rejected": -9.0625, "step": 12840 }, { "epoch": 0.9668196523963585, "grad_norm": 9.449344455166797, "learning_rate": 1.6751987486130493e-09, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -648.0, "logps/rejected": -1080.0, "loss": 0.1659, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.28125, "rewards/rejected": -9.25, "step": 12850 }, { "epoch": 0.9675720412309081, "grad_norm": 8.618861876185289, "learning_rate": 1.6001676174537127e-09, "logits/chosen": -2.75, "logits/rejected": -2.453125, "logps/chosen": -640.0, "logps/rejected": -1048.0, "loss": 0.17, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 3.9375, "rewards/rejected": -8.875, "step": 12860 }, { "epoch": 0.9683244300654579, "grad_norm": 10.343416257368734, "learning_rate": 1.526850111388922e-09, "logits/chosen": -2.859375, "logits/rejected": -2.5, "logps/chosen": -656.0, "logps/rejected": -1056.0, "loss": 0.19, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.09375, "rewards/rejected": -9.0, "step": 12870 }, { "epoch": 0.9690768189000075, "grad_norm": 8.442539800835934, "learning_rate": 1.4552467362109744e-09, "logits/chosen": -2.734375, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1112.0, "loss": 0.1738, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.125, "rewards/margins": 4.40625, "rewards/rejected": -9.5, "step": 12880 }, { "epoch": 0.9698292077345572, "grad_norm": 12.578280505695382, "learning_rate": 1.3853579858869313e-09, "logits/chosen": -2.828125, "logits/rejected": -2.59375, "logps/chosen": -688.0, "logps/rejected": -1064.0, "loss": 0.1734, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 3.921875, "rewards/rejected": -9.125, "step": 12890 }, { "epoch": 0.9705815965691069, "grad_norm": 6.95902823734716, "learning_rate": 1.3171843425553163e-09, "logits/chosen": -2.78125, "logits/rejected": -2.546875, "logps/chosen": -672.0, "logps/rejected": -1072.0, "loss": 0.1687, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.21875, "rewards/margins": 4.125, "rewards/rejected": -9.3125, "step": 12900 }, { "epoch": 0.9713339854036566, "grad_norm": 10.13045073060942, "learning_rate": 1.250726276522618e-09, "logits/chosen": -2.71875, "logits/rejected": -2.5, "logps/chosen": -672.0, "logps/rejected": -1040.0, "loss": 0.1556, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.09375, "rewards/rejected": -9.0625, "step": 12910 }, { "epoch": 0.9720863742382063, "grad_norm": 9.135577188635157, "learning_rate": 1.1859842462602077e-09, "logits/chosen": -2.78125, "logits/rejected": -2.546875, "logps/chosen": -704.0, "logps/rejected": -1064.0, "loss": 0.1682, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.375, "rewards/margins": 3.671875, "rewards/rejected": -9.0625, "step": 12920 }, { "epoch": 0.972838763072756, "grad_norm": 10.483153971428983, "learning_rate": 1.1229586984011496e-09, "logits/chosen": -2.609375, "logits/rejected": -2.453125, "logps/chosen": -712.0, "logps/rejected": -1040.0, "loss": 0.1846, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.25, "rewards/margins": 3.71875, "rewards/rejected": -8.9375, "step": 12930 }, { "epoch": 0.9735911519073057, "grad_norm": 9.123570068855518, "learning_rate": 1.0616500677369799e-09, "logits/chosen": -2.84375, "logits/rejected": -2.578125, "logps/chosen": -696.0, "logps/rejected": -1104.0, "loss": 0.161, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.125, "rewards/rejected": -9.375, "step": 12940 }, { "epoch": 0.9743435407418554, "grad_norm": 6.0145425156514465, "learning_rate": 1.0020587772149314e-09, "logits/chosen": -2.84375, "logits/rejected": -2.53125, "logps/chosen": -660.0, "logps/rejected": -1048.0, "loss": 0.1853, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0625, "rewards/margins": 3.890625, "rewards/rejected": -8.9375, "step": 12950 }, { "epoch": 0.9750959295764051, "grad_norm": 10.69301968660796, "learning_rate": 9.441852379347969e-10, "logits/chosen": -2.8125, "logits/rejected": -2.515625, "logps/chosen": -668.0, "logps/rejected": -1064.0, "loss": 0.1476, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.0625, "rewards/rejected": -9.0625, "step": 12960 }, { "epoch": 0.9758483184109548, "grad_norm": 7.760772346097098, "learning_rate": 8.880298491462934e-10, "logits/chosen": -2.8125, "logits/rejected": -2.46875, "logps/chosen": -692.0, "logps/rejected": -1136.0, "loss": 0.1651, "rewards/accuracies": 0.96875, "rewards/chosen": -5.28125, "rewards/margins": 4.53125, "rewards/rejected": -9.8125, "step": 12970 }, { "epoch": 0.9766007072455045, "grad_norm": 11.644754110571947, "learning_rate": 8.335929982460909e-10, "logits/chosen": -2.75, "logits/rejected": -2.53125, "logps/chosen": -672.0, "logps/rejected": -1032.0, "loss": 0.1731, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0, "rewards/margins": 3.890625, "rewards/rejected": -8.875, "step": 12980 }, { "epoch": 0.9773530960800542, "grad_norm": 9.263755113715213, "learning_rate": 7.808750607753711e-10, "logits/chosen": -2.796875, "logits/rejected": -2.515625, "logps/chosen": -688.0, "logps/rejected": -1064.0, "loss": 0.17, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 4.0625, "rewards/rejected": -9.1875, "step": 12990 }, { "epoch": 0.9781054849146039, "grad_norm": 9.50070307974902, "learning_rate": 7.29876400417051e-10, "logits/chosen": -2.71875, "logits/rejected": -2.453125, "logps/chosen": -672.0, "logps/rejected": -1048.0, "loss": 0.1724, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 3.96875, "rewards/rejected": -9.0625, "step": 13000 }, { "epoch": 0.9788578737491536, "grad_norm": 8.404077485363258, "learning_rate": 6.805973689933408e-10, "logits/chosen": -2.6875, "logits/rejected": -2.421875, "logps/chosen": -676.0, "logps/rejected": -1072.0, "loss": 0.1806, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.03125, "rewards/rejected": -9.3125, "step": 13010 }, { "epoch": 0.9796102625837032, "grad_norm": 9.848873574736691, "learning_rate": 6.330383064633849e-10, "logits/chosen": -2.953125, "logits/rejected": -2.578125, "logps/chosen": -652.0, "logps/rejected": -1040.0, "loss": 0.1869, "rewards/accuracies": 0.9375, "rewards/chosen": -4.90625, "rewards/margins": 4.0625, "rewards/rejected": -8.9375, "step": 13020 }, { "epoch": 0.980362651418253, "grad_norm": 8.775804036394055, "learning_rate": 5.871995409207908e-10, "logits/chosen": -2.765625, "logits/rejected": -2.5, "logps/chosen": -684.0, "logps/rejected": -1072.0, "loss": 0.1737, "rewards/accuracies": 0.90625, "rewards/chosen": -5.25, "rewards/margins": 4.03125, "rewards/rejected": -9.3125, "step": 13030 }, { "epoch": 0.9811150402528026, "grad_norm": 6.549141119267545, "learning_rate": 5.430813885914653e-10, "logits/chosen": -2.78125, "logits/rejected": -2.484375, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.1762, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.125, "rewards/margins": 4.34375, "rewards/rejected": -9.5, "step": 13040 }, { "epoch": 0.9818674290873524, "grad_norm": 8.908476496818508, "learning_rate": 5.006841538313933e-10, "logits/chosen": -2.78125, "logits/rejected": -2.5, "logps/chosen": -700.0, "logps/rejected": -1064.0, "loss": 0.1734, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.375, "rewards/margins": 3.953125, "rewards/rejected": -9.3125, "step": 13050 }, { "epoch": 0.982619817921902, "grad_norm": 7.99784275885509, "learning_rate": 4.600081291245006e-10, "logits/chosen": -2.75, "logits/rejected": -2.46875, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1745, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.125, "rewards/margins": 3.828125, "rewards/rejected": -8.9375, "step": 13060 }, { "epoch": 0.9833722067564518, "grad_norm": 8.370615866150834, "learning_rate": 4.2105359508071147e-10, "logits/chosen": -2.890625, "logits/rejected": -2.5, "logps/chosen": -716.0, "logps/rejected": -1096.0, "loss": 0.1771, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.4375, "rewards/margins": 3.921875, "rewards/rejected": -9.375, "step": 13070 }, { "epoch": 0.9841245955910014, "grad_norm": 6.878257381305365, "learning_rate": 3.8382082043400544e-10, "logits/chosen": -2.75, "logits/rejected": -2.5, "logps/chosen": -700.0, "logps/rejected": -1104.0, "loss": 0.1811, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.25, "rewards/margins": 4.09375, "rewards/rejected": -9.3125, "step": 13080 }, { "epoch": 0.9848769844255512, "grad_norm": 9.941988022392518, "learning_rate": 3.4831006204044666e-10, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -648.0, "logps/rejected": -1088.0, "loss": 0.1718, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.28125, "rewards/rejected": -9.3125, "step": 13090 }, { "epoch": 0.9856293732601008, "grad_norm": 9.53275865183766, "learning_rate": 3.14521564876602e-10, "logits/chosen": -2.765625, "logits/rejected": -2.53125, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.1723, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.21875, "rewards/rejected": -9.25, "step": 13100 }, { "epoch": 0.9863817620946506, "grad_norm": 8.234062180211685, "learning_rate": 2.8245556203768116e-10, "logits/chosen": -2.8125, "logits/rejected": -2.453125, "logps/chosen": -696.0, "logps/rejected": -1072.0, "loss": 0.1864, "rewards/accuracies": 0.90625, "rewards/chosen": -5.375, "rewards/margins": 3.828125, "rewards/rejected": -9.1875, "step": 13110 }, { "epoch": 0.9871341509292002, "grad_norm": 7.0917821698046835, "learning_rate": 2.5211227473603826e-10, "logits/chosen": -2.84375, "logits/rejected": -2.546875, "logps/chosen": -724.0, "logps/rejected": -1104.0, "loss": 0.179, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.46875, "rewards/margins": 3.9375, "rewards/rejected": -9.375, "step": 13120 }, { "epoch": 0.98788653976375, "grad_norm": 9.771436542216827, "learning_rate": 2.2349191229956156e-10, "logits/chosen": -2.71875, "logits/rejected": -2.34375, "logps/chosen": -680.0, "logps/rejected": -1064.0, "loss": 0.1643, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.28125, "rewards/margins": 3.96875, "rewards/rejected": -9.25, "step": 13130 }, { "epoch": 0.9886389285982996, "grad_norm": 7.230807314397468, "learning_rate": 1.9659467217031377e-10, "logits/chosen": -2.859375, "logits/rejected": -2.5625, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.1736, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.0, "rewards/rejected": -9.25, "step": 13140 }, { "epoch": 0.9893913174328492, "grad_norm": 8.704818479628996, "learning_rate": 1.7142073990308868e-10, "logits/chosen": -2.859375, "logits/rejected": -2.625, "logps/chosen": -688.0, "logps/rejected": -1104.0, "loss": 0.1637, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.15625, "rewards/rejected": -9.4375, "step": 13150 }, { "epoch": 0.990143706267399, "grad_norm": 10.49614592647157, "learning_rate": 1.4797028916424536e-10, "logits/chosen": -2.734375, "logits/rejected": -2.46875, "logps/chosen": -680.0, "logps/rejected": -1080.0, "loss": 0.186, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.15625, "rewards/margins": 3.96875, "rewards/rejected": -9.125, "step": 13160 }, { "epoch": 0.9908960951019486, "grad_norm": 10.02961396465751, "learning_rate": 1.2624348173034814e-10, "logits/chosen": -2.84375, "logits/rejected": -2.46875, "logps/chosen": -684.0, "logps/rejected": -1080.0, "loss": 0.1968, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.0, "rewards/rejected": -9.25, "step": 13170 }, { "epoch": 0.9916484839364984, "grad_norm": 9.26215404228547, "learning_rate": 1.0624046748716753e-10, "logits/chosen": -2.84375, "logits/rejected": -2.46875, "logps/chosen": -668.0, "logps/rejected": -1088.0, "loss": 0.1474, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.25, "rewards/rejected": -9.25, "step": 13180 }, { "epoch": 0.992400872771048, "grad_norm": 8.231614202034578, "learning_rate": 8.796138442868084e-11, "logits/chosen": -2.8125, "logits/rejected": -2.4375, "logps/chosen": -668.0, "logps/rejected": -1072.0, "loss": 0.1815, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.3125, "rewards/rejected": -9.25, "step": 13190 }, { "epoch": 0.9931532616055978, "grad_norm": 8.949874822215708, "learning_rate": 7.140635865593437e-11, "logits/chosen": -2.78125, "logits/rejected": -2.546875, "logps/chosen": -672.0, "logps/rejected": -1056.0, "loss": 0.1889, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.25, "rewards/margins": 3.921875, "rewards/rejected": -9.1875, "step": 13200 }, { "epoch": 0.9939056504401474, "grad_norm": 9.932856767714386, "learning_rate": 5.65755043764049e-11, "logits/chosen": -2.859375, "logits/rejected": -2.640625, "logps/chosen": -700.0, "logps/rejected": -1072.0, "loss": 0.1676, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.21875, "rewards/margins": 3.96875, "rewards/rejected": -9.1875, "step": 13210 }, { "epoch": 0.9946580392746972, "grad_norm": 7.782030286808017, "learning_rate": 4.346892390302836e-11, "logits/chosen": -2.703125, "logits/rejected": -2.5625, "logps/chosen": -680.0, "logps/rejected": -1048.0, "loss": 0.1751, "rewards/accuracies": 0.9375, "rewards/chosen": -5.09375, "rewards/margins": 3.796875, "rewards/rejected": -8.875, "step": 13220 }, { "epoch": 0.9954104281092468, "grad_norm": 6.674392906651129, "learning_rate": 3.208670765364463e-11, "logits/chosen": -2.671875, "logits/rejected": -2.53125, "logps/chosen": -692.0, "logps/rejected": -1048.0, "loss": 0.1634, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.25, "rewards/margins": 3.65625, "rewards/rejected": -8.875, "step": 13230 }, { "epoch": 0.9961628169437966, "grad_norm": 8.217452800520352, "learning_rate": 2.2428934150192646e-11, "logits/chosen": -2.9375, "logits/rejected": -2.5625, "logps/chosen": -680.0, "logps/rejected": -1072.0, "loss": 0.1785, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.21875, "rewards/margins": 3.921875, "rewards/rejected": -9.125, "step": 13240 }, { "epoch": 0.9969152057783462, "grad_norm": 10.460235887809906, "learning_rate": 1.4495670018405125e-11, "logits/chosen": -2.828125, "logits/rejected": -2.484375, "logps/chosen": -688.0, "logps/rejected": -1072.0, "loss": 0.1784, "rewards/accuracies": 0.90625, "rewards/chosen": -5.4375, "rewards/margins": 3.90625, "rewards/rejected": -9.3125, "step": 13250 }, { "epoch": 0.9976675946128959, "grad_norm": 8.892182145767135, "learning_rate": 8.286969987086888e-12, "logits/chosen": -2.71875, "logits/rejected": -2.53125, "logps/chosen": -684.0, "logps/rejected": -1072.0, "loss": 0.1874, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.15625, "rewards/margins": 3.90625, "rewards/rejected": -9.0625, "step": 13260 }, { "epoch": 0.9984199834474456, "grad_norm": 11.14444056822175, "learning_rate": 3.802876887948336e-12, "logits/chosen": -2.703125, "logits/rejected": -2.453125, "logps/chosen": -688.0, "logps/rejected": -1048.0, "loss": 0.1651, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.34375, "rewards/margins": 3.578125, "rewards/rejected": -8.9375, "step": 13270 }, { "epoch": 0.9991723722819953, "grad_norm": 12.760341698047581, "learning_rate": 1.0434216552168695e-12, "logits/chosen": -2.796875, "logits/rejected": -2.5, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.1694, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.375, "rewards/rejected": -9.4375, "step": 13280 }, { "epoch": 0.999924761116545, "grad_norm": 9.271473709379865, "learning_rate": 8.623325414847116e-15, "logits/chosen": -2.796875, "logits/rejected": -2.421875, "logps/chosen": -684.0, "logps/rejected": -1064.0, "loss": 0.169, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.1875, "rewards/margins": 3.921875, "rewards/rejected": -9.125, "step": 13290 }, { "epoch": 1.0, "step": 13291, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 43.6247, "train_samples_per_second": 38996.291, "train_steps_per_second": 304.667 } ], "logging_steps": 10, "max_steps": 13291, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }