{ "best_metric": 0.3146648108959198, "best_model_checkpoint": "output/dpo/finetune-llama-3.2-1b-gsm8k/iself-gsm8k/epoch1.0-lr1e-6-verifier-v3/checkpoint-6811", "epoch": 0.7002158939035674, "eval_steps": 973, "global_step": 6811, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005140331037318803, "grad_norm": 26.625, "learning_rate": 5.13874614594039e-09, "logits/chosen": -0.4711429476737976, "logits/rejected": -0.4845251441001892, "logps/chosen": -46.204132080078125, "logps/rejected": -44.41657257080078, "loss": 0.6935, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.001767764100804925, "rewards/margins": 0.003719678148627281, "rewards/rejected": -0.0019519136985763907, "step": 5 }, { "epoch": 0.0010280662074637606, "grad_norm": 24.75, "learning_rate": 1.027749229188078e-08, "logits/chosen": -0.48729124665260315, "logits/rejected": -0.5160056948661804, "logps/chosen": -47.70532989501953, "logps/rejected": -44.788963317871094, "loss": 0.6923, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0022705462761223316, "rewards/margins": -0.00681522861123085, "rewards/rejected": 0.004544682335108519, "step": 10 }, { "epoch": 0.001542099311195641, "grad_norm": 26.125, "learning_rate": 1.5416238437821173e-08, "logits/chosen": -0.5396078824996948, "logits/rejected": -0.5188677310943604, "logps/chosen": -41.14617156982422, "logps/rejected": -42.69959259033203, "loss": 0.6952, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0018560737371444702, "rewards/margins": -0.003085608361288905, "rewards/rejected": 0.0012295341584831476, "step": 15 }, { "epoch": 0.002056132414927521, "grad_norm": 25.625, "learning_rate": 2.055498458376156e-08, "logits/chosen": -0.47882193326950073, "logits/rejected": -0.49616456031799316, "logps/chosen": -40.94513702392578, "logps/rejected": -43.03068542480469, "loss": 0.6933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006069836672395468, "rewards/margins": 0.005176500882953405, "rewards/rejected": 0.0008933353237807751, "step": 20 }, { "epoch": 0.0025701655186594016, "grad_norm": 25.5, "learning_rate": 2.569373072970195e-08, "logits/chosen": -0.5729622840881348, "logits/rejected": -0.5740376114845276, "logps/chosen": -43.77743148803711, "logps/rejected": -44.969825744628906, "loss": 0.6927, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0031908322125673294, "rewards/margins": -0.005262752063572407, "rewards/rejected": 0.0020719196181744337, "step": 25 }, { "epoch": 0.003084198622391282, "grad_norm": 26.5, "learning_rate": 3.0832476875642346e-08, "logits/chosen": -0.4574706554412842, "logits/rejected": -0.4841234087944031, "logps/chosen": -43.66632080078125, "logps/rejected": -42.249122619628906, "loss": 0.6947, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.694044426083565e-05, "rewards/margins": 0.00204763887450099, "rewards/rejected": -0.0020845793187618256, "step": 30 }, { "epoch": 0.0035982317261231624, "grad_norm": 25.5, "learning_rate": 3.597122302158273e-08, "logits/chosen": -0.5357931852340698, "logits/rejected": -0.536652684211731, "logps/chosen": -44.913734436035156, "logps/rejected": -41.85783767700195, "loss": 0.6951, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008510352927260101, "rewards/margins": -0.0027728986460715532, "rewards/rejected": 0.0036239332985132933, "step": 35 }, { "epoch": 0.004112264829855042, "grad_norm": 28.75, "learning_rate": 4.110996916752312e-08, "logits/chosen": -0.4459976255893707, "logits/rejected": -0.476590633392334, "logps/chosen": -45.914649963378906, "logps/rejected": -45.92125701904297, "loss": 0.693, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0004974032053723931, "rewards/margins": 0.0009790894109755754, "rewards/rejected": -0.0004816867003683001, "step": 40 }, { "epoch": 0.004626297933586923, "grad_norm": 29.0, "learning_rate": 4.624871531346351e-08, "logits/chosen": -0.4863155782222748, "logits/rejected": -0.4671866297721863, "logps/chosen": -46.881126403808594, "logps/rejected": -49.35586166381836, "loss": 0.6942, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0018791819456964731, "rewards/margins": -0.0068735405802726746, "rewards/rejected": 0.004994358867406845, "step": 45 }, { "epoch": 0.005140331037318803, "grad_norm": 28.75, "learning_rate": 5.13874614594039e-08, "logits/chosen": -0.5210267305374146, "logits/rejected": -0.5675699710845947, "logps/chosen": -49.350589752197266, "logps/rejected": -44.708213806152344, "loss": 0.6938, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0002384190447628498, "rewards/margins": 0.0063677458092570305, "rewards/rejected": -0.006129326298832893, "step": 50 }, { "epoch": 0.0056543641410506836, "grad_norm": 26.625, "learning_rate": 5.6526207605344294e-08, "logits/chosen": -0.5158163905143738, "logits/rejected": -0.4802324175834656, "logps/chosen": -48.64641189575195, "logps/rejected": -43.40717315673828, "loss": 0.6941, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004383726045489311, "rewards/margins": 0.008062734268605709, "rewards/rejected": -0.0036790084559470415, "step": 55 }, { "epoch": 0.006168397244782564, "grad_norm": 26.125, "learning_rate": 6.166495375128469e-08, "logits/chosen": -0.553433895111084, "logits/rejected": -0.5630972385406494, "logps/chosen": -38.46834945678711, "logps/rejected": -41.884254455566406, "loss": 0.6944, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.002529902383685112, "rewards/margins": 0.000525031122379005, "rewards/rejected": 0.0020048716105520725, "step": 60 }, { "epoch": 0.006682430348514444, "grad_norm": 26.875, "learning_rate": 6.680369989722508e-08, "logits/chosen": -0.4908978044986725, "logits/rejected": -0.49853163957595825, "logps/chosen": -48.81123733520508, "logps/rejected": -46.16844177246094, "loss": 0.6921, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0030208348762243986, "rewards/margins": -0.0017249680822715163, "rewards/rejected": 0.00474580330774188, "step": 65 }, { "epoch": 0.007196463452246325, "grad_norm": 25.25, "learning_rate": 7.194244604316546e-08, "logits/chosen": -0.5208503603935242, "logits/rejected": -0.4796935021877289, "logps/chosen": -49.59909439086914, "logps/rejected": -41.80225372314453, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00012643302034121007, "rewards/margins": 0.003192401025444269, "rewards/rejected": -0.003318834351375699, "step": 70 }, { "epoch": 0.007710496555978205, "grad_norm": 24.5, "learning_rate": 7.708119218910586e-08, "logits/chosen": -0.5105139017105103, "logits/rejected": -0.5086828470230103, "logps/chosen": -40.67918014526367, "logps/rejected": -47.469642639160156, "loss": 0.6934, "rewards/accuracies": 0.375, "rewards/chosen": -0.0009156321175396442, "rewards/margins": -0.004579101223498583, "rewards/rejected": 0.0036634684074670076, "step": 75 }, { "epoch": 0.008224529659710085, "grad_norm": 24.625, "learning_rate": 8.221993833504624e-08, "logits/chosen": -0.463491290807724, "logits/rejected": -0.4281392991542816, "logps/chosen": -48.95800018310547, "logps/rejected": -41.99492645263672, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00619291327893734, "rewards/margins": 0.004972576629370451, "rewards/rejected": 0.0012203359510749578, "step": 80 }, { "epoch": 0.008738562763441966, "grad_norm": 26.625, "learning_rate": 8.735868448098664e-08, "logits/chosen": -0.48711472749710083, "logits/rejected": -0.5137075185775757, "logps/chosen": -45.183021545410156, "logps/rejected": -43.21146774291992, "loss": 0.6931, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.003010921645909548, "rewards/margins": -0.0032089711166918278, "rewards/rejected": 0.000198049281607382, "step": 85 }, { "epoch": 0.009252595867173845, "grad_norm": 24.625, "learning_rate": 9.249743062692702e-08, "logits/chosen": -0.442121684551239, "logits/rejected": -0.47005003690719604, "logps/chosen": -45.645572662353516, "logps/rejected": -43.540863037109375, "loss": 0.6918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0028347542975097895, "rewards/margins": 0.003199048340320587, "rewards/rejected": -0.0003642938972916454, "step": 90 }, { "epoch": 0.009766628970905727, "grad_norm": 27.75, "learning_rate": 9.763617677286741e-08, "logits/chosen": -0.44471701979637146, "logits/rejected": -0.4581942558288574, "logps/chosen": -43.32032012939453, "logps/rejected": -45.05426788330078, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": 0.006592039950191975, "rewards/margins": 0.006631531752645969, "rewards/rejected": -3.949133679270744e-05, "step": 95 }, { "epoch": 0.010280662074637606, "grad_norm": 25.625, "learning_rate": 1.027749229188078e-07, "logits/chosen": -0.5046353936195374, "logits/rejected": -0.5248790979385376, "logps/chosen": -53.102569580078125, "logps/rejected": -47.08808135986328, "loss": 0.6936, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.001970844343304634, "rewards/margins": 0.004899029619991779, "rewards/rejected": -0.002928185509517789, "step": 100 }, { "epoch": 0.010794695178369488, "grad_norm": 28.5, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -0.5177448391914368, "logits/rejected": -0.5530094504356384, "logps/chosen": -51.18027877807617, "logps/rejected": -47.607688903808594, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.002196488669142127, "rewards/margins": -0.0010633041383698583, "rewards/rejected": 0.00325979245826602, "step": 105 }, { "epoch": 0.011308728282101367, "grad_norm": 25.5, "learning_rate": 1.1305241521068859e-07, "logits/chosen": -0.4905124306678772, "logits/rejected": -0.47289562225341797, "logps/chosen": -43.474098205566406, "logps/rejected": -41.02274703979492, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0010326718911528587, "rewards/margins": -0.0024478621780872345, "rewards/rejected": 0.0034805345349013805, "step": 110 }, { "epoch": 0.011822761385833248, "grad_norm": 26.25, "learning_rate": 1.1819116135662897e-07, "logits/chosen": -0.542976975440979, "logits/rejected": -0.4488591253757477, "logps/chosen": -49.40229034423828, "logps/rejected": -45.29668045043945, "loss": 0.6936, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0007848789682611823, "rewards/margins": -0.006581378635019064, "rewards/rejected": 0.007366256322711706, "step": 115 }, { "epoch": 0.012336794489565128, "grad_norm": 28.125, "learning_rate": 1.2332990750256938e-07, "logits/chosen": -0.4456765055656433, "logits/rejected": -0.48736634850502014, "logps/chosen": -44.41051483154297, "logps/rejected": -43.91301345825195, "loss": 0.6922, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0012469341745600104, "rewards/margins": 0.004232616629451513, "rewards/rejected": -0.002985682338476181, "step": 120 }, { "epoch": 0.01285082759329701, "grad_norm": 26.25, "learning_rate": 1.2846865364850975e-07, "logits/chosen": -0.570705771446228, "logits/rejected": -0.5827814340591431, "logps/chosen": -52.73783493041992, "logps/rejected": -44.6126708984375, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0018550775712355971, "rewards/margins": 0.0008317806059494615, "rewards/rejected": -0.0026868581771850586, "step": 125 }, { "epoch": 0.013364860697028889, "grad_norm": 27.5, "learning_rate": 1.3360739979445015e-07, "logits/chosen": -0.5795720815658569, "logits/rejected": -0.6144151091575623, "logps/chosen": -51.951751708984375, "logps/rejected": -46.758888244628906, "loss": 0.6931, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0029468678403645754, "rewards/margins": 0.0027336219791322947, "rewards/rejected": 0.00021324660337995738, "step": 130 }, { "epoch": 0.013878893800760768, "grad_norm": 25.125, "learning_rate": 1.3874614594039055e-07, "logits/chosen": -0.5207489728927612, "logits/rejected": -0.5364847779273987, "logps/chosen": -46.40230941772461, "logps/rejected": -45.097694396972656, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.002200732473284006, "rewards/margins": 0.0016380930319428444, "rewards/rejected": 0.0005626391503028572, "step": 135 }, { "epoch": 0.01439292690449265, "grad_norm": 27.625, "learning_rate": 1.4388489208633092e-07, "logits/chosen": -0.5434900522232056, "logits/rejected": -0.5947614908218384, "logps/chosen": -49.76659393310547, "logps/rejected": -41.96564483642578, "loss": 0.6918, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0056605772115290165, "rewards/margins": -0.0034701351542025805, "rewards/rejected": 0.009130711667239666, "step": 140 }, { "epoch": 0.014906960008224529, "grad_norm": 28.125, "learning_rate": 1.4902363823227132e-07, "logits/chosen": -0.5094398856163025, "logits/rejected": -0.5408573150634766, "logps/chosen": -44.48435974121094, "logps/rejected": -44.013553619384766, "loss": 0.6902, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.005452323239296675, "rewards/margins": 0.004191312473267317, "rewards/rejected": 0.0012610103003680706, "step": 145 }, { "epoch": 0.01542099311195641, "grad_norm": 27.25, "learning_rate": 1.5416238437821172e-07, "logits/chosen": -0.45197024941444397, "logits/rejected": -0.4677615165710449, "logps/chosen": -43.98979187011719, "logps/rejected": -40.97275161743164, "loss": 0.6923, "rewards/accuracies": 0.625, "rewards/chosen": 0.002554960548877716, "rewards/margins": 0.0043929144740104675, "rewards/rejected": -0.0018379546236246824, "step": 150 }, { "epoch": 0.01593502621568829, "grad_norm": 30.625, "learning_rate": 1.593011305241521e-07, "logits/chosen": -0.5512616634368896, "logits/rejected": -0.557027280330658, "logps/chosen": -41.731632232666016, "logps/rejected": -48.15312194824219, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004013709723949432, "rewards/margins": 0.0013138672802597284, "rewards/rejected": 0.002699842443689704, "step": 155 }, { "epoch": 0.01644905931942017, "grad_norm": 28.125, "learning_rate": 1.6443987667009249e-07, "logits/chosen": -0.5241793394088745, "logits/rejected": -0.5748635530471802, "logps/chosen": -48.35177230834961, "logps/rejected": -48.15678024291992, "loss": 0.6912, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.0028473520651459694, "rewards/margins": -0.0014995671808719635, "rewards/rejected": -0.0013477850006893277, "step": 160 }, { "epoch": 0.01696309242315205, "grad_norm": 25.875, "learning_rate": 1.6957862281603288e-07, "logits/chosen": -0.4488959312438965, "logits/rejected": -0.48494553565979004, "logps/chosen": -47.163787841796875, "logps/rejected": -47.575416564941406, "loss": 0.6925, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0053576515056192875, "rewards/margins": -0.0034027197398245335, "rewards/rejected": -0.0019549319986253977, "step": 165 }, { "epoch": 0.017477125526883932, "grad_norm": 28.0, "learning_rate": 1.7471736896197328e-07, "logits/chosen": -0.5358412861824036, "logits/rejected": -0.49765077233314514, "logps/chosen": -44.149147033691406, "logps/rejected": -36.848358154296875, "loss": 0.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007571063004434109, "rewards/margins": 0.011015111580491066, "rewards/rejected": -0.0034440464805811644, "step": 170 }, { "epoch": 0.017991158630615813, "grad_norm": 25.875, "learning_rate": 1.7985611510791365e-07, "logits/chosen": -0.4514164924621582, "logits/rejected": -0.44472751021385193, "logps/chosen": -48.17477798461914, "logps/rejected": -42.502559661865234, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004001913126558065, "rewards/margins": 0.0032674118410795927, "rewards/rejected": 0.0007345010526478291, "step": 175 }, { "epoch": 0.01850519173434769, "grad_norm": 26.875, "learning_rate": 1.8499486125385405e-07, "logits/chosen": -0.47482576966285706, "logits/rejected": -0.5241795778274536, "logps/chosen": -46.53056335449219, "logps/rejected": -40.08974838256836, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0014074135106056929, "rewards/margins": -0.00029100431129336357, "rewards/rejected": 0.0016984173562377691, "step": 180 }, { "epoch": 0.019019224838079572, "grad_norm": 25.5, "learning_rate": 1.9013360739979445e-07, "logits/chosen": -0.49966615438461304, "logits/rejected": -0.5137614011764526, "logps/chosen": -44.31159591674805, "logps/rejected": -42.841392517089844, "loss": 0.6934, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0024131631944328547, "rewards/margins": -0.009240898303687572, "rewards/rejected": 0.006827736739069223, "step": 185 }, { "epoch": 0.019533257941811454, "grad_norm": 25.125, "learning_rate": 1.9527235354573482e-07, "logits/chosen": -0.5224586129188538, "logits/rejected": -0.5343331098556519, "logps/chosen": -50.447120666503906, "logps/rejected": -44.22306823730469, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.003419180167838931, "rewards/margins": 0.0027483694721013308, "rewards/rejected": 0.0006708097644150257, "step": 190 }, { "epoch": 0.02004729104554333, "grad_norm": 27.0, "learning_rate": 2.0041109969167522e-07, "logits/chosen": -0.5053311586380005, "logits/rejected": -0.5488319396972656, "logps/chosen": -48.321449279785156, "logps/rejected": -44.96624755859375, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.002887277863919735, "rewards/margins": 0.0007682610885240138, "rewards/rejected": 0.0021190166007727385, "step": 195 }, { "epoch": 0.020561324149275213, "grad_norm": 26.0, "learning_rate": 2.055498458376156e-07, "logits/chosen": -0.4777766168117523, "logits/rejected": -0.44057536125183105, "logps/chosen": -41.93425750732422, "logps/rejected": -44.166175842285156, "loss": 0.6919, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004490427672863007, "rewards/margins": 0.005314779933542013, "rewards/rejected": -0.0008243514457717538, "step": 200 }, { "epoch": 0.021075357253007094, "grad_norm": 26.375, "learning_rate": 2.10688591983556e-07, "logits/chosen": -0.5536308288574219, "logits/rejected": -0.609908938407898, "logps/chosen": -46.37224578857422, "logps/rejected": -46.44746780395508, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": 0.0028821658343076706, "rewards/margins": 0.007997493259608746, "rewards/rejected": -0.005115327890962362, "step": 205 }, { "epoch": 0.021589390356738975, "grad_norm": 26.625, "learning_rate": 2.1582733812949638e-07, "logits/chosen": -0.5162872672080994, "logits/rejected": -0.4827386438846588, "logps/chosen": -45.56764221191406, "logps/rejected": -41.18996047973633, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00026861648075282574, "rewards/margins": 0.0019447185331955552, "rewards/rejected": -0.002213334897533059, "step": 210 }, { "epoch": 0.022103423460470853, "grad_norm": 25.625, "learning_rate": 2.2096608427543678e-07, "logits/chosen": -0.4692758619785309, "logits/rejected": -0.4862975478172302, "logps/chosen": -46.68827819824219, "logps/rejected": -46.67244338989258, "loss": 0.6914, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.009469417855143547, "rewards/margins": 0.00625162199139595, "rewards/rejected": 0.0032177972607314587, "step": 215 }, { "epoch": 0.022617456564202734, "grad_norm": 26.375, "learning_rate": 2.2610483042137718e-07, "logits/chosen": -0.5159892439842224, "logits/rejected": -0.5249872207641602, "logps/chosen": -42.67253494262695, "logps/rejected": -42.87208938598633, "loss": 0.6915, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0008270788821391761, "rewards/margins": 0.0015653322916477919, "rewards/rejected": -0.002392411231994629, "step": 220 }, { "epoch": 0.023131489667934615, "grad_norm": 27.125, "learning_rate": 2.3124357656731757e-07, "logits/chosen": -0.4262077808380127, "logits/rejected": -0.4671955108642578, "logps/chosen": -50.68128204345703, "logps/rejected": -44.864479064941406, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.003470859257504344, "rewards/margins": 0.0041311695240437984, "rewards/rejected": -0.0006603100337088108, "step": 225 }, { "epoch": 0.023645522771666497, "grad_norm": 27.125, "learning_rate": 2.3638232271325795e-07, "logits/chosen": -0.4931139349937439, "logits/rejected": -0.45486631989479065, "logps/chosen": -49.90330505371094, "logps/rejected": -44.18583297729492, "loss": 0.6913, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.007043546997010708, "rewards/margins": 0.005053005181252956, "rewards/rejected": 0.0019905422814190388, "step": 230 }, { "epoch": 0.024159555875398375, "grad_norm": 26.375, "learning_rate": 2.4152106885919837e-07, "logits/chosen": -0.4569849967956543, "logits/rejected": -0.515161395072937, "logps/chosen": -47.67377471923828, "logps/rejected": -42.07758331298828, "loss": 0.6891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.004838242195546627, "rewards/margins": 0.006369642913341522, "rewards/rejected": -0.0015314004849642515, "step": 235 }, { "epoch": 0.024673588979130256, "grad_norm": 25.875, "learning_rate": 2.4665981500513877e-07, "logits/chosen": -0.5250482559204102, "logits/rejected": -0.5692823529243469, "logps/chosen": -47.11442184448242, "logps/rejected": -43.58592987060547, "loss": 0.6916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008311735466122627, "rewards/margins": 0.009756923653185368, "rewards/rejected": -0.0014451885363087058, "step": 240 }, { "epoch": 0.025187622082862137, "grad_norm": 28.375, "learning_rate": 2.517985611510791e-07, "logits/chosen": -0.4967048764228821, "logits/rejected": -0.5395949482917786, "logps/chosen": -48.00700759887695, "logps/rejected": -45.21251678466797, "loss": 0.6899, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0040191179141402245, "rewards/margins": 0.002224746160209179, "rewards/rejected": 0.0017943715211004019, "step": 245 }, { "epoch": 0.02570165518659402, "grad_norm": 29.5, "learning_rate": 2.569373072970195e-07, "logits/chosen": -0.46316757798194885, "logits/rejected": -0.4398517608642578, "logps/chosen": -49.78543472290039, "logps/rejected": -43.632347106933594, "loss": 0.6909, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.012596115469932556, "rewards/margins": 0.004999056458473206, "rewards/rejected": 0.0075970604084432125, "step": 250 }, { "epoch": 0.026215688290325896, "grad_norm": 28.75, "learning_rate": 2.620760534429599e-07, "logits/chosen": -0.5207998752593994, "logits/rejected": -0.5036340951919556, "logps/chosen": -45.04606628417969, "logps/rejected": -42.51580047607422, "loss": 0.6908, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00527938874438405, "rewards/margins": 0.002742548007518053, "rewards/rejected": 0.00253684027120471, "step": 255 }, { "epoch": 0.026729721394057777, "grad_norm": 26.25, "learning_rate": 2.672147995889003e-07, "logits/chosen": -0.41138577461242676, "logits/rejected": -0.38474565744400024, "logps/chosen": -43.42218017578125, "logps/rejected": -42.483497619628906, "loss": 0.6906, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0037311457563191652, "rewards/margins": -0.00048454804345965385, "rewards/rejected": 0.004215694032609463, "step": 260 }, { "epoch": 0.02724375449778966, "grad_norm": 28.75, "learning_rate": 2.723535457348407e-07, "logits/chosen": -0.5527512431144714, "logits/rejected": -0.5617319941520691, "logps/chosen": -47.81463623046875, "logps/rejected": -45.9075927734375, "loss": 0.691, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004240431822836399, "rewards/margins": 0.004871487617492676, "rewards/rejected": -0.0006310559110715985, "step": 265 }, { "epoch": 0.027757787601521536, "grad_norm": 26.375, "learning_rate": 2.774922918807811e-07, "logits/chosen": -0.4830349385738373, "logits/rejected": -0.4828927516937256, "logps/chosen": -47.45735168457031, "logps/rejected": -42.06171798706055, "loss": 0.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006635819561779499, "rewards/margins": 0.004589528776705265, "rewards/rejected": 0.002046289388090372, "step": 270 }, { "epoch": 0.028271820705253418, "grad_norm": 24.5, "learning_rate": 2.8263103802672144e-07, "logits/chosen": -0.5561514496803284, "logits/rejected": -0.5141309499740601, "logps/chosen": -48.25905227661133, "logps/rejected": -47.09813690185547, "loss": 0.6893, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005340857431292534, "rewards/margins": 0.004945469554513693, "rewards/rejected": 0.00039538851706311107, "step": 275 }, { "epoch": 0.0287858538089853, "grad_norm": 28.5, "learning_rate": 2.8776978417266184e-07, "logits/chosen": -0.4912126660346985, "logits/rejected": -0.49212852120399475, "logps/chosen": -45.4501953125, "logps/rejected": -49.1949348449707, "loss": 0.6896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01073845848441124, "rewards/margins": 0.01449015736579895, "rewards/rejected": -0.0037516974844038486, "step": 280 }, { "epoch": 0.02929988691271718, "grad_norm": 28.5, "learning_rate": 2.9290853031860224e-07, "logits/chosen": -0.4637204706668854, "logits/rejected": -0.44693416357040405, "logps/chosen": -48.0748405456543, "logps/rejected": -47.74013137817383, "loss": 0.6896, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.009842668659985065, "rewards/margins": 0.016693221405148506, "rewards/rejected": -0.006850552745163441, "step": 285 }, { "epoch": 0.029813920016449058, "grad_norm": 26.25, "learning_rate": 2.9804727646454264e-07, "logits/chosen": -0.5127094388008118, "logits/rejected": -0.5148364305496216, "logps/chosen": -44.210487365722656, "logps/rejected": -41.93080139160156, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006960840430110693, "rewards/margins": 0.008477607741951942, "rewards/rejected": -0.001516766962595284, "step": 290 }, { "epoch": 0.03032795312018094, "grad_norm": 25.75, "learning_rate": 3.0318602261048304e-07, "logits/chosen": -0.4951019287109375, "logits/rejected": -0.5386073589324951, "logps/chosen": -43.852088928222656, "logps/rejected": -43.57620620727539, "loss": 0.6884, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.010046077892184258, "rewards/margins": 0.010534048080444336, "rewards/rejected": -0.0004879712068941444, "step": 295 }, { "epoch": 0.03084198622391282, "grad_norm": 25.625, "learning_rate": 3.0832476875642343e-07, "logits/chosen": -0.585810661315918, "logits/rejected": -0.5672785639762878, "logps/chosen": -44.93952178955078, "logps/rejected": -46.53219223022461, "loss": 0.6898, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.013124431483447552, "rewards/margins": 0.01867709495127201, "rewards/rejected": -0.005552663933485746, "step": 300 }, { "epoch": 0.0313560193276447, "grad_norm": 26.625, "learning_rate": 3.1346351490236383e-07, "logits/chosen": -0.5154384970664978, "logits/rejected": -0.5614781975746155, "logps/chosen": -45.13372039794922, "logps/rejected": -47.46242141723633, "loss": 0.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0131938261911273, "rewards/margins": 0.016649987548589706, "rewards/rejected": -0.0034561636857688427, "step": 305 }, { "epoch": 0.03187005243137658, "grad_norm": 24.5, "learning_rate": 3.186022610483042e-07, "logits/chosen": -0.447089284658432, "logits/rejected": -0.4526313245296478, "logps/chosen": -43.729190826416016, "logps/rejected": -42.146568298339844, "loss": 0.6894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011664348654448986, "rewards/margins": 0.01359101664274931, "rewards/rejected": -0.0019266704330220819, "step": 310 }, { "epoch": 0.032384085535108464, "grad_norm": 28.5, "learning_rate": 3.2374100719424457e-07, "logits/chosen": -0.5490690469741821, "logits/rejected": -0.543434739112854, "logps/chosen": -45.7617073059082, "logps/rejected": -42.74955368041992, "loss": 0.6876, "rewards/accuracies": 0.625, "rewards/chosen": 0.012433048337697983, "rewards/margins": 0.006272668484598398, "rewards/rejected": 0.006160378456115723, "step": 315 }, { "epoch": 0.03289811863884034, "grad_norm": 24.875, "learning_rate": 3.2887975334018497e-07, "logits/chosen": -0.4678472578525543, "logits/rejected": -0.4483782649040222, "logps/chosen": -45.02315139770508, "logps/rejected": -41.11427688598633, "loss": 0.6882, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.008293437771499157, "rewards/margins": 0.011450320482254028, "rewards/rejected": -0.0031568817794322968, "step": 320 }, { "epoch": 0.03341215174257222, "grad_norm": 25.375, "learning_rate": 3.3401849948612537e-07, "logits/chosen": -0.4880601763725281, "logits/rejected": -0.48183387517929077, "logps/chosen": -46.551979064941406, "logps/rejected": -39.47237014770508, "loss": 0.6878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.013086204417049885, "rewards/margins": 0.017154041677713394, "rewards/rejected": -0.004067836329340935, "step": 325 }, { "epoch": 0.0339261848463041, "grad_norm": 27.75, "learning_rate": 3.3915724563206577e-07, "logits/chosen": -0.4977228045463562, "logits/rejected": -0.43751540780067444, "logps/chosen": -51.057823181152344, "logps/rejected": -40.45303726196289, "loss": 0.6879, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01387846004217863, "rewards/margins": 0.011770183220505714, "rewards/rejected": 0.0021082782186567783, "step": 330 }, { "epoch": 0.03444021795003598, "grad_norm": 27.125, "learning_rate": 3.4429599177800616e-07, "logits/chosen": -0.5317984819412231, "logits/rejected": -0.5307309627532959, "logps/chosen": -46.23440933227539, "logps/rejected": -39.79348373413086, "loss": 0.687, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011821817606687546, "rewards/margins": 0.014034529216587543, "rewards/rejected": -0.002212710212916136, "step": 335 }, { "epoch": 0.034954251053767864, "grad_norm": 28.75, "learning_rate": 3.4943473792394656e-07, "logits/chosen": -0.508651077747345, "logits/rejected": -0.5829213261604309, "logps/chosen": -55.94213104248047, "logps/rejected": -43.97373580932617, "loss": 0.6862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.019808102399110794, "rewards/margins": 0.014452656731009483, "rewards/rejected": 0.0053554438054561615, "step": 340 }, { "epoch": 0.035468284157499745, "grad_norm": 27.5, "learning_rate": 3.545734840698869e-07, "logits/chosen": -0.46204233169555664, "logits/rejected": -0.46814584732055664, "logps/chosen": -45.85118865966797, "logps/rejected": -45.836158752441406, "loss": 0.6862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.012900441884994507, "rewards/margins": 0.011376372538506985, "rewards/rejected": 0.0015240715583786368, "step": 345 }, { "epoch": 0.035982317261231626, "grad_norm": 25.625, "learning_rate": 3.597122302158273e-07, "logits/chosen": -0.53444504737854, "logits/rejected": -0.5176218152046204, "logps/chosen": -45.691410064697266, "logps/rejected": -43.10414505004883, "loss": 0.6858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.019554611295461655, "rewards/margins": 0.01760965958237648, "rewards/rejected": 0.00194495206233114, "step": 350 }, { "epoch": 0.0364963503649635, "grad_norm": 26.0, "learning_rate": 3.648509763617677e-07, "logits/chosen": -0.48483046889305115, "logits/rejected": -0.5044042468070984, "logps/chosen": -47.60944747924805, "logps/rejected": -44.49134063720703, "loss": 0.6847, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01937175914645195, "rewards/margins": 0.018882866948843002, "rewards/rejected": 0.0004888916737399995, "step": 355 }, { "epoch": 0.03701038346869538, "grad_norm": 28.125, "learning_rate": 3.699897225077081e-07, "logits/chosen": -0.5828704833984375, "logits/rejected": -0.5791498422622681, "logps/chosen": -51.829627990722656, "logps/rejected": -46.47412109375, "loss": 0.6845, "rewards/accuracies": 0.75, "rewards/chosen": 0.016117200255393982, "rewards/margins": 0.01978086121380329, "rewards/rejected": -0.0036636595614254475, "step": 360 }, { "epoch": 0.03752441657242726, "grad_norm": 30.0, "learning_rate": 3.751284686536485e-07, "logits/chosen": -0.505221962928772, "logits/rejected": -0.4928968548774719, "logps/chosen": -44.02387619018555, "logps/rejected": -47.058006286621094, "loss": 0.6833, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02020454965531826, "rewards/margins": 0.016679178923368454, "rewards/rejected": 0.0035253718961030245, "step": 365 }, { "epoch": 0.038038449676159145, "grad_norm": 28.375, "learning_rate": 3.802672147995889e-07, "logits/chosen": -0.510718047618866, "logits/rejected": -0.4746573567390442, "logps/chosen": -47.281822204589844, "logps/rejected": -43.88661193847656, "loss": 0.6833, "rewards/accuracies": 0.75, "rewards/chosen": 0.01637912355363369, "rewards/margins": 0.01656152680516243, "rewards/rejected": -0.00018240450299344957, "step": 370 }, { "epoch": 0.038552482779891026, "grad_norm": 26.25, "learning_rate": 3.854059609455293e-07, "logits/chosen": -0.42101702094078064, "logits/rejected": -0.44628190994262695, "logps/chosen": -49.241207122802734, "logps/rejected": -44.722198486328125, "loss": 0.6836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.021957814693450928, "rewards/margins": 0.02004944533109665, "rewards/rejected": 0.001908369129523635, "step": 375 }, { "epoch": 0.03906651588362291, "grad_norm": 27.875, "learning_rate": 3.9054470709146964e-07, "logits/chosen": -0.5213592648506165, "logits/rejected": -0.5449898838996887, "logps/chosen": -52.734642028808594, "logps/rejected": -44.96174240112305, "loss": 0.683, "rewards/accuracies": 0.625, "rewards/chosen": 0.023487048223614693, "rewards/margins": 0.01700151339173317, "rewards/rejected": 0.0064855339005589485, "step": 380 }, { "epoch": 0.03958054898735479, "grad_norm": 26.0, "learning_rate": 3.9568345323741003e-07, "logits/chosen": -0.4739038050174713, "logits/rejected": -0.4557901918888092, "logps/chosen": -45.509056091308594, "logps/rejected": -43.408607482910156, "loss": 0.6817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.023487018421292305, "rewards/margins": 0.018131664022803307, "rewards/rejected": 0.0053553530015051365, "step": 385 }, { "epoch": 0.04009458209108666, "grad_norm": 26.625, "learning_rate": 4.0082219938335043e-07, "logits/chosen": -0.5418193340301514, "logits/rejected": -0.5201669335365295, "logps/chosen": -44.9716682434082, "logps/rejected": -42.58832550048828, "loss": 0.6823, "rewards/accuracies": 0.75, "rewards/chosen": 0.028062742203474045, "rewards/margins": 0.02600189484655857, "rewards/rejected": 0.002060852013528347, "step": 390 }, { "epoch": 0.040608615194818544, "grad_norm": 27.625, "learning_rate": 4.0596094552929083e-07, "logits/chosen": -0.5765486359596252, "logits/rejected": -0.5540743470191956, "logps/chosen": -48.902183532714844, "logps/rejected": -44.736900329589844, "loss": 0.6821, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0202605240046978, "rewards/margins": 0.019397493451833725, "rewards/rejected": 0.0008630324155092239, "step": 395 }, { "epoch": 0.041122648298550425, "grad_norm": 26.875, "learning_rate": 4.110996916752312e-07, "logits/chosen": -0.508124589920044, "logits/rejected": -0.5311577916145325, "logps/chosen": -47.83466720581055, "logps/rejected": -41.02056121826172, "loss": 0.6839, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03066617250442505, "rewards/margins": 0.024624215438961983, "rewards/rejected": 0.006041955202817917, "step": 400 }, { "epoch": 0.041636681402282306, "grad_norm": 28.125, "learning_rate": 4.162384378211716e-07, "logits/chosen": -0.5752208828926086, "logits/rejected": -0.5787706971168518, "logps/chosen": -48.02177810668945, "logps/rejected": -47.3804817199707, "loss": 0.6829, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02745751477777958, "rewards/margins": 0.027623314410448074, "rewards/rejected": -0.00016580075316596776, "step": 405 }, { "epoch": 0.04215071450601419, "grad_norm": 25.75, "learning_rate": 4.21377183967112e-07, "logits/chosen": -0.6047698259353638, "logits/rejected": -0.5706152319908142, "logps/chosen": -45.578704833984375, "logps/rejected": -44.50090408325195, "loss": 0.6823, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03414739668369293, "rewards/margins": 0.029389819130301476, "rewards/rejected": 0.004757576622068882, "step": 410 }, { "epoch": 0.04266474760974607, "grad_norm": 24.625, "learning_rate": 4.2651593011305237e-07, "logits/chosen": -0.4687598645687103, "logits/rejected": -0.5287759304046631, "logps/chosen": -43.25025939941406, "logps/rejected": -45.230281829833984, "loss": 0.6827, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03294065222144127, "rewards/margins": 0.03717473894357681, "rewards/rejected": -0.004234084859490395, "step": 415 }, { "epoch": 0.04317878071347795, "grad_norm": 26.0, "learning_rate": 4.3165467625899276e-07, "logits/chosen": -0.4836321771144867, "logits/rejected": -0.5369798541069031, "logps/chosen": -47.18671417236328, "logps/rejected": -47.465057373046875, "loss": 0.6806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.034452877938747406, "rewards/margins": 0.030393976718187332, "rewards/rejected": 0.004058904014527798, "step": 420 }, { "epoch": 0.04369281381720983, "grad_norm": 26.75, "learning_rate": 4.3679342240493316e-07, "logits/chosen": -0.5255634188652039, "logits/rejected": -0.5517128705978394, "logps/chosen": -46.662174224853516, "logps/rejected": -40.17934799194336, "loss": 0.6801, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.031456779688596725, "rewards/margins": 0.030252089723944664, "rewards/rejected": 0.0012046911288052797, "step": 425 }, { "epoch": 0.044206846920941706, "grad_norm": 27.125, "learning_rate": 4.4193216855087356e-07, "logits/chosen": -0.4304000437259674, "logits/rejected": -0.5370306372642517, "logps/chosen": -48.03946304321289, "logps/rejected": -43.679039001464844, "loss": 0.6797, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.026951367035508156, "rewards/margins": 0.030883466824889183, "rewards/rejected": -0.00393209932371974, "step": 430 }, { "epoch": 0.04472088002467359, "grad_norm": 30.25, "learning_rate": 4.4707091469681396e-07, "logits/chosen": -0.5269969701766968, "logits/rejected": -0.5630894899368286, "logps/chosen": -48.23720169067383, "logps/rejected": -48.55601119995117, "loss": 0.6812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0234702005982399, "rewards/margins": 0.02061503939330578, "rewards/rejected": 0.002855162601917982, "step": 435 }, { "epoch": 0.04523491312840547, "grad_norm": 28.75, "learning_rate": 4.5220966084275435e-07, "logits/chosen": -0.4956912398338318, "logits/rejected": -0.5181102156639099, "logps/chosen": -45.25495529174805, "logps/rejected": -45.9106559753418, "loss": 0.6778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.034591950476169586, "rewards/margins": 0.029671236872673035, "rewards/rejected": 0.004920716397464275, "step": 440 }, { "epoch": 0.04574894623213735, "grad_norm": 25.375, "learning_rate": 4.5734840698869475e-07, "logits/chosen": -0.5331254005432129, "logits/rejected": -0.5552483201026917, "logps/chosen": -45.05670166015625, "logps/rejected": -45.485801696777344, "loss": 0.6795, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03514258563518524, "rewards/margins": 0.0365949310362339, "rewards/rejected": -0.001452345633879304, "step": 445 }, { "epoch": 0.04626297933586923, "grad_norm": 25.0, "learning_rate": 4.6248715313463515e-07, "logits/chosen": -0.4824303686618805, "logits/rejected": -0.4695838391780853, "logps/chosen": -43.74272918701172, "logps/rejected": -39.747047424316406, "loss": 0.6795, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03708468750119209, "rewards/margins": 0.03387463092803955, "rewards/rejected": 0.003210053313523531, "step": 450 }, { "epoch": 0.04677701243960111, "grad_norm": 26.75, "learning_rate": 4.676258992805755e-07, "logits/chosen": -0.45794662833213806, "logits/rejected": -0.4536573886871338, "logps/chosen": -43.49451446533203, "logps/rejected": -37.92616653442383, "loss": 0.68, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03212534636259079, "rewards/margins": 0.022766735404729843, "rewards/rejected": 0.009358611889183521, "step": 455 }, { "epoch": 0.047291045543332993, "grad_norm": 27.375, "learning_rate": 4.727646454265159e-07, "logits/chosen": -0.5134713649749756, "logits/rejected": -0.5406021475791931, "logps/chosen": -45.326499938964844, "logps/rejected": -45.83544158935547, "loss": 0.6767, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.04057953134179115, "rewards/margins": 0.03847244381904602, "rewards/rejected": 0.002107086358591914, "step": 460 }, { "epoch": 0.04780507864706487, "grad_norm": 26.0, "learning_rate": 4.779033915724563e-07, "logits/chosen": -0.611420750617981, "logits/rejected": -0.5816811323165894, "logps/chosen": -49.832679748535156, "logps/rejected": -41.91526794433594, "loss": 0.6774, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03544536978006363, "rewards/margins": 0.031596891582012177, "rewards/rejected": 0.003848481923341751, "step": 465 }, { "epoch": 0.04831911175079675, "grad_norm": 26.375, "learning_rate": 4.830421377183967e-07, "logits/chosen": -0.4909031391143799, "logits/rejected": -0.4823439121246338, "logps/chosen": -48.8991813659668, "logps/rejected": -43.50041580200195, "loss": 0.6759, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.04381650313735008, "rewards/margins": 0.03712482005357742, "rewards/rejected": 0.006691685877740383, "step": 470 }, { "epoch": 0.04883314485452863, "grad_norm": 26.5, "learning_rate": 4.881808838643371e-07, "logits/chosen": -0.5548766255378723, "logits/rejected": -0.5131130814552307, "logps/chosen": -48.66565704345703, "logps/rejected": -42.445838928222656, "loss": 0.6775, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.042327068746089935, "rewards/margins": 0.031185006722807884, "rewards/rejected": 0.011142059229314327, "step": 475 }, { "epoch": 0.04934717795826051, "grad_norm": 26.25, "learning_rate": 4.933196300102775e-07, "logits/chosen": -0.6174675226211548, "logits/rejected": -0.6118943095207214, "logps/chosen": -45.056480407714844, "logps/rejected": -45.18384552001953, "loss": 0.6763, "rewards/accuracies": 0.75, "rewards/chosen": 0.03719925507903099, "rewards/margins": 0.033355433493852615, "rewards/rejected": 0.0038438127376139164, "step": 480 }, { "epoch": 0.04986121106199239, "grad_norm": 26.0, "learning_rate": 4.984583761562179e-07, "logits/chosen": -0.5000137090682983, "logits/rejected": -0.5083146095275879, "logps/chosen": -48.88059616088867, "logps/rejected": -46.710472106933594, "loss": 0.6774, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.040368981659412384, "rewards/margins": 0.037680696696043015, "rewards/rejected": 0.0026882889214903116, "step": 485 }, { "epoch": 0.050375244165724274, "grad_norm": 26.5, "learning_rate": 5.035971223021582e-07, "logits/chosen": -0.5114009380340576, "logits/rejected": -0.4864082932472229, "logps/chosen": -43.1078987121582, "logps/rejected": -41.390567779541016, "loss": 0.6744, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04831656441092491, "rewards/margins": 0.04123422130942345, "rewards/rejected": 0.007082338444888592, "step": 490 }, { "epoch": 0.050889277269456155, "grad_norm": 28.0, "learning_rate": 5.087358684480987e-07, "logits/chosen": -0.4545704424381256, "logits/rejected": -0.5083001255989075, "logps/chosen": -47.85335922241211, "logps/rejected": -43.91342544555664, "loss": 0.6737, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05095354840159416, "rewards/margins": 0.054133933037519455, "rewards/rejected": -0.0031803841702640057, "step": 495 }, { "epoch": 0.05140331037318804, "grad_norm": 24.375, "learning_rate": 5.13874614594039e-07, "logits/chosen": -0.4743877053260803, "logits/rejected": -0.539908230304718, "logps/chosen": -44.78369140625, "logps/rejected": -45.05404281616211, "loss": 0.6734, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03945828229188919, "rewards/margins": 0.03410506993532181, "rewards/rejected": 0.0053532118909060955, "step": 500 }, { "epoch": 0.05191734347691991, "grad_norm": 25.75, "learning_rate": 5.190133607399794e-07, "logits/chosen": -0.5342231392860413, "logits/rejected": -0.527351975440979, "logps/chosen": -51.517059326171875, "logps/rejected": -46.72102355957031, "loss": 0.6722, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05102931708097458, "rewards/margins": 0.045265696942806244, "rewards/rejected": 0.005763621535152197, "step": 505 }, { "epoch": 0.05243137658065179, "grad_norm": 26.375, "learning_rate": 5.241521068859198e-07, "logits/chosen": -0.5305172801017761, "logits/rejected": -0.5127025842666626, "logps/chosen": -41.901554107666016, "logps/rejected": -39.92859649658203, "loss": 0.6723, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04082357883453369, "rewards/margins": 0.03722428157925606, "rewards/rejected": 0.003599300514906645, "step": 510 }, { "epoch": 0.052945409684383674, "grad_norm": 25.375, "learning_rate": 5.292908530318602e-07, "logits/chosen": -0.5150061845779419, "logits/rejected": -0.5059565901756287, "logps/chosen": -49.38332748413086, "logps/rejected": -45.272396087646484, "loss": 0.6712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.055579204112291336, "rewards/margins": 0.05841914936900139, "rewards/rejected": -0.002839947585016489, "step": 515 }, { "epoch": 0.053459442788115555, "grad_norm": 26.0, "learning_rate": 5.344295991778006e-07, "logits/chosen": -0.5219481587409973, "logits/rejected": -0.5028671622276306, "logps/chosen": -53.82368087768555, "logps/rejected": -44.15641403198242, "loss": 0.6692, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.058307498693466187, "rewards/margins": 0.049201663583517075, "rewards/rejected": 0.009105834178626537, "step": 520 }, { "epoch": 0.053973475891847436, "grad_norm": 25.5, "learning_rate": 5.39568345323741e-07, "logits/chosen": -0.4817027449607849, "logits/rejected": -0.4652460515499115, "logps/chosen": -45.653987884521484, "logps/rejected": -43.90810775756836, "loss": 0.6706, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0621105432510376, "rewards/margins": 0.06496389955282211, "rewards/rejected": -0.002853355836123228, "step": 525 }, { "epoch": 0.05448750899557932, "grad_norm": 25.375, "learning_rate": 5.447070914696814e-07, "logits/chosen": -0.5817725658416748, "logits/rejected": -0.583831250667572, "logps/chosen": -43.09241485595703, "logps/rejected": -41.90944290161133, "loss": 0.6708, "rewards/accuracies": 0.875, "rewards/chosen": 0.053008757531642914, "rewards/margins": 0.05464207008481026, "rewards/rejected": -0.0016333151143044233, "step": 530 }, { "epoch": 0.0550015420993112, "grad_norm": 25.375, "learning_rate": 5.498458376156218e-07, "logits/chosen": -0.6109253764152527, "logits/rejected": -0.6273789405822754, "logps/chosen": -48.81477737426758, "logps/rejected": -40.780029296875, "loss": 0.6711, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.056519024074077606, "rewards/margins": 0.04170162230730057, "rewards/rejected": 0.01481739990413189, "step": 535 }, { "epoch": 0.05551557520304307, "grad_norm": 25.375, "learning_rate": 5.549845837615622e-07, "logits/chosen": -0.6146945357322693, "logits/rejected": -0.6046128869056702, "logps/chosen": -47.859375, "logps/rejected": -42.31548309326172, "loss": 0.669, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06374059617519379, "rewards/margins": 0.05304313823580742, "rewards/rejected": 0.010697446763515472, "step": 540 }, { "epoch": 0.056029608306774954, "grad_norm": 26.625, "learning_rate": 5.601233299075026e-07, "logits/chosen": -0.41786742210388184, "logits/rejected": -0.4587439000606537, "logps/chosen": -41.6313591003418, "logps/rejected": -40.682159423828125, "loss": 0.6678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04819998890161514, "rewards/margins": 0.04530651122331619, "rewards/rejected": 0.002893476514145732, "step": 545 }, { "epoch": 0.056543641410506836, "grad_norm": 30.5, "learning_rate": 5.652620760534429e-07, "logits/chosen": -0.49062785506248474, "logits/rejected": -0.5483888387680054, "logps/chosen": -52.4312744140625, "logps/rejected": -47.17681121826172, "loss": 0.6676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.06597664207220078, "rewards/margins": 0.04898099973797798, "rewards/rejected": 0.01699564978480339, "step": 550 }, { "epoch": 0.05705767451423872, "grad_norm": 28.125, "learning_rate": 5.704008221993834e-07, "logits/chosen": -0.5821124911308289, "logits/rejected": -0.559248685836792, "logps/chosen": -45.64586639404297, "logps/rejected": -44.337425231933594, "loss": 0.6672, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.06010516732931137, "rewards/margins": 0.0447075292468071, "rewards/rejected": 0.015397635288536549, "step": 555 }, { "epoch": 0.0575717076179706, "grad_norm": 25.5, "learning_rate": 5.755395683453237e-07, "logits/chosen": -0.4848307967185974, "logits/rejected": -0.5035262703895569, "logps/chosen": -47.6322135925293, "logps/rejected": -44.67769241333008, "loss": 0.6662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.05576752871274948, "rewards/margins": 0.04481849819421768, "rewards/rejected": 0.010949034243822098, "step": 560 }, { "epoch": 0.05808574072170248, "grad_norm": 25.125, "learning_rate": 5.806783144912642e-07, "logits/chosen": -0.5306991338729858, "logits/rejected": -0.5228010416030884, "logps/chosen": -49.74065399169922, "logps/rejected": -43.71206283569336, "loss": 0.6671, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0664423480629921, "rewards/margins": 0.0603569857776165, "rewards/rejected": 0.006085357628762722, "step": 565 }, { "epoch": 0.05859977382543436, "grad_norm": 25.75, "learning_rate": 5.858170606372045e-07, "logits/chosen": -0.497631311416626, "logits/rejected": -0.5466474294662476, "logps/chosen": -43.012542724609375, "logps/rejected": -46.62449645996094, "loss": 0.6691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05486062914133072, "rewards/margins": 0.036226723343133926, "rewards/rejected": 0.01863391324877739, "step": 570 }, { "epoch": 0.059113806929166235, "grad_norm": 25.5, "learning_rate": 5.90955806783145e-07, "logits/chosen": -0.5157301425933838, "logits/rejected": -0.49960002303123474, "logps/chosen": -39.978843688964844, "logps/rejected": -39.647769927978516, "loss": 0.6655, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.062398433685302734, "rewards/margins": 0.04404488578438759, "rewards/rejected": 0.018353547900915146, "step": 575 }, { "epoch": 0.059627840032898116, "grad_norm": 25.0, "learning_rate": 5.960945529290853e-07, "logits/chosen": -0.5709384679794312, "logits/rejected": -0.5384314656257629, "logps/chosen": -45.90753173828125, "logps/rejected": -39.767398834228516, "loss": 0.6653, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.07308018207550049, "rewards/margins": 0.05762292072176933, "rewards/rejected": 0.015457254834473133, "step": 580 }, { "epoch": 0.06014187313663, "grad_norm": 26.875, "learning_rate": 6.012332990750257e-07, "logits/chosen": -0.5344622731208801, "logits/rejected": -0.559745192527771, "logps/chosen": -44.27277374267578, "logps/rejected": -39.91753387451172, "loss": 0.6631, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06943769752979279, "rewards/margins": 0.06558750569820404, "rewards/rejected": 0.0038501829840242863, "step": 585 }, { "epoch": 0.06065590624036188, "grad_norm": 26.125, "learning_rate": 6.063720452209661e-07, "logits/chosen": -0.5467257499694824, "logits/rejected": -0.5369237661361694, "logps/chosen": -46.22926330566406, "logps/rejected": -43.032432556152344, "loss": 0.6634, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.07820887118577957, "rewards/margins": 0.08122418075799942, "rewards/rejected": -0.003015313297510147, "step": 590 }, { "epoch": 0.06116993934409376, "grad_norm": 25.0, "learning_rate": 6.115107913669065e-07, "logits/chosen": -0.544464647769928, "logits/rejected": -0.5558351278305054, "logps/chosen": -46.67375946044922, "logps/rejected": -46.12049102783203, "loss": 0.6627, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0821080207824707, "rewards/margins": 0.05992167070508003, "rewards/rejected": 0.02218634821474552, "step": 595 }, { "epoch": 0.06168397244782564, "grad_norm": 25.25, "learning_rate": 6.166495375128469e-07, "logits/chosen": -0.563421905040741, "logits/rejected": -0.5573095083236694, "logps/chosen": -46.96693801879883, "logps/rejected": -43.01422882080078, "loss": 0.6595, "rewards/accuracies": 0.875, "rewards/chosen": 0.0783597081899643, "rewards/margins": 0.07695178687572479, "rewards/rejected": 0.001407914562150836, "step": 600 }, { "epoch": 0.06219800555155752, "grad_norm": 29.25, "learning_rate": 6.217882836587873e-07, "logits/chosen": -0.5611908435821533, "logits/rejected": -0.607645571231842, "logps/chosen": -44.93268585205078, "logps/rejected": -44.68094253540039, "loss": 0.6586, "rewards/accuracies": 0.875, "rewards/chosen": 0.08854435384273529, "rewards/margins": 0.07231277227401733, "rewards/rejected": 0.016231579706072807, "step": 605 }, { "epoch": 0.0627120386552894, "grad_norm": 25.375, "learning_rate": 6.269270298047277e-07, "logits/chosen": -0.4500214457511902, "logits/rejected": -0.45499172806739807, "logps/chosen": -49.58077621459961, "logps/rejected": -44.00128936767578, "loss": 0.6613, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.08128474652767181, "rewards/margins": 0.07924707233905792, "rewards/rejected": 0.0020376869942992926, "step": 610 }, { "epoch": 0.06322607175902129, "grad_norm": 28.375, "learning_rate": 6.320657759506681e-07, "logits/chosen": -0.5335690379142761, "logits/rejected": -0.5622512102127075, "logps/chosen": -41.53282165527344, "logps/rejected": -38.63495635986328, "loss": 0.6591, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.07654312252998352, "rewards/margins": 0.06970039755105972, "rewards/rejected": 0.006842723581939936, "step": 615 }, { "epoch": 0.06374010486275317, "grad_norm": 25.375, "learning_rate": 6.372045220966083e-07, "logits/chosen": -0.5400862693786621, "logits/rejected": -0.5449045896530151, "logps/chosen": -43.825748443603516, "logps/rejected": -40.282169342041016, "loss": 0.6562, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.07743750512599945, "rewards/margins": 0.06461333483457565, "rewards/rejected": 0.012824165634810925, "step": 620 }, { "epoch": 0.06425413796648505, "grad_norm": 24.25, "learning_rate": 6.423432682425489e-07, "logits/chosen": -0.5889990329742432, "logits/rejected": -0.57150799036026, "logps/chosen": -43.58386993408203, "logps/rejected": -41.79051971435547, "loss": 0.6593, "rewards/accuracies": 0.875, "rewards/chosen": 0.10206643491983414, "rewards/margins": 0.06871925294399261, "rewards/rejected": 0.03334718197584152, "step": 625 }, { "epoch": 0.06476817107021693, "grad_norm": 26.875, "learning_rate": 6.474820143884891e-07, "logits/chosen": -0.5010123252868652, "logits/rejected": -0.49935874342918396, "logps/chosen": -45.082496643066406, "logps/rejected": -46.53763961791992, "loss": 0.6545, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.09224720299243927, "rewards/margins": 0.07828235626220703, "rewards/rejected": 0.013964837417006493, "step": 630 }, { "epoch": 0.0652822041739488, "grad_norm": 27.625, "learning_rate": 6.526207605344296e-07, "logits/chosen": -0.6027558445930481, "logits/rejected": -0.6525936126708984, "logps/chosen": -45.5389289855957, "logps/rejected": -42.71904754638672, "loss": 0.6553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1029411181807518, "rewards/margins": 0.06894311308860779, "rewards/rejected": 0.03399800509214401, "step": 635 }, { "epoch": 0.06579623727768068, "grad_norm": 25.25, "learning_rate": 6.577595066803699e-07, "logits/chosen": -0.5269140005111694, "logits/rejected": -0.5494248270988464, "logps/chosen": -50.50534439086914, "logps/rejected": -47.678863525390625, "loss": 0.6528, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.09652815759181976, "rewards/margins": 0.08760304749011993, "rewards/rejected": 0.008925109170377254, "step": 640 }, { "epoch": 0.06631027038141256, "grad_norm": 24.625, "learning_rate": 6.628982528263104e-07, "logits/chosen": -0.5341302156448364, "logits/rejected": -0.6332500576972961, "logps/chosen": -43.9234733581543, "logps/rejected": -44.81741714477539, "loss": 0.6517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09060404449701309, "rewards/margins": 0.06428372859954834, "rewards/rejected": 0.026320312172174454, "step": 645 }, { "epoch": 0.06682430348514444, "grad_norm": 27.125, "learning_rate": 6.680369989722507e-07, "logits/chosen": -0.5383872985839844, "logits/rejected": -0.551451563835144, "logps/chosen": -42.72985076904297, "logps/rejected": -42.190162658691406, "loss": 0.651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.09585133194923401, "rewards/margins": 0.06622285395860672, "rewards/rejected": 0.029628481715917587, "step": 650 }, { "epoch": 0.06733833658887632, "grad_norm": 26.375, "learning_rate": 6.731757451181911e-07, "logits/chosen": -0.5040010213851929, "logits/rejected": -0.5523461699485779, "logps/chosen": -40.53135681152344, "logps/rejected": -44.733158111572266, "loss": 0.6501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.09501311928033829, "rewards/margins": 0.08449190109968185, "rewards/rejected": 0.010521212592720985, "step": 655 }, { "epoch": 0.0678523696926082, "grad_norm": 22.75, "learning_rate": 6.783144912641315e-07, "logits/chosen": -0.5167514085769653, "logits/rejected": -0.538071870803833, "logps/chosen": -46.6627197265625, "logps/rejected": -44.59030532836914, "loss": 0.651, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.11643459647893906, "rewards/margins": 0.10182207822799683, "rewards/rejected": 0.01461251825094223, "step": 660 }, { "epoch": 0.06836640279634008, "grad_norm": 22.625, "learning_rate": 6.834532374100719e-07, "logits/chosen": -0.5125952363014221, "logits/rejected": -0.5071858167648315, "logps/chosen": -41.30177307128906, "logps/rejected": -42.38993835449219, "loss": 0.6499, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.10925598442554474, "rewards/margins": 0.07605016231536865, "rewards/rejected": 0.03320581838488579, "step": 665 }, { "epoch": 0.06888043590007197, "grad_norm": 25.375, "learning_rate": 6.885919835560123e-07, "logits/chosen": -0.5518497824668884, "logits/rejected": -0.564605712890625, "logps/chosen": -44.73925018310547, "logps/rejected": -42.017494201660156, "loss": 0.6475, "rewards/accuracies": 0.875, "rewards/chosen": 0.12703107297420502, "rewards/margins": 0.09497301280498505, "rewards/rejected": 0.03205806761980057, "step": 670 }, { "epoch": 0.06939446900380385, "grad_norm": 28.375, "learning_rate": 6.937307297019527e-07, "logits/chosen": -0.5117191076278687, "logits/rejected": -0.5155859589576721, "logps/chosen": -47.258216857910156, "logps/rejected": -48.386207580566406, "loss": 0.647, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.1167471781373024, "rewards/margins": 0.08844326436519623, "rewards/rejected": 0.028303900733590126, "step": 675 }, { "epoch": 0.06990850210753573, "grad_norm": 26.375, "learning_rate": 6.988694758478931e-07, "logits/chosen": -0.5648790597915649, "logits/rejected": -0.5773806571960449, "logps/chosen": -54.9184455871582, "logps/rejected": -48.435768127441406, "loss": 0.6461, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.13382390141487122, "rewards/margins": 0.10929258912801743, "rewards/rejected": 0.024531327188014984, "step": 680 }, { "epoch": 0.07042253521126761, "grad_norm": 26.875, "learning_rate": 7.040082219938335e-07, "logits/chosen": -0.45843204855918884, "logits/rejected": -0.5463451147079468, "logps/chosen": -43.974403381347656, "logps/rejected": -41.56947326660156, "loss": 0.6419, "rewards/accuracies": 0.875, "rewards/chosen": 0.1364334225654602, "rewards/margins": 0.1075710654258728, "rewards/rejected": 0.028862357139587402, "step": 685 }, { "epoch": 0.07093656831499949, "grad_norm": 27.0, "learning_rate": 7.091469681397738e-07, "logits/chosen": -0.4703858494758606, "logits/rejected": -0.4219907224178314, "logps/chosen": -40.51359176635742, "logps/rejected": -49.21204376220703, "loss": 0.642, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.1347707211971283, "rewards/margins": 0.10831012576818466, "rewards/rejected": 0.026460599154233932, "step": 690 }, { "epoch": 0.07145060141873137, "grad_norm": 26.625, "learning_rate": 7.142857142857143e-07, "logits/chosen": -0.6139708161354065, "logits/rejected": -0.5133739113807678, "logps/chosen": -43.73206329345703, "logps/rejected": -43.279869079589844, "loss": 0.6441, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.13301515579223633, "rewards/margins": 0.11033841222524643, "rewards/rejected": 0.0226767435669899, "step": 695 }, { "epoch": 0.07196463452246325, "grad_norm": 26.5, "learning_rate": 7.194244604316546e-07, "logits/chosen": -0.5898646712303162, "logits/rejected": -0.6064137816429138, "logps/chosen": -42.343421936035156, "logps/rejected": -48.16742706298828, "loss": 0.6423, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.12604334950447083, "rewards/margins": 0.09903419017791748, "rewards/rejected": 0.02700917050242424, "step": 700 }, { "epoch": 0.07247866762619512, "grad_norm": 25.875, "learning_rate": 7.245632065775951e-07, "logits/chosen": -0.5698856115341187, "logits/rejected": -0.594225287437439, "logps/chosen": -44.28474044799805, "logps/rejected": -45.193870544433594, "loss": 0.6362, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.14130310714244843, "rewards/margins": 0.11438391357660294, "rewards/rejected": 0.026919201016426086, "step": 705 }, { "epoch": 0.072992700729927, "grad_norm": 26.625, "learning_rate": 7.297019527235354e-07, "logits/chosen": -0.4831959307193756, "logits/rejected": -0.49004945158958435, "logps/chosen": -43.622318267822266, "logps/rejected": -44.85218048095703, "loss": 0.6405, "rewards/accuracies": 0.875, "rewards/chosen": 0.14013539254665375, "rewards/margins": 0.1094023808836937, "rewards/rejected": 0.030733004212379456, "step": 710 }, { "epoch": 0.07350673383365888, "grad_norm": 26.375, "learning_rate": 7.348406988694759e-07, "logits/chosen": -0.511799156665802, "logits/rejected": -0.50799560546875, "logps/chosen": -50.011474609375, "logps/rejected": -49.09990692138672, "loss": 0.6375, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.16222240030765533, "rewards/margins": 0.1427510678768158, "rewards/rejected": 0.019471321254968643, "step": 715 }, { "epoch": 0.07402076693739076, "grad_norm": 22.75, "learning_rate": 7.399794450154162e-07, "logits/chosen": -0.660801887512207, "logits/rejected": -0.6382924318313599, "logps/chosen": -47.22163391113281, "logps/rejected": -45.54157638549805, "loss": 0.6365, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.15280112624168396, "rewards/margins": 0.10339011996984482, "rewards/rejected": 0.04941100627183914, "step": 720 }, { "epoch": 0.07453480004112265, "grad_norm": 24.25, "learning_rate": 7.451181911613566e-07, "logits/chosen": -0.526086688041687, "logits/rejected": -0.4995068907737732, "logps/chosen": -45.85238265991211, "logps/rejected": -41.46195602416992, "loss": 0.6352, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.16677738726139069, "rewards/margins": 0.13918253779411316, "rewards/rejected": 0.027594860643148422, "step": 725 }, { "epoch": 0.07504883314485453, "grad_norm": 26.375, "learning_rate": 7.50256937307297e-07, "logits/chosen": -0.5618652701377869, "logits/rejected": -0.584435760974884, "logps/chosen": -47.01496124267578, "logps/rejected": -49.0267333984375, "loss": 0.6304, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.1589895784854889, "rewards/margins": 0.12156827747821808, "rewards/rejected": 0.0374213308095932, "step": 730 }, { "epoch": 0.07556286624858641, "grad_norm": 26.875, "learning_rate": 7.553956834532374e-07, "logits/chosen": -0.5104331374168396, "logits/rejected": -0.5405488014221191, "logps/chosen": -47.697200775146484, "logps/rejected": -45.49576187133789, "loss": 0.6351, "rewards/accuracies": 0.875, "rewards/chosen": 0.16554021835327148, "rewards/margins": 0.13836148381233215, "rewards/rejected": 0.027178723365068436, "step": 735 }, { "epoch": 0.07607689935231829, "grad_norm": 24.0, "learning_rate": 7.605344295991778e-07, "logits/chosen": -0.5179128646850586, "logits/rejected": -0.5564489364624023, "logps/chosen": -42.2685432434082, "logps/rejected": -39.608909606933594, "loss": 0.6353, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.1745699644088745, "rewards/margins": 0.12810799479484558, "rewards/rejected": 0.04646197706460953, "step": 740 }, { "epoch": 0.07659093245605017, "grad_norm": 27.375, "learning_rate": 7.656731757451182e-07, "logits/chosen": -0.5294661521911621, "logits/rejected": -0.533990204334259, "logps/chosen": -45.177528381347656, "logps/rejected": -43.729705810546875, "loss": 0.6307, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.17410778999328613, "rewards/margins": 0.13470788300037384, "rewards/rejected": 0.03939991444349289, "step": 745 }, { "epoch": 0.07710496555978205, "grad_norm": 25.0, "learning_rate": 7.708119218910586e-07, "logits/chosen": -0.5507833957672119, "logits/rejected": -0.585070013999939, "logps/chosen": -41.0269660949707, "logps/rejected": -44.56657791137695, "loss": 0.6329, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.16012287139892578, "rewards/margins": 0.13244251906871796, "rewards/rejected": 0.027680357918143272, "step": 750 }, { "epoch": 0.07761899866351393, "grad_norm": 25.5, "learning_rate": 7.75950668036999e-07, "logits/chosen": -0.5234525799751282, "logits/rejected": -0.5436912178993225, "logps/chosen": -42.71730041503906, "logps/rejected": -41.27848434448242, "loss": 0.6297, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.15959532558918, "rewards/margins": 0.12607041001319885, "rewards/rejected": 0.033524904400110245, "step": 755 }, { "epoch": 0.07813303176724581, "grad_norm": 25.0, "learning_rate": 7.810894141829393e-07, "logits/chosen": -0.6267899870872498, "logits/rejected": -0.6411890983581543, "logps/chosen": -45.34892654418945, "logps/rejected": -41.09944534301758, "loss": 0.6343, "rewards/accuracies": 0.875, "rewards/chosen": 0.17738842964172363, "rewards/margins": 0.13232603669166565, "rewards/rejected": 0.04506240040063858, "step": 760 }, { "epoch": 0.0786470648709777, "grad_norm": 25.5, "learning_rate": 7.862281603288798e-07, "logits/chosen": -0.5541279911994934, "logits/rejected": -0.5926000475883484, "logps/chosen": -46.9913330078125, "logps/rejected": -39.91486740112305, "loss": 0.6292, "rewards/accuracies": 0.875, "rewards/chosen": 0.191436305642128, "rewards/margins": 0.12061263620853424, "rewards/rejected": 0.07082368433475494, "step": 765 }, { "epoch": 0.07916109797470958, "grad_norm": 25.5, "learning_rate": 7.913669064748201e-07, "logits/chosen": -0.5731217265129089, "logits/rejected": -0.6460896134376526, "logps/chosen": -43.78594207763672, "logps/rejected": -42.71596145629883, "loss": 0.6284, "rewards/accuracies": 0.875, "rewards/chosen": 0.17373165488243103, "rewards/margins": 0.1430257260799408, "rewards/rejected": 0.030705943703651428, "step": 770 }, { "epoch": 0.07967513107844146, "grad_norm": 25.625, "learning_rate": 7.965056526207606e-07, "logits/chosen": -0.5293170213699341, "logits/rejected": -0.43949565291404724, "logps/chosen": -41.696495056152344, "logps/rejected": -44.26519012451172, "loss": 0.6255, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.17448677122592926, "rewards/margins": 0.1398381143808365, "rewards/rejected": 0.03464866802096367, "step": 775 }, { "epoch": 0.08018916418217333, "grad_norm": 23.25, "learning_rate": 8.016443987667009e-07, "logits/chosen": -0.47517603635787964, "logits/rejected": -0.5334277749061584, "logps/chosen": -41.38899230957031, "logps/rejected": -44.05311584472656, "loss": 0.6219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.19122488796710968, "rewards/margins": 0.15533722937107086, "rewards/rejected": 0.03588765114545822, "step": 780 }, { "epoch": 0.0807031972859052, "grad_norm": 24.75, "learning_rate": 8.067831449126414e-07, "logits/chosen": -0.4958719313144684, "logits/rejected": -0.5784968137741089, "logps/chosen": -41.6627197265625, "logps/rejected": -41.74710464477539, "loss": 0.6246, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.19721569120883942, "rewards/margins": 0.18843746185302734, "rewards/rejected": 0.008778234012424946, "step": 785 }, { "epoch": 0.08121723038963709, "grad_norm": 24.125, "learning_rate": 8.119218910585817e-07, "logits/chosen": -0.6115940809249878, "logits/rejected": -0.6551922559738159, "logps/chosen": -44.18309783935547, "logps/rejected": -42.7044792175293, "loss": 0.6212, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.19443735480308533, "rewards/margins": 0.14608842134475708, "rewards/rejected": 0.04834895208477974, "step": 790 }, { "epoch": 0.08173126349336897, "grad_norm": 26.75, "learning_rate": 8.170606372045221e-07, "logits/chosen": -0.6191340684890747, "logits/rejected": -0.6283766031265259, "logps/chosen": -49.639869689941406, "logps/rejected": -50.319374084472656, "loss": 0.6138, "rewards/accuracies": 0.875, "rewards/chosen": 0.2012145072221756, "rewards/margins": 0.16686315834522247, "rewards/rejected": 0.03435134142637253, "step": 795 }, { "epoch": 0.08224529659710085, "grad_norm": 27.875, "learning_rate": 8.221993833504625e-07, "logits/chosen": -0.5054925680160522, "logits/rejected": -0.5086299180984497, "logps/chosen": -49.278926849365234, "logps/rejected": -44.5020751953125, "loss": 0.6154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.21480365097522736, "rewards/margins": 0.1588359773159027, "rewards/rejected": 0.05596764013171196, "step": 800 }, { "epoch": 0.08275932970083273, "grad_norm": 25.875, "learning_rate": 8.273381294964028e-07, "logits/chosen": -0.6146968603134155, "logits/rejected": -0.6153755187988281, "logps/chosen": -46.71783447265625, "logps/rejected": -41.253597259521484, "loss": 0.6192, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.19896550476551056, "rewards/margins": 0.13613177835941315, "rewards/rejected": 0.0628337562084198, "step": 805 }, { "epoch": 0.08327336280456461, "grad_norm": 24.125, "learning_rate": 8.324768756423432e-07, "logits/chosen": -0.5464031100273132, "logits/rejected": -0.6091150045394897, "logps/chosen": -42.114505767822266, "logps/rejected": -45.462162017822266, "loss": 0.6129, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.22474953532218933, "rewards/margins": 0.15742707252502441, "rewards/rejected": 0.06732246279716492, "step": 810 }, { "epoch": 0.0837873959082965, "grad_norm": 24.0, "learning_rate": 8.376156217882836e-07, "logits/chosen": -0.48167529702186584, "logits/rejected": -0.43913644552230835, "logps/chosen": -44.059051513671875, "logps/rejected": -42.52682876586914, "loss": 0.6187, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.20822246372699738, "rewards/margins": 0.14555740356445312, "rewards/rejected": 0.06266506016254425, "step": 815 }, { "epoch": 0.08430142901202838, "grad_norm": 24.125, "learning_rate": 8.42754367934224e-07, "logits/chosen": -0.5151184797286987, "logits/rejected": -0.5087379813194275, "logps/chosen": -39.25678253173828, "logps/rejected": -38.175296783447266, "loss": 0.616, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2330058068037033, "rewards/margins": 0.15575110912322998, "rewards/rejected": 0.07725472003221512, "step": 820 }, { "epoch": 0.08481546211576026, "grad_norm": 24.125, "learning_rate": 8.478931140801644e-07, "logits/chosen": -0.562117874622345, "logits/rejected": -0.5475424528121948, "logps/chosen": -44.15347671508789, "logps/rejected": -40.62128829956055, "loss": 0.6114, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.25094062089920044, "rewards/margins": 0.18744292855262756, "rewards/rejected": 0.06349767744541168, "step": 825 }, { "epoch": 0.08532949521949214, "grad_norm": 26.0, "learning_rate": 8.530318602261047e-07, "logits/chosen": -0.564426839351654, "logits/rejected": -0.5748254060745239, "logps/chosen": -39.0877799987793, "logps/rejected": -38.94200134277344, "loss": 0.607, "rewards/accuracies": 0.875, "rewards/chosen": 0.2215259075164795, "rewards/margins": 0.15298059582710266, "rewards/rejected": 0.06854535639286041, "step": 830 }, { "epoch": 0.08584352832322402, "grad_norm": 23.25, "learning_rate": 8.581706063720452e-07, "logits/chosen": -0.5240844488143921, "logits/rejected": -0.5205927491188049, "logps/chosen": -39.638427734375, "logps/rejected": -38.300655364990234, "loss": 0.6103, "rewards/accuracies": 0.875, "rewards/chosen": 0.21714529395103455, "rewards/margins": 0.1626720428466797, "rewards/rejected": 0.05447326973080635, "step": 835 }, { "epoch": 0.0863575614269559, "grad_norm": 26.0, "learning_rate": 8.633093525179855e-07, "logits/chosen": -0.59308922290802, "logits/rejected": -0.5491374135017395, "logps/chosen": -40.43780517578125, "logps/rejected": -42.06497573852539, "loss": 0.6158, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.2348296195268631, "rewards/margins": 0.12793505191802979, "rewards/rejected": 0.10689453780651093, "step": 840 }, { "epoch": 0.08687159453068778, "grad_norm": 24.625, "learning_rate": 8.68448098663926e-07, "logits/chosen": -0.5241016149520874, "logits/rejected": -0.5391318798065186, "logps/chosen": -46.1440315246582, "logps/rejected": -45.53388214111328, "loss": 0.611, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.2650156021118164, "rewards/margins": 0.20876316726207733, "rewards/rejected": 0.056252431124448776, "step": 845 }, { "epoch": 0.08738562763441966, "grad_norm": 23.875, "learning_rate": 8.735868448098663e-07, "logits/chosen": -0.5923094749450684, "logits/rejected": -0.5893076062202454, "logps/chosen": -46.99716567993164, "logps/rejected": -45.95599365234375, "loss": 0.6135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2524680197238922, "rewards/margins": 0.17436857521533966, "rewards/rejected": 0.07809942960739136, "step": 850 }, { "epoch": 0.08789966073815153, "grad_norm": 25.75, "learning_rate": 8.787255909558068e-07, "logits/chosen": -0.4996982514858246, "logits/rejected": -0.5052278637886047, "logps/chosen": -49.46247100830078, "logps/rejected": -41.334197998046875, "loss": 0.5985, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.2742147445678711, "rewards/margins": 0.20412257313728333, "rewards/rejected": 0.07009214162826538, "step": 855 }, { "epoch": 0.08841369384188341, "grad_norm": 25.0, "learning_rate": 8.838643371017471e-07, "logits/chosen": -0.49240589141845703, "logits/rejected": -0.5319634079933167, "logps/chosen": -43.413658142089844, "logps/rejected": -43.180503845214844, "loss": 0.6067, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2450878918170929, "rewards/margins": 0.1728995144367218, "rewards/rejected": 0.0721883550286293, "step": 860 }, { "epoch": 0.08892772694561529, "grad_norm": 24.75, "learning_rate": 8.890030832476875e-07, "logits/chosen": -0.5048456788063049, "logits/rejected": -0.5277706384658813, "logps/chosen": -47.675743103027344, "logps/rejected": -46.74076843261719, "loss": 0.6096, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2430659979581833, "rewards/margins": 0.1771424412727356, "rewards/rejected": 0.06592359393835068, "step": 865 }, { "epoch": 0.08944176004934717, "grad_norm": 23.125, "learning_rate": 8.941418293936279e-07, "logits/chosen": -0.5244823694229126, "logits/rejected": -0.4953669607639313, "logps/chosen": -41.39569854736328, "logps/rejected": -39.12853240966797, "loss": 0.6068, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2699137032032013, "rewards/margins": 0.1879681646823883, "rewards/rejected": 0.08194556087255478, "step": 870 }, { "epoch": 0.08995579315307906, "grad_norm": 24.75, "learning_rate": 8.992805755395683e-07, "logits/chosen": -0.5035258531570435, "logits/rejected": -0.5230237245559692, "logps/chosen": -44.86183547973633, "logps/rejected": -39.31138229370117, "loss": 0.6038, "rewards/accuracies": 0.875, "rewards/chosen": 0.2743573784828186, "rewards/margins": 0.17980891466140747, "rewards/rejected": 0.09454852342605591, "step": 875 }, { "epoch": 0.09046982625681094, "grad_norm": 25.375, "learning_rate": 9.044193216855087e-07, "logits/chosen": -0.4941064417362213, "logits/rejected": -0.5459443926811218, "logps/chosen": -41.35919952392578, "logps/rejected": -44.40766143798828, "loss": 0.6007, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.2823048532009125, "rewards/margins": 0.17241452634334564, "rewards/rejected": 0.10989032685756683, "step": 880 }, { "epoch": 0.09098385936054282, "grad_norm": 25.25, "learning_rate": 9.095580678314491e-07, "logits/chosen": -0.48760485649108887, "logits/rejected": -0.5458135604858398, "logps/chosen": -44.389259338378906, "logps/rejected": -41.011573791503906, "loss": 0.5955, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.3135862946510315, "rewards/margins": 0.27151918411254883, "rewards/rejected": 0.04206707328557968, "step": 885 }, { "epoch": 0.0914978924642747, "grad_norm": 22.0, "learning_rate": 9.146968139773895e-07, "logits/chosen": -0.43619054555892944, "logits/rejected": -0.464957058429718, "logps/chosen": -42.32903289794922, "logps/rejected": -43.94342041015625, "loss": 0.6009, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.29867300391197205, "rewards/margins": 0.17221492528915405, "rewards/rejected": 0.1264580935239792, "step": 890 }, { "epoch": 0.09201192556800658, "grad_norm": 24.25, "learning_rate": 9.198355601233299e-07, "logits/chosen": -0.5680084824562073, "logits/rejected": -0.6193189024925232, "logps/chosen": -46.03605270385742, "logps/rejected": -53.083595275878906, "loss": 0.5903, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.3222612142562866, "rewards/margins": 0.26098957657814026, "rewards/rejected": 0.061271656304597855, "step": 895 }, { "epoch": 0.09252595867173846, "grad_norm": 23.0, "learning_rate": 9.249743062692703e-07, "logits/chosen": -0.5024283528327942, "logits/rejected": -0.513434112071991, "logps/chosen": -39.591060638427734, "logps/rejected": -39.03270721435547, "loss": 0.5965, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2952932119369507, "rewards/margins": 0.17335407435894012, "rewards/rejected": 0.12193915992975235, "step": 900 }, { "epoch": 0.09303999177547034, "grad_norm": 24.5, "learning_rate": 9.301130524152107e-07, "logits/chosen": -0.5175317525863647, "logits/rejected": -0.6094178557395935, "logps/chosen": -40.966758728027344, "logps/rejected": -45.327674865722656, "loss": 0.5888, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.3319776654243469, "rewards/margins": 0.2539907395839691, "rewards/rejected": 0.077986940741539, "step": 905 }, { "epoch": 0.09355402487920222, "grad_norm": 24.875, "learning_rate": 9.35251798561151e-07, "logits/chosen": -0.46008220314979553, "logits/rejected": -0.4727664887905121, "logps/chosen": -43.739356994628906, "logps/rejected": -42.01952362060547, "loss": 0.5875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.30772897601127625, "rewards/margins": 0.22202029824256897, "rewards/rejected": 0.08570870012044907, "step": 910 }, { "epoch": 0.0940680579829341, "grad_norm": 24.625, "learning_rate": 9.403905447070915e-07, "logits/chosen": -0.5525678396224976, "logits/rejected": -0.5400579571723938, "logps/chosen": -39.17808532714844, "logps/rejected": -43.64794158935547, "loss": 0.5935, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.34777578711509705, "rewards/margins": 0.2437710016965866, "rewards/rejected": 0.10400480031967163, "step": 915 }, { "epoch": 0.09458209108666599, "grad_norm": 24.75, "learning_rate": 9.455292908530318e-07, "logits/chosen": -0.535949170589447, "logits/rejected": -0.5588391423225403, "logps/chosen": -39.0285758972168, "logps/rejected": -43.81180953979492, "loss": 0.5882, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2977657616138458, "rewards/margins": 0.19746136665344238, "rewards/rejected": 0.10030442476272583, "step": 920 }, { "epoch": 0.09509612419039787, "grad_norm": 23.875, "learning_rate": 9.506680369989723e-07, "logits/chosen": -0.5817698240280151, "logits/rejected": -0.621291995048523, "logps/chosen": -46.12104797363281, "logps/rejected": -44.2409553527832, "loss": 0.5772, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3760177493095398, "rewards/margins": 0.29808032512664795, "rewards/rejected": 0.07793743163347244, "step": 925 }, { "epoch": 0.09561015729412974, "grad_norm": 25.625, "learning_rate": 9.558067831449127e-07, "logits/chosen": -0.5552026033401489, "logits/rejected": -0.5414708256721497, "logps/chosen": -44.77216339111328, "logps/rejected": -42.54108428955078, "loss": 0.5821, "rewards/accuracies": 0.875, "rewards/chosen": 0.38131922483444214, "rewards/margins": 0.24616150557994843, "rewards/rejected": 0.1351577192544937, "step": 930 }, { "epoch": 0.09612419039786162, "grad_norm": 24.375, "learning_rate": 9.60945529290853e-07, "logits/chosen": -0.5116204023361206, "logits/rejected": -0.49239253997802734, "logps/chosen": -40.12095260620117, "logps/rejected": -45.451812744140625, "loss": 0.5842, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3647356629371643, "rewards/margins": 0.2268202304840088, "rewards/rejected": 0.13791540265083313, "step": 935 }, { "epoch": 0.0966382235015935, "grad_norm": 26.125, "learning_rate": 9.660842754367935e-07, "logits/chosen": -0.5752975344657898, "logits/rejected": -0.6406502723693848, "logps/chosen": -44.05303192138672, "logps/rejected": -41.388526916503906, "loss": 0.5787, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.40914517641067505, "rewards/margins": 0.2593352198600769, "rewards/rejected": 0.14980994164943695, "step": 940 }, { "epoch": 0.09715225660532538, "grad_norm": 24.875, "learning_rate": 9.712230215827338e-07, "logits/chosen": -0.5392632484436035, "logits/rejected": -0.538465678691864, "logps/chosen": -38.91827392578125, "logps/rejected": -45.3657112121582, "loss": 0.5787, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4064578413963318, "rewards/margins": 0.2623597979545593, "rewards/rejected": 0.14409807324409485, "step": 945 }, { "epoch": 0.09766628970905726, "grad_norm": 25.0, "learning_rate": 9.763617677286743e-07, "logits/chosen": -0.5683516263961792, "logits/rejected": -0.5496304035186768, "logps/chosen": -43.033363342285156, "logps/rejected": -43.614784240722656, "loss": 0.5741, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.39778420329093933, "rewards/margins": 0.24807074666023254, "rewards/rejected": 0.1497134119272232, "step": 950 }, { "epoch": 0.09818032281278914, "grad_norm": 27.625, "learning_rate": 9.815005138746146e-07, "logits/chosen": -0.6039291620254517, "logits/rejected": -0.6193878054618835, "logps/chosen": -44.188499450683594, "logps/rejected": -44.453392028808594, "loss": 0.574, "rewards/accuracies": 1.0, "rewards/chosen": 0.4697563052177429, "rewards/margins": 0.3212011754512787, "rewards/rejected": 0.14855512976646423, "step": 955 }, { "epoch": 0.09869435591652102, "grad_norm": 26.75, "learning_rate": 9.86639260020555e-07, "logits/chosen": -0.5767684578895569, "logits/rejected": -0.5841031670570374, "logps/chosen": -42.790679931640625, "logps/rejected": -39.944557189941406, "loss": 0.582, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4536202847957611, "rewards/margins": 0.2580909729003906, "rewards/rejected": 0.1955292969942093, "step": 960 }, { "epoch": 0.0992083890202529, "grad_norm": 25.5, "learning_rate": 9.917780061664954e-07, "logits/chosen": -0.5209633111953735, "logits/rejected": -0.5625368356704712, "logps/chosen": -42.93593215942383, "logps/rejected": -41.11357879638672, "loss": 0.578, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.4613956809043884, "rewards/margins": 0.20635242760181427, "rewards/rejected": 0.25504329800605774, "step": 965 }, { "epoch": 0.09972242212398479, "grad_norm": 24.875, "learning_rate": 9.969167523124359e-07, "logits/chosen": -0.5649109482765198, "logits/rejected": -0.5880676507949829, "logps/chosen": -46.86402893066406, "logps/rejected": -43.7769775390625, "loss": 0.5689, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5170598030090332, "rewards/margins": 0.2797464430332184, "rewards/rejected": 0.2373134195804596, "step": 970 }, { "epoch": 0.10003084198622392, "eval_logits/chosen": -0.5180739760398865, "eval_logits/rejected": -0.5676196813583374, "eval_logps/chosen": -80.08999633789062, "eval_logps/rejected": -44.22517013549805, "eval_loss": 0.5726205110549927, "eval_rewards/accuracies": 0.8602941036224365, "eval_rewards/chosen": 0.4818618893623352, "eval_rewards/margins": 0.27013716101646423, "eval_rewards/rejected": 0.21172477304935455, "eval_runtime": 2.2867, "eval_samples_per_second": 467.922, "eval_steps_per_second": 7.434, "step": 973 }, { "epoch": 0.10023645522771667, "grad_norm": 25.125, "learning_rate": 9.997715330134796e-07, "logits/chosen": -0.530249834060669, "logits/rejected": -0.5008015036582947, "logps/chosen": -42.11176300048828, "logps/rejected": -42.3381462097168, "loss": 0.5689, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5287051200866699, "rewards/margins": 0.29625481367111206, "rewards/rejected": 0.23245029151439667, "step": 975 }, { "epoch": 0.10075048833144855, "grad_norm": 24.0, "learning_rate": 9.992003655471783e-07, "logits/chosen": -0.5443663597106934, "logits/rejected": -0.5252584218978882, "logps/chosen": -42.98344421386719, "logps/rejected": -41.4400520324707, "loss": 0.5714, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4839715361595154, "rewards/margins": 0.26456665992736816, "rewards/rejected": 0.21940484642982483, "step": 980 }, { "epoch": 0.10126452143518043, "grad_norm": 24.75, "learning_rate": 9.986291980808773e-07, "logits/chosen": -0.5100902318954468, "logits/rejected": -0.5599586963653564, "logps/chosen": -39.85112762451172, "logps/rejected": -38.79536437988281, "loss": 0.5642, "rewards/accuracies": 0.875, "rewards/chosen": 0.5098745226860046, "rewards/margins": 0.3105296492576599, "rewards/rejected": 0.19934484362602234, "step": 985 }, { "epoch": 0.10177855453891231, "grad_norm": 24.0, "learning_rate": 9.980580306145762e-07, "logits/chosen": -0.5055256485939026, "logits/rejected": -0.5115092992782593, "logps/chosen": -36.888526916503906, "logps/rejected": -37.88523483276367, "loss": 0.5644, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5681403875350952, "rewards/margins": 0.3064644932746887, "rewards/rejected": 0.26167580485343933, "step": 990 }, { "epoch": 0.10229258764264419, "grad_norm": 23.625, "learning_rate": 9.97486863148275e-07, "logits/chosen": -0.6056371927261353, "logits/rejected": -0.589686393737793, "logps/chosen": -39.67005920410156, "logps/rejected": -37.89403533935547, "loss": 0.5674, "rewards/accuracies": 0.875, "rewards/chosen": 0.5500081777572632, "rewards/margins": 0.26420703530311584, "rewards/rejected": 0.2858012318611145, "step": 995 }, { "epoch": 0.10280662074637607, "grad_norm": 24.75, "learning_rate": 9.969156956819738e-07, "logits/chosen": -0.5434216856956482, "logits/rejected": -0.5593206286430359, "logps/chosen": -40.41916275024414, "logps/rejected": -41.03071594238281, "loss": 0.5543, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6262924075126648, "rewards/margins": 0.35116127133369446, "rewards/rejected": 0.2751311659812927, "step": 1000 }, { "epoch": 0.10332065385010794, "grad_norm": 23.875, "learning_rate": 9.963445282156729e-07, "logits/chosen": -0.5755112767219543, "logits/rejected": -0.5655656456947327, "logps/chosen": -41.30464553833008, "logps/rejected": -35.47516632080078, "loss": 0.5572, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6133618354797363, "rewards/margins": 0.29435715079307556, "rewards/rejected": 0.3190046548843384, "step": 1005 }, { "epoch": 0.10383468695383982, "grad_norm": 24.875, "learning_rate": 9.957733607493717e-07, "logits/chosen": -0.5557002425193787, "logits/rejected": -0.554291307926178, "logps/chosen": -35.85614776611328, "logps/rejected": -41.343387603759766, "loss": 0.5452, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6353901624679565, "rewards/margins": 0.3465721011161804, "rewards/rejected": 0.2888180911540985, "step": 1010 }, { "epoch": 0.1043487200575717, "grad_norm": 24.125, "learning_rate": 9.952021932830706e-07, "logits/chosen": -0.5028610825538635, "logits/rejected": -0.5945279598236084, "logps/chosen": -41.178863525390625, "logps/rejected": -36.4996452331543, "loss": 0.5562, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6535662412643433, "rewards/margins": 0.30809885263442993, "rewards/rejected": 0.34546732902526855, "step": 1015 }, { "epoch": 0.10486275316130358, "grad_norm": 25.125, "learning_rate": 9.946310258167695e-07, "logits/chosen": -0.5247623324394226, "logits/rejected": -0.5023131966590881, "logps/chosen": -37.534584045410156, "logps/rejected": -37.53390884399414, "loss": 0.5497, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6968510746955872, "rewards/margins": 0.3517096936702728, "rewards/rejected": 0.34514135122299194, "step": 1020 }, { "epoch": 0.10537678626503547, "grad_norm": 24.5, "learning_rate": 9.940598583504684e-07, "logits/chosen": -0.5970107316970825, "logits/rejected": -0.6423881649971008, "logps/chosen": -40.542625427246094, "logps/rejected": -42.098533630371094, "loss": 0.5432, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6873819828033447, "rewards/margins": 0.3379356861114502, "rewards/rejected": 0.34944623708724976, "step": 1025 }, { "epoch": 0.10589081936876735, "grad_norm": 24.125, "learning_rate": 9.934886908841672e-07, "logits/chosen": -0.6340837478637695, "logits/rejected": -0.6326473355293274, "logps/chosen": -38.317359924316406, "logps/rejected": -37.989540100097656, "loss": 0.5421, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7714845538139343, "rewards/margins": 0.28635719418525696, "rewards/rejected": 0.48512735962867737, "step": 1030 }, { "epoch": 0.10640485247249923, "grad_norm": 24.875, "learning_rate": 9.929175234178661e-07, "logits/chosen": -0.6566641926765442, "logits/rejected": -0.6301606297492981, "logps/chosen": -37.48707962036133, "logps/rejected": -38.410789489746094, "loss": 0.5381, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7650716304779053, "rewards/margins": 0.410480797290802, "rewards/rejected": 0.35459089279174805, "step": 1035 }, { "epoch": 0.10691888557623111, "grad_norm": 25.5, "learning_rate": 9.92346355951565e-07, "logits/chosen": -0.5458164811134338, "logits/rejected": -0.5728727579116821, "logps/chosen": -40.01411819458008, "logps/rejected": -35.760860443115234, "loss": 0.5262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.838398277759552, "rewards/margins": 0.36327868700027466, "rewards/rejected": 0.47511953115463257, "step": 1040 }, { "epoch": 0.10743291867996299, "grad_norm": 26.0, "learning_rate": 9.917751884852639e-07, "logits/chosen": -0.5427117943763733, "logits/rejected": -0.5244967937469482, "logps/chosen": -43.681941986083984, "logps/rejected": -41.80842590332031, "loss": 0.5366, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8763354420661926, "rewards/margins": 0.42986831068992615, "rewards/rejected": 0.4464671015739441, "step": 1045 }, { "epoch": 0.10794695178369487, "grad_norm": 24.5, "learning_rate": 9.912040210189627e-07, "logits/chosen": -0.5519906282424927, "logits/rejected": -0.56514573097229, "logps/chosen": -33.70969772338867, "logps/rejected": -36.49928283691406, "loss": 0.5323, "rewards/accuracies": 0.875, "rewards/chosen": 0.8634076118469238, "rewards/margins": 0.29779934883117676, "rewards/rejected": 0.5656081438064575, "step": 1050 }, { "epoch": 0.10846098488742675, "grad_norm": 25.125, "learning_rate": 9.906328535526616e-07, "logits/chosen": -0.5736539363861084, "logits/rejected": -0.5273616909980774, "logps/chosen": -39.43199920654297, "logps/rejected": -35.37147903442383, "loss": 0.5151, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9503647089004517, "rewards/margins": 0.40115809440612793, "rewards/rejected": 0.549206554889679, "step": 1055 }, { "epoch": 0.10897501799115863, "grad_norm": 26.375, "learning_rate": 9.900616860863605e-07, "logits/chosen": -0.5435464382171631, "logits/rejected": -0.49053335189819336, "logps/chosen": -40.27143478393555, "logps/rejected": -39.121002197265625, "loss": 0.5251, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.953619658946991, "rewards/margins": 0.4472038745880127, "rewards/rejected": 0.5064157247543335, "step": 1060 }, { "epoch": 0.10948905109489052, "grad_norm": 24.125, "learning_rate": 9.894905186200594e-07, "logits/chosen": -0.5957614779472351, "logits/rejected": -0.5923556089401245, "logps/chosen": -32.61050796508789, "logps/rejected": -37.6281852722168, "loss": 0.5238, "rewards/accuracies": 0.875, "rewards/chosen": 1.0458221435546875, "rewards/margins": 0.47959113121032715, "rewards/rejected": 0.5662310719490051, "step": 1065 }, { "epoch": 0.1100030841986224, "grad_norm": 24.5, "learning_rate": 9.889193511537583e-07, "logits/chosen": -0.5527225732803345, "logits/rejected": -0.5349028706550598, "logps/chosen": -32.419761657714844, "logps/rejected": -35.1560173034668, "loss": 0.5189, "rewards/accuracies": 0.875, "rewards/chosen": 1.043513536453247, "rewards/margins": 0.35306641459465027, "rewards/rejected": 0.6904473304748535, "step": 1070 }, { "epoch": 0.11051711730235428, "grad_norm": 26.375, "learning_rate": 9.883481836874571e-07, "logits/chosen": -0.5463324785232544, "logits/rejected": -0.5926105380058289, "logps/chosen": -39.120426177978516, "logps/rejected": -39.05540466308594, "loss": 0.505, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1311018466949463, "rewards/margins": 0.4869552552700043, "rewards/rejected": 0.6441465616226196, "step": 1075 }, { "epoch": 0.11103115040608615, "grad_norm": 26.125, "learning_rate": 9.87777016221156e-07, "logits/chosen": -0.5864880681037903, "logits/rejected": -0.570834755897522, "logps/chosen": -31.659826278686523, "logps/rejected": -35.486907958984375, "loss": 0.5093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0864136219024658, "rewards/margins": 0.45383185148239136, "rewards/rejected": 0.6325815916061401, "step": 1080 }, { "epoch": 0.11154518350981803, "grad_norm": 27.25, "learning_rate": 9.872058487548549e-07, "logits/chosen": -0.6328712701797485, "logits/rejected": -0.6375039219856262, "logps/chosen": -33.215980529785156, "logps/rejected": -39.32414627075195, "loss": 0.5078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1854474544525146, "rewards/margins": 0.5696211457252502, "rewards/rejected": 0.6158263087272644, "step": 1085 }, { "epoch": 0.11205921661354991, "grad_norm": 26.125, "learning_rate": 9.866346812885538e-07, "logits/chosen": -0.6500171422958374, "logits/rejected": -0.6616379022598267, "logps/chosen": -33.216529846191406, "logps/rejected": -33.618316650390625, "loss": 0.4941, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1314723491668701, "rewards/margins": 0.5365325808525085, "rewards/rejected": 0.5949397087097168, "step": 1090 }, { "epoch": 0.11257324971728179, "grad_norm": 26.75, "learning_rate": 9.860635138222526e-07, "logits/chosen": -0.5477662086486816, "logits/rejected": -0.5441513061523438, "logps/chosen": -33.29972839355469, "logps/rejected": -34.78999710083008, "loss": 0.4926, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1715445518493652, "rewards/margins": 0.47312530875205994, "rewards/rejected": 0.6984192132949829, "step": 1095 }, { "epoch": 0.11308728282101367, "grad_norm": 26.75, "learning_rate": 9.854923463559515e-07, "logits/chosen": -0.5545639991760254, "logits/rejected": -0.5233069062232971, "logps/chosen": -37.12775802612305, "logps/rejected": -35.42032241821289, "loss": 0.4621, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2166430950164795, "rewards/margins": 0.5950759649276733, "rewards/rejected": 0.6215670704841614, "step": 1100 }, { "epoch": 0.11360131592474555, "grad_norm": 24.5, "learning_rate": 9.849211788896504e-07, "logits/chosen": -0.6106120347976685, "logits/rejected": -0.5530328154563904, "logps/chosen": -36.96139144897461, "logps/rejected": -35.39409637451172, "loss": 0.4923, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1937626600265503, "rewards/margins": 0.5453779101371765, "rewards/rejected": 0.6483848690986633, "step": 1105 }, { "epoch": 0.11411534902847743, "grad_norm": 24.75, "learning_rate": 9.843500114233493e-07, "logits/chosen": -0.5171526074409485, "logits/rejected": -0.5343550443649292, "logps/chosen": -32.521507263183594, "logps/rejected": -37.619476318359375, "loss": 0.4795, "rewards/accuracies": 0.875, "rewards/chosen": 1.179251790046692, "rewards/margins": 0.4826686382293701, "rewards/rejected": 0.696583092212677, "step": 1110 }, { "epoch": 0.11462938213220931, "grad_norm": 25.75, "learning_rate": 9.837788439570482e-07, "logits/chosen": -0.5424202680587769, "logits/rejected": -0.5309928059577942, "logps/chosen": -36.31576919555664, "logps/rejected": -38.4758186340332, "loss": 0.4673, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1903746128082275, "rewards/margins": 0.6093793511390686, "rewards/rejected": 0.5809952616691589, "step": 1115 }, { "epoch": 0.1151434152359412, "grad_norm": 27.875, "learning_rate": 9.83207676490747e-07, "logits/chosen": -0.5623931884765625, "logits/rejected": -0.5106463432312012, "logps/chosen": -32.70075607299805, "logps/rejected": -39.48289108276367, "loss": 0.4753, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.165183424949646, "rewards/margins": 0.5383630394935608, "rewards/rejected": 0.6268205642700195, "step": 1120 }, { "epoch": 0.11565744833967308, "grad_norm": 24.125, "learning_rate": 9.82636509024446e-07, "logits/chosen": -0.5489836931228638, "logits/rejected": -0.5663599967956543, "logps/chosen": -33.29705047607422, "logps/rejected": -37.42795944213867, "loss": 0.4698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1578710079193115, "rewards/margins": 0.694525420665741, "rewards/rejected": 0.4633456766605377, "step": 1125 }, { "epoch": 0.11617148144340496, "grad_norm": 25.25, "learning_rate": 9.820653415581448e-07, "logits/chosen": -0.49705252051353455, "logits/rejected": -0.5839281678199768, "logps/chosen": -31.790319442749023, "logps/rejected": -37.85599136352539, "loss": 0.4568, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1818134784698486, "rewards/margins": 0.5704919099807739, "rewards/rejected": 0.6113215684890747, "step": 1130 }, { "epoch": 0.11668551454713684, "grad_norm": 23.875, "learning_rate": 9.814941740918437e-07, "logits/chosen": -0.6382747888565063, "logits/rejected": -0.6609949469566345, "logps/chosen": -30.709720611572266, "logps/rejected": -36.69295120239258, "loss": 0.4488, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1610615253448486, "rewards/margins": 0.6576789617538452, "rewards/rejected": 0.5033824443817139, "step": 1135 }, { "epoch": 0.11719954765086872, "grad_norm": 23.875, "learning_rate": 9.809230066255425e-07, "logits/chosen": -0.6781406402587891, "logits/rejected": -0.6679522395133972, "logps/chosen": -33.703372955322266, "logps/rejected": -36.94292449951172, "loss": 0.4702, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.2397964000701904, "rewards/margins": 0.6344760656356812, "rewards/rejected": 0.6053205728530884, "step": 1140 }, { "epoch": 0.1177135807546006, "grad_norm": 25.75, "learning_rate": 9.803518391592414e-07, "logits/chosen": -0.5545135736465454, "logits/rejected": -0.5636809468269348, "logps/chosen": -38.29312515258789, "logps/rejected": -38.93864059448242, "loss": 0.4545, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.18159818649292, "rewards/margins": 0.7387237548828125, "rewards/rejected": 0.4428744912147522, "step": 1145 }, { "epoch": 0.11822761385833247, "grad_norm": 25.125, "learning_rate": 9.797806716929403e-07, "logits/chosen": -0.507590651512146, "logits/rejected": -0.5135211944580078, "logps/chosen": -32.70012283325195, "logps/rejected": -36.24805450439453, "loss": 0.4516, "rewards/accuracies": 0.875, "rewards/chosen": 1.0949721336364746, "rewards/margins": 0.5551019310951233, "rewards/rejected": 0.5398701429367065, "step": 1150 }, { "epoch": 0.11874164696206435, "grad_norm": 25.75, "learning_rate": 9.792095042266392e-07, "logits/chosen": -0.577866792678833, "logits/rejected": -0.5971522331237793, "logps/chosen": -32.483970642089844, "logps/rejected": -40.29816436767578, "loss": 0.438, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0861401557922363, "rewards/margins": 0.6411436200141907, "rewards/rejected": 0.44499650597572327, "step": 1155 }, { "epoch": 0.11925568006579623, "grad_norm": 27.625, "learning_rate": 9.78638336760338e-07, "logits/chosen": -0.6167039275169373, "logits/rejected": -0.69914710521698, "logps/chosen": -34.28248596191406, "logps/rejected": -37.41499710083008, "loss": 0.4258, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.2989521026611328, "rewards/margins": 0.840005099773407, "rewards/rejected": 0.45894694328308105, "step": 1160 }, { "epoch": 0.11976971316952811, "grad_norm": 25.25, "learning_rate": 9.78067169294037e-07, "logits/chosen": -0.5773812532424927, "logits/rejected": -0.6072746515274048, "logps/chosen": -30.29741859436035, "logps/rejected": -39.00043487548828, "loss": 0.445, "rewards/accuracies": 0.875, "rewards/chosen": 1.2151817083358765, "rewards/margins": 0.7656243443489075, "rewards/rejected": 0.4495575428009033, "step": 1165 }, { "epoch": 0.12028374627326, "grad_norm": 26.875, "learning_rate": 9.774960018277358e-07, "logits/chosen": -0.5364764928817749, "logits/rejected": -0.5073537826538086, "logps/chosen": -30.726924896240234, "logps/rejected": -39.81100082397461, "loss": 0.4512, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.168264389038086, "rewards/margins": 0.7475508451461792, "rewards/rejected": 0.42071348428726196, "step": 1170 }, { "epoch": 0.12079777937699188, "grad_norm": 27.25, "learning_rate": 9.769248343614347e-07, "logits/chosen": -0.5989760160446167, "logits/rejected": -0.5515146255493164, "logps/chosen": -33.6259880065918, "logps/rejected": -40.4922981262207, "loss": 0.4411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.041962742805481, "rewards/margins": 0.726040244102478, "rewards/rejected": 0.315922349691391, "step": 1175 }, { "epoch": 0.12131181248072376, "grad_norm": 24.25, "learning_rate": 9.763536668951336e-07, "logits/chosen": -0.5802556276321411, "logits/rejected": -0.5526619553565979, "logps/chosen": -31.8721866607666, "logps/rejected": -40.436256408691406, "loss": 0.4415, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0436660051345825, "rewards/margins": 0.8569744229316711, "rewards/rejected": 0.18669165670871735, "step": 1180 }, { "epoch": 0.12182584558445564, "grad_norm": 23.875, "learning_rate": 9.757824994288324e-07, "logits/chosen": -0.6362278461456299, "logits/rejected": -0.49579548835754395, "logps/chosen": -37.989540100097656, "logps/rejected": -47.96315002441406, "loss": 0.4236, "rewards/accuracies": 0.875, "rewards/chosen": 1.0069835186004639, "rewards/margins": 0.7746144533157349, "rewards/rejected": 0.23236910998821259, "step": 1185 }, { "epoch": 0.12233987868818752, "grad_norm": 24.625, "learning_rate": 9.752113319625313e-07, "logits/chosen": -0.6055121421813965, "logits/rejected": -0.6249540448188782, "logps/chosen": -40.194618225097656, "logps/rejected": -44.206661224365234, "loss": 0.4157, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1233078241348267, "rewards/margins": 0.9776338338851929, "rewards/rejected": 0.14567407965660095, "step": 1190 }, { "epoch": 0.1228539117919194, "grad_norm": 26.0, "learning_rate": 9.746401644962302e-07, "logits/chosen": -0.5693105459213257, "logits/rejected": -0.5826455354690552, "logps/chosen": -35.647499084472656, "logps/rejected": -42.159339904785156, "loss": 0.4213, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.055010437965393, "rewards/margins": 0.9486603736877441, "rewards/rejected": 0.1063499003648758, "step": 1195 }, { "epoch": 0.12336794489565128, "grad_norm": 27.875, "learning_rate": 9.740689970299293e-07, "logits/chosen": -0.5071693658828735, "logits/rejected": -0.5193157196044922, "logps/chosen": -34.238319396972656, "logps/rejected": -41.81940841674805, "loss": 0.4094, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1508972644805908, "rewards/margins": 0.8542629480361938, "rewards/rejected": 0.2966344356536865, "step": 1200 }, { "epoch": 0.12388197799938316, "grad_norm": 25.625, "learning_rate": 9.73497829563628e-07, "logits/chosen": -0.551594614982605, "logits/rejected": -0.5454241633415222, "logps/chosen": -34.46183776855469, "logps/rejected": -42.75682830810547, "loss": 0.4001, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0880959033966064, "rewards/margins": 1.0596786737442017, "rewards/rejected": 0.02841705083847046, "step": 1205 }, { "epoch": 0.12439601110311505, "grad_norm": 27.125, "learning_rate": 9.729266620973268e-07, "logits/chosen": -0.4791211485862732, "logits/rejected": -0.5238500833511353, "logps/chosen": -34.6198844909668, "logps/rejected": -41.003021240234375, "loss": 0.4242, "rewards/accuracies": 0.875, "rewards/chosen": 0.9936138987541199, "rewards/margins": 0.7765811085700989, "rewards/rejected": 0.21703281998634338, "step": 1210 }, { "epoch": 0.12491004420684693, "grad_norm": 24.5, "learning_rate": 9.723554946310257e-07, "logits/chosen": -0.6360510587692261, "logits/rejected": -0.6552263498306274, "logps/chosen": -38.470333099365234, "logps/rejected": -37.76917266845703, "loss": 0.433, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0477888584136963, "rewards/margins": 0.9280229806900024, "rewards/rejected": 0.11976580321788788, "step": 1215 }, { "epoch": 0.1254240773105788, "grad_norm": 24.875, "learning_rate": 9.717843271647248e-07, "logits/chosen": -0.5491260886192322, "logits/rejected": -0.53700852394104, "logps/chosen": -32.235050201416016, "logps/rejected": -40.60303497314453, "loss": 0.4124, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0626546144485474, "rewards/margins": 0.8070840835571289, "rewards/rejected": 0.2555704116821289, "step": 1220 }, { "epoch": 0.1259381104143107, "grad_norm": 26.875, "learning_rate": 9.712131596984234e-07, "logits/chosen": -0.5597547888755798, "logits/rejected": -0.6623493432998657, "logps/chosen": -34.651893615722656, "logps/rejected": -40.72418975830078, "loss": 0.4172, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1699163913726807, "rewards/margins": 0.9309837222099304, "rewards/rejected": 0.2389325648546219, "step": 1225 }, { "epoch": 0.12645214351804257, "grad_norm": 24.25, "learning_rate": 9.706419922321223e-07, "logits/chosen": -0.592757523059845, "logits/rejected": -0.6291838884353638, "logps/chosen": -35.86713409423828, "logps/rejected": -46.61927032470703, "loss": 0.4132, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0918524265289307, "rewards/margins": 1.0920673608779907, "rewards/rejected": -0.00021498351998161525, "step": 1230 }, { "epoch": 0.12696617662177445, "grad_norm": 27.75, "learning_rate": 9.700708247658212e-07, "logits/chosen": -0.5790265202522278, "logits/rejected": -0.5743574500083923, "logps/chosen": -36.709754943847656, "logps/rejected": -44.686187744140625, "loss": 0.4349, "rewards/accuracies": 0.875, "rewards/chosen": 1.0638378858566284, "rewards/margins": 0.9088146090507507, "rewards/rejected": 0.15502327680587769, "step": 1235 }, { "epoch": 0.12748020972550633, "grad_norm": 27.5, "learning_rate": 9.694996572995203e-07, "logits/chosen": -0.6205686330795288, "logits/rejected": -0.5764986276626587, "logps/chosen": -34.36640548706055, "logps/rejected": -42.49652099609375, "loss": 0.4021, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0889451503753662, "rewards/margins": 0.8171971440315247, "rewards/rejected": 0.27174824476242065, "step": 1240 }, { "epoch": 0.12799424282923821, "grad_norm": 30.375, "learning_rate": 9.689284898332192e-07, "logits/chosen": -0.6665223836898804, "logits/rejected": -0.6890705823898315, "logps/chosen": -29.33636474609375, "logps/rejected": -41.305511474609375, "loss": 0.4182, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.1655733585357666, "rewards/margins": 0.9827142953872681, "rewards/rejected": 0.18285900354385376, "step": 1245 }, { "epoch": 0.1285082759329701, "grad_norm": 27.5, "learning_rate": 9.683573223669178e-07, "logits/chosen": -0.5321668386459351, "logits/rejected": -0.5425465106964111, "logps/chosen": -39.13972091674805, "logps/rejected": -45.65961456298828, "loss": 0.3908, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8661860227584839, "rewards/margins": 1.059276819229126, "rewards/rejected": -0.1930907964706421, "step": 1250 }, { "epoch": 0.12902230903670198, "grad_norm": 25.25, "learning_rate": 9.677861549006167e-07, "logits/chosen": -0.5257444977760315, "logits/rejected": -0.5703641176223755, "logps/chosen": -44.51708221435547, "logps/rejected": -41.25822448730469, "loss": 0.4382, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9908590316772461, "rewards/margins": 0.6756852865219116, "rewards/rejected": 0.31517380475997925, "step": 1255 }, { "epoch": 0.12953634214043386, "grad_norm": 27.125, "learning_rate": 9.672149874343158e-07, "logits/chosen": -0.5857222080230713, "logits/rejected": -0.595578670501709, "logps/chosen": -38.014732360839844, "logps/rejected": -46.055274963378906, "loss": 0.4111, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.129724383354187, "rewards/margins": 1.192350149154663, "rewards/rejected": -0.06262576580047607, "step": 1260 }, { "epoch": 0.1300503752441657, "grad_norm": 23.5, "learning_rate": 9.666438199680147e-07, "logits/chosen": -0.5369521975517273, "logits/rejected": -0.5665919184684753, "logps/chosen": -33.40223693847656, "logps/rejected": -42.504669189453125, "loss": 0.3606, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1352444887161255, "rewards/margins": 1.0560858249664307, "rewards/rejected": 0.0791587233543396, "step": 1265 }, { "epoch": 0.1305644083478976, "grad_norm": 26.0, "learning_rate": 9.660726525017133e-07, "logits/chosen": -0.5417221784591675, "logits/rejected": -0.5975457429885864, "logps/chosen": -44.38947296142578, "logps/rejected": -49.75096893310547, "loss": 0.3783, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.120849847793579, "rewards/margins": 1.2262380123138428, "rewards/rejected": -0.10538794845342636, "step": 1270 }, { "epoch": 0.13107844145162947, "grad_norm": 25.625, "learning_rate": 9.655014850354122e-07, "logits/chosen": -0.5396966934204102, "logits/rejected": -0.6626105904579163, "logps/chosen": -31.743194580078125, "logps/rejected": -43.811256408691406, "loss": 0.4067, "rewards/accuracies": 0.875, "rewards/chosen": 1.135941743850708, "rewards/margins": 1.066961646080017, "rewards/rejected": 0.0689801350235939, "step": 1275 }, { "epoch": 0.13159247455536136, "grad_norm": 24.75, "learning_rate": 9.649303175691113e-07, "logits/chosen": -0.5645908713340759, "logits/rejected": -0.5586274862289429, "logps/chosen": -33.78209686279297, "logps/rejected": -40.592079162597656, "loss": 0.3917, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9926704168319702, "rewards/margins": 0.9461511373519897, "rewards/rejected": 0.0465191975235939, "step": 1280 }, { "epoch": 0.13210650765909324, "grad_norm": 25.625, "learning_rate": 9.643591501028102e-07, "logits/chosen": -0.5143414735794067, "logits/rejected": -0.5304186344146729, "logps/chosen": -31.150196075439453, "logps/rejected": -37.015541076660156, "loss": 0.3854, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9647483825683594, "rewards/margins": 0.9245759844779968, "rewards/rejected": 0.04017244279384613, "step": 1285 }, { "epoch": 0.13262054076282512, "grad_norm": 26.625, "learning_rate": 9.637879826365089e-07, "logits/chosen": -0.5385246276855469, "logits/rejected": -0.5669955015182495, "logps/chosen": -38.71395492553711, "logps/rejected": -41.088863372802734, "loss": 0.3632, "rewards/accuracies": 0.875, "rewards/chosen": 1.1248414516448975, "rewards/margins": 1.1073817014694214, "rewards/rejected": 0.017459595575928688, "step": 1290 }, { "epoch": 0.133134573866557, "grad_norm": 29.25, "learning_rate": 9.632168151702077e-07, "logits/chosen": -0.6932376623153687, "logits/rejected": -0.6863298416137695, "logps/chosen": -33.32396697998047, "logps/rejected": -38.54293441772461, "loss": 0.3954, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1738003492355347, "rewards/margins": 1.124878168106079, "rewards/rejected": 0.04892220348119736, "step": 1295 }, { "epoch": 0.13364860697028888, "grad_norm": 24.75, "learning_rate": 9.626456477039068e-07, "logits/chosen": -0.526185929775238, "logits/rejected": -0.5590692162513733, "logps/chosen": -34.102352142333984, "logps/rejected": -38.95243835449219, "loss": 0.3969, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0642433166503906, "rewards/margins": 0.8016577959060669, "rewards/rejected": 0.2625856399536133, "step": 1300 }, { "epoch": 0.13416264007402076, "grad_norm": 27.375, "learning_rate": 9.620744802376057e-07, "logits/chosen": -0.47204700112342834, "logits/rejected": -0.5228668451309204, "logps/chosen": -33.2864875793457, "logps/rejected": -47.063758850097656, "loss": 0.3922, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1738137006759644, "rewards/margins": 1.116525650024414, "rewards/rejected": 0.05728799104690552, "step": 1305 }, { "epoch": 0.13467667317775264, "grad_norm": 29.625, "learning_rate": 9.615033127713046e-07, "logits/chosen": -0.6153759956359863, "logits/rejected": -0.5623060464859009, "logps/chosen": -38.96274948120117, "logps/rejected": -52.13995361328125, "loss": 0.3808, "rewards/accuracies": 0.875, "rewards/chosen": 1.19746994972229, "rewards/margins": 1.1374306678771973, "rewards/rejected": 0.06003915145993233, "step": 1310 }, { "epoch": 0.13519070628148452, "grad_norm": 27.0, "learning_rate": 9.609321453050034e-07, "logits/chosen": -0.6661285161972046, "logits/rejected": -0.6552878022193909, "logps/chosen": -37.784400939941406, "logps/rejected": -43.49180221557617, "loss": 0.379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1418242454528809, "rewards/margins": 1.1576061248779297, "rewards/rejected": -0.015781860798597336, "step": 1315 }, { "epoch": 0.1357047393852164, "grad_norm": 25.875, "learning_rate": 9.603609778387023e-07, "logits/chosen": -0.6171391010284424, "logits/rejected": -0.6460161209106445, "logps/chosen": -41.55107879638672, "logps/rejected": -39.03644561767578, "loss": 0.3661, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0615184307098389, "rewards/margins": 0.9245885610580444, "rewards/rejected": 0.13692983984947205, "step": 1320 }, { "epoch": 0.1362187724889483, "grad_norm": 26.0, "learning_rate": 9.597898103724012e-07, "logits/chosen": -0.5404579043388367, "logits/rejected": -0.591010570526123, "logps/chosen": -39.813133239746094, "logps/rejected": -46.30543899536133, "loss": 0.3591, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1119792461395264, "rewards/margins": 1.2971826791763306, "rewards/rejected": -0.18520358204841614, "step": 1325 }, { "epoch": 0.13673280559268017, "grad_norm": 27.25, "learning_rate": 9.592186429061e-07, "logits/chosen": -0.5963798761367798, "logits/rejected": -0.6730097532272339, "logps/chosen": -33.899269104003906, "logps/rejected": -47.00066375732422, "loss": 0.37, "rewards/accuracies": 0.875, "rewards/chosen": 0.9008532762527466, "rewards/margins": 1.1041648387908936, "rewards/rejected": -0.2033115178346634, "step": 1330 }, { "epoch": 0.13724683869641205, "grad_norm": 23.375, "learning_rate": 9.58647475439799e-07, "logits/chosen": -0.607312798500061, "logits/rejected": -0.6064570546150208, "logps/chosen": -36.922515869140625, "logps/rejected": -43.99715805053711, "loss": 0.3717, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0404947996139526, "rewards/margins": 1.082999587059021, "rewards/rejected": -0.04250502586364746, "step": 1335 }, { "epoch": 0.13776087180014393, "grad_norm": 28.125, "learning_rate": 9.580763079734978e-07, "logits/chosen": -0.5586491823196411, "logits/rejected": -0.5563176274299622, "logps/chosen": -39.51609420776367, "logps/rejected": -43.73973846435547, "loss": 0.3911, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1016077995300293, "rewards/margins": 1.2950866222381592, "rewards/rejected": -0.19347873330116272, "step": 1340 }, { "epoch": 0.1382749049038758, "grad_norm": 37.5, "learning_rate": 9.575051405071967e-07, "logits/chosen": -0.5680769085884094, "logits/rejected": -0.5704718828201294, "logps/chosen": -33.1668586730957, "logps/rejected": -42.24604034423828, "loss": 0.3615, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.076220989227295, "rewards/margins": 1.4372539520263672, "rewards/rejected": -0.36103296279907227, "step": 1345 }, { "epoch": 0.1387889380076077, "grad_norm": 28.625, "learning_rate": 9.569339730408956e-07, "logits/chosen": -0.6170913577079773, "logits/rejected": -0.549118161201477, "logps/chosen": -38.63761520385742, "logps/rejected": -41.86328125, "loss": 0.3718, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.9105095863342285, "rewards/margins": 0.9516828656196594, "rewards/rejected": -0.04117327928543091, "step": 1350 }, { "epoch": 0.13930297111133957, "grad_norm": 26.5, "learning_rate": 9.563628055745945e-07, "logits/chosen": -0.5807555913925171, "logits/rejected": -0.5314736366271973, "logps/chosen": -35.94896697998047, "logps/rejected": -45.942466735839844, "loss": 0.3716, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9321867823600769, "rewards/margins": 1.4163792133331299, "rewards/rejected": -0.48419222235679626, "step": 1355 }, { "epoch": 0.13981700421507146, "grad_norm": 25.125, "learning_rate": 9.557916381082933e-07, "logits/chosen": -0.5857502818107605, "logits/rejected": -0.6162732839584351, "logps/chosen": -37.03959274291992, "logps/rejected": -41.971282958984375, "loss": 0.3685, "rewards/accuracies": 0.75, "rewards/chosen": 0.8226981163024902, "rewards/margins": 0.9390947222709656, "rewards/rejected": -0.11639660596847534, "step": 1360 }, { "epoch": 0.14033103731880334, "grad_norm": 26.5, "learning_rate": 9.552204706419922e-07, "logits/chosen": -0.6473585963249207, "logits/rejected": -0.6109882593154907, "logps/chosen": -45.04735565185547, "logps/rejected": -45.16634750366211, "loss": 0.3804, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8300371170043945, "rewards/margins": 1.1152554750442505, "rewards/rejected": -0.28521841764450073, "step": 1365 }, { "epoch": 0.14084507042253522, "grad_norm": 29.875, "learning_rate": 9.54649303175691e-07, "logits/chosen": -0.5988157391548157, "logits/rejected": -0.6534436345100403, "logps/chosen": -34.84793472290039, "logps/rejected": -43.767005920410156, "loss": 0.3648, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9740058779716492, "rewards/margins": 1.3185245990753174, "rewards/rejected": -0.34451884031295776, "step": 1370 }, { "epoch": 0.1413591035262671, "grad_norm": 27.75, "learning_rate": 9.5407813570939e-07, "logits/chosen": -0.6263769268989563, "logits/rejected": -0.5877900123596191, "logps/chosen": -37.43610382080078, "logps/rejected": -47.8087272644043, "loss": 0.3431, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0564360618591309, "rewards/margins": 1.2013373374938965, "rewards/rejected": -0.14490123093128204, "step": 1375 }, { "epoch": 0.14187313662999898, "grad_norm": 28.375, "learning_rate": 9.535069682430889e-07, "logits/chosen": -0.5863901376724243, "logits/rejected": -0.6097357273101807, "logps/chosen": -34.135948181152344, "logps/rejected": -46.25653839111328, "loss": 0.394, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.045481562614441, "rewards/margins": 1.2178642749786377, "rewards/rejected": -0.1723826825618744, "step": 1380 }, { "epoch": 0.14238716973373086, "grad_norm": 34.25, "learning_rate": 9.529358007767877e-07, "logits/chosen": -0.6491494178771973, "logits/rejected": -0.692763090133667, "logps/chosen": -35.301002502441406, "logps/rejected": -46.93037796020508, "loss": 0.3612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9952324032783508, "rewards/margins": 1.1310077905654907, "rewards/rejected": -0.1357753574848175, "step": 1385 }, { "epoch": 0.14290120283746274, "grad_norm": 36.0, "learning_rate": 9.523646333104865e-07, "logits/chosen": -0.5077964067459106, "logits/rejected": -0.5805376768112183, "logps/chosen": -35.24088668823242, "logps/rejected": -44.04827117919922, "loss": 0.3673, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9618390202522278, "rewards/margins": 1.0732640027999878, "rewards/rejected": -0.11142508685588837, "step": 1390 }, { "epoch": 0.14341523594119462, "grad_norm": 25.125, "learning_rate": 9.517934658441855e-07, "logits/chosen": -0.579708456993103, "logits/rejected": -0.5795426368713379, "logps/chosen": -37.3784294128418, "logps/rejected": -44.51187515258789, "loss": 0.3666, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9625225067138672, "rewards/margins": 0.9345510601997375, "rewards/rejected": 0.027971375733613968, "step": 1395 }, { "epoch": 0.1439292690449265, "grad_norm": 31.875, "learning_rate": 9.512222983778844e-07, "logits/chosen": -0.5555420517921448, "logits/rejected": -0.5371555089950562, "logps/chosen": -31.814319610595703, "logps/rejected": -44.28928756713867, "loss": 0.3564, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0128023624420166, "rewards/margins": 1.396436095237732, "rewards/rejected": -0.38363364338874817, "step": 1400 }, { "epoch": 0.1444433021486584, "grad_norm": 25.125, "learning_rate": 9.506511309115832e-07, "logits/chosen": -0.5305755138397217, "logits/rejected": -0.5713062286376953, "logps/chosen": -38.682823181152344, "logps/rejected": -43.66755676269531, "loss": 0.3759, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.1187782287597656, "rewards/margins": 1.0243339538574219, "rewards/rejected": 0.09444423764944077, "step": 1405 }, { "epoch": 0.14495733525239024, "grad_norm": 24.875, "learning_rate": 9.500799634452821e-07, "logits/chosen": -0.6030920147895813, "logits/rejected": -0.6258928775787354, "logps/chosen": -33.256752014160156, "logps/rejected": -42.61906814575195, "loss": 0.3932, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0679137706756592, "rewards/margins": 1.324344515800476, "rewards/rejected": -0.25643080472946167, "step": 1410 }, { "epoch": 0.14547136835612212, "grad_norm": 35.75, "learning_rate": 9.49508795978981e-07, "logits/chosen": -0.6234217882156372, "logits/rejected": -0.5398747324943542, "logps/chosen": -41.096473693847656, "logps/rejected": -49.444358825683594, "loss": 0.3782, "rewards/accuracies": 0.875, "rewards/chosen": 0.9583643078804016, "rewards/margins": 1.150061011314392, "rewards/rejected": -0.19169682264328003, "step": 1415 }, { "epoch": 0.145985401459854, "grad_norm": 26.25, "learning_rate": 9.489376285126799e-07, "logits/chosen": -0.542667806148529, "logits/rejected": -0.4991677701473236, "logps/chosen": -36.54325485229492, "logps/rejected": -44.59380340576172, "loss": 0.351, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.2381294965744019, "rewards/margins": 1.4808385372161865, "rewards/rejected": -0.24270892143249512, "step": 1420 }, { "epoch": 0.14649943456358588, "grad_norm": 29.125, "learning_rate": 9.483664610463787e-07, "logits/chosen": -0.60767662525177, "logits/rejected": -0.5882222652435303, "logps/chosen": -36.287261962890625, "logps/rejected": -42.0889892578125, "loss": 0.3641, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.085073471069336, "rewards/margins": 1.137346863746643, "rewards/rejected": -0.05227324366569519, "step": 1425 }, { "epoch": 0.14701346766731777, "grad_norm": 28.25, "learning_rate": 9.477952935800776e-07, "logits/chosen": -0.5600114464759827, "logits/rejected": -0.5862640142440796, "logps/chosen": -40.21672058105469, "logps/rejected": -46.35845947265625, "loss": 0.3812, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1154863834381104, "rewards/margins": 1.1855602264404297, "rewards/rejected": -0.07007394731044769, "step": 1430 }, { "epoch": 0.14752750077104965, "grad_norm": 25.375, "learning_rate": 9.472241261137766e-07, "logits/chosen": -0.5318673849105835, "logits/rejected": -0.5981088280677795, "logps/chosen": -35.91817092895508, "logps/rejected": -46.819915771484375, "loss": 0.3396, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9764559864997864, "rewards/margins": 1.3713948726654053, "rewards/rejected": -0.3949388265609741, "step": 1435 }, { "epoch": 0.14804153387478153, "grad_norm": 32.25, "learning_rate": 9.466529586474754e-07, "logits/chosen": -0.575641393661499, "logits/rejected": -0.5533297061920166, "logps/chosen": -35.46027755737305, "logps/rejected": -46.91266632080078, "loss": 0.3916, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0883461236953735, "rewards/margins": 1.333022117614746, "rewards/rejected": -0.2446759045124054, "step": 1440 }, { "epoch": 0.1485555669785134, "grad_norm": 24.125, "learning_rate": 9.460817911811743e-07, "logits/chosen": -0.5501986742019653, "logits/rejected": -0.5647501945495605, "logps/chosen": -37.67755126953125, "logps/rejected": -45.941566467285156, "loss": 0.3561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.787689745426178, "rewards/margins": 1.2697255611419678, "rewards/rejected": -0.4820357859134674, "step": 1445 }, { "epoch": 0.1490696000822453, "grad_norm": 36.0, "learning_rate": 9.455106237148731e-07, "logits/chosen": -0.6030054092407227, "logits/rejected": -0.6231889128684998, "logps/chosen": -46.25151824951172, "logps/rejected": -51.89415740966797, "loss": 0.3283, "rewards/accuracies": 1.0, "rewards/chosen": 0.8896169662475586, "rewards/margins": 1.4785000085830688, "rewards/rejected": -0.5888828635215759, "step": 1450 }, { "epoch": 0.14958363318597717, "grad_norm": 29.75, "learning_rate": 9.449394562485721e-07, "logits/chosen": -0.6031926274299622, "logits/rejected": -0.6590163111686707, "logps/chosen": -33.42090606689453, "logps/rejected": -43.08900451660156, "loss": 0.3654, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9049280285835266, "rewards/margins": 1.106311559677124, "rewards/rejected": -0.2013835906982422, "step": 1455 }, { "epoch": 0.15009766628970905, "grad_norm": 33.25, "learning_rate": 9.443682887822709e-07, "logits/chosen": -0.6270116567611694, "logits/rejected": -0.6745739579200745, "logps/chosen": -38.40895462036133, "logps/rejected": -46.546695709228516, "loss": 0.3663, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9966541528701782, "rewards/margins": 1.1092488765716553, "rewards/rejected": -0.1125946193933487, "step": 1460 }, { "epoch": 0.15061169939344093, "grad_norm": 25.625, "learning_rate": 9.437971213159698e-07, "logits/chosen": -0.6041979193687439, "logits/rejected": -0.6235709190368652, "logps/chosen": -34.129554748535156, "logps/rejected": -46.976009368896484, "loss": 0.3147, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8534135818481445, "rewards/margins": 1.323385238647461, "rewards/rejected": -0.4699716567993164, "step": 1465 }, { "epoch": 0.15112573249717282, "grad_norm": 28.125, "learning_rate": 9.432259538496686e-07, "logits/chosen": -0.5907290577888489, "logits/rejected": -0.6288011074066162, "logps/chosen": -40.58198547363281, "logps/rejected": -46.66529083251953, "loss": 0.4151, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9403311014175415, "rewards/margins": 1.2740390300750732, "rewards/rejected": -0.33370792865753174, "step": 1470 }, { "epoch": 0.1516397656009047, "grad_norm": 28.625, "learning_rate": 9.426547863833676e-07, "logits/chosen": -0.5679578185081482, "logits/rejected": -0.5709707736968994, "logps/chosen": -38.99807357788086, "logps/rejected": -43.439903259277344, "loss": 0.3926, "rewards/accuracies": 0.875, "rewards/chosen": 0.8586533665657043, "rewards/margins": 1.0244338512420654, "rewards/rejected": -0.16578057408332825, "step": 1475 }, { "epoch": 0.15215379870463658, "grad_norm": 28.75, "learning_rate": 9.420836189170664e-07, "logits/chosen": -0.5597811937332153, "logits/rejected": -0.5616393089294434, "logps/chosen": -33.34391784667969, "logps/rejected": -44.85847854614258, "loss": 0.3886, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.8566901087760925, "rewards/margins": 0.9734767079353333, "rewards/rejected": -0.11678646504878998, "step": 1480 }, { "epoch": 0.15266783180836846, "grad_norm": 27.5, "learning_rate": 9.415124514507653e-07, "logits/chosen": -0.5570103526115417, "logits/rejected": -0.5228220224380493, "logps/chosen": -33.29974365234375, "logps/rejected": -46.793861389160156, "loss": 0.3618, "rewards/accuracies": 0.875, "rewards/chosen": 0.9743233919143677, "rewards/margins": 1.3534283638000488, "rewards/rejected": -0.37910494208335876, "step": 1485 }, { "epoch": 0.15318186491210034, "grad_norm": 28.5, "learning_rate": 9.409412839844642e-07, "logits/chosen": -0.47686314582824707, "logits/rejected": -0.5550869703292847, "logps/chosen": -35.8111686706543, "logps/rejected": -48.00031280517578, "loss": 0.3566, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0368844270706177, "rewards/margins": 1.4789998531341553, "rewards/rejected": -0.44211554527282715, "step": 1490 }, { "epoch": 0.15369589801583222, "grad_norm": 28.875, "learning_rate": 9.403701165181631e-07, "logits/chosen": -0.5914559960365295, "logits/rejected": -0.5757344961166382, "logps/chosen": -36.20911407470703, "logps/rejected": -42.29595947265625, "loss": 0.3625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7366764545440674, "rewards/margins": 0.8050495386123657, "rewards/rejected": -0.06837305426597595, "step": 1495 }, { "epoch": 0.1542099311195641, "grad_norm": 26.25, "learning_rate": 9.39798949051862e-07, "logits/chosen": -0.5922192335128784, "logits/rejected": -0.5931317210197449, "logps/chosen": -40.62456512451172, "logps/rejected": -48.44416046142578, "loss": 0.3506, "rewards/accuracies": 0.875, "rewards/chosen": 0.9827395677566528, "rewards/margins": 1.1429075002670288, "rewards/rejected": -0.16016802191734314, "step": 1500 }, { "epoch": 0.15472396422329598, "grad_norm": 24.875, "learning_rate": 9.392277815855608e-07, "logits/chosen": -0.5423511862754822, "logits/rejected": -0.5967980623245239, "logps/chosen": -35.088340759277344, "logps/rejected": -47.10858917236328, "loss": 0.318, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8123103976249695, "rewards/margins": 1.2158576250076294, "rewards/rejected": -0.4035471975803375, "step": 1505 }, { "epoch": 0.15523799732702787, "grad_norm": 26.25, "learning_rate": 9.386566141192597e-07, "logits/chosen": -0.6271685361862183, "logits/rejected": -0.6033157110214233, "logps/chosen": -37.37553787231445, "logps/rejected": -49.87334060668945, "loss": 0.3567, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8434385061264038, "rewards/margins": 1.2074424028396606, "rewards/rejected": -0.36400386691093445, "step": 1510 }, { "epoch": 0.15575203043075975, "grad_norm": 34.5, "learning_rate": 9.380854466529586e-07, "logits/chosen": -0.6201636791229248, "logits/rejected": -0.6777393817901611, "logps/chosen": -40.047027587890625, "logps/rejected": -47.85330581665039, "loss": 0.3414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.739526093006134, "rewards/margins": 0.9982763528823853, "rewards/rejected": -0.25875014066696167, "step": 1515 }, { "epoch": 0.15626606353449163, "grad_norm": 35.0, "learning_rate": 9.375142791866575e-07, "logits/chosen": -0.5901078581809998, "logits/rejected": -0.5878726243972778, "logps/chosen": -39.492733001708984, "logps/rejected": -46.60566711425781, "loss": 0.3622, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.963487982749939, "rewards/margins": 1.2939823865890503, "rewards/rejected": -0.3304945230484009, "step": 1520 }, { "epoch": 0.1567800966382235, "grad_norm": 27.0, "learning_rate": 9.369431117203563e-07, "logits/chosen": -0.6314666867256165, "logits/rejected": -0.5588493943214417, "logps/chosen": -32.61498260498047, "logps/rejected": -42.43160629272461, "loss": 0.373, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0559009313583374, "rewards/margins": 1.1820871829986572, "rewards/rejected": -0.12618613243103027, "step": 1525 }, { "epoch": 0.1572941297419554, "grad_norm": 27.375, "learning_rate": 9.363719442540553e-07, "logits/chosen": -0.5716749429702759, "logits/rejected": -0.5829801559448242, "logps/chosen": -35.568931579589844, "logps/rejected": -48.20122528076172, "loss": 0.3334, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9897148013114929, "rewards/margins": 1.204551100730896, "rewards/rejected": -0.21483652293682098, "step": 1530 }, { "epoch": 0.15780816284568727, "grad_norm": 24.625, "learning_rate": 9.358007767877542e-07, "logits/chosen": -0.4924170970916748, "logits/rejected": -0.5371814370155334, "logps/chosen": -39.55474853515625, "logps/rejected": -45.96873474121094, "loss": 0.3369, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9321895837783813, "rewards/margins": 1.250876545906067, "rewards/rejected": -0.3186870217323303, "step": 1535 }, { "epoch": 0.15832219594941915, "grad_norm": 26.5, "learning_rate": 9.35229609321453e-07, "logits/chosen": -0.5853351354598999, "logits/rejected": -0.5607547760009766, "logps/chosen": -39.12966537475586, "logps/rejected": -48.40225601196289, "loss": 0.3307, "rewards/accuracies": 0.875, "rewards/chosen": 0.9521196484565735, "rewards/margins": 1.4796549081802368, "rewards/rejected": -0.5275352001190186, "step": 1540 }, { "epoch": 0.15883622905315103, "grad_norm": 32.25, "learning_rate": 9.346584418551519e-07, "logits/chosen": -0.4937865734100342, "logits/rejected": -0.5220987200737, "logps/chosen": -35.6842155456543, "logps/rejected": -40.7150764465332, "loss": 0.3406, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9435647130012512, "rewards/margins": 1.1760072708129883, "rewards/rejected": -0.23244261741638184, "step": 1545 }, { "epoch": 0.15935026215688292, "grad_norm": 22.625, "learning_rate": 9.340872743888508e-07, "logits/chosen": -0.5895169973373413, "logits/rejected": -0.5933536291122437, "logps/chosen": -40.88249588012695, "logps/rejected": -46.960784912109375, "loss": 0.3781, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6926434636116028, "rewards/margins": 1.1002981662750244, "rewards/rejected": -0.40765461325645447, "step": 1550 }, { "epoch": 0.1598642952606148, "grad_norm": 25.25, "learning_rate": 9.335161069225497e-07, "logits/chosen": -0.525857150554657, "logits/rejected": -0.5663772821426392, "logps/chosen": -35.8292350769043, "logps/rejected": -46.87906265258789, "loss": 0.3118, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9350957870483398, "rewards/margins": 1.1798839569091797, "rewards/rejected": -0.24478812515735626, "step": 1555 }, { "epoch": 0.16037832836434665, "grad_norm": 27.75, "learning_rate": 9.329449394562485e-07, "logits/chosen": -0.4423489570617676, "logits/rejected": -0.5264717936515808, "logps/chosen": -32.49250793457031, "logps/rejected": -43.6275520324707, "loss": 0.3552, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8771027326583862, "rewards/margins": 1.5362975597381592, "rewards/rejected": -0.659194827079773, "step": 1560 }, { "epoch": 0.16089236146807853, "grad_norm": 28.0, "learning_rate": 9.323737719899474e-07, "logits/chosen": -0.5118684768676758, "logits/rejected": -0.5252355337142944, "logps/chosen": -31.634740829467773, "logps/rejected": -44.609718322753906, "loss": 0.3639, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8576010465621948, "rewards/margins": 1.3271108865737915, "rewards/rejected": -0.4695097804069519, "step": 1565 }, { "epoch": 0.1614063945718104, "grad_norm": 23.75, "learning_rate": 9.318026045236463e-07, "logits/chosen": -0.4733484387397766, "logits/rejected": -0.45012950897216797, "logps/chosen": -42.06184768676758, "logps/rejected": -50.617061614990234, "loss": 0.3604, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8331177830696106, "rewards/margins": 1.3329589366912842, "rewards/rejected": -0.4998410642147064, "step": 1570 }, { "epoch": 0.1619204276755423, "grad_norm": 26.125, "learning_rate": 9.312314370573452e-07, "logits/chosen": -0.5228344202041626, "logits/rejected": -0.6128243207931519, "logps/chosen": -43.06123733520508, "logps/rejected": -51.10765838623047, "loss": 0.3211, "rewards/accuracies": 0.875, "rewards/chosen": 0.9386968612670898, "rewards/margins": 1.6207917928695679, "rewards/rejected": -0.682094931602478, "step": 1575 }, { "epoch": 0.16243446077927418, "grad_norm": 26.125, "learning_rate": 9.30660269591044e-07, "logits/chosen": -0.5264501571655273, "logits/rejected": -0.5323842763900757, "logps/chosen": -39.829193115234375, "logps/rejected": -46.071258544921875, "loss": 0.3881, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7928739786148071, "rewards/margins": 1.4999399185180664, "rewards/rejected": -0.7070659399032593, "step": 1580 }, { "epoch": 0.16294849388300606, "grad_norm": 31.625, "learning_rate": 9.300891021247429e-07, "logits/chosen": -0.5565804243087769, "logits/rejected": -0.595677375793457, "logps/chosen": -34.574798583984375, "logps/rejected": -46.94990539550781, "loss": 0.3732, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9490131139755249, "rewards/margins": 1.4082289934158325, "rewards/rejected": -0.4592157304286957, "step": 1585 }, { "epoch": 0.16346252698673794, "grad_norm": 27.25, "learning_rate": 9.295179346584419e-07, "logits/chosen": -0.64186692237854, "logits/rejected": -0.6467679142951965, "logps/chosen": -36.54703903198242, "logps/rejected": -47.2222900390625, "loss": 0.3314, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0333263874053955, "rewards/margins": 1.3895410299301147, "rewards/rejected": -0.35621485114097595, "step": 1590 }, { "epoch": 0.16397656009046982, "grad_norm": 35.25, "learning_rate": 9.289467671921407e-07, "logits/chosen": -0.6021275520324707, "logits/rejected": -0.5866779088973999, "logps/chosen": -38.281761169433594, "logps/rejected": -48.19720458984375, "loss": 0.3581, "rewards/accuracies": 0.875, "rewards/chosen": 0.8525298237800598, "rewards/margins": 1.3657069206237793, "rewards/rejected": -0.5131770372390747, "step": 1595 }, { "epoch": 0.1644905931942017, "grad_norm": 37.0, "learning_rate": 9.283755997258396e-07, "logits/chosen": -0.5931041836738586, "logits/rejected": -0.6428229212760925, "logps/chosen": -36.89219284057617, "logps/rejected": -46.72726821899414, "loss": 0.3848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7808471918106079, "rewards/margins": 1.0728747844696045, "rewards/rejected": -0.29202768206596375, "step": 1600 }, { "epoch": 0.16500462629793358, "grad_norm": 27.0, "learning_rate": 9.278044322595384e-07, "logits/chosen": -0.5861050486564636, "logits/rejected": -0.550404965877533, "logps/chosen": -41.492027282714844, "logps/rejected": -49.71320343017578, "loss": 0.3298, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8761526346206665, "rewards/margins": 1.547926902770996, "rewards/rejected": -0.67177414894104, "step": 1605 }, { "epoch": 0.16551865940166546, "grad_norm": 31.0, "learning_rate": 9.272332647932374e-07, "logits/chosen": -0.555610716342926, "logits/rejected": -0.5396636128425598, "logps/chosen": -38.35833740234375, "logps/rejected": -49.36560821533203, "loss": 0.3525, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.910391628742218, "rewards/margins": 1.5080621242523193, "rewards/rejected": -0.5976706147193909, "step": 1610 }, { "epoch": 0.16603269250539734, "grad_norm": 33.0, "learning_rate": 9.266620973269362e-07, "logits/chosen": -0.5802830457687378, "logits/rejected": -0.5528700351715088, "logps/chosen": -36.526790618896484, "logps/rejected": -50.34374237060547, "loss": 0.3624, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9005420804023743, "rewards/margins": 1.5528391599655151, "rewards/rejected": -0.6522972583770752, "step": 1615 }, { "epoch": 0.16654672560912923, "grad_norm": 41.0, "learning_rate": 9.260909298606351e-07, "logits/chosen": -0.5216872692108154, "logits/rejected": -0.613303542137146, "logps/chosen": -42.73640441894531, "logps/rejected": -52.901336669921875, "loss": 0.3359, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8667834401130676, "rewards/margins": 1.4533493518829346, "rewards/rejected": -0.5865658521652222, "step": 1620 }, { "epoch": 0.1670607587128611, "grad_norm": 33.75, "learning_rate": 9.255197623943339e-07, "logits/chosen": -0.5968568325042725, "logits/rejected": -0.5647949576377869, "logps/chosen": -34.05177688598633, "logps/rejected": -48.65740203857422, "loss": 0.3776, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8313129544258118, "rewards/margins": 1.1912543773651123, "rewards/rejected": -0.35994139313697815, "step": 1625 }, { "epoch": 0.167574791816593, "grad_norm": 28.375, "learning_rate": 9.249485949280329e-07, "logits/chosen": -0.5135396718978882, "logits/rejected": -0.5080360174179077, "logps/chosen": -39.960289001464844, "logps/rejected": -48.26568603515625, "loss": 0.3463, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8418840169906616, "rewards/margins": 1.1426502466201782, "rewards/rejected": -0.30076608061790466, "step": 1630 }, { "epoch": 0.16808882492032487, "grad_norm": 26.0, "learning_rate": 9.243774274617318e-07, "logits/chosen": -0.6107691526412964, "logits/rejected": -0.6132943630218506, "logps/chosen": -36.18310546875, "logps/rejected": -49.48863983154297, "loss": 0.3394, "rewards/accuracies": 1.0, "rewards/chosen": 0.9156566858291626, "rewards/margins": 1.381001353263855, "rewards/rejected": -0.4653448164463043, "step": 1635 }, { "epoch": 0.16860285802405675, "grad_norm": 33.5, "learning_rate": 9.238062599954306e-07, "logits/chosen": -0.5731534957885742, "logits/rejected": -0.5616478323936462, "logps/chosen": -38.12310028076172, "logps/rejected": -47.730804443359375, "loss": 0.338, "rewards/accuracies": 0.875, "rewards/chosen": 0.9960298538208008, "rewards/margins": 1.229662537574768, "rewards/rejected": -0.23363271355628967, "step": 1640 }, { "epoch": 0.16911689112778863, "grad_norm": 26.125, "learning_rate": 9.232350925291294e-07, "logits/chosen": -0.6290663480758667, "logits/rejected": -0.6236658692359924, "logps/chosen": -28.918155670166016, "logps/rejected": -44.66530990600586, "loss": 0.3131, "rewards/accuracies": 0.875, "rewards/chosen": 1.0585908889770508, "rewards/margins": 1.5927239656448364, "rewards/rejected": -0.5341330766677856, "step": 1645 }, { "epoch": 0.1696309242315205, "grad_norm": 44.0, "learning_rate": 9.226639250628284e-07, "logits/chosen": -0.5637052059173584, "logits/rejected": -0.5912858247756958, "logps/chosen": -38.746910095214844, "logps/rejected": -50.323448181152344, "loss": 0.3658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8208960294723511, "rewards/margins": 1.4183322191238403, "rewards/rejected": -0.597436249256134, "step": 1650 }, { "epoch": 0.1701449573352524, "grad_norm": 28.5, "learning_rate": 9.220927575965273e-07, "logits/chosen": -0.5592361688613892, "logits/rejected": -0.5658442974090576, "logps/chosen": -39.94562530517578, "logps/rejected": -49.79155349731445, "loss": 0.3359, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7619194984436035, "rewards/margins": 1.356593132019043, "rewards/rejected": -0.594673752784729, "step": 1655 }, { "epoch": 0.17065899043898428, "grad_norm": 39.5, "learning_rate": 9.215215901302261e-07, "logits/chosen": -0.5109578371047974, "logits/rejected": -0.4874093532562256, "logps/chosen": -38.02873229980469, "logps/rejected": -51.2262077331543, "loss": 0.3433, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7420952320098877, "rewards/margins": 1.363847017288208, "rewards/rejected": -0.6217517852783203, "step": 1660 }, { "epoch": 0.17117302354271616, "grad_norm": 28.125, "learning_rate": 9.20950422663925e-07, "logits/chosen": -0.44042736291885376, "logits/rejected": -0.509255051612854, "logps/chosen": -35.12376022338867, "logps/rejected": -50.78401565551758, "loss": 0.3501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7420121431350708, "rewards/margins": 1.2213551998138428, "rewards/rejected": -0.4793429374694824, "step": 1665 }, { "epoch": 0.17168705664644804, "grad_norm": 43.5, "learning_rate": 9.203792551976239e-07, "logits/chosen": -0.5704957842826843, "logits/rejected": -0.5615702271461487, "logps/chosen": -34.875911712646484, "logps/rejected": -46.93279266357422, "loss": 0.3347, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8903500437736511, "rewards/margins": 1.4277875423431396, "rewards/rejected": -0.5374374389648438, "step": 1670 }, { "epoch": 0.17220108975017992, "grad_norm": 27.125, "learning_rate": 9.198080877313228e-07, "logits/chosen": -0.5341443419456482, "logits/rejected": -0.5420817136764526, "logps/chosen": -41.29304885864258, "logps/rejected": -46.96661376953125, "loss": 0.3759, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8897830247879028, "rewards/margins": 1.4017813205718994, "rewards/rejected": -0.5119984149932861, "step": 1675 }, { "epoch": 0.1727151228539118, "grad_norm": 32.0, "learning_rate": 9.192369202650216e-07, "logits/chosen": -0.6349416375160217, "logits/rejected": -0.6147390604019165, "logps/chosen": -36.48649597167969, "logps/rejected": -53.22981643676758, "loss": 0.3422, "rewards/accuracies": 0.875, "rewards/chosen": 0.8972150683403015, "rewards/margins": 1.5522397756576538, "rewards/rejected": -0.6550248861312866, "step": 1680 }, { "epoch": 0.17322915595764368, "grad_norm": 28.5, "learning_rate": 9.186657527987205e-07, "logits/chosen": -0.5536549091339111, "logits/rejected": -0.6197288036346436, "logps/chosen": -35.02629852294922, "logps/rejected": -46.736473083496094, "loss": 0.3858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8370817303657532, "rewards/margins": 1.0324485301971436, "rewards/rejected": -0.19536668062210083, "step": 1685 }, { "epoch": 0.17374318906137556, "grad_norm": 23.0, "learning_rate": 9.180945853324194e-07, "logits/chosen": -0.537802517414093, "logits/rejected": -0.5996926426887512, "logps/chosen": -36.54448699951172, "logps/rejected": -50.95597457885742, "loss": 0.3191, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1492102146148682, "rewards/margins": 1.609212875366211, "rewards/rejected": -0.4600028097629547, "step": 1690 }, { "epoch": 0.17425722216510744, "grad_norm": 28.125, "learning_rate": 9.175234178661183e-07, "logits/chosen": -0.5580058097839355, "logits/rejected": -0.600119948387146, "logps/chosen": -36.9126091003418, "logps/rejected": -53.536102294921875, "loss": 0.3598, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7872682809829712, "rewards/margins": 1.342641830444336, "rewards/rejected": -0.5553735494613647, "step": 1695 }, { "epoch": 0.17477125526883933, "grad_norm": 26.0, "learning_rate": 9.169522503998172e-07, "logits/chosen": -0.4811463952064514, "logits/rejected": -0.5151039361953735, "logps/chosen": -30.746318817138672, "logps/rejected": -44.27534484863281, "loss": 0.3552, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9314136505126953, "rewards/margins": 1.339023232460022, "rewards/rejected": -0.4076095521450043, "step": 1700 }, { "epoch": 0.1752852883725712, "grad_norm": 28.625, "learning_rate": 9.16381082933516e-07, "logits/chosen": -0.5791860222816467, "logits/rejected": -0.590351939201355, "logps/chosen": -36.346771240234375, "logps/rejected": -45.53864669799805, "loss": 0.3463, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7260745167732239, "rewards/margins": 1.3506381511688232, "rewards/rejected": -0.6245637536048889, "step": 1705 }, { "epoch": 0.17579932147630306, "grad_norm": 31.125, "learning_rate": 9.15809915467215e-07, "logits/chosen": -0.5912117958068848, "logits/rejected": -0.6145442724227905, "logps/chosen": -38.339454650878906, "logps/rejected": -48.76460647583008, "loss": 0.37, "rewards/accuracies": 0.875, "rewards/chosen": 0.7842147946357727, "rewards/margins": 1.0616796016693115, "rewards/rejected": -0.27746471762657166, "step": 1710 }, { "epoch": 0.17631335458003494, "grad_norm": 37.0, "learning_rate": 9.152387480009138e-07, "logits/chosen": -0.5378602147102356, "logits/rejected": -0.5685992240905762, "logps/chosen": -41.60502243041992, "logps/rejected": -53.668907165527344, "loss": 0.312, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6825618147850037, "rewards/margins": 1.3986537456512451, "rewards/rejected": -0.7160921096801758, "step": 1715 }, { "epoch": 0.17682738768376682, "grad_norm": 26.375, "learning_rate": 9.146675805346127e-07, "logits/chosen": -0.5516695976257324, "logits/rejected": -0.5298255085945129, "logps/chosen": -44.02328872680664, "logps/rejected": -45.091827392578125, "loss": 0.3575, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0404813289642334, "rewards/margins": 1.3414922952651978, "rewards/rejected": -0.30101075768470764, "step": 1720 }, { "epoch": 0.1773414207874987, "grad_norm": 30.875, "learning_rate": 9.140964130683115e-07, "logits/chosen": -0.5522476434707642, "logits/rejected": -0.5840319991111755, "logps/chosen": -41.71228790283203, "logps/rejected": -52.55419921875, "loss": 0.3378, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7162148952484131, "rewards/margins": 1.618085265159607, "rewards/rejected": -0.9018704295158386, "step": 1725 }, { "epoch": 0.17785545389123059, "grad_norm": 29.25, "learning_rate": 9.135252456020105e-07, "logits/chosen": -0.5032190084457397, "logits/rejected": -0.5448973178863525, "logps/chosen": -37.97126007080078, "logps/rejected": -45.5742301940918, "loss": 0.3457, "rewards/accuracies": 0.875, "rewards/chosen": 0.832854151725769, "rewards/margins": 1.2363859415054321, "rewards/rejected": -0.40353164076805115, "step": 1730 }, { "epoch": 0.17836948699496247, "grad_norm": 30.875, "learning_rate": 9.129540781357093e-07, "logits/chosen": -0.5898447036743164, "logits/rejected": -0.5702348947525024, "logps/chosen": -38.763916015625, "logps/rejected": -45.034610748291016, "loss": 0.3676, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.8239415287971497, "rewards/margins": 1.1798151731491089, "rewards/rejected": -0.35587364435195923, "step": 1735 }, { "epoch": 0.17888352009869435, "grad_norm": 45.25, "learning_rate": 9.123829106694082e-07, "logits/chosen": -0.6861371397972107, "logits/rejected": -0.6756376028060913, "logps/chosen": -42.32303237915039, "logps/rejected": -53.78196334838867, "loss": 0.3407, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9448530077934265, "rewards/margins": 1.6541986465454102, "rewards/rejected": -0.7093454599380493, "step": 1740 }, { "epoch": 0.17939755320242623, "grad_norm": 26.75, "learning_rate": 9.118117432031072e-07, "logits/chosen": -0.5713189840316772, "logits/rejected": -0.6098989844322205, "logps/chosen": -41.05432891845703, "logps/rejected": -53.79986572265625, "loss": 0.3367, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9782170057296753, "rewards/margins": 1.7790740728378296, "rewards/rejected": -0.8008570671081543, "step": 1745 }, { "epoch": 0.1799115863061581, "grad_norm": 26.0, "learning_rate": 9.11240575736806e-07, "logits/chosen": -0.5303887724876404, "logits/rejected": -0.5520049333572388, "logps/chosen": -36.35606002807617, "logps/rejected": -49.00086975097656, "loss": 0.3365, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.707277238368988, "rewards/margins": 1.4833412170410156, "rewards/rejected": -0.7760639786720276, "step": 1750 }, { "epoch": 0.18042561940989, "grad_norm": 33.25, "learning_rate": 9.106694082705049e-07, "logits/chosen": -0.615024209022522, "logits/rejected": -0.5689089894294739, "logps/chosen": -38.048274993896484, "logps/rejected": -44.77051544189453, "loss": 0.3692, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8296100497245789, "rewards/margins": 1.1342989206314087, "rewards/rejected": -0.30468881130218506, "step": 1755 }, { "epoch": 0.18093965251362187, "grad_norm": 39.75, "learning_rate": 9.100982408042037e-07, "logits/chosen": -0.5907160043716431, "logits/rejected": -0.6618340015411377, "logps/chosen": -35.08910369873047, "logps/rejected": -51.98505783081055, "loss": 0.3266, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0039924383163452, "rewards/margins": 1.7724307775497437, "rewards/rejected": -0.768438458442688, "step": 1760 }, { "epoch": 0.18145368561735375, "grad_norm": 25.5, "learning_rate": 9.095270733379027e-07, "logits/chosen": -0.5819886326789856, "logits/rejected": -0.5960472226142883, "logps/chosen": -35.85673522949219, "logps/rejected": -49.80208206176758, "loss": 0.3074, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0264055728912354, "rewards/margins": 1.810766577720642, "rewards/rejected": -0.7843610048294067, "step": 1765 }, { "epoch": 0.18196771872108564, "grad_norm": 26.75, "learning_rate": 9.089559058716015e-07, "logits/chosen": -0.5979526042938232, "logits/rejected": -0.617988646030426, "logps/chosen": -35.13605499267578, "logps/rejected": -46.23804473876953, "loss": 0.2941, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8984397649765015, "rewards/margins": 1.3603730201721191, "rewards/rejected": -0.461933434009552, "step": 1770 }, { "epoch": 0.18248175182481752, "grad_norm": 29.25, "learning_rate": 9.083847384053004e-07, "logits/chosen": -0.5672441720962524, "logits/rejected": -0.5803080201148987, "logps/chosen": -35.72931671142578, "logps/rejected": -44.9852294921875, "loss": 0.335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8503983616828918, "rewards/margins": 1.0599360466003418, "rewards/rejected": -0.20953771471977234, "step": 1775 }, { "epoch": 0.1829957849285494, "grad_norm": 38.0, "learning_rate": 9.078135709389992e-07, "logits/chosen": -0.48103171586990356, "logits/rejected": -0.4719775319099426, "logps/chosen": -37.937652587890625, "logps/rejected": -44.688011169433594, "loss": 0.3429, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8560273051261902, "rewards/margins": 1.0132917165756226, "rewards/rejected": -0.1572643518447876, "step": 1780 }, { "epoch": 0.18350981803228128, "grad_norm": 25.0, "learning_rate": 9.072424034726982e-07, "logits/chosen": -0.5552318692207336, "logits/rejected": -0.6085149049758911, "logps/chosen": -40.5311279296875, "logps/rejected": -53.843048095703125, "loss": 0.3186, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.818136990070343, "rewards/margins": 1.7109352350234985, "rewards/rejected": -0.892798125743866, "step": 1785 }, { "epoch": 0.18402385113601316, "grad_norm": 38.25, "learning_rate": 9.066712360063971e-07, "logits/chosen": -0.5245305895805359, "logits/rejected": -0.5706087350845337, "logps/chosen": -39.93595886230469, "logps/rejected": -53.5020866394043, "loss": 0.3117, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9663870930671692, "rewards/margins": 1.6548535823822021, "rewards/rejected": -0.6884667277336121, "step": 1790 }, { "epoch": 0.18453788423974504, "grad_norm": 42.75, "learning_rate": 9.061000685400959e-07, "logits/chosen": -0.5434181690216064, "logits/rejected": -0.5424691438674927, "logps/chosen": -46.8280029296875, "logps/rejected": -50.63633346557617, "loss": 0.3475, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5327618718147278, "rewards/margins": 1.0692431926727295, "rewards/rejected": -0.5364813804626465, "step": 1795 }, { "epoch": 0.18505191734347692, "grad_norm": 26.625, "learning_rate": 9.055289010737947e-07, "logits/chosen": -0.5965789556503296, "logits/rejected": -0.6833090782165527, "logps/chosen": -35.97347640991211, "logps/rejected": -50.015830993652344, "loss": 0.3579, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8256972432136536, "rewards/margins": 1.5530413389205933, "rewards/rejected": -0.7273441553115845, "step": 1800 }, { "epoch": 0.1855659504472088, "grad_norm": 29.0, "learning_rate": 9.049577336074937e-07, "logits/chosen": -0.5780996084213257, "logits/rejected": -0.6021654009819031, "logps/chosen": -42.54791259765625, "logps/rejected": -50.76902389526367, "loss": 0.3175, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8480795621871948, "rewards/margins": 1.369022011756897, "rewards/rejected": -0.5209423303604126, "step": 1805 }, { "epoch": 0.1860799835509407, "grad_norm": 29.375, "learning_rate": 9.043865661411926e-07, "logits/chosen": -0.6180293560028076, "logits/rejected": -0.6061933636665344, "logps/chosen": -42.599586486816406, "logps/rejected": -52.82392120361328, "loss": 0.3278, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8630892038345337, "rewards/margins": 1.523090124130249, "rewards/rejected": -0.6600009202957153, "step": 1810 }, { "epoch": 0.18659401665467257, "grad_norm": 52.75, "learning_rate": 9.038153986748914e-07, "logits/chosen": -0.5488831400871277, "logits/rejected": -0.514278769493103, "logps/chosen": -36.33308792114258, "logps/rejected": -46.7570915222168, "loss": 0.3476, "rewards/accuracies": 0.875, "rewards/chosen": 1.0091992616653442, "rewards/margins": 1.5923959016799927, "rewards/rejected": -0.583196759223938, "step": 1815 }, { "epoch": 0.18710804975840445, "grad_norm": 28.375, "learning_rate": 9.032442312085903e-07, "logits/chosen": -0.6047636866569519, "logits/rejected": -0.6591908931732178, "logps/chosen": -37.921905517578125, "logps/rejected": -48.45692825317383, "loss": 0.3377, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.863241970539093, "rewards/margins": 1.2512167692184448, "rewards/rejected": -0.38797491788864136, "step": 1820 }, { "epoch": 0.18762208286213633, "grad_norm": 27.125, "learning_rate": 9.026730637422892e-07, "logits/chosen": -0.6328172087669373, "logits/rejected": -0.5404263138771057, "logps/chosen": -34.27733612060547, "logps/rejected": -52.089630126953125, "loss": 0.3387, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9299665689468384, "rewards/margins": 1.5193182229995728, "rewards/rejected": -0.5893517136573792, "step": 1825 }, { "epoch": 0.1881361159658682, "grad_norm": 33.25, "learning_rate": 9.021018962759881e-07, "logits/chosen": -0.6239026784896851, "logits/rejected": -0.6828736662864685, "logps/chosen": -39.81781768798828, "logps/rejected": -55.92618942260742, "loss": 0.3364, "rewards/accuracies": 0.875, "rewards/chosen": 0.7860864996910095, "rewards/margins": 1.6750195026397705, "rewards/rejected": -0.8889328837394714, "step": 1830 }, { "epoch": 0.1886501490696001, "grad_norm": 30.75, "learning_rate": 9.01530728809687e-07, "logits/chosen": -0.6041110754013062, "logits/rejected": -0.5835949778556824, "logps/chosen": -42.65482711791992, "logps/rejected": -51.35002899169922, "loss": 0.3578, "rewards/accuracies": 0.875, "rewards/chosen": 0.786199688911438, "rewards/margins": 1.5150748491287231, "rewards/rejected": -0.7288751602172852, "step": 1835 }, { "epoch": 0.18916418217333197, "grad_norm": 30.375, "learning_rate": 9.009595613433858e-07, "logits/chosen": -0.5224886536598206, "logits/rejected": -0.5487086176872253, "logps/chosen": -37.708595275878906, "logps/rejected": -52.05681610107422, "loss": 0.3646, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0314538478851318, "rewards/margins": 1.5140646696090698, "rewards/rejected": -0.4826107919216156, "step": 1840 }, { "epoch": 0.18967821527706386, "grad_norm": 33.75, "learning_rate": 9.003883938770847e-07, "logits/chosen": -0.5281314253807068, "logits/rejected": -0.5230777263641357, "logps/chosen": -40.57289123535156, "logps/rejected": -45.482608795166016, "loss": 0.3549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6483539342880249, "rewards/margins": 1.18730890750885, "rewards/rejected": -0.53895503282547, "step": 1845 }, { "epoch": 0.19019224838079574, "grad_norm": 37.0, "learning_rate": 8.998172264107836e-07, "logits/chosen": -0.6411517262458801, "logits/rejected": -0.6038408875465393, "logps/chosen": -43.88751983642578, "logps/rejected": -56.083961486816406, "loss": 0.3348, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6325811743736267, "rewards/margins": 1.5366418361663818, "rewards/rejected": -0.9040605425834656, "step": 1850 }, { "epoch": 0.1907062814845276, "grad_norm": 39.75, "learning_rate": 8.992460589444825e-07, "logits/chosen": -0.5690240263938904, "logits/rejected": -0.5778707265853882, "logps/chosen": -36.031524658203125, "logps/rejected": -52.27650833129883, "loss": 0.3061, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.043666124343872, "rewards/margins": 1.606652855873108, "rewards/rejected": -0.5629867315292358, "step": 1855 }, { "epoch": 0.19122031458825947, "grad_norm": 32.25, "learning_rate": 8.986748914781813e-07, "logits/chosen": -0.6125805974006653, "logits/rejected": -0.6693569421768188, "logps/chosen": -33.001895904541016, "logps/rejected": -46.982391357421875, "loss": 0.3542, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.830554187297821, "rewards/margins": 1.342931866645813, "rewards/rejected": -0.5123776197433472, "step": 1860 }, { "epoch": 0.19173434769199135, "grad_norm": 26.5, "learning_rate": 8.981037240118803e-07, "logits/chosen": -0.5464804172515869, "logits/rejected": -0.5718744397163391, "logps/chosen": -35.77030944824219, "logps/rejected": -51.76643753051758, "loss": 0.3383, "rewards/accuracies": 0.875, "rewards/chosen": 1.0402343273162842, "rewards/margins": 1.7718255519866943, "rewards/rejected": -0.7315911054611206, "step": 1865 }, { "epoch": 0.19224838079572323, "grad_norm": 27.75, "learning_rate": 8.975325565455791e-07, "logits/chosen": -0.5581719875335693, "logits/rejected": -0.5215951204299927, "logps/chosen": -37.661949157714844, "logps/rejected": -43.298362731933594, "loss": 0.3638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8804548978805542, "rewards/margins": 1.1207942962646484, "rewards/rejected": -0.2403395175933838, "step": 1870 }, { "epoch": 0.19276241389945512, "grad_norm": 31.125, "learning_rate": 8.96961389079278e-07, "logits/chosen": -0.5676814913749695, "logits/rejected": -0.5624275207519531, "logps/chosen": -50.19966506958008, "logps/rejected": -57.56801223754883, "loss": 0.3632, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7437518239021301, "rewards/margins": 1.4345632791519165, "rewards/rejected": -0.6908115148544312, "step": 1875 }, { "epoch": 0.193276447003187, "grad_norm": 27.375, "learning_rate": 8.963902216129769e-07, "logits/chosen": -0.5048921704292297, "logits/rejected": -0.548738956451416, "logps/chosen": -38.95627975463867, "logps/rejected": -45.97825241088867, "loss": 0.3628, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9801608920097351, "rewards/margins": 1.3087043762207031, "rewards/rejected": -0.32854336500167847, "step": 1880 }, { "epoch": 0.19379048010691888, "grad_norm": 32.25, "learning_rate": 8.958190541466758e-07, "logits/chosen": -0.5884486436843872, "logits/rejected": -0.6537419557571411, "logps/chosen": -32.58468246459961, "logps/rejected": -46.827430725097656, "loss": 0.3443, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9379048347473145, "rewards/margins": 1.3408966064453125, "rewards/rejected": -0.4029918313026428, "step": 1885 }, { "epoch": 0.19430451321065076, "grad_norm": 36.75, "learning_rate": 8.952478866803746e-07, "logits/chosen": -0.5107083916664124, "logits/rejected": -0.5470095872879028, "logps/chosen": -36.849609375, "logps/rejected": -49.48329544067383, "loss": 0.3337, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6995590329170227, "rewards/margins": 1.509117841720581, "rewards/rejected": -0.8095590472221375, "step": 1890 }, { "epoch": 0.19481854631438264, "grad_norm": 27.625, "learning_rate": 8.946767192140735e-07, "logits/chosen": -0.5373606085777283, "logits/rejected": -0.5812641978263855, "logps/chosen": -33.45539474487305, "logps/rejected": -46.77941131591797, "loss": 0.3299, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6222339868545532, "rewards/margins": 1.2657737731933594, "rewards/rejected": -0.6435397863388062, "step": 1895 }, { "epoch": 0.19533257941811452, "grad_norm": 30.125, "learning_rate": 8.941055517477724e-07, "logits/chosen": -0.5417529344558716, "logits/rejected": -0.5732064843177795, "logps/chosen": -36.014381408691406, "logps/rejected": -49.80429458618164, "loss": 0.3438, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.705402135848999, "rewards/margins": 1.4369323253631592, "rewards/rejected": -0.7315303087234497, "step": 1900 }, { "epoch": 0.1958466125218464, "grad_norm": 44.0, "learning_rate": 8.935343842814713e-07, "logits/chosen": -0.6297652721405029, "logits/rejected": -0.6687008738517761, "logps/chosen": -36.51008605957031, "logps/rejected": -47.23175048828125, "loss": 0.3383, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9303016662597656, "rewards/margins": 1.5488035678863525, "rewards/rejected": -0.6185019612312317, "step": 1905 }, { "epoch": 0.19636064562557828, "grad_norm": 28.125, "learning_rate": 8.929632168151702e-07, "logits/chosen": -0.5359702706336975, "logits/rejected": -0.6159013509750366, "logps/chosen": -32.45431137084961, "logps/rejected": -49.49165725708008, "loss": 0.3378, "rewards/accuracies": 0.875, "rewards/chosen": 0.9603782892227173, "rewards/margins": 1.5031906366348267, "rewards/rejected": -0.5428124070167542, "step": 1910 }, { "epoch": 0.19687467872931017, "grad_norm": 28.875, "learning_rate": 8.92392049348869e-07, "logits/chosen": -0.511168360710144, "logits/rejected": -0.5103895664215088, "logps/chosen": -41.45120620727539, "logps/rejected": -47.4807243347168, "loss": 0.3283, "rewards/accuracies": 0.75, "rewards/chosen": 0.4590584635734558, "rewards/margins": 1.0442761182785034, "rewards/rejected": -0.5852176547050476, "step": 1915 }, { "epoch": 0.19738871183304205, "grad_norm": 24.625, "learning_rate": 8.918208818825679e-07, "logits/chosen": -0.49592891335487366, "logits/rejected": -0.5282418131828308, "logps/chosen": -32.11997985839844, "logps/rejected": -47.47538375854492, "loss": 0.3388, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.622148871421814, "rewards/margins": 1.1340491771697998, "rewards/rejected": -0.5119003653526306, "step": 1920 }, { "epoch": 0.19790274493677393, "grad_norm": 32.75, "learning_rate": 8.912497144162669e-07, "logits/chosen": -0.5184054970741272, "logits/rejected": -0.5586624145507812, "logps/chosen": -39.903099060058594, "logps/rejected": -46.16015625, "loss": 0.3631, "rewards/accuracies": 0.875, "rewards/chosen": 0.9139362573623657, "rewards/margins": 1.1447007656097412, "rewards/rejected": -0.23076441884040833, "step": 1925 }, { "epoch": 0.1984167780405058, "grad_norm": 55.5, "learning_rate": 8.906785469499657e-07, "logits/chosen": -0.5260945558547974, "logits/rejected": -0.5812472105026245, "logps/chosen": -37.25315475463867, "logps/rejected": -48.24432373046875, "loss": 0.3696, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.956146240234375, "rewards/margins": 1.6043908596038818, "rewards/rejected": -0.6482447385787964, "step": 1930 }, { "epoch": 0.1989308111442377, "grad_norm": 37.0, "learning_rate": 8.901073794836645e-07, "logits/chosen": -0.5587001442909241, "logits/rejected": -0.6009715795516968, "logps/chosen": -30.762317657470703, "logps/rejected": -48.37239456176758, "loss": 0.3549, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7940875291824341, "rewards/margins": 1.5988450050354004, "rewards/rejected": -0.8047574162483215, "step": 1935 }, { "epoch": 0.19944484424796957, "grad_norm": 26.375, "learning_rate": 8.895362120173634e-07, "logits/chosen": -0.5881232023239136, "logits/rejected": -0.5268701314926147, "logps/chosen": -35.554901123046875, "logps/rejected": -46.340476989746094, "loss": 0.2941, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0404150485992432, "rewards/margins": 1.558104157447815, "rewards/rejected": -0.5176891088485718, "step": 1940 }, { "epoch": 0.19995887735170145, "grad_norm": 29.5, "learning_rate": 8.889650445510624e-07, "logits/chosen": -0.6195683479309082, "logits/rejected": -0.6380826234817505, "logps/chosen": -36.408241271972656, "logps/rejected": -52.56690216064453, "loss": 0.3481, "rewards/accuracies": 0.875, "rewards/chosen": 0.8807379603385925, "rewards/margins": 1.607008934020996, "rewards/rejected": -0.726270854473114, "step": 1945 }, { "epoch": 0.20006168397244783, "eval_logits/chosen": -0.5404154658317566, "eval_logits/rejected": -0.6004202961921692, "eval_logps/chosen": -76.58434295654297, "eval_logps/rejected": -52.747100830078125, "eval_loss": 0.3315838575363159, "eval_rewards/accuracies": 0.8602941036224365, "eval_rewards/chosen": 0.8324273228645325, "eval_rewards/margins": 1.4728955030441284, "eval_rewards/rejected": -0.6404681205749512, "eval_runtime": 2.1164, "eval_samples_per_second": 505.58, "eval_steps_per_second": 8.033, "step": 1946 }, { "epoch": 0.20047291045543333, "grad_norm": 31.125, "learning_rate": 8.883938770847612e-07, "logits/chosen": -0.5983020067214966, "logits/rejected": -0.6114877462387085, "logps/chosen": -40.2936897277832, "logps/rejected": -47.35614013671875, "loss": 0.3548, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8672000169754028, "rewards/margins": 1.317804217338562, "rewards/rejected": -0.45060425996780396, "step": 1950 }, { "epoch": 0.20098694355916522, "grad_norm": 34.0, "learning_rate": 8.8782270961846e-07, "logits/chosen": -0.604744553565979, "logits/rejected": -0.5747587084770203, "logps/chosen": -41.29680633544922, "logps/rejected": -43.99660873413086, "loss": 0.3465, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0799241065979004, "rewards/margins": 1.1947104930877686, "rewards/rejected": -0.11478634923696518, "step": 1955 }, { "epoch": 0.2015009766628971, "grad_norm": 26.0, "learning_rate": 8.87251542152159e-07, "logits/chosen": -0.5971983671188354, "logits/rejected": -0.5714359879493713, "logps/chosen": -38.234893798828125, "logps/rejected": -56.33788299560547, "loss": 0.3143, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9877635836601257, "rewards/margins": 1.789055585861206, "rewards/rejected": -0.8012921214103699, "step": 1960 }, { "epoch": 0.20201500976662898, "grad_norm": 26.25, "learning_rate": 8.866803746858579e-07, "logits/chosen": -0.6366058588027954, "logits/rejected": -0.6597836017608643, "logps/chosen": -42.7608642578125, "logps/rejected": -53.91908645629883, "loss": 0.3382, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9538263082504272, "rewards/margins": 1.6347293853759766, "rewards/rejected": -0.6809029579162598, "step": 1965 }, { "epoch": 0.20252904287036086, "grad_norm": 27.0, "learning_rate": 8.861092072195568e-07, "logits/chosen": -0.5828176736831665, "logits/rejected": -0.6280630826950073, "logps/chosen": -36.149906158447266, "logps/rejected": -50.00937271118164, "loss": 0.295, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7439976930618286, "rewards/margins": 1.4574577808380127, "rewards/rejected": -0.7134600877761841, "step": 1970 }, { "epoch": 0.20304307597409274, "grad_norm": 32.5, "learning_rate": 8.855380397532556e-07, "logits/chosen": -0.6574116945266724, "logits/rejected": -0.6517921686172485, "logps/chosen": -40.4972038269043, "logps/rejected": -50.256004333496094, "loss": 0.3565, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6435269713401794, "rewards/margins": 1.2824435234069824, "rewards/rejected": -0.6389164924621582, "step": 1975 }, { "epoch": 0.20355710907782462, "grad_norm": 25.875, "learning_rate": 8.849668722869545e-07, "logits/chosen": -0.5455323457717896, "logits/rejected": -0.5669711828231812, "logps/chosen": -42.24761199951172, "logps/rejected": -54.0086784362793, "loss": 0.3672, "rewards/accuracies": 0.875, "rewards/chosen": 0.7258812189102173, "rewards/margins": 1.3645894527435303, "rewards/rejected": -0.638708233833313, "step": 1980 }, { "epoch": 0.2040711421815565, "grad_norm": 32.0, "learning_rate": 8.843957048206534e-07, "logits/chosen": -0.5784268379211426, "logits/rejected": -0.5925766229629517, "logps/chosen": -40.496334075927734, "logps/rejected": -51.255767822265625, "loss": 0.3547, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5165726542472839, "rewards/margins": 1.437197208404541, "rewards/rejected": -0.9206244349479675, "step": 1985 }, { "epoch": 0.20458517528528838, "grad_norm": 34.75, "learning_rate": 8.838245373543523e-07, "logits/chosen": -0.5861397981643677, "logits/rejected": -0.6300731897354126, "logps/chosen": -44.67594528198242, "logps/rejected": -55.106651306152344, "loss": 0.3236, "rewards/accuracies": 0.875, "rewards/chosen": 0.7358596920967102, "rewards/margins": 1.3760707378387451, "rewards/rejected": -0.6402109861373901, "step": 1990 }, { "epoch": 0.20509920838902027, "grad_norm": 35.5, "learning_rate": 8.832533698880511e-07, "logits/chosen": -0.5037344694137573, "logits/rejected": -0.5023512840270996, "logps/chosen": -41.93202590942383, "logps/rejected": -51.49782180786133, "loss": 0.3035, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5351628065109253, "rewards/margins": 1.3771991729736328, "rewards/rejected": -0.8420364260673523, "step": 1995 }, { "epoch": 0.20561324149275215, "grad_norm": 31.5, "learning_rate": 8.8268220242175e-07, "logits/chosen": -0.5670769214630127, "logits/rejected": -0.5613880157470703, "logps/chosen": -34.00499725341797, "logps/rejected": -48.178863525390625, "loss": 0.3558, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7887181043624878, "rewards/margins": 1.4164315462112427, "rewards/rejected": -0.6277132034301758, "step": 2000 }, { "epoch": 0.206127274596484, "grad_norm": 34.75, "learning_rate": 8.821110349554489e-07, "logits/chosen": -0.6005203723907471, "logits/rejected": -0.5942217111587524, "logps/chosen": -39.770755767822266, "logps/rejected": -52.22999954223633, "loss": 0.342, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7252010703086853, "rewards/margins": 1.4553858041763306, "rewards/rejected": -0.7301846742630005, "step": 2005 }, { "epoch": 0.20664130770021588, "grad_norm": 27.25, "learning_rate": 8.815398674891478e-07, "logits/chosen": -0.5849184989929199, "logits/rejected": -0.6153854131698608, "logps/chosen": -37.80316925048828, "logps/rejected": -50.84638214111328, "loss": 0.3481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8740198016166687, "rewards/margins": 1.6260604858398438, "rewards/rejected": -0.7520407438278198, "step": 2010 }, { "epoch": 0.20715534080394776, "grad_norm": 24.5, "learning_rate": 8.809687000228466e-07, "logits/chosen": -0.5860157608985901, "logits/rejected": -0.6471272706985474, "logps/chosen": -38.755714416503906, "logps/rejected": -56.280845642089844, "loss": 0.2872, "rewards/accuracies": 0.875, "rewards/chosen": 0.6698523163795471, "rewards/margins": 1.45920729637146, "rewards/rejected": -0.7893549799919128, "step": 2015 }, { "epoch": 0.20766937390767964, "grad_norm": 28.0, "learning_rate": 8.803975325565456e-07, "logits/chosen": -0.6383806467056274, "logits/rejected": -0.6675608158111572, "logps/chosen": -38.409603118896484, "logps/rejected": -50.86361312866211, "loss": 0.3141, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8550891876220703, "rewards/margins": 1.646091103553772, "rewards/rejected": -0.7910016179084778, "step": 2020 }, { "epoch": 0.20818340701141153, "grad_norm": 32.25, "learning_rate": 8.798263650902444e-07, "logits/chosen": -0.6374236345291138, "logits/rejected": -0.7030409574508667, "logps/chosen": -41.94251251220703, "logps/rejected": -55.22880935668945, "loss": 0.3313, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8891535997390747, "rewards/margins": 1.501090407371521, "rewards/rejected": -0.6119370460510254, "step": 2025 }, { "epoch": 0.2086974401151434, "grad_norm": 31.5, "learning_rate": 8.792551976239433e-07, "logits/chosen": -0.5161077976226807, "logits/rejected": -0.6035955548286438, "logps/chosen": -37.340545654296875, "logps/rejected": -46.48689270019531, "loss": 0.329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6884075403213501, "rewards/margins": 1.3213766813278198, "rewards/rejected": -0.6329692006111145, "step": 2030 }, { "epoch": 0.2092114732188753, "grad_norm": 26.5, "learning_rate": 8.786840301576422e-07, "logits/chosen": -0.45035797357559204, "logits/rejected": -0.46677881479263306, "logps/chosen": -34.68207931518555, "logps/rejected": -53.56829071044922, "loss": 0.3319, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8526372909545898, "rewards/margins": 1.3890628814697266, "rewards/rejected": -0.5364255905151367, "step": 2035 }, { "epoch": 0.20972550632260717, "grad_norm": 43.0, "learning_rate": 8.781128626913411e-07, "logits/chosen": -0.5909737348556519, "logits/rejected": -0.5649340748786926, "logps/chosen": -36.16325759887695, "logps/rejected": -44.000938415527344, "loss": 0.3546, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.605525553226471, "rewards/margins": 0.9225884675979614, "rewards/rejected": -0.31706276535987854, "step": 2040 }, { "epoch": 0.21023953942633905, "grad_norm": 29.375, "learning_rate": 8.775416952250399e-07, "logits/chosen": -0.6155343055725098, "logits/rejected": -0.6454334855079651, "logps/chosen": -40.09392547607422, "logps/rejected": -54.750633239746094, "loss": 0.3251, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5929685831069946, "rewards/margins": 1.527928352355957, "rewards/rejected": -0.934959888458252, "step": 2045 }, { "epoch": 0.21075357253007093, "grad_norm": 37.0, "learning_rate": 8.769705277587388e-07, "logits/chosen": -0.6052112579345703, "logits/rejected": -0.555386483669281, "logps/chosen": -39.303466796875, "logps/rejected": -48.33509826660156, "loss": 0.3483, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5539143085479736, "rewards/margins": 1.2916033267974854, "rewards/rejected": -0.7376888990402222, "step": 2050 }, { "epoch": 0.2112676056338028, "grad_norm": 45.25, "learning_rate": 8.763993602924377e-07, "logits/chosen": -0.5161058902740479, "logits/rejected": -0.5638940930366516, "logps/chosen": -35.76390838623047, "logps/rejected": -50.4088020324707, "loss": 0.3128, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8044037818908691, "rewards/margins": 1.5767505168914795, "rewards/rejected": -0.7723469734191895, "step": 2055 }, { "epoch": 0.2117816387375347, "grad_norm": 32.5, "learning_rate": 8.758281928261366e-07, "logits/chosen": -0.6316355466842651, "logits/rejected": -0.5960062742233276, "logps/chosen": -41.15311813354492, "logps/rejected": -44.527976989746094, "loss": 0.3343, "rewards/accuracies": 0.875, "rewards/chosen": 0.5771728754043579, "rewards/margins": 0.96763676404953, "rewards/rejected": -0.3904639780521393, "step": 2060 }, { "epoch": 0.21229567184126658, "grad_norm": 34.75, "learning_rate": 8.752570253598354e-07, "logits/chosen": -0.3941386342048645, "logits/rejected": -0.4942146837711334, "logps/chosen": -38.71873474121094, "logps/rejected": -49.96076202392578, "loss": 0.3161, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7693501114845276, "rewards/margins": 1.395344853401184, "rewards/rejected": -0.6259948015213013, "step": 2065 }, { "epoch": 0.21280970494499846, "grad_norm": 27.875, "learning_rate": 8.746858578935343e-07, "logits/chosen": -0.5463622808456421, "logits/rejected": -0.5913022756576538, "logps/chosen": -37.951908111572266, "logps/rejected": -49.27983856201172, "loss": 0.3533, "rewards/accuracies": 0.875, "rewards/chosen": 0.8814668655395508, "rewards/margins": 1.4162929058074951, "rewards/rejected": -0.5348259210586548, "step": 2070 }, { "epoch": 0.21332373804873034, "grad_norm": 29.125, "learning_rate": 8.741146904272332e-07, "logits/chosen": -0.545819878578186, "logits/rejected": -0.5567599534988403, "logps/chosen": -36.9376335144043, "logps/rejected": -52.268829345703125, "loss": 0.3331, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.534757137298584, "rewards/margins": 1.436633825302124, "rewards/rejected": -0.9018768072128296, "step": 2075 }, { "epoch": 0.21383777115246222, "grad_norm": 26.875, "learning_rate": 8.735435229609322e-07, "logits/chosen": -0.5184996724128723, "logits/rejected": -0.4859018325805664, "logps/chosen": -43.663856506347656, "logps/rejected": -52.574066162109375, "loss": 0.3264, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8596780896186829, "rewards/margins": 1.7633726596832275, "rewards/rejected": -0.9036946296691895, "step": 2080 }, { "epoch": 0.2143518042561941, "grad_norm": 34.5, "learning_rate": 8.72972355494631e-07, "logits/chosen": -0.6564141511917114, "logits/rejected": -0.6040593385696411, "logps/chosen": -41.890846252441406, "logps/rejected": -54.4328727722168, "loss": 0.3108, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7068260312080383, "rewards/margins": 1.5656723976135254, "rewards/rejected": -0.8588463068008423, "step": 2085 }, { "epoch": 0.21486583735992598, "grad_norm": 25.25, "learning_rate": 8.724011880283298e-07, "logits/chosen": -0.5341938734054565, "logits/rejected": -0.6116842031478882, "logps/chosen": -36.90131378173828, "logps/rejected": -54.70105743408203, "loss": 0.3327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5909588932991028, "rewards/margins": 1.356183648109436, "rewards/rejected": -0.7652247548103333, "step": 2090 }, { "epoch": 0.21537987046365786, "grad_norm": 33.25, "learning_rate": 8.718300205620287e-07, "logits/chosen": -0.526642918586731, "logits/rejected": -0.640835165977478, "logps/chosen": -37.802162170410156, "logps/rejected": -51.020362854003906, "loss": 0.3474, "rewards/accuracies": 0.875, "rewards/chosen": 0.6428853273391724, "rewards/margins": 1.4157631397247314, "rewards/rejected": -0.7728778123855591, "step": 2095 }, { "epoch": 0.21589390356738974, "grad_norm": 50.0, "learning_rate": 8.712588530957277e-07, "logits/chosen": -0.5584341287612915, "logits/rejected": -0.510897159576416, "logps/chosen": -38.01245880126953, "logps/rejected": -48.239139556884766, "loss": 0.3627, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8578101992607117, "rewards/margins": 1.4175866842269897, "rewards/rejected": -0.559776782989502, "step": 2100 }, { "epoch": 0.21640793667112163, "grad_norm": 32.0, "learning_rate": 8.706876856294265e-07, "logits/chosen": -0.5524247288703918, "logits/rejected": -0.5592859983444214, "logps/chosen": -36.89834976196289, "logps/rejected": -51.73820877075195, "loss": 0.3499, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7433491349220276, "rewards/margins": 1.5424131155014038, "rewards/rejected": -0.799064040184021, "step": 2105 }, { "epoch": 0.2169219697748535, "grad_norm": 31.0, "learning_rate": 8.701165181631253e-07, "logits/chosen": -0.6700602173805237, "logits/rejected": -0.6858260035514832, "logps/chosen": -38.84659194946289, "logps/rejected": -50.92143249511719, "loss": 0.3386, "rewards/accuracies": 0.875, "rewards/chosen": 0.6944657564163208, "rewards/margins": 1.5608885288238525, "rewards/rejected": -0.8664228320121765, "step": 2110 }, { "epoch": 0.2174360028785854, "grad_norm": 32.75, "learning_rate": 8.695453506968242e-07, "logits/chosen": -0.5031174421310425, "logits/rejected": -0.5518320798873901, "logps/chosen": -34.944091796875, "logps/rejected": -42.19989013671875, "loss": 0.3546, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7230604887008667, "rewards/margins": 1.1480903625488281, "rewards/rejected": -0.4250297546386719, "step": 2115 }, { "epoch": 0.21795003598231727, "grad_norm": 30.75, "learning_rate": 8.689741832305232e-07, "logits/chosen": -0.5788718461990356, "logits/rejected": -0.6296242475509644, "logps/chosen": -40.2994270324707, "logps/rejected": -53.244300842285156, "loss": 0.3424, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7540902495384216, "rewards/margins": 1.5647027492523193, "rewards/rejected": -0.8106124997138977, "step": 2120 }, { "epoch": 0.21846406908604915, "grad_norm": 35.0, "learning_rate": 8.684030157642221e-07, "logits/chosen": -0.5672677159309387, "logits/rejected": -0.5938138961791992, "logps/chosen": -37.84978485107422, "logps/rejected": -43.13164520263672, "loss": 0.3619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5909175872802734, "rewards/margins": 0.8562394976615906, "rewards/rejected": -0.26532191038131714, "step": 2125 }, { "epoch": 0.21897810218978103, "grad_norm": 39.0, "learning_rate": 8.678318482979209e-07, "logits/chosen": -0.521304726600647, "logits/rejected": -0.5749825239181519, "logps/chosen": -40.334678649902344, "logps/rejected": -55.22173309326172, "loss": 0.321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1454753875732422, "rewards/margins": 2.039086103439331, "rewards/rejected": -0.8936104774475098, "step": 2130 }, { "epoch": 0.2194921352935129, "grad_norm": 45.0, "learning_rate": 8.672606808316197e-07, "logits/chosen": -0.5459715723991394, "logits/rejected": -0.5749081373214722, "logps/chosen": -39.445655822753906, "logps/rejected": -54.75017166137695, "loss": 0.3336, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8698694109916687, "rewards/margins": 1.8334176540374756, "rewards/rejected": -0.9635483026504517, "step": 2135 }, { "epoch": 0.2200061683972448, "grad_norm": 53.5, "learning_rate": 8.666895133653187e-07, "logits/chosen": -0.569520115852356, "logits/rejected": -0.6522771716117859, "logps/chosen": -35.11920928955078, "logps/rejected": -50.898651123046875, "loss": 0.2832, "rewards/accuracies": 0.875, "rewards/chosen": 0.941665768623352, "rewards/margins": 1.5785391330718994, "rewards/rejected": -0.6368734240531921, "step": 2140 }, { "epoch": 0.22052020150097668, "grad_norm": 37.0, "learning_rate": 8.661183458990176e-07, "logits/chosen": -0.4897998869419098, "logits/rejected": -0.5094395875930786, "logps/chosen": -41.87484359741211, "logps/rejected": -52.617652893066406, "loss": 0.3408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5773676633834839, "rewards/margins": 1.2493952512741089, "rewards/rejected": -0.6720275282859802, "step": 2145 }, { "epoch": 0.22103423460470856, "grad_norm": 29.5, "learning_rate": 8.655471784327164e-07, "logits/chosen": -0.5471209287643433, "logits/rejected": -0.6153703927993774, "logps/chosen": -31.626256942749023, "logps/rejected": -51.809486389160156, "loss": 0.3248, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9900878071784973, "rewards/margins": 1.9462766647338867, "rewards/rejected": -0.956188976764679, "step": 2150 }, { "epoch": 0.2215482677084404, "grad_norm": 48.0, "learning_rate": 8.649760109664152e-07, "logits/chosen": -0.6439474821090698, "logits/rejected": -0.6887057423591614, "logps/chosen": -36.66521453857422, "logps/rejected": -51.069358825683594, "loss": 0.3425, "rewards/accuracies": 0.875, "rewards/chosen": 0.6757761240005493, "rewards/margins": 1.3430808782577515, "rewards/rejected": -0.6673048734664917, "step": 2155 }, { "epoch": 0.2220623008121723, "grad_norm": 45.0, "learning_rate": 8.644048435001142e-07, "logits/chosen": -0.559620201587677, "logits/rejected": -0.54682457447052, "logps/chosen": -37.37910842895508, "logps/rejected": -46.13095474243164, "loss": 0.3425, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9137174487113953, "rewards/margins": 1.3019927740097046, "rewards/rejected": -0.3882753252983093, "step": 2160 }, { "epoch": 0.22257633391590417, "grad_norm": 27.375, "learning_rate": 8.638336760338131e-07, "logits/chosen": -0.5336966514587402, "logits/rejected": -0.552636444568634, "logps/chosen": -39.702293395996094, "logps/rejected": -48.91362380981445, "loss": 0.3401, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6229864954948425, "rewards/margins": 1.0995585918426514, "rewards/rejected": -0.4765722155570984, "step": 2165 }, { "epoch": 0.22309036701963605, "grad_norm": 25.0, "learning_rate": 8.63262508567512e-07, "logits/chosen": -0.606689453125, "logits/rejected": -0.6259554624557495, "logps/chosen": -39.77228927612305, "logps/rejected": -57.586341857910156, "loss": 0.3405, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.952769935131073, "rewards/margins": 1.6020015478134155, "rewards/rejected": -0.6492316126823425, "step": 2170 }, { "epoch": 0.22360440012336794, "grad_norm": 42.75, "learning_rate": 8.626913411012109e-07, "logits/chosen": -0.540320634841919, "logits/rejected": -0.6080446839332581, "logps/chosen": -32.26276779174805, "logps/rejected": -43.4725341796875, "loss": 0.3141, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9628424644470215, "rewards/margins": 1.3051837682724, "rewards/rejected": -0.3423413634300232, "step": 2175 }, { "epoch": 0.22411843322709982, "grad_norm": 26.375, "learning_rate": 8.621201736349097e-07, "logits/chosen": -0.5665949583053589, "logits/rejected": -0.5681470632553101, "logps/chosen": -40.6776008605957, "logps/rejected": -49.226219177246094, "loss": 0.3172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.950779914855957, "rewards/margins": 1.6940672397613525, "rewards/rejected": -0.7432873249053955, "step": 2180 }, { "epoch": 0.2246324663308317, "grad_norm": 26.125, "learning_rate": 8.615490061686086e-07, "logits/chosen": -0.4977108836174011, "logits/rejected": -0.5197646617889404, "logps/chosen": -47.351810455322266, "logps/rejected": -56.1024055480957, "loss": 0.3074, "rewards/accuracies": 0.875, "rewards/chosen": 0.8470796346664429, "rewards/margins": 1.6521892547607422, "rewards/rejected": -0.8051095008850098, "step": 2185 }, { "epoch": 0.22514649943456358, "grad_norm": 29.125, "learning_rate": 8.609778387023075e-07, "logits/chosen": -0.6172149777412415, "logits/rejected": -0.6145997643470764, "logps/chosen": -42.938716888427734, "logps/rejected": -50.833953857421875, "loss": 0.3348, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9010616540908813, "rewards/margins": 1.4396494626998901, "rewards/rejected": -0.5385879278182983, "step": 2190 }, { "epoch": 0.22566053253829546, "grad_norm": 23.5, "learning_rate": 8.604066712360064e-07, "logits/chosen": -0.5263647437095642, "logits/rejected": -0.5668430924415588, "logps/chosen": -42.16041946411133, "logps/rejected": -52.2848014831543, "loss": 0.3486, "rewards/accuracies": 0.875, "rewards/chosen": 0.9179126620292664, "rewards/margins": 1.7427747249603271, "rewards/rejected": -0.8248621821403503, "step": 2195 }, { "epoch": 0.22617456564202734, "grad_norm": 25.25, "learning_rate": 8.598355037697052e-07, "logits/chosen": -0.6893194913864136, "logits/rejected": -0.6729424595832825, "logps/chosen": -40.40544128417969, "logps/rejected": -50.85029220581055, "loss": 0.332, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8512359857559204, "rewards/margins": 1.4765164852142334, "rewards/rejected": -0.625280499458313, "step": 2200 }, { "epoch": 0.22668859874575922, "grad_norm": 36.5, "learning_rate": 8.592643363034041e-07, "logits/chosen": -0.5024547576904297, "logits/rejected": -0.5763053894042969, "logps/chosen": -32.14833450317383, "logps/rejected": -49.220924377441406, "loss": 0.3553, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6856701374053955, "rewards/margins": 1.3061200380325317, "rewards/rejected": -0.6204499006271362, "step": 2205 }, { "epoch": 0.2272026318494911, "grad_norm": 30.125, "learning_rate": 8.58693168837103e-07, "logits/chosen": -0.5234354138374329, "logits/rejected": -0.5467531085014343, "logps/chosen": -34.894466400146484, "logps/rejected": -55.08393478393555, "loss": 0.2918, "rewards/accuracies": 1.0, "rewards/chosen": 1.0002260208129883, "rewards/margins": 1.953669548034668, "rewards/rejected": -0.953443706035614, "step": 2210 }, { "epoch": 0.22771666495322299, "grad_norm": 37.25, "learning_rate": 8.58122001370802e-07, "logits/chosen": -0.5884762406349182, "logits/rejected": -0.5524431467056274, "logps/chosen": -45.480567932128906, "logps/rejected": -45.55812454223633, "loss": 0.3458, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6582952737808228, "rewards/margins": 1.0627399682998657, "rewards/rejected": -0.40444475412368774, "step": 2215 }, { "epoch": 0.22823069805695487, "grad_norm": 28.75, "learning_rate": 8.575508339045007e-07, "logits/chosen": -0.5714191794395447, "logits/rejected": -0.6399219632148743, "logps/chosen": -37.71223831176758, "logps/rejected": -52.86958694458008, "loss": 0.2889, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8518350720405579, "rewards/margins": 1.877733588218689, "rewards/rejected": -1.0258984565734863, "step": 2220 }, { "epoch": 0.22874473116068675, "grad_norm": 35.75, "learning_rate": 8.569796664381996e-07, "logits/chosen": -0.5450949668884277, "logits/rejected": -0.5717419385910034, "logps/chosen": -36.654205322265625, "logps/rejected": -50.776161193847656, "loss": 0.3413, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9073578119277954, "rewards/margins": 1.4119106531143188, "rewards/rejected": -0.5045528411865234, "step": 2225 }, { "epoch": 0.22925876426441863, "grad_norm": 30.0, "learning_rate": 8.564084989718985e-07, "logits/chosen": -0.5817470550537109, "logits/rejected": -0.6052084565162659, "logps/chosen": -39.72934341430664, "logps/rejected": -49.18779754638672, "loss": 0.3458, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7632990479469299, "rewards/margins": 1.4083693027496338, "rewards/rejected": -0.6450701355934143, "step": 2230 }, { "epoch": 0.2297727973681505, "grad_norm": 26.75, "learning_rate": 8.558373315055975e-07, "logits/chosen": -0.5726627707481384, "logits/rejected": -0.6328667402267456, "logps/chosen": -38.003639221191406, "logps/rejected": -52.39570236206055, "loss": 0.3438, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9286128878593445, "rewards/margins": 1.6032359600067139, "rewards/rejected": -0.6746233105659485, "step": 2235 }, { "epoch": 0.2302868304718824, "grad_norm": 24.0, "learning_rate": 8.552661640392963e-07, "logits/chosen": -0.5891412496566772, "logits/rejected": -0.6151705980300903, "logps/chosen": -44.8524169921875, "logps/rejected": -54.03007125854492, "loss": 0.3261, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7909253835678101, "rewards/margins": 1.5324552059173584, "rewards/rejected": -0.7415299415588379, "step": 2240 }, { "epoch": 0.23080086357561427, "grad_norm": 35.5, "learning_rate": 8.546949965729951e-07, "logits/chosen": -0.5273049473762512, "logits/rejected": -0.5532500147819519, "logps/chosen": -39.423160552978516, "logps/rejected": -48.4839973449707, "loss": 0.3581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6817730665206909, "rewards/margins": 1.306992530822754, "rewards/rejected": -0.6252195239067078, "step": 2245 }, { "epoch": 0.23131489667934615, "grad_norm": 32.5, "learning_rate": 8.54123829106694e-07, "logits/chosen": -0.6033645868301392, "logits/rejected": -0.6726464033126831, "logps/chosen": -38.27296447753906, "logps/rejected": -55.532867431640625, "loss": 0.3565, "rewards/accuracies": 0.875, "rewards/chosen": 0.6369024515151978, "rewards/margins": 1.3927379846572876, "rewards/rejected": -0.7558354139328003, "step": 2250 }, { "epoch": 0.23182892978307804, "grad_norm": 47.5, "learning_rate": 8.53552661640393e-07, "logits/chosen": -0.6289905309677124, "logits/rejected": -0.6478694677352905, "logps/chosen": -41.56806945800781, "logps/rejected": -48.48179244995117, "loss": 0.3262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.682607889175415, "rewards/margins": 1.3985705375671387, "rewards/rejected": -0.7159627079963684, "step": 2255 }, { "epoch": 0.23234296288680992, "grad_norm": 25.5, "learning_rate": 8.529814941740919e-07, "logits/chosen": -0.6017194986343384, "logits/rejected": -0.6313401460647583, "logps/chosen": -41.67848205566406, "logps/rejected": -52.83408737182617, "loss": 0.3242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6907979249954224, "rewards/margins": 1.5059791803359985, "rewards/rejected": -0.8151813745498657, "step": 2260 }, { "epoch": 0.2328569959905418, "grad_norm": 26.625, "learning_rate": 8.524103267077906e-07, "logits/chosen": -0.5780819654464722, "logits/rejected": -0.6262399554252625, "logps/chosen": -38.40315628051758, "logps/rejected": -48.6972541809082, "loss": 0.3298, "rewards/accuracies": 0.875, "rewards/chosen": 0.5725816488265991, "rewards/margins": 1.323000192642212, "rewards/rejected": -0.7504185438156128, "step": 2265 }, { "epoch": 0.23337102909427368, "grad_norm": 34.75, "learning_rate": 8.518391592414895e-07, "logits/chosen": -0.5010443925857544, "logits/rejected": -0.6208441257476807, "logps/chosen": -39.567283630371094, "logps/rejected": -52.994598388671875, "loss": 0.3249, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5391669273376465, "rewards/margins": 1.1944314241409302, "rewards/rejected": -0.6552644968032837, "step": 2270 }, { "epoch": 0.23388506219800556, "grad_norm": 36.25, "learning_rate": 8.512679917751885e-07, "logits/chosen": -0.5399574637413025, "logits/rejected": -0.5711160898208618, "logps/chosen": -36.050819396972656, "logps/rejected": -46.44729232788086, "loss": 0.3343, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5615106821060181, "rewards/margins": 1.1716985702514648, "rewards/rejected": -0.6101880669593811, "step": 2275 }, { "epoch": 0.23439909530173744, "grad_norm": 34.0, "learning_rate": 8.506968243088874e-07, "logits/chosen": -0.5191384553909302, "logits/rejected": -0.5209972262382507, "logps/chosen": -37.74274444580078, "logps/rejected": -52.5051383972168, "loss": 0.3207, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7643178701400757, "rewards/margins": 1.7509746551513672, "rewards/rejected": -0.9866567850112915, "step": 2280 }, { "epoch": 0.23491312840546932, "grad_norm": 48.0, "learning_rate": 8.501256568425862e-07, "logits/chosen": -0.6018669605255127, "logits/rejected": -0.6170397996902466, "logps/chosen": -41.71763229370117, "logps/rejected": -56.610931396484375, "loss": 0.3026, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5909707546234131, "rewards/margins": 1.446590781211853, "rewards/rejected": -0.8556200861930847, "step": 2285 }, { "epoch": 0.2354271615092012, "grad_norm": 25.75, "learning_rate": 8.49554489376285e-07, "logits/chosen": -0.6264339685440063, "logits/rejected": -0.6174625158309937, "logps/chosen": -37.695838928222656, "logps/rejected": -46.39299011230469, "loss": 0.3076, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6843377351760864, "rewards/margins": 1.186598777770996, "rewards/rejected": -0.5022611021995544, "step": 2290 }, { "epoch": 0.2359411946129331, "grad_norm": 32.25, "learning_rate": 8.48983321909984e-07, "logits/chosen": -0.6229345798492432, "logits/rejected": -0.6901284456253052, "logps/chosen": -38.86873245239258, "logps/rejected": -51.545082092285156, "loss": 0.3464, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8561866879463196, "rewards/margins": 1.6159948110580444, "rewards/rejected": -0.7598081827163696, "step": 2295 }, { "epoch": 0.23645522771666494, "grad_norm": 30.0, "learning_rate": 8.484121544436829e-07, "logits/chosen": -0.6248496770858765, "logits/rejected": -0.5989278554916382, "logps/chosen": -36.075279235839844, "logps/rejected": -49.7542610168457, "loss": 0.3154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8464832305908203, "rewards/margins": 1.4895519018173218, "rewards/rejected": -0.6430689096450806, "step": 2300 }, { "epoch": 0.23696926082039682, "grad_norm": 28.375, "learning_rate": 8.478409869773817e-07, "logits/chosen": -0.5638717412948608, "logits/rejected": -0.6369618773460388, "logps/chosen": -40.511329650878906, "logps/rejected": -51.904090881347656, "loss": 0.3622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8165721893310547, "rewards/margins": 1.4115118980407715, "rewards/rejected": -0.5949397683143616, "step": 2305 }, { "epoch": 0.2374832939241287, "grad_norm": 36.75, "learning_rate": 8.472698195110805e-07, "logits/chosen": -0.6287067532539368, "logits/rejected": -0.639886200428009, "logps/chosen": -36.944549560546875, "logps/rejected": -51.51033401489258, "loss": 0.3375, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8344141244888306, "rewards/margins": 1.4715083837509155, "rewards/rejected": -0.6370943784713745, "step": 2310 }, { "epoch": 0.23799732702786058, "grad_norm": 26.0, "learning_rate": 8.466986520447795e-07, "logits/chosen": -0.5523120164871216, "logits/rejected": -0.5117930173873901, "logps/chosen": -32.350990295410156, "logps/rejected": -55.518638610839844, "loss": 0.3442, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0215754508972168, "rewards/margins": 1.8127784729003906, "rewards/rejected": -0.7912030816078186, "step": 2315 }, { "epoch": 0.23851136013159246, "grad_norm": 31.625, "learning_rate": 8.461274845784784e-07, "logits/chosen": -0.4955109655857086, "logits/rejected": -0.554134726524353, "logps/chosen": -37.822696685791016, "logps/rejected": -54.902137756347656, "loss": 0.3234, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0883750915527344, "rewards/margins": 1.8375533819198608, "rewards/rejected": -0.7491782903671265, "step": 2320 }, { "epoch": 0.23902539323532435, "grad_norm": 42.75, "learning_rate": 8.455563171121773e-07, "logits/chosen": -0.5122092962265015, "logits/rejected": -0.535693347454071, "logps/chosen": -43.82991027832031, "logps/rejected": -55.75189971923828, "loss": 0.3421, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5961688756942749, "rewards/margins": 1.40804922580719, "rewards/rejected": -0.8118804097175598, "step": 2325 }, { "epoch": 0.23953942633905623, "grad_norm": 31.625, "learning_rate": 8.44985149645876e-07, "logits/chosen": -0.5932412147521973, "logits/rejected": -0.5497549772262573, "logps/chosen": -38.808467864990234, "logps/rejected": -56.844696044921875, "loss": 0.3151, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8156789541244507, "rewards/margins": 1.8755197525024414, "rewards/rejected": -1.0598407983779907, "step": 2330 }, { "epoch": 0.2400534594427881, "grad_norm": 34.0, "learning_rate": 8.44413982179575e-07, "logits/chosen": -0.6414599418640137, "logits/rejected": -0.6357199549674988, "logps/chosen": -33.841400146484375, "logps/rejected": -44.58306121826172, "loss": 0.3062, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8550594449043274, "rewards/margins": 1.176527738571167, "rewards/rejected": -0.3214685320854187, "step": 2335 }, { "epoch": 0.24056749254652, "grad_norm": 29.75, "learning_rate": 8.438428147132739e-07, "logits/chosen": -0.6079205274581909, "logits/rejected": -0.6252256631851196, "logps/chosen": -40.08894348144531, "logps/rejected": -55.3144416809082, "loss": 0.3111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9512923955917358, "rewards/margins": 1.848876953125, "rewards/rejected": -0.8975847363471985, "step": 2340 }, { "epoch": 0.24108152565025187, "grad_norm": 31.75, "learning_rate": 8.432716472469728e-07, "logits/chosen": -0.45789051055908203, "logits/rejected": -0.45395761728286743, "logps/chosen": -37.73030471801758, "logps/rejected": -53.05809783935547, "loss": 0.3345, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5786162614822388, "rewards/margins": 1.569893479347229, "rewards/rejected": -0.9912770986557007, "step": 2345 }, { "epoch": 0.24159555875398375, "grad_norm": 48.75, "learning_rate": 8.427004797806716e-07, "logits/chosen": -0.5674115419387817, "logits/rejected": -0.5480192303657532, "logps/chosen": -35.35588836669922, "logps/rejected": -48.82938003540039, "loss": 0.3112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7567355036735535, "rewards/margins": 1.8056310415267944, "rewards/rejected": -1.0488954782485962, "step": 2350 }, { "epoch": 0.24210959185771563, "grad_norm": 56.0, "learning_rate": 8.421293123143705e-07, "logits/chosen": -0.6570814847946167, "logits/rejected": -0.6591002345085144, "logps/chosen": -43.90917205810547, "logps/rejected": -48.576698303222656, "loss": 0.3146, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8327959179878235, "rewards/margins": 1.40406334400177, "rewards/rejected": -0.5712674260139465, "step": 2355 }, { "epoch": 0.24262362496144751, "grad_norm": 31.0, "learning_rate": 8.415581448480694e-07, "logits/chosen": -0.6122063994407654, "logits/rejected": -0.6174297332763672, "logps/chosen": -31.908710479736328, "logps/rejected": -47.162940979003906, "loss": 0.3081, "rewards/accuracies": 0.875, "rewards/chosen": 0.7232493162155151, "rewards/margins": 1.5812965631484985, "rewards/rejected": -0.8580471277236938, "step": 2360 }, { "epoch": 0.2431376580651794, "grad_norm": 37.5, "learning_rate": 8.409869773817683e-07, "logits/chosen": -0.522334635257721, "logits/rejected": -0.5771368145942688, "logps/chosen": -35.014320373535156, "logps/rejected": -45.319759368896484, "loss": 0.3339, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6296733617782593, "rewards/margins": 0.9839666485786438, "rewards/rejected": -0.3542934060096741, "step": 2365 }, { "epoch": 0.24365169116891128, "grad_norm": 31.125, "learning_rate": 8.404158099154672e-07, "logits/chosen": -0.5866087675094604, "logits/rejected": -0.608900785446167, "logps/chosen": -37.706275939941406, "logps/rejected": -46.91361999511719, "loss": 0.2879, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8505100011825562, "rewards/margins": 1.371140718460083, "rewards/rejected": -0.5206307172775269, "step": 2370 }, { "epoch": 0.24416572427264316, "grad_norm": 43.75, "learning_rate": 8.39844642449166e-07, "logits/chosen": -0.4907437860965729, "logits/rejected": -0.5423033237457275, "logps/chosen": -36.283470153808594, "logps/rejected": -52.52995681762695, "loss": 0.3073, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9007803201675415, "rewards/margins": 1.7192020416259766, "rewards/rejected": -0.8184216618537903, "step": 2375 }, { "epoch": 0.24467975737637504, "grad_norm": 30.625, "learning_rate": 8.392734749828649e-07, "logits/chosen": -0.6226853132247925, "logits/rejected": -0.6305990815162659, "logps/chosen": -41.22824478149414, "logps/rejected": -55.56984329223633, "loss": 0.3307, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.746027410030365, "rewards/margins": 1.5808656215667725, "rewards/rejected": -0.8348382115364075, "step": 2380 }, { "epoch": 0.24519379048010692, "grad_norm": 29.875, "learning_rate": 8.387023075165638e-07, "logits/chosen": -0.5354931950569153, "logits/rejected": -0.5442846417427063, "logps/chosen": -31.870925903320312, "logps/rejected": -51.153656005859375, "loss": 0.3039, "rewards/accuracies": 0.875, "rewards/chosen": 1.0299677848815918, "rewards/margins": 1.7394416332244873, "rewards/rejected": -0.7094741463661194, "step": 2385 }, { "epoch": 0.2457078235838388, "grad_norm": 31.5, "learning_rate": 8.381311400502628e-07, "logits/chosen": -0.4823378920555115, "logits/rejected": -0.5669240355491638, "logps/chosen": -34.077388763427734, "logps/rejected": -50.02438735961914, "loss": 0.3412, "rewards/accuracies": 0.875, "rewards/chosen": 0.8011423349380493, "rewards/margins": 1.3205386400222778, "rewards/rejected": -0.5193962454795837, "step": 2390 }, { "epoch": 0.24622185668757068, "grad_norm": 36.5, "learning_rate": 8.375599725839616e-07, "logits/chosen": -0.5760425329208374, "logits/rejected": -0.6132414937019348, "logps/chosen": -38.415855407714844, "logps/rejected": -53.56256103515625, "loss": 0.3388, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6926613450050354, "rewards/margins": 1.5159966945648193, "rewards/rejected": -0.8233353495597839, "step": 2395 }, { "epoch": 0.24673588979130257, "grad_norm": 24.125, "learning_rate": 8.369888051176604e-07, "logits/chosen": -0.6097526550292969, "logits/rejected": -0.6711469888687134, "logps/chosen": -37.960533142089844, "logps/rejected": -56.525794982910156, "loss": 0.3323, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6215432286262512, "rewards/margins": 1.6412789821624756, "rewards/rejected": -1.0197356939315796, "step": 2400 }, { "epoch": 0.24724992289503445, "grad_norm": 32.75, "learning_rate": 8.364176376513593e-07, "logits/chosen": -0.5324459671974182, "logits/rejected": -0.5147036910057068, "logps/chosen": -38.79023742675781, "logps/rejected": -50.02273178100586, "loss": 0.3234, "rewards/accuracies": 0.875, "rewards/chosen": 0.9835893511772156, "rewards/margins": 1.4796888828277588, "rewards/rejected": -0.4960996210575104, "step": 2405 }, { "epoch": 0.24776395599876633, "grad_norm": 27.5, "learning_rate": 8.358464701850583e-07, "logits/chosen": -0.5852396488189697, "logits/rejected": -0.6570113897323608, "logps/chosen": -38.03417205810547, "logps/rejected": -51.87772750854492, "loss": 0.3373, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8252083659172058, "rewards/margins": 1.39286470413208, "rewards/rejected": -0.567656397819519, "step": 2410 }, { "epoch": 0.2482779891024982, "grad_norm": 34.5, "learning_rate": 8.352753027187572e-07, "logits/chosen": -0.6527113914489746, "logits/rejected": -0.6847392916679382, "logps/chosen": -39.34019088745117, "logps/rejected": -50.707252502441406, "loss": 0.3276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7146943807601929, "rewards/margins": 1.4533154964447021, "rewards/rejected": -0.7386212348937988, "step": 2415 }, { "epoch": 0.2487920222062301, "grad_norm": 40.75, "learning_rate": 8.347041352524559e-07, "logits/chosen": -0.5701441764831543, "logits/rejected": -0.6190380454063416, "logps/chosen": -36.96477127075195, "logps/rejected": -49.35224151611328, "loss": 0.3349, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9505037069320679, "rewards/margins": 1.7022435665130615, "rewards/rejected": -0.7517396211624146, "step": 2420 }, { "epoch": 0.24930605530996197, "grad_norm": 36.5, "learning_rate": 8.341329677861548e-07, "logits/chosen": -0.5772402286529541, "logits/rejected": -0.6143247485160828, "logps/chosen": -37.78667068481445, "logps/rejected": -51.996315002441406, "loss": 0.3294, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8970044255256653, "rewards/margins": 1.3671543598175049, "rewards/rejected": -0.4701499044895172, "step": 2425 }, { "epoch": 0.24982008841369385, "grad_norm": 25.25, "learning_rate": 8.335618003198538e-07, "logits/chosen": -0.586074709892273, "logits/rejected": -0.5747097730636597, "logps/chosen": -41.098167419433594, "logps/rejected": -58.87479782104492, "loss": 0.3298, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6556660532951355, "rewards/margins": 1.669313669204712, "rewards/rejected": -1.0136476755142212, "step": 2430 }, { "epoch": 0.25033412151742573, "grad_norm": 31.25, "learning_rate": 8.329906328535527e-07, "logits/chosen": -0.610697329044342, "logits/rejected": -0.6139100193977356, "logps/chosen": -39.25822448730469, "logps/rejected": -54.895713806152344, "loss": 0.3541, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7778449654579163, "rewards/margins": 1.7350788116455078, "rewards/rejected": -0.9572337865829468, "step": 2435 }, { "epoch": 0.2508481546211576, "grad_norm": 29.875, "learning_rate": 8.324194653872514e-07, "logits/chosen": -0.5684868097305298, "logits/rejected": -0.6213175058364868, "logps/chosen": -39.210174560546875, "logps/rejected": -50.68478775024414, "loss": 0.3318, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8261879682540894, "rewards/margins": 1.5892630815505981, "rewards/rejected": -0.763075053691864, "step": 2440 }, { "epoch": 0.2513621877248895, "grad_norm": 33.75, "learning_rate": 8.318482979209503e-07, "logits/chosen": -0.5671430230140686, "logits/rejected": -0.6013943552970886, "logps/chosen": -39.53706741333008, "logps/rejected": -46.26061248779297, "loss": 0.3511, "rewards/accuracies": 0.875, "rewards/chosen": 0.6032713651657104, "rewards/margins": 1.3498393297195435, "rewards/rejected": -0.7465680241584778, "step": 2445 }, { "epoch": 0.2518762208286214, "grad_norm": 26.375, "learning_rate": 8.312771304546493e-07, "logits/chosen": -0.5537019371986389, "logits/rejected": -0.5718465447425842, "logps/chosen": -34.32490539550781, "logps/rejected": -47.323768615722656, "loss": 0.296, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7764578461647034, "rewards/margins": 1.6604171991348267, "rewards/rejected": -0.8839591145515442, "step": 2450 }, { "epoch": 0.25239025393235326, "grad_norm": 27.125, "learning_rate": 8.307059629883482e-07, "logits/chosen": -0.6252461671829224, "logits/rejected": -0.6206279397010803, "logps/chosen": -43.743690490722656, "logps/rejected": -57.917579650878906, "loss": 0.3617, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.822170615196228, "rewards/margins": 1.902807593345642, "rewards/rejected": -1.0806372165679932, "step": 2455 }, { "epoch": 0.25290428703608514, "grad_norm": 32.75, "learning_rate": 8.301347955220471e-07, "logits/chosen": -0.606519341468811, "logits/rejected": -0.5495678186416626, "logps/chosen": -37.20199203491211, "logps/rejected": -52.25756072998047, "loss": 0.3201, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7953210473060608, "rewards/margins": 1.7392747402191162, "rewards/rejected": -0.943953812122345, "step": 2460 }, { "epoch": 0.253418320139817, "grad_norm": 43.25, "learning_rate": 8.295636280557458e-07, "logits/chosen": -0.6121213436126709, "logits/rejected": -0.6034160852432251, "logps/chosen": -37.77757263183594, "logps/rejected": -52.867286682128906, "loss": 0.32, "rewards/accuracies": 1.0, "rewards/chosen": 0.947515606880188, "rewards/margins": 1.7630611658096313, "rewards/rejected": -0.8155455589294434, "step": 2465 }, { "epoch": 0.2539323532435489, "grad_norm": 45.5, "learning_rate": 8.289924605894448e-07, "logits/chosen": -0.5580183267593384, "logits/rejected": -0.5150309801101685, "logps/chosen": -30.397411346435547, "logps/rejected": -44.70641326904297, "loss": 0.3734, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7613798975944519, "rewards/margins": 0.9943809509277344, "rewards/rejected": -0.23300111293792725, "step": 2470 }, { "epoch": 0.2544463863472808, "grad_norm": 60.75, "learning_rate": 8.284212931231437e-07, "logits/chosen": -0.5340873003005981, "logits/rejected": -0.5792728066444397, "logps/chosen": -37.57379913330078, "logps/rejected": -49.914222717285156, "loss": 0.3206, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9331499338150024, "rewards/margins": 1.3993866443634033, "rewards/rejected": -0.46623677015304565, "step": 2475 }, { "epoch": 0.25496041945101267, "grad_norm": 27.875, "learning_rate": 8.278501256568426e-07, "logits/chosen": -0.5560107827186584, "logits/rejected": -0.6224827766418457, "logps/chosen": -39.36342239379883, "logps/rejected": -55.598609924316406, "loss": 0.3361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6587998270988464, "rewards/margins": 1.3865835666656494, "rewards/rejected": -0.7277837991714478, "step": 2480 }, { "epoch": 0.25547445255474455, "grad_norm": 40.0, "learning_rate": 8.272789581905413e-07, "logits/chosen": -0.6276751756668091, "logits/rejected": -0.5769188404083252, "logps/chosen": -38.77619934082031, "logps/rejected": -51.55413818359375, "loss": 0.3122, "rewards/accuracies": 0.875, "rewards/chosen": 0.6747103929519653, "rewards/margins": 1.5070953369140625, "rewards/rejected": -0.8323848843574524, "step": 2485 }, { "epoch": 0.25598848565847643, "grad_norm": 32.75, "learning_rate": 8.267077907242403e-07, "logits/chosen": -0.5309317708015442, "logits/rejected": -0.654255747795105, "logps/chosen": -41.43610382080078, "logps/rejected": -51.84813690185547, "loss": 0.2868, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9959470629692078, "rewards/margins": 1.9144554138183594, "rewards/rejected": -0.9185082316398621, "step": 2490 }, { "epoch": 0.2565025187622083, "grad_norm": 28.5, "learning_rate": 8.261366232579392e-07, "logits/chosen": -0.5676356554031372, "logits/rejected": -0.5691910982131958, "logps/chosen": -38.91252136230469, "logps/rejected": -51.90314483642578, "loss": 0.3339, "rewards/accuracies": 1.0, "rewards/chosen": 1.1306413412094116, "rewards/margins": 2.059508800506592, "rewards/rejected": -0.928867518901825, "step": 2495 }, { "epoch": 0.2570165518659402, "grad_norm": 34.5, "learning_rate": 8.255654557916381e-07, "logits/chosen": -0.5539349913597107, "logits/rejected": -0.5599786043167114, "logps/chosen": -41.418514251708984, "logps/rejected": -50.95996856689453, "loss": 0.3335, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8292455673217773, "rewards/margins": 1.7275283336639404, "rewards/rejected": -0.8982828259468079, "step": 2500 }, { "epoch": 0.25753058496967207, "grad_norm": 24.125, "learning_rate": 8.24994288325337e-07, "logits/chosen": -0.544836163520813, "logits/rejected": -0.5849722027778625, "logps/chosen": -39.00090026855469, "logps/rejected": -50.527137756347656, "loss": 0.3471, "rewards/accuracies": 0.875, "rewards/chosen": 0.8358081579208374, "rewards/margins": 1.341496229171753, "rewards/rejected": -0.505687952041626, "step": 2505 }, { "epoch": 0.25804461807340395, "grad_norm": 27.25, "learning_rate": 8.244231208590358e-07, "logits/chosen": -0.5479040145874023, "logits/rejected": -0.586516261100769, "logps/chosen": -37.29068374633789, "logps/rejected": -52.432777404785156, "loss": 0.3483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8986017107963562, "rewards/margins": 1.436348557472229, "rewards/rejected": -0.5377467274665833, "step": 2510 }, { "epoch": 0.25855865117713583, "grad_norm": 41.5, "learning_rate": 8.238519533927347e-07, "logits/chosen": -0.533218502998352, "logits/rejected": -0.5613458752632141, "logps/chosen": -33.23505401611328, "logps/rejected": -47.821449279785156, "loss": 0.3343, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8412440419197083, "rewards/margins": 1.5999653339385986, "rewards/rejected": -0.7587212324142456, "step": 2515 }, { "epoch": 0.2590726842808677, "grad_norm": 31.0, "learning_rate": 8.232807859264336e-07, "logits/chosen": -0.5901399850845337, "logits/rejected": -0.5737950801849365, "logps/chosen": -38.137664794921875, "logps/rejected": -50.84968566894531, "loss": 0.3387, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7579399347305298, "rewards/margins": 1.5162951946258545, "rewards/rejected": -0.7583553194999695, "step": 2520 }, { "epoch": 0.25958671738459954, "grad_norm": 26.375, "learning_rate": 8.227096184601325e-07, "logits/chosen": -0.556483805179596, "logits/rejected": -0.6330370903015137, "logps/chosen": -34.883583068847656, "logps/rejected": -50.44098663330078, "loss": 0.3318, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8654283285140991, "rewards/margins": 1.6854321956634521, "rewards/rejected": -0.8200038075447083, "step": 2525 }, { "epoch": 0.2601007504883314, "grad_norm": 30.25, "learning_rate": 8.221384509938313e-07, "logits/chosen": -0.5933849811553955, "logits/rejected": -0.6062746644020081, "logps/chosen": -36.566978454589844, "logps/rejected": -52.498626708984375, "loss": 0.2954, "rewards/accuracies": 0.875, "rewards/chosen": 0.7794309854507446, "rewards/margins": 1.3937060832977295, "rewards/rejected": -0.6142752170562744, "step": 2530 }, { "epoch": 0.2606147835920633, "grad_norm": 42.75, "learning_rate": 8.215672835275302e-07, "logits/chosen": -0.5589181780815125, "logits/rejected": -0.5454494953155518, "logps/chosen": -46.60298538208008, "logps/rejected": -54.02357864379883, "loss": 0.3523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.894694447517395, "rewards/margins": 1.4741030931472778, "rewards/rejected": -0.5794087648391724, "step": 2535 }, { "epoch": 0.2611288166957952, "grad_norm": 34.0, "learning_rate": 8.209961160612291e-07, "logits/chosen": -0.5049071907997131, "logits/rejected": -0.589408278465271, "logps/chosen": -37.716773986816406, "logps/rejected": -51.049171447753906, "loss": 0.3085, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.601995587348938, "rewards/margins": 1.68996262550354, "rewards/rejected": -1.0879671573638916, "step": 2540 }, { "epoch": 0.26164284979952707, "grad_norm": 33.0, "learning_rate": 8.20424948594928e-07, "logits/chosen": -0.6891703009605408, "logits/rejected": -0.6552414298057556, "logps/chosen": -44.94964599609375, "logps/rejected": -52.5418815612793, "loss": 0.3343, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.627403736114502, "rewards/margins": 1.3622678518295288, "rewards/rejected": -0.7348642349243164, "step": 2545 }, { "epoch": 0.26215688290325895, "grad_norm": 27.75, "learning_rate": 8.19853781128627e-07, "logits/chosen": -0.42545390129089355, "logits/rejected": -0.45015591382980347, "logps/chosen": -40.2750358581543, "logps/rejected": -58.93817901611328, "loss": 0.3375, "rewards/accuracies": 0.875, "rewards/chosen": 0.7640242576599121, "rewards/margins": 1.6386816501617432, "rewards/rejected": -0.8746572732925415, "step": 2550 }, { "epoch": 0.26267091600699083, "grad_norm": 47.25, "learning_rate": 8.192826136623257e-07, "logits/chosen": -0.5523547530174255, "logits/rejected": -0.5699556469917297, "logps/chosen": -32.90167999267578, "logps/rejected": -51.282470703125, "loss": 0.3252, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8501798510551453, "rewards/margins": 1.5933705568313599, "rewards/rejected": -0.7431905269622803, "step": 2555 }, { "epoch": 0.2631849491107227, "grad_norm": 32.25, "learning_rate": 8.187114461960246e-07, "logits/chosen": -0.5408596992492676, "logits/rejected": -0.5678967833518982, "logps/chosen": -40.18189239501953, "logps/rejected": -49.22533416748047, "loss": 0.3496, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.543592631816864, "rewards/margins": 1.3306907415390015, "rewards/rejected": -0.7870982885360718, "step": 2560 }, { "epoch": 0.2636989822144546, "grad_norm": 29.75, "learning_rate": 8.181402787297235e-07, "logits/chosen": -0.5547072291374207, "logits/rejected": -0.5828058123588562, "logps/chosen": -37.347049713134766, "logps/rejected": -50.31292724609375, "loss": 0.3124, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6852031946182251, "rewards/margins": 1.57588529586792, "rewards/rejected": -0.8906822204589844, "step": 2565 }, { "epoch": 0.2642130153181865, "grad_norm": 33.5, "learning_rate": 8.175691112634225e-07, "logits/chosen": -0.5595728158950806, "logits/rejected": -0.6326349377632141, "logps/chosen": -36.790313720703125, "logps/rejected": -49.53343963623047, "loss": 0.343, "rewards/accuracies": 0.875, "rewards/chosen": 0.6468476057052612, "rewards/margins": 1.1680185794830322, "rewards/rejected": -0.5211710929870605, "step": 2570 }, { "epoch": 0.26472704842191835, "grad_norm": 29.375, "learning_rate": 8.169979437971212e-07, "logits/chosen": -0.5108506083488464, "logits/rejected": -0.5596425533294678, "logps/chosen": -37.745399475097656, "logps/rejected": -48.368919372558594, "loss": 0.3542, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7989941835403442, "rewards/margins": 1.1005011796951294, "rewards/rejected": -0.3015069365501404, "step": 2575 }, { "epoch": 0.26524108152565024, "grad_norm": 26.875, "learning_rate": 8.164267763308201e-07, "logits/chosen": -0.5720106363296509, "logits/rejected": -0.5780494809150696, "logps/chosen": -38.04465866088867, "logps/rejected": -48.160240173339844, "loss": 0.3344, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6913898587226868, "rewards/margins": 1.5069063901901245, "rewards/rejected": -0.815516471862793, "step": 2580 }, { "epoch": 0.2657551146293821, "grad_norm": 28.875, "learning_rate": 8.15855608864519e-07, "logits/chosen": -0.5518124103546143, "logits/rejected": -0.5865200757980347, "logps/chosen": -30.93931007385254, "logps/rejected": -46.8905143737793, "loss": 0.3229, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.814688503742218, "rewards/margins": 1.5711781978607178, "rewards/rejected": -0.756489634513855, "step": 2585 }, { "epoch": 0.266269147733114, "grad_norm": 26.125, "learning_rate": 8.15284441398218e-07, "logits/chosen": -0.41506463289260864, "logits/rejected": -0.449720561504364, "logps/chosen": -41.071510314941406, "logps/rejected": -54.53052520751953, "loss": 0.3287, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5473376512527466, "rewards/margins": 1.4212305545806885, "rewards/rejected": -0.8738929033279419, "step": 2590 }, { "epoch": 0.2667831808368459, "grad_norm": 27.0, "learning_rate": 8.147132739319167e-07, "logits/chosen": -0.5830138921737671, "logits/rejected": -0.6391711235046387, "logps/chosen": -38.42620849609375, "logps/rejected": -55.5728874206543, "loss": 0.3081, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8622525930404663, "rewards/margins": 1.82339608669281, "rewards/rejected": -0.9611434936523438, "step": 2595 }, { "epoch": 0.26729721394057776, "grad_norm": 33.75, "learning_rate": 8.141421064656156e-07, "logits/chosen": -0.5715519189834595, "logits/rejected": -0.7177749276161194, "logps/chosen": -40.219627380371094, "logps/rejected": -51.36127853393555, "loss": 0.3371, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7928406000137329, "rewards/margins": 1.4801390171051025, "rewards/rejected": -0.687298595905304, "step": 2600 }, { "epoch": 0.26781124704430964, "grad_norm": 54.0, "learning_rate": 8.135709389993146e-07, "logits/chosen": -0.5773388743400574, "logits/rejected": -0.5609177947044373, "logps/chosen": -39.13777542114258, "logps/rejected": -52.4000129699707, "loss": 0.3501, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5036259293556213, "rewards/margins": 1.4135462045669556, "rewards/rejected": -0.9099200963973999, "step": 2605 }, { "epoch": 0.2683252801480415, "grad_norm": 39.5, "learning_rate": 8.129997715330135e-07, "logits/chosen": -0.4972134530544281, "logits/rejected": -0.5395873785018921, "logps/chosen": -36.056480407714844, "logps/rejected": -46.372737884521484, "loss": 0.3255, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7175301313400269, "rewards/margins": 1.4679456949234009, "rewards/rejected": -0.7504154443740845, "step": 2610 }, { "epoch": 0.2688393132517734, "grad_norm": 33.5, "learning_rate": 8.124286040667124e-07, "logits/chosen": -0.556797444820404, "logits/rejected": -0.582920253276825, "logps/chosen": -37.28467559814453, "logps/rejected": -48.69483184814453, "loss": 0.35, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7106724977493286, "rewards/margins": 1.5370115041732788, "rewards/rejected": -0.8263387680053711, "step": 2615 }, { "epoch": 0.2693533463555053, "grad_norm": 27.25, "learning_rate": 8.118574366004111e-07, "logits/chosen": -0.6083003878593445, "logits/rejected": -0.5987656116485596, "logps/chosen": -36.334068298339844, "logps/rejected": -54.12360763549805, "loss": 0.306, "rewards/accuracies": 0.875, "rewards/chosen": 0.8716567158699036, "rewards/margins": 1.8070560693740845, "rewards/rejected": -0.9353994131088257, "step": 2620 }, { "epoch": 0.26986737945923717, "grad_norm": 31.25, "learning_rate": 8.112862691341101e-07, "logits/chosen": -0.565024197101593, "logits/rejected": -0.5650396347045898, "logps/chosen": -37.43252182006836, "logps/rejected": -54.5045166015625, "loss": 0.3366, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8193279504776001, "rewards/margins": 1.7015384435653687, "rewards/rejected": -0.8822104334831238, "step": 2625 }, { "epoch": 0.27038141256296905, "grad_norm": 36.25, "learning_rate": 8.10715101667809e-07, "logits/chosen": -0.6296054124832153, "logits/rejected": -0.601709246635437, "logps/chosen": -40.42405700683594, "logps/rejected": -51.58613204956055, "loss": 0.3012, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9447135925292969, "rewards/margins": 1.5467709302902222, "rewards/rejected": -0.6020573377609253, "step": 2630 }, { "epoch": 0.27089544566670093, "grad_norm": 31.625, "learning_rate": 8.101439342015079e-07, "logits/chosen": -0.6279605031013489, "logits/rejected": -0.6460429430007935, "logps/chosen": -41.076210021972656, "logps/rejected": -51.4727783203125, "loss": 0.3853, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8535226583480835, "rewards/margins": 1.5410677194595337, "rewards/rejected": -0.6875449419021606, "step": 2635 }, { "epoch": 0.2714094787704328, "grad_norm": 43.25, "learning_rate": 8.095727667352066e-07, "logits/chosen": -0.6000229120254517, "logits/rejected": -0.6271520256996155, "logps/chosen": -36.1815185546875, "logps/rejected": -46.01683807373047, "loss": 0.3306, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8568794131278992, "rewards/margins": 1.5293705463409424, "rewards/rejected": -0.672491192817688, "step": 2640 }, { "epoch": 0.2719235118741647, "grad_norm": 42.25, "learning_rate": 8.090015992689056e-07, "logits/chosen": -0.5202063322067261, "logits/rejected": -0.5808990597724915, "logps/chosen": -38.52047348022461, "logps/rejected": -50.732730865478516, "loss": 0.307, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6776046752929688, "rewards/margins": 1.4748519659042358, "rewards/rejected": -0.7972472906112671, "step": 2645 }, { "epoch": 0.2724375449778966, "grad_norm": 37.5, "learning_rate": 8.084304318026045e-07, "logits/chosen": -0.45740780234336853, "logits/rejected": -0.558831512928009, "logps/chosen": -35.184974670410156, "logps/rejected": -48.28584289550781, "loss": 0.313, "rewards/accuracies": 0.875, "rewards/chosen": 0.6477391719818115, "rewards/margins": 1.4011237621307373, "rewards/rejected": -0.7533845901489258, "step": 2650 }, { "epoch": 0.27295157808162845, "grad_norm": 44.25, "learning_rate": 8.078592643363034e-07, "logits/chosen": -0.5540612936019897, "logits/rejected": -0.5380195379257202, "logps/chosen": -48.20555877685547, "logps/rejected": -57.02117919921875, "loss": 0.3001, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8224876523017883, "rewards/margins": 1.928381323814392, "rewards/rejected": -1.1058937311172485, "step": 2655 }, { "epoch": 0.27346561118536034, "grad_norm": 29.625, "learning_rate": 8.072880968700023e-07, "logits/chosen": -0.5973197817802429, "logits/rejected": -0.5963981747627258, "logps/chosen": -38.93446731567383, "logps/rejected": -49.95897674560547, "loss": 0.3515, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7645783424377441, "rewards/margins": 1.4363292455673218, "rewards/rejected": -0.6717509031295776, "step": 2660 }, { "epoch": 0.2739796442890922, "grad_norm": 31.625, "learning_rate": 8.067169294037011e-07, "logits/chosen": -0.6118860244750977, "logits/rejected": -0.6059748530387878, "logps/chosen": -39.40489959716797, "logps/rejected": -53.81787872314453, "loss": 0.3318, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.846260666847229, "rewards/margins": 1.7310192584991455, "rewards/rejected": -0.8847583532333374, "step": 2665 }, { "epoch": 0.2744936773928241, "grad_norm": 43.5, "learning_rate": 8.061457619374e-07, "logits/chosen": -0.5892786383628845, "logits/rejected": -0.6523498892784119, "logps/chosen": -44.20829772949219, "logps/rejected": -48.53339767456055, "loss": 0.3435, "rewards/accuracies": 0.875, "rewards/chosen": 0.786115288734436, "rewards/margins": 1.2078816890716553, "rewards/rejected": -0.4217664301395416, "step": 2670 }, { "epoch": 0.275007710496556, "grad_norm": 50.5, "learning_rate": 8.055745944710989e-07, "logits/chosen": -0.4891696870326996, "logits/rejected": -0.5408421754837036, "logps/chosen": -35.53083038330078, "logps/rejected": -50.99610900878906, "loss": 0.3202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9761897921562195, "rewards/margins": 1.8018112182617188, "rewards/rejected": -0.8256214261054993, "step": 2675 }, { "epoch": 0.27552174360028786, "grad_norm": 26.125, "learning_rate": 8.050034270047978e-07, "logits/chosen": -0.5370458364486694, "logits/rejected": -0.512549638748169, "logps/chosen": -41.92572784423828, "logps/rejected": -52.40850830078125, "loss": 0.3, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7289192080497742, "rewards/margins": 1.579735279083252, "rewards/rejected": -0.8508160710334778, "step": 2680 }, { "epoch": 0.27603577670401974, "grad_norm": 37.5, "learning_rate": 8.044322595384966e-07, "logits/chosen": -0.6152782440185547, "logits/rejected": -0.6609753370285034, "logps/chosen": -43.19878387451172, "logps/rejected": -50.677459716796875, "loss": 0.3148, "rewards/accuracies": 0.875, "rewards/chosen": 0.8183810114860535, "rewards/margins": 1.5711876153945923, "rewards/rejected": -0.7528066635131836, "step": 2685 }, { "epoch": 0.2765498098077516, "grad_norm": 28.0, "learning_rate": 8.038610920721955e-07, "logits/chosen": -0.5986356735229492, "logits/rejected": -0.6448944807052612, "logps/chosen": -41.701438903808594, "logps/rejected": -54.01934051513672, "loss": 0.3235, "rewards/accuracies": 0.875, "rewards/chosen": 0.5900524854660034, "rewards/margins": 1.530051827430725, "rewards/rejected": -0.9399992823600769, "step": 2690 }, { "epoch": 0.2770638429114835, "grad_norm": 28.125, "learning_rate": 8.032899246058944e-07, "logits/chosen": -0.5560696125030518, "logits/rejected": -0.5365002155303955, "logps/chosen": -41.13336181640625, "logps/rejected": -50.325382232666016, "loss": 0.3534, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6695707440376282, "rewards/margins": 1.3802134990692139, "rewards/rejected": -0.7106426954269409, "step": 2695 }, { "epoch": 0.2775778760152154, "grad_norm": 29.5, "learning_rate": 8.027187571395933e-07, "logits/chosen": -0.6531693935394287, "logits/rejected": -0.655467689037323, "logps/chosen": -37.27671813964844, "logps/rejected": -52.17753219604492, "loss": 0.3266, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7505525946617126, "rewards/margins": 1.794227957725525, "rewards/rejected": -1.0436756610870361, "step": 2700 }, { "epoch": 0.27809190911894727, "grad_norm": 75.5, "learning_rate": 8.021475896732923e-07, "logits/chosen": -0.5463694930076599, "logits/rejected": -0.5457005500793457, "logps/chosen": -40.31636428833008, "logps/rejected": -50.86219024658203, "loss": 0.3142, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.75978022813797, "rewards/margins": 1.6092884540557861, "rewards/rejected": -0.8495081663131714, "step": 2705 }, { "epoch": 0.27860594222267915, "grad_norm": 29.375, "learning_rate": 8.01576422206991e-07, "logits/chosen": -0.5057317018508911, "logits/rejected": -0.5163851976394653, "logps/chosen": -39.22124099731445, "logps/rejected": -47.41815948486328, "loss": 0.3181, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6806716322898865, "rewards/margins": 1.0450575351715088, "rewards/rejected": -0.36438578367233276, "step": 2710 }, { "epoch": 0.27911997532641103, "grad_norm": 49.25, "learning_rate": 8.010052547406899e-07, "logits/chosen": -0.6168071031570435, "logits/rejected": -0.6198270916938782, "logps/chosen": -34.5545654296875, "logps/rejected": -52.644386291503906, "loss": 0.2874, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5474293828010559, "rewards/margins": 1.5009405612945557, "rewards/rejected": -0.9535113573074341, "step": 2715 }, { "epoch": 0.2796340084301429, "grad_norm": 34.75, "learning_rate": 8.004340872743888e-07, "logits/chosen": -0.6791720390319824, "logits/rejected": -0.6941150426864624, "logps/chosen": -41.28388595581055, "logps/rejected": -54.35636520385742, "loss": 0.336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8671029806137085, "rewards/margins": 1.7142982482910156, "rewards/rejected": -0.8471953272819519, "step": 2720 }, { "epoch": 0.2801480415338748, "grad_norm": 30.625, "learning_rate": 7.998629198080878e-07, "logits/chosen": -0.6395012140274048, "logits/rejected": -0.6585319638252258, "logps/chosen": -39.93240737915039, "logps/rejected": -48.5073127746582, "loss": 0.3149, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6066262125968933, "rewards/margins": 1.3643603324890137, "rewards/rejected": -0.7577340006828308, "step": 2725 }, { "epoch": 0.2806620746376067, "grad_norm": 25.0, "learning_rate": 7.992917523417865e-07, "logits/chosen": -0.5619273781776428, "logits/rejected": -0.651478111743927, "logps/chosen": -38.12032699584961, "logps/rejected": -51.218482971191406, "loss": 0.3415, "rewards/accuracies": 0.875, "rewards/chosen": 0.52886962890625, "rewards/margins": 1.6256355047225952, "rewards/rejected": -1.0967657566070557, "step": 2730 }, { "epoch": 0.28117610774133855, "grad_norm": 30.25, "learning_rate": 7.987205848754854e-07, "logits/chosen": -0.6156484484672546, "logits/rejected": -0.643109142780304, "logps/chosen": -38.44417190551758, "logps/rejected": -54.91167449951172, "loss": 0.3277, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7776897549629211, "rewards/margins": 1.7829749584197998, "rewards/rejected": -1.0052852630615234, "step": 2735 }, { "epoch": 0.28169014084507044, "grad_norm": 35.5, "learning_rate": 7.981494174091843e-07, "logits/chosen": -0.5015630125999451, "logits/rejected": -0.5097275972366333, "logps/chosen": -42.26335906982422, "logps/rejected": -47.423343658447266, "loss": 0.322, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6644924879074097, "rewards/margins": 1.1846481561660767, "rewards/rejected": -0.5201555490493774, "step": 2740 }, { "epoch": 0.2822041739488023, "grad_norm": 28.25, "learning_rate": 7.975782499428833e-07, "logits/chosen": -0.5102794766426086, "logits/rejected": -0.5601925849914551, "logps/chosen": -38.11832046508789, "logps/rejected": -50.34503936767578, "loss": 0.3342, "rewards/accuracies": 0.875, "rewards/chosen": 0.8043190240859985, "rewards/margins": 1.626518964767456, "rewards/rejected": -0.822199821472168, "step": 2745 }, { "epoch": 0.2827182070525342, "grad_norm": 24.5, "learning_rate": 7.970070824765822e-07, "logits/chosen": -0.6255351305007935, "logits/rejected": -0.5950052738189697, "logps/chosen": -38.21377182006836, "logps/rejected": -55.45896530151367, "loss": 0.3144, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.865623950958252, "rewards/margins": 2.031430721282959, "rewards/rejected": -1.1658064126968384, "step": 2750 }, { "epoch": 0.2832322401562661, "grad_norm": 29.375, "learning_rate": 7.964359150102809e-07, "logits/chosen": -0.5556917190551758, "logits/rejected": -0.6364370584487915, "logps/chosen": -37.98551559448242, "logps/rejected": -51.4348030090332, "loss": 0.3311, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9023094177246094, "rewards/margins": 1.792612075805664, "rewards/rejected": -0.8903026580810547, "step": 2755 }, { "epoch": 0.28374627325999796, "grad_norm": 25.625, "learning_rate": 7.958647475439798e-07, "logits/chosen": -0.6264057159423828, "logits/rejected": -0.6303829550743103, "logps/chosen": -36.876197814941406, "logps/rejected": -55.11478805541992, "loss": 0.3129, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7299001812934875, "rewards/margins": 1.6665176153182983, "rewards/rejected": -0.9366175532341003, "step": 2760 }, { "epoch": 0.28426030636372984, "grad_norm": 34.75, "learning_rate": 7.952935800776788e-07, "logits/chosen": -0.5764901638031006, "logits/rejected": -0.6148121953010559, "logps/chosen": -38.388153076171875, "logps/rejected": -51.450782775878906, "loss": 0.311, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.020770788192749, "rewards/margins": 1.8746757507324219, "rewards/rejected": -0.8539049029350281, "step": 2765 }, { "epoch": 0.2847743394674617, "grad_norm": 32.5, "learning_rate": 7.947224126113777e-07, "logits/chosen": -0.5868847966194153, "logits/rejected": -0.6352534294128418, "logps/chosen": -42.50141143798828, "logps/rejected": -50.22951126098633, "loss": 0.3484, "rewards/accuracies": 0.75, "rewards/chosen": 0.693642795085907, "rewards/margins": 1.1238865852355957, "rewards/rejected": -0.4302436411380768, "step": 2770 }, { "epoch": 0.2852883725711936, "grad_norm": 28.5, "learning_rate": 7.941512451450764e-07, "logits/chosen": -0.5802504420280457, "logits/rejected": -0.6199682950973511, "logps/chosen": -41.97921371459961, "logps/rejected": -52.42344284057617, "loss": 0.3176, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6132591962814331, "rewards/margins": 1.5867505073547363, "rewards/rejected": -0.973491370677948, "step": 2775 }, { "epoch": 0.2858024056749255, "grad_norm": 32.5, "learning_rate": 7.935800776787753e-07, "logits/chosen": -0.5894335508346558, "logits/rejected": -0.5253283977508545, "logps/chosen": -39.5084342956543, "logps/rejected": -50.95637130737305, "loss": 0.3147, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7894219756126404, "rewards/margins": 1.60183846950531, "rewards/rejected": -0.8124163746833801, "step": 2780 }, { "epoch": 0.28631643877865737, "grad_norm": 32.75, "learning_rate": 7.930089102124743e-07, "logits/chosen": -0.6230098009109497, "logits/rejected": -0.6103426814079285, "logps/chosen": -41.36117935180664, "logps/rejected": -46.64020538330078, "loss": 0.3185, "rewards/accuracies": 0.875, "rewards/chosen": 0.9115855097770691, "rewards/margins": 1.32338547706604, "rewards/rejected": -0.41179990768432617, "step": 2785 }, { "epoch": 0.28683047188238925, "grad_norm": 39.5, "learning_rate": 7.924377427461732e-07, "logits/chosen": -0.5788400173187256, "logits/rejected": -0.6223078370094299, "logps/chosen": -41.689361572265625, "logps/rejected": -52.362022399902344, "loss": 0.3193, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7765151262283325, "rewards/margins": 1.585984230041504, "rewards/rejected": -0.8094690442085266, "step": 2790 }, { "epoch": 0.28734450498612113, "grad_norm": 28.5, "learning_rate": 7.91866575279872e-07, "logits/chosen": -0.4896772503852844, "logits/rejected": -0.5467736124992371, "logps/chosen": -40.36580276489258, "logps/rejected": -52.81828689575195, "loss": 0.319, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7770923376083374, "rewards/margins": 1.5965068340301514, "rewards/rejected": -0.8194146156311035, "step": 2795 }, { "epoch": 0.287858538089853, "grad_norm": 35.5, "learning_rate": 7.912954078135708e-07, "logits/chosen": -0.48675990104675293, "logits/rejected": -0.5747967958450317, "logps/chosen": -31.286739349365234, "logps/rejected": -52.25028610229492, "loss": 0.3183, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9186381101608276, "rewards/margins": 1.9105141162872314, "rewards/rejected": -0.991875946521759, "step": 2800 }, { "epoch": 0.2883725711935849, "grad_norm": 31.625, "learning_rate": 7.907242403472698e-07, "logits/chosen": -0.5314956903457642, "logits/rejected": -0.5037826299667358, "logps/chosen": -38.24446487426758, "logps/rejected": -50.572059631347656, "loss": 0.3233, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9135986566543579, "rewards/margins": 1.730847716331482, "rewards/rejected": -0.8172491788864136, "step": 2805 }, { "epoch": 0.2888866042973168, "grad_norm": 33.75, "learning_rate": 7.901530728809687e-07, "logits/chosen": -0.6081064939498901, "logits/rejected": -0.5640048384666443, "logps/chosen": -39.97286605834961, "logps/rejected": -49.895835876464844, "loss": 0.3309, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7328807711601257, "rewards/margins": 1.1848268508911133, "rewards/rejected": -0.4519461989402771, "step": 2810 }, { "epoch": 0.28940063740104865, "grad_norm": 28.625, "learning_rate": 7.895819054146676e-07, "logits/chosen": -0.5018969774246216, "logits/rejected": -0.5233650207519531, "logps/chosen": -37.96908187866211, "logps/rejected": -52.6068229675293, "loss": 0.3469, "rewards/accuracies": 0.875, "rewards/chosen": 0.7964318990707397, "rewards/margins": 1.4728972911834717, "rewards/rejected": -0.6764655113220215, "step": 2815 }, { "epoch": 0.2899146705047805, "grad_norm": 34.0, "learning_rate": 7.890107379483664e-07, "logits/chosen": -0.5232565999031067, "logits/rejected": -0.5189487338066101, "logps/chosen": -39.706703186035156, "logps/rejected": -47.306732177734375, "loss": 0.3743, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7025637626647949, "rewards/margins": 1.27166748046875, "rewards/rejected": -0.5691037178039551, "step": 2820 }, { "epoch": 0.29042870360851236, "grad_norm": 30.125, "learning_rate": 7.884395704820653e-07, "logits/chosen": -0.580420970916748, "logits/rejected": -0.597321093082428, "logps/chosen": -39.980743408203125, "logps/rejected": -48.996604919433594, "loss": 0.3579, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.4959433078765869, "rewards/margins": 1.1604650020599365, "rewards/rejected": -0.6645216345787048, "step": 2825 }, { "epoch": 0.29094273671224424, "grad_norm": 52.5, "learning_rate": 7.878684030157642e-07, "logits/chosen": -0.5829691886901855, "logits/rejected": -0.600680947303772, "logps/chosen": -38.030242919921875, "logps/rejected": -47.41745376586914, "loss": 0.3287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7585018873214722, "rewards/margins": 1.496957540512085, "rewards/rejected": -0.7384557723999023, "step": 2830 }, { "epoch": 0.2914567698159761, "grad_norm": 39.75, "learning_rate": 7.872972355494631e-07, "logits/chosen": -0.47838473320007324, "logits/rejected": -0.4978574216365814, "logps/chosen": -40.6726188659668, "logps/rejected": -50.62644577026367, "loss": 0.3286, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6016615629196167, "rewards/margins": 1.354872226715088, "rewards/rejected": -0.7532106637954712, "step": 2835 }, { "epoch": 0.291970802919708, "grad_norm": 27.5, "learning_rate": 7.86726068083162e-07, "logits/chosen": -0.5507707595825195, "logits/rejected": -0.6474792957305908, "logps/chosen": -36.957801818847656, "logps/rejected": -53.323699951171875, "loss": 0.3013, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8331316709518433, "rewards/margins": 1.5970426797866821, "rewards/rejected": -0.7639108896255493, "step": 2840 }, { "epoch": 0.2924848360234399, "grad_norm": 27.625, "learning_rate": 7.861549006168608e-07, "logits/chosen": -0.527405321598053, "logits/rejected": -0.5063089728355408, "logps/chosen": -37.726097106933594, "logps/rejected": -50.51045608520508, "loss": 0.2984, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7443009614944458, "rewards/margins": 1.3620805740356445, "rewards/rejected": -0.617779552936554, "step": 2845 }, { "epoch": 0.29299886912717177, "grad_norm": 44.75, "learning_rate": 7.855837331505597e-07, "logits/chosen": -0.5319653153419495, "logits/rejected": -0.5250851511955261, "logps/chosen": -36.13595199584961, "logps/rejected": -52.084022521972656, "loss": 0.3331, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7030356526374817, "rewards/margins": 1.2938852310180664, "rewards/rejected": -0.5908496975898743, "step": 2850 }, { "epoch": 0.29351290223090365, "grad_norm": 30.0, "learning_rate": 7.850125656842586e-07, "logits/chosen": -0.4976530075073242, "logits/rejected": -0.5193990468978882, "logps/chosen": -39.44697189331055, "logps/rejected": -53.611358642578125, "loss": 0.3268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7338736057281494, "rewards/margins": 1.58577299118042, "rewards/rejected": -0.851899266242981, "step": 2855 }, { "epoch": 0.29402693533463553, "grad_norm": 34.75, "learning_rate": 7.844413982179576e-07, "logits/chosen": -0.4936915338039398, "logits/rejected": -0.5671813488006592, "logps/chosen": -41.091373443603516, "logps/rejected": -53.112754821777344, "loss": 0.3024, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7006824612617493, "rewards/margins": 1.449549913406372, "rewards/rejected": -0.7488673329353333, "step": 2860 }, { "epoch": 0.2945409684383674, "grad_norm": 28.5, "learning_rate": 7.838702307516563e-07, "logits/chosen": -0.5492366552352905, "logits/rejected": -0.5514737963676453, "logps/chosen": -42.392601013183594, "logps/rejected": -51.710052490234375, "loss": 0.2935, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8023785352706909, "rewards/margins": 1.541131854057312, "rewards/rejected": -0.7387533187866211, "step": 2865 }, { "epoch": 0.2950550015420993, "grad_norm": 25.0, "learning_rate": 7.832990632853552e-07, "logits/chosen": -0.5668608546257019, "logits/rejected": -0.5478943586349487, "logps/chosen": -33.097408294677734, "logps/rejected": -50.13191604614258, "loss": 0.3472, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8493663668632507, "rewards/margins": 1.9473533630371094, "rewards/rejected": -1.0979869365692139, "step": 2870 }, { "epoch": 0.2955690346458312, "grad_norm": 33.5, "learning_rate": 7.827278958190541e-07, "logits/chosen": -0.6412750482559204, "logits/rejected": -0.6356221437454224, "logps/chosen": -35.047176361083984, "logps/rejected": -49.67909240722656, "loss": 0.337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8270595669746399, "rewards/margins": 1.3279955387115479, "rewards/rejected": -0.5009359121322632, "step": 2875 }, { "epoch": 0.29608306774956306, "grad_norm": 32.5, "learning_rate": 7.821567283527531e-07, "logits/chosen": -0.6392526626586914, "logits/rejected": -0.6894222497940063, "logps/chosen": -40.6172981262207, "logps/rejected": -60.69456100463867, "loss": 0.3189, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.690325915813446, "rewards/margins": 1.6393060684204102, "rewards/rejected": -0.9489800333976746, "step": 2880 }, { "epoch": 0.29659710085329494, "grad_norm": 27.0, "learning_rate": 7.815855608864519e-07, "logits/chosen": -0.5585804581642151, "logits/rejected": -0.5581811666488647, "logps/chosen": -36.36436462402344, "logps/rejected": -47.69687271118164, "loss": 0.3206, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8932547569274902, "rewards/margins": 1.6562995910644531, "rewards/rejected": -0.7630447149276733, "step": 2885 }, { "epoch": 0.2971111339570268, "grad_norm": 28.0, "learning_rate": 7.810143934201507e-07, "logits/chosen": -0.5142213106155396, "logits/rejected": -0.5880078673362732, "logps/chosen": -35.39085388183594, "logps/rejected": -57.158477783203125, "loss": 0.3175, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0097005367279053, "rewards/margins": 1.715166687965393, "rewards/rejected": -0.705466091632843, "step": 2890 }, { "epoch": 0.2976251670607587, "grad_norm": 23.0, "learning_rate": 7.804432259538496e-07, "logits/chosen": -0.47852247953414917, "logits/rejected": -0.6048077344894409, "logps/chosen": -40.649879455566406, "logps/rejected": -48.9451904296875, "loss": 0.2915, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9587495923042297, "rewards/margins": 1.5323927402496338, "rewards/rejected": -0.5736433267593384, "step": 2895 }, { "epoch": 0.2981392001644906, "grad_norm": 27.25, "learning_rate": 7.798720584875486e-07, "logits/chosen": -0.5752219557762146, "logits/rejected": -0.5965328812599182, "logps/chosen": -32.410980224609375, "logps/rejected": -48.50654220581055, "loss": 0.3275, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8733240962028503, "rewards/margins": 1.6390893459320068, "rewards/rejected": -0.7657654285430908, "step": 2900 }, { "epoch": 0.29865323326822246, "grad_norm": 26.75, "learning_rate": 7.793008910212475e-07, "logits/chosen": -0.5923627614974976, "logits/rejected": -0.600249171257019, "logps/chosen": -38.932167053222656, "logps/rejected": -55.134986877441406, "loss": 0.3199, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7377450466156006, "rewards/margins": 1.7619025707244873, "rewards/rejected": -1.0241577625274658, "step": 2905 }, { "epoch": 0.29916726637195434, "grad_norm": 28.875, "learning_rate": 7.787297235549462e-07, "logits/chosen": -0.6332677602767944, "logits/rejected": -0.6492992043495178, "logps/chosen": -40.84601593017578, "logps/rejected": -52.169960021972656, "loss": 0.352, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5198203325271606, "rewards/margins": 1.1531544923782349, "rewards/rejected": -0.6333340406417847, "step": 2910 }, { "epoch": 0.2996812994756862, "grad_norm": 29.875, "learning_rate": 7.781585560886451e-07, "logits/chosen": -0.6043485403060913, "logits/rejected": -0.5452898740768433, "logps/chosen": -38.535003662109375, "logps/rejected": -50.80746841430664, "loss": 0.316, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5890969038009644, "rewards/margins": 1.7265207767486572, "rewards/rejected": -1.1374242305755615, "step": 2915 }, { "epoch": 0.30009252595867175, "eval_logits/chosen": -0.5386792421340942, "eval_logits/rejected": -0.5987717509269714, "eval_logps/chosen": -77.57208251953125, "eval_logps/rejected": -54.76309585571289, "eval_loss": 0.31749245524406433, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.73365318775177, "eval_rewards/margins": 1.5757219791412354, "eval_rewards/rejected": -0.8420688509941101, "eval_runtime": 2.1081, "eval_samples_per_second": 507.577, "eval_steps_per_second": 8.064, "step": 2919 }, { "epoch": 0.3001953325794181, "grad_norm": 23.625, "learning_rate": 7.775873886223441e-07, "logits/chosen": -0.5797132253646851, "logits/rejected": -0.5928038954734802, "logps/chosen": -37.97409439086914, "logps/rejected": -56.41802978515625, "loss": 0.2756, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9339389801025391, "rewards/margins": 1.910517692565918, "rewards/rejected": -0.9765788316726685, "step": 2920 }, { "epoch": 0.30070936568315, "grad_norm": 43.0, "learning_rate": 7.77016221156043e-07, "logits/chosen": -0.5500937700271606, "logits/rejected": -0.5827196836471558, "logps/chosen": -37.575416564941406, "logps/rejected": -57.097434997558594, "loss": 0.3094, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8119398355484009, "rewards/margins": 1.9190095663070679, "rewards/rejected": -1.107069969177246, "step": 2925 }, { "epoch": 0.30122339878688187, "grad_norm": 26.875, "learning_rate": 7.764450536897417e-07, "logits/chosen": -0.5498543381690979, "logits/rejected": -0.5437840223312378, "logps/chosen": -36.46363067626953, "logps/rejected": -48.51121520996094, "loss": 0.3183, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9067064523696899, "rewards/margins": 1.341650366783142, "rewards/rejected": -0.4349438548088074, "step": 2930 }, { "epoch": 0.30173743189061375, "grad_norm": 40.25, "learning_rate": 7.758738862234406e-07, "logits/chosen": -0.5090315341949463, "logits/rejected": -0.50466388463974, "logps/chosen": -37.37971878051758, "logps/rejected": -50.99030685424805, "loss": 0.3315, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6877740025520325, "rewards/margins": 1.0885086059570312, "rewards/rejected": -0.40073466300964355, "step": 2935 }, { "epoch": 0.30225146499434563, "grad_norm": 28.5, "learning_rate": 7.753027187571396e-07, "logits/chosen": -0.5525422096252441, "logits/rejected": -0.5840590000152588, "logps/chosen": -42.94794464111328, "logps/rejected": -52.775840759277344, "loss": 0.3205, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8211679458618164, "rewards/margins": 1.7452062368392944, "rewards/rejected": -0.9240385293960571, "step": 2940 }, { "epoch": 0.3027654980980775, "grad_norm": 42.75, "learning_rate": 7.747315512908385e-07, "logits/chosen": -0.5997062921524048, "logits/rejected": -0.5743883848190308, "logps/chosen": -37.42679977416992, "logps/rejected": -58.694671630859375, "loss": 0.3358, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8176084756851196, "rewards/margins": 1.6229045391082764, "rewards/rejected": -0.8052962422370911, "step": 2945 }, { "epoch": 0.3032795312018094, "grad_norm": 26.375, "learning_rate": 7.741603838245373e-07, "logits/chosen": -0.5691391825675964, "logits/rejected": -0.5723081827163696, "logps/chosen": -37.19514465332031, "logps/rejected": -46.65428924560547, "loss": 0.3433, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.8514429330825806, "rewards/margins": 1.433421015739441, "rewards/rejected": -0.5819781422615051, "step": 2950 }, { "epoch": 0.3037935643055413, "grad_norm": 29.0, "learning_rate": 7.735892163582361e-07, "logits/chosen": -0.6420547962188721, "logits/rejected": -0.6513731479644775, "logps/chosen": -46.0274658203125, "logps/rejected": -53.33039474487305, "loss": 0.2959, "rewards/accuracies": 0.875, "rewards/chosen": 0.8344631195068359, "rewards/margins": 1.8240280151367188, "rewards/rejected": -0.9895647764205933, "step": 2955 }, { "epoch": 0.30430759740927316, "grad_norm": 31.375, "learning_rate": 7.730180488919351e-07, "logits/chosen": -0.5946555733680725, "logits/rejected": -0.5529307126998901, "logps/chosen": -48.54204177856445, "logps/rejected": -54.3989372253418, "loss": 0.3604, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8121358752250671, "rewards/margins": 1.6319761276245117, "rewards/rejected": -0.8198402523994446, "step": 2960 }, { "epoch": 0.30482163051300504, "grad_norm": 29.25, "learning_rate": 7.72446881425634e-07, "logits/chosen": -0.559175431728363, "logits/rejected": -0.6162104606628418, "logps/chosen": -38.21702194213867, "logps/rejected": -54.75004959106445, "loss": 0.3331, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8181084394454956, "rewards/margins": 1.812313437461853, "rewards/rejected": -0.994205117225647, "step": 2965 }, { "epoch": 0.3053356636167369, "grad_norm": 27.0, "learning_rate": 7.718757139593329e-07, "logits/chosen": -0.5106409788131714, "logits/rejected": -0.5549336671829224, "logps/chosen": -41.16735076904297, "logps/rejected": -52.969566345214844, "loss": 0.3663, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8586363792419434, "rewards/margins": 1.4744458198547363, "rewards/rejected": -0.6158093810081482, "step": 2970 }, { "epoch": 0.3058496967204688, "grad_norm": 34.25, "learning_rate": 7.713045464930316e-07, "logits/chosen": -0.6239701509475708, "logits/rejected": -0.5881041288375854, "logps/chosen": -34.86396408081055, "logps/rejected": -49.013145446777344, "loss": 0.2968, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7032966613769531, "rewards/margins": 1.3707959651947021, "rewards/rejected": -0.667499303817749, "step": 2975 }, { "epoch": 0.3063637298242007, "grad_norm": 34.25, "learning_rate": 7.707333790267306e-07, "logits/chosen": -0.5760880708694458, "logits/rejected": -0.6224786043167114, "logps/chosen": -41.14115524291992, "logps/rejected": -55.886627197265625, "loss": 0.3062, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7745804190635681, "rewards/margins": 1.759320616722107, "rewards/rejected": -0.9847403764724731, "step": 2980 }, { "epoch": 0.30687776292793256, "grad_norm": 30.75, "learning_rate": 7.701622115604295e-07, "logits/chosen": -0.6509965658187866, "logits/rejected": -0.6257980465888977, "logps/chosen": -38.58134078979492, "logps/rejected": -52.99036407470703, "loss": 0.3451, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6383540630340576, "rewards/margins": 1.184044599533081, "rewards/rejected": -0.5456904768943787, "step": 2985 }, { "epoch": 0.30739179603166444, "grad_norm": 63.0, "learning_rate": 7.695910440941284e-07, "logits/chosen": -0.5317556858062744, "logits/rejected": -0.5111108422279358, "logps/chosen": -41.72612380981445, "logps/rejected": -55.103721618652344, "loss": 0.3517, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.675636351108551, "rewards/margins": 1.6185426712036133, "rewards/rejected": -0.9429062008857727, "step": 2990 }, { "epoch": 0.3079058291353963, "grad_norm": 31.5, "learning_rate": 7.690198766278272e-07, "logits/chosen": -0.5717836022377014, "logits/rejected": -0.5830844044685364, "logps/chosen": -38.348724365234375, "logps/rejected": -56.482460021972656, "loss": 0.3027, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6441038250923157, "rewards/margins": 1.8909555673599243, "rewards/rejected": -1.2468518018722534, "step": 2995 }, { "epoch": 0.3084198622391282, "grad_norm": 30.25, "learning_rate": 7.684487091615261e-07, "logits/chosen": -0.5451354384422302, "logits/rejected": -0.5278294682502747, "logps/chosen": -36.04201126098633, "logps/rejected": -48.8140869140625, "loss": 0.3111, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5250680446624756, "rewards/margins": 1.146518349647522, "rewards/rejected": -0.6214501261711121, "step": 3000 }, { "epoch": 0.3089338953428601, "grad_norm": 31.125, "learning_rate": 7.67877541695225e-07, "logits/chosen": -0.560802161693573, "logits/rejected": -0.5806612372398376, "logps/chosen": -43.0438117980957, "logps/rejected": -56.244911193847656, "loss": 0.306, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5948652625083923, "rewards/margins": 1.385058879852295, "rewards/rejected": -0.7901936769485474, "step": 3005 }, { "epoch": 0.30944792844659197, "grad_norm": 35.0, "learning_rate": 7.673063742289239e-07, "logits/chosen": -0.6126992702484131, "logits/rejected": -0.6396545171737671, "logps/chosen": -43.697837829589844, "logps/rejected": -53.76910400390625, "loss": 0.295, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6114951372146606, "rewards/margins": 1.4322755336761475, "rewards/rejected": -0.8207803964614868, "step": 3010 }, { "epoch": 0.30996196155032385, "grad_norm": 25.125, "learning_rate": 7.667352067626229e-07, "logits/chosen": -0.7492235898971558, "logits/rejected": -0.6955565214157104, "logps/chosen": -44.53866958618164, "logps/rejected": -56.16865158081055, "loss": 0.3152, "rewards/accuracies": 1.0, "rewards/chosen": 0.6949045062065125, "rewards/margins": 1.7406940460205078, "rewards/rejected": -1.0457894802093506, "step": 3015 }, { "epoch": 0.31047599465405573, "grad_norm": 52.0, "learning_rate": 7.661640392963216e-07, "logits/chosen": -0.6057881116867065, "logits/rejected": -0.6500394940376282, "logps/chosen": -41.23649597167969, "logps/rejected": -55.3447265625, "loss": 0.341, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8732160329818726, "rewards/margins": 1.7431766986846924, "rewards/rejected": -0.8699606657028198, "step": 3020 }, { "epoch": 0.3109900277577876, "grad_norm": 28.875, "learning_rate": 7.655928718300205e-07, "logits/chosen": -0.5782452821731567, "logits/rejected": -0.6065753698348999, "logps/chosen": -45.540130615234375, "logps/rejected": -53.4218864440918, "loss": 0.3451, "rewards/accuracies": 0.875, "rewards/chosen": 0.6806749105453491, "rewards/margins": 1.1791822910308838, "rewards/rejected": -0.49850741028785706, "step": 3025 }, { "epoch": 0.3115040608615195, "grad_norm": 30.375, "learning_rate": 7.650217043637194e-07, "logits/chosen": -0.5148284435272217, "logits/rejected": -0.5090001821517944, "logps/chosen": -41.59611129760742, "logps/rejected": -51.330955505371094, "loss": 0.3226, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.701398491859436, "rewards/margins": 1.4019076824188232, "rewards/rejected": -0.700509250164032, "step": 3030 }, { "epoch": 0.3120180939652514, "grad_norm": 29.25, "learning_rate": 7.644505368974184e-07, "logits/chosen": -0.5361277461051941, "logits/rejected": -0.554371178150177, "logps/chosen": -36.412078857421875, "logps/rejected": -47.570552825927734, "loss": 0.3112, "rewards/accuracies": 0.875, "rewards/chosen": 0.8925937414169312, "rewards/margins": 1.5810306072235107, "rewards/rejected": -0.6884368658065796, "step": 3035 }, { "epoch": 0.31253212706898326, "grad_norm": 44.25, "learning_rate": 7.638793694311172e-07, "logits/chosen": -0.5927631855010986, "logits/rejected": -0.6096982955932617, "logps/chosen": -39.635772705078125, "logps/rejected": -51.25396728515625, "loss": 0.3402, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7695282697677612, "rewards/margins": 1.6313116550445557, "rewards/rejected": -0.8617833852767944, "step": 3040 }, { "epoch": 0.31304616017271514, "grad_norm": 36.25, "learning_rate": 7.63308201964816e-07, "logits/chosen": -0.5604921579360962, "logits/rejected": -0.569855809211731, "logps/chosen": -36.16693878173828, "logps/rejected": -51.58256912231445, "loss": 0.3304, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8295314908027649, "rewards/margins": 1.312610387802124, "rewards/rejected": -0.48307910561561584, "step": 3045 }, { "epoch": 0.313560193276447, "grad_norm": 27.125, "learning_rate": 7.627370344985149e-07, "logits/chosen": -0.5501684546470642, "logits/rejected": -0.5031177401542664, "logps/chosen": -37.233970642089844, "logps/rejected": -57.59173583984375, "loss": 0.3009, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8952769041061401, "rewards/margins": 1.779581069946289, "rewards/rejected": -0.8843041658401489, "step": 3050 }, { "epoch": 0.3140742263801789, "grad_norm": 39.0, "learning_rate": 7.621658670322139e-07, "logits/chosen": -0.6029247641563416, "logits/rejected": -0.5994477272033691, "logps/chosen": -38.276885986328125, "logps/rejected": -53.146705627441406, "loss": 0.3139, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7975435256958008, "rewards/margins": 1.5540329217910767, "rewards/rejected": -0.7564893960952759, "step": 3055 }, { "epoch": 0.3145882594839108, "grad_norm": 32.75, "learning_rate": 7.615946995659127e-07, "logits/chosen": -0.6078521609306335, "logits/rejected": -0.5684828758239746, "logps/chosen": -45.725467681884766, "logps/rejected": -50.54976272583008, "loss": 0.3142, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8809221386909485, "rewards/margins": 1.5691697597503662, "rewards/rejected": -0.688247799873352, "step": 3060 }, { "epoch": 0.31510229258764266, "grad_norm": 42.5, "learning_rate": 7.610235320996115e-07, "logits/chosen": -0.5949805974960327, "logits/rejected": -0.5318517684936523, "logps/chosen": -41.2944221496582, "logps/rejected": -48.21776580810547, "loss": 0.3635, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6785162687301636, "rewards/margins": 1.2672199010849, "rewards/rejected": -0.5887037515640259, "step": 3065 }, { "epoch": 0.31561632569137454, "grad_norm": 25.125, "learning_rate": 7.604523646333104e-07, "logits/chosen": -0.5823272466659546, "logits/rejected": -0.5759316682815552, "logps/chosen": -39.916839599609375, "logps/rejected": -57.072166442871094, "loss": 0.3239, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7376953363418579, "rewards/margins": 1.8322477340698242, "rewards/rejected": -1.0945523977279663, "step": 3070 }, { "epoch": 0.3161303587951064, "grad_norm": 38.5, "learning_rate": 7.598811971670094e-07, "logits/chosen": -0.5579496622085571, "logits/rejected": -0.5736753940582275, "logps/chosen": -37.18047332763672, "logps/rejected": -51.52405548095703, "loss": 0.3471, "rewards/accuracies": 0.875, "rewards/chosen": 0.6771619915962219, "rewards/margins": 1.6057630777359009, "rewards/rejected": -0.9286011457443237, "step": 3075 }, { "epoch": 0.3166443918988383, "grad_norm": 30.375, "learning_rate": 7.593100297007083e-07, "logits/chosen": -0.5705540776252747, "logits/rejected": -0.5173262357711792, "logps/chosen": -36.812747955322266, "logps/rejected": -52.3991813659668, "loss": 0.3173, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.834936797618866, "rewards/margins": 1.541101098060608, "rewards/rejected": -0.7061641812324524, "step": 3080 }, { "epoch": 0.3171584250025702, "grad_norm": 28.875, "learning_rate": 7.587388622344071e-07, "logits/chosen": -0.5932108163833618, "logits/rejected": -0.5749810934066772, "logps/chosen": -35.80099105834961, "logps/rejected": -56.99809646606445, "loss": 0.3274, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9302994608879089, "rewards/margins": 1.625526785850525, "rewards/rejected": -0.6952272057533264, "step": 3085 }, { "epoch": 0.31767245810630207, "grad_norm": 30.875, "learning_rate": 7.581676947681059e-07, "logits/chosen": -0.599201500415802, "logits/rejected": -0.6382046937942505, "logps/chosen": -44.78556442260742, "logps/rejected": -51.749229431152344, "loss": 0.3401, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5203807950019836, "rewards/margins": 1.2099593877792358, "rewards/rejected": -0.689578652381897, "step": 3090 }, { "epoch": 0.31818649121003395, "grad_norm": 35.75, "learning_rate": 7.575965273018049e-07, "logits/chosen": -0.5985896587371826, "logits/rejected": -0.5557582974433899, "logps/chosen": -40.67613983154297, "logps/rejected": -49.57311248779297, "loss": 0.343, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.673755943775177, "rewards/margins": 1.3739683628082275, "rewards/rejected": -0.700212299823761, "step": 3095 }, { "epoch": 0.31870052431376583, "grad_norm": 28.875, "learning_rate": 7.570253598355038e-07, "logits/chosen": -0.5571392774581909, "logits/rejected": -0.5214211344718933, "logps/chosen": -32.778076171875, "logps/rejected": -45.68296432495117, "loss": 0.3083, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8073792457580566, "rewards/margins": 1.318839430809021, "rewards/rejected": -0.5114601850509644, "step": 3100 }, { "epoch": 0.3192145574174977, "grad_norm": 42.0, "learning_rate": 7.564541923692026e-07, "logits/chosen": -0.6195815801620483, "logits/rejected": -0.6537154912948608, "logps/chosen": -39.55904769897461, "logps/rejected": -50.36970520019531, "loss": 0.3223, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8047407269477844, "rewards/margins": 1.6076921224594116, "rewards/rejected": -0.8029515147209167, "step": 3105 }, { "epoch": 0.3197285905212296, "grad_norm": 30.0, "learning_rate": 7.558830249029014e-07, "logits/chosen": -0.5269190669059753, "logits/rejected": -0.6161175966262817, "logps/chosen": -42.828086853027344, "logps/rejected": -54.656776428222656, "loss": 0.305, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7171423435211182, "rewards/margins": 1.5920143127441406, "rewards/rejected": -0.8748719096183777, "step": 3110 }, { "epoch": 0.3202426236249614, "grad_norm": 25.625, "learning_rate": 7.553118574366004e-07, "logits/chosen": -0.5426310300827026, "logits/rejected": -0.5756717324256897, "logps/chosen": -36.65655517578125, "logps/rejected": -44.88041305541992, "loss": 0.3249, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9138415455818176, "rewards/margins": 1.6661407947540283, "rewards/rejected": -0.7522993683815002, "step": 3115 }, { "epoch": 0.3207566567286933, "grad_norm": 28.25, "learning_rate": 7.547406899702993e-07, "logits/chosen": -0.5598804950714111, "logits/rejected": -0.6145951151847839, "logps/chosen": -46.17415237426758, "logps/rejected": -51.65135955810547, "loss": 0.3468, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7190077304840088, "rewards/margins": 1.5030966997146606, "rewards/rejected": -0.7840889692306519, "step": 3120 }, { "epoch": 0.3212706898324252, "grad_norm": 30.125, "learning_rate": 7.541695225039982e-07, "logits/chosen": -0.5229955911636353, "logits/rejected": -0.5480608344078064, "logps/chosen": -40.41106414794922, "logps/rejected": -55.505645751953125, "loss": 0.3183, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7099063992500305, "rewards/margins": 1.9398164749145508, "rewards/rejected": -1.229909896850586, "step": 3125 }, { "epoch": 0.32178472293615706, "grad_norm": 29.875, "learning_rate": 7.535983550376969e-07, "logits/chosen": -0.5536885857582092, "logits/rejected": -0.5846058130264282, "logps/chosen": -44.4316291809082, "logps/rejected": -54.16328048706055, "loss": 0.287, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7779724597930908, "rewards/margins": 1.7291860580444336, "rewards/rejected": -0.9512137174606323, "step": 3130 }, { "epoch": 0.32229875603988895, "grad_norm": 30.0, "learning_rate": 7.530271875713959e-07, "logits/chosen": -0.5445115566253662, "logits/rejected": -0.5840675234794617, "logps/chosen": -39.534889221191406, "logps/rejected": -49.56596374511719, "loss": 0.3087, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6075369715690613, "rewards/margins": 1.421291470527649, "rewards/rejected": -0.8137543797492981, "step": 3135 }, { "epoch": 0.3228127891436208, "grad_norm": 27.25, "learning_rate": 7.524560201050948e-07, "logits/chosen": -0.5415239334106445, "logits/rejected": -0.543041467666626, "logps/chosen": -36.38002014160156, "logps/rejected": -50.262298583984375, "loss": 0.3169, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7362571954727173, "rewards/margins": 1.6571712493896484, "rewards/rejected": -0.9209141731262207, "step": 3140 }, { "epoch": 0.3233268222473527, "grad_norm": 30.5, "learning_rate": 7.518848526387937e-07, "logits/chosen": -0.5760734677314758, "logits/rejected": -0.5902703404426575, "logps/chosen": -36.75959396362305, "logps/rejected": -49.631591796875, "loss": 0.3237, "rewards/accuracies": 0.875, "rewards/chosen": 0.6997939348220825, "rewards/margins": 1.6873706579208374, "rewards/rejected": -0.9875768423080444, "step": 3145 }, { "epoch": 0.3238408553510846, "grad_norm": 35.25, "learning_rate": 7.513136851724925e-07, "logits/chosen": -0.5173889994621277, "logits/rejected": -0.572870671749115, "logps/chosen": -46.0563850402832, "logps/rejected": -58.9186897277832, "loss": 0.3602, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5198113322257996, "rewards/margins": 1.440482258796692, "rewards/rejected": -0.9206709861755371, "step": 3150 }, { "epoch": 0.32435488845481647, "grad_norm": 32.75, "learning_rate": 7.507425177061914e-07, "logits/chosen": -0.5757821202278137, "logits/rejected": -0.5653301477432251, "logps/chosen": -36.39452362060547, "logps/rejected": -51.9075813293457, "loss": 0.3116, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8580197095870972, "rewards/margins": 1.725600242614746, "rewards/rejected": -0.8675804138183594, "step": 3155 }, { "epoch": 0.32486892155854835, "grad_norm": 34.0, "learning_rate": 7.501713502398903e-07, "logits/chosen": -0.6129119396209717, "logits/rejected": -0.6027839779853821, "logps/chosen": -38.70296096801758, "logps/rejected": -48.441009521484375, "loss": 0.3226, "rewards/accuracies": 0.875, "rewards/chosen": 0.7974683046340942, "rewards/margins": 1.2406498193740845, "rewards/rejected": -0.4431813657283783, "step": 3160 }, { "epoch": 0.32538295466228023, "grad_norm": 34.5, "learning_rate": 7.496001827735892e-07, "logits/chosen": -0.49716219305992126, "logits/rejected": -0.5421132445335388, "logps/chosen": -41.15399932861328, "logps/rejected": -52.79241943359375, "loss": 0.353, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5685975551605225, "rewards/margins": 1.316451072692871, "rewards/rejected": -0.7478536367416382, "step": 3165 }, { "epoch": 0.3258969877660121, "grad_norm": 30.625, "learning_rate": 7.49029015307288e-07, "logits/chosen": -0.5548164248466492, "logits/rejected": -0.5249150395393372, "logps/chosen": -37.531917572021484, "logps/rejected": -53.166778564453125, "loss": 0.3572, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7411186695098877, "rewards/margins": 1.56855046749115, "rewards/rejected": -0.8274319767951965, "step": 3170 }, { "epoch": 0.326411020869744, "grad_norm": 32.5, "learning_rate": 7.48457847840987e-07, "logits/chosen": -0.5459908246994019, "logits/rejected": -0.6094061136245728, "logps/chosen": -37.56303787231445, "logps/rejected": -56.0590934753418, "loss": 0.3162, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6287157535552979, "rewards/margins": 1.7639007568359375, "rewards/rejected": -1.1351850032806396, "step": 3175 }, { "epoch": 0.3269250539734759, "grad_norm": 24.875, "learning_rate": 7.478866803746858e-07, "logits/chosen": -0.5942214727401733, "logits/rejected": -0.5992611646652222, "logps/chosen": -32.073387145996094, "logps/rejected": -48.37733459472656, "loss": 0.3059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8995519876480103, "rewards/margins": 1.418178915977478, "rewards/rejected": -0.5186268091201782, "step": 3180 }, { "epoch": 0.32743908707720776, "grad_norm": 31.5, "learning_rate": 7.473155129083847e-07, "logits/chosen": -0.5398157835006714, "logits/rejected": -0.6208677291870117, "logps/chosen": -38.14966583251953, "logps/rejected": -49.617210388183594, "loss": 0.343, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6747261881828308, "rewards/margins": 1.0401569604873657, "rewards/rejected": -0.36543071269989014, "step": 3185 }, { "epoch": 0.32795312018093964, "grad_norm": 33.0, "learning_rate": 7.467443454420836e-07, "logits/chosen": -0.5303190350532532, "logits/rejected": -0.6532015204429626, "logps/chosen": -35.78411865234375, "logps/rejected": -52.012481689453125, "loss": 0.2864, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.825641930103302, "rewards/margins": 1.8966586589813232, "rewards/rejected": -1.0710169076919556, "step": 3190 }, { "epoch": 0.3284671532846715, "grad_norm": 29.5, "learning_rate": 7.461731779757825e-07, "logits/chosen": -0.5641353726387024, "logits/rejected": -0.5477244853973389, "logps/chosen": -42.060115814208984, "logps/rejected": -56.24275588989258, "loss": 0.2937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6274124383926392, "rewards/margins": 1.7237615585327148, "rewards/rejected": -1.0963490009307861, "step": 3195 }, { "epoch": 0.3289811863884034, "grad_norm": 28.75, "learning_rate": 7.456020105094813e-07, "logits/chosen": -0.536596417427063, "logits/rejected": -0.5495226979255676, "logps/chosen": -44.87983322143555, "logps/rejected": -48.03911590576172, "loss": 0.357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5679740905761719, "rewards/margins": 1.0960280895233154, "rewards/rejected": -0.5280539393424988, "step": 3200 }, { "epoch": 0.3294952194921353, "grad_norm": 44.0, "learning_rate": 7.450308430431802e-07, "logits/chosen": -0.6006880402565002, "logits/rejected": -0.5917578935623169, "logps/chosen": -38.602237701416016, "logps/rejected": -53.68071746826172, "loss": 0.3307, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7763473987579346, "rewards/margins": 1.812355637550354, "rewards/rejected": -1.0360082387924194, "step": 3205 }, { "epoch": 0.33000925259586716, "grad_norm": 29.875, "learning_rate": 7.444596755768791e-07, "logits/chosen": -0.63639897108078, "logits/rejected": -0.650357723236084, "logps/chosen": -38.819576263427734, "logps/rejected": -50.347381591796875, "loss": 0.3046, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8874145746231079, "rewards/margins": 1.7507625818252563, "rewards/rejected": -0.8633478879928589, "step": 3210 }, { "epoch": 0.33052328569959905, "grad_norm": 52.0, "learning_rate": 7.43888508110578e-07, "logits/chosen": -0.6057561635971069, "logits/rejected": -0.5695822834968567, "logps/chosen": -41.16838836669922, "logps/rejected": -49.966888427734375, "loss": 0.374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6271792650222778, "rewards/margins": 1.014478325843811, "rewards/rejected": -0.38729915022850037, "step": 3215 }, { "epoch": 0.3310373188033309, "grad_norm": 38.75, "learning_rate": 7.433173406442768e-07, "logits/chosen": -0.5466042757034302, "logits/rejected": -0.5699619054794312, "logps/chosen": -35.16822052001953, "logps/rejected": -48.94297409057617, "loss": 0.3542, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7957583069801331, "rewards/margins": 1.5555442571640015, "rewards/rejected": -0.7597859501838684, "step": 3220 }, { "epoch": 0.3315513519070628, "grad_norm": 25.125, "learning_rate": 7.427461731779757e-07, "logits/chosen": -0.6669623255729675, "logits/rejected": -0.6790940761566162, "logps/chosen": -38.012046813964844, "logps/rejected": -53.752052307128906, "loss": 0.3346, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8167978525161743, "rewards/margins": 1.7845232486724854, "rewards/rejected": -0.9677254557609558, "step": 3225 }, { "epoch": 0.3320653850107947, "grad_norm": 25.5, "learning_rate": 7.421750057116747e-07, "logits/chosen": -0.6140472292900085, "logits/rejected": -0.6161102652549744, "logps/chosen": -41.11570358276367, "logps/rejected": -59.86993408203125, "loss": 0.2986, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6044076681137085, "rewards/margins": 1.8399293422698975, "rewards/rejected": -1.235521674156189, "step": 3230 }, { "epoch": 0.33257941811452657, "grad_norm": 39.25, "learning_rate": 7.416038382453736e-07, "logits/chosen": -0.4876881539821625, "logits/rejected": -0.5749961733818054, "logps/chosen": -39.61144256591797, "logps/rejected": -50.99229431152344, "loss": 0.3054, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5821043848991394, "rewards/margins": 1.4596067667007446, "rewards/rejected": -0.8775023221969604, "step": 3235 }, { "epoch": 0.33309345121825845, "grad_norm": 35.0, "learning_rate": 7.410326707790724e-07, "logits/chosen": -0.5218977928161621, "logits/rejected": -0.5127808451652527, "logps/chosen": -40.38215637207031, "logps/rejected": -48.96603012084961, "loss": 0.3001, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8639349937438965, "rewards/margins": 1.4322223663330078, "rewards/rejected": -0.5682876110076904, "step": 3240 }, { "epoch": 0.33360748432199033, "grad_norm": 31.125, "learning_rate": 7.404615033127712e-07, "logits/chosen": -0.4929841160774231, "logits/rejected": -0.5657715797424316, "logps/chosen": -36.08307647705078, "logps/rejected": -46.71179962158203, "loss": 0.3216, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8721952438354492, "rewards/margins": 1.2887378931045532, "rewards/rejected": -0.41654253005981445, "step": 3245 }, { "epoch": 0.3341215174257222, "grad_norm": 46.5, "learning_rate": 7.398903358464702e-07, "logits/chosen": -0.592995285987854, "logits/rejected": -0.5543553233146667, "logps/chosen": -38.836090087890625, "logps/rejected": -53.56346893310547, "loss": 0.3321, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8473955392837524, "rewards/margins": 1.8094682693481445, "rewards/rejected": -0.9620726704597473, "step": 3250 }, { "epoch": 0.3346355505294541, "grad_norm": 43.25, "learning_rate": 7.393191683801691e-07, "logits/chosen": -0.5473984479904175, "logits/rejected": -0.5295090675354004, "logps/chosen": -37.68865966796875, "logps/rejected": -53.09112548828125, "loss": 0.345, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6952961683273315, "rewards/margins": 1.699679970741272, "rewards/rejected": -1.0043838024139404, "step": 3255 }, { "epoch": 0.335149583633186, "grad_norm": 33.5, "learning_rate": 7.387480009138679e-07, "logits/chosen": -0.5350745916366577, "logits/rejected": -0.5631982684135437, "logps/chosen": -35.67806625366211, "logps/rejected": -52.12560272216797, "loss": 0.3048, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9121628999710083, "rewards/margins": 1.6221179962158203, "rewards/rejected": -0.7099552154541016, "step": 3260 }, { "epoch": 0.33566361673691786, "grad_norm": 34.5, "learning_rate": 7.381768334475667e-07, "logits/chosen": -0.7053366899490356, "logits/rejected": -0.660443127155304, "logps/chosen": -42.79478073120117, "logps/rejected": -51.82883834838867, "loss": 0.3285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.681643545627594, "rewards/margins": 1.3155789375305176, "rewards/rejected": -0.6339353919029236, "step": 3265 }, { "epoch": 0.33617764984064974, "grad_norm": 30.25, "learning_rate": 7.376056659812657e-07, "logits/chosen": -0.5517573356628418, "logits/rejected": -0.6266417503356934, "logps/chosen": -39.126808166503906, "logps/rejected": -52.63947296142578, "loss": 0.3255, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7312706112861633, "rewards/margins": 1.6836555004119873, "rewards/rejected": -0.9523848295211792, "step": 3270 }, { "epoch": 0.3366916829443816, "grad_norm": 35.25, "learning_rate": 7.370344985149646e-07, "logits/chosen": -0.632914662361145, "logits/rejected": -0.6281504034996033, "logps/chosen": -39.14437484741211, "logps/rejected": -47.02012252807617, "loss": 0.3157, "rewards/accuracies": 0.875, "rewards/chosen": 0.8454147577285767, "rewards/margins": 1.2597154378890991, "rewards/rejected": -0.4143008589744568, "step": 3275 }, { "epoch": 0.3372057160481135, "grad_norm": 32.75, "learning_rate": 7.364633310486635e-07, "logits/chosen": -0.5432999134063721, "logits/rejected": -0.48349007964134216, "logps/chosen": -38.29658889770508, "logps/rejected": -54.2403564453125, "loss": 0.3048, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.609063982963562, "rewards/margins": 1.5539969205856323, "rewards/rejected": -0.9449328184127808, "step": 3280 }, { "epoch": 0.3377197491518454, "grad_norm": 41.0, "learning_rate": 7.358921635823623e-07, "logits/chosen": -0.5357798337936401, "logits/rejected": -0.5681656002998352, "logps/chosen": -39.09517288208008, "logps/rejected": -46.98961639404297, "loss": 0.3235, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6887152791023254, "rewards/margins": 1.3235318660736084, "rewards/rejected": -0.6348164677619934, "step": 3285 }, { "epoch": 0.33823378225557726, "grad_norm": 28.5, "learning_rate": 7.353209961160612e-07, "logits/chosen": -0.5542798638343811, "logits/rejected": -0.5968409180641174, "logps/chosen": -36.14155960083008, "logps/rejected": -49.01643371582031, "loss": 0.2973, "rewards/accuracies": 0.875, "rewards/chosen": 0.7740663290023804, "rewards/margins": 1.7420494556427002, "rewards/rejected": -0.9679828882217407, "step": 3290 }, { "epoch": 0.33874781535930915, "grad_norm": 26.0, "learning_rate": 7.347498286497601e-07, "logits/chosen": -0.5030015707015991, "logits/rejected": -0.5307884216308594, "logps/chosen": -37.93645095825195, "logps/rejected": -53.0576171875, "loss": 0.3251, "rewards/accuracies": 0.875, "rewards/chosen": 0.7453385591506958, "rewards/margins": 1.56119704246521, "rewards/rejected": -0.8158584833145142, "step": 3295 }, { "epoch": 0.339261848463041, "grad_norm": 39.75, "learning_rate": 7.34178661183459e-07, "logits/chosen": -0.5979939699172974, "logits/rejected": -0.6058388948440552, "logps/chosen": -38.83005905151367, "logps/rejected": -54.53696823120117, "loss": 0.3007, "rewards/accuracies": 0.875, "rewards/chosen": 0.7146542072296143, "rewards/margins": 1.76400625705719, "rewards/rejected": -1.0493520498275757, "step": 3300 }, { "epoch": 0.3397758815667729, "grad_norm": 30.625, "learning_rate": 7.336074937171578e-07, "logits/chosen": -0.5600487589836121, "logits/rejected": -0.5570458769798279, "logps/chosen": -40.354122161865234, "logps/rejected": -46.274478912353516, "loss": 0.3453, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8502486348152161, "rewards/margins": 1.4252756834030151, "rewards/rejected": -0.5750270485877991, "step": 3305 }, { "epoch": 0.3402899146705048, "grad_norm": 33.75, "learning_rate": 7.330363262508567e-07, "logits/chosen": -0.5931159853935242, "logits/rejected": -0.6549820303916931, "logps/chosen": -36.55560302734375, "logps/rejected": -49.155006408691406, "loss": 0.3234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8062360882759094, "rewards/margins": 1.1858373880386353, "rewards/rejected": -0.3796014189720154, "step": 3310 }, { "epoch": 0.34080394777423667, "grad_norm": 32.25, "learning_rate": 7.324651587845556e-07, "logits/chosen": -0.5642812252044678, "logits/rejected": -0.6039407849311829, "logps/chosen": -38.051143646240234, "logps/rejected": -52.939361572265625, "loss": 0.3467, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7728269696235657, "rewards/margins": 1.502469778060913, "rewards/rejected": -0.7296426892280579, "step": 3315 }, { "epoch": 0.34131798087796855, "grad_norm": 32.25, "learning_rate": 7.318939913182545e-07, "logits/chosen": -0.5546906590461731, "logits/rejected": -0.5679140686988831, "logps/chosen": -41.698646545410156, "logps/rejected": -49.97942352294922, "loss": 0.326, "rewards/accuracies": 0.875, "rewards/chosen": 0.5257952809333801, "rewards/margins": 1.1629226207733154, "rewards/rejected": -0.6371272802352905, "step": 3320 }, { "epoch": 0.34183201398170043, "grad_norm": 34.0, "learning_rate": 7.313228238519533e-07, "logits/chosen": -0.4689987301826477, "logits/rejected": -0.46158868074417114, "logps/chosen": -40.30472183227539, "logps/rejected": -47.723148345947266, "loss": 0.342, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5228188037872314, "rewards/margins": 1.2080979347229004, "rewards/rejected": -0.6852791905403137, "step": 3325 }, { "epoch": 0.3423460470854323, "grad_norm": 34.0, "learning_rate": 7.307516563856523e-07, "logits/chosen": -0.5616940259933472, "logits/rejected": -0.5689252614974976, "logps/chosen": -36.75053787231445, "logps/rejected": -55.14136505126953, "loss": 0.3154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9461946487426758, "rewards/margins": 1.7853710651397705, "rewards/rejected": -0.8391765356063843, "step": 3330 }, { "epoch": 0.3428600801891642, "grad_norm": 38.5, "learning_rate": 7.301804889193511e-07, "logits/chosen": -0.6345259547233582, "logits/rejected": -0.636085569858551, "logps/chosen": -46.377418518066406, "logps/rejected": -51.843788146972656, "loss": 0.3567, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7265440225601196, "rewards/margins": 1.1574203968048096, "rewards/rejected": -0.4308764338493347, "step": 3335 }, { "epoch": 0.3433741132928961, "grad_norm": 29.125, "learning_rate": 7.2960932145305e-07, "logits/chosen": -0.5774432420730591, "logits/rejected": -0.5297644138336182, "logps/chosen": -35.094947814941406, "logps/rejected": -49.39681625366211, "loss": 0.2872, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8650074005126953, "rewards/margins": 1.7493892908096313, "rewards/rejected": -0.8843819499015808, "step": 3340 }, { "epoch": 0.34388814639662796, "grad_norm": 27.0, "learning_rate": 7.290381539867489e-07, "logits/chosen": -0.597658634185791, "logits/rejected": -0.5695503950119019, "logps/chosen": -37.02014923095703, "logps/rejected": -46.67082977294922, "loss": 0.3173, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8400412797927856, "rewards/margins": 1.4940986633300781, "rewards/rejected": -0.6540575623512268, "step": 3345 }, { "epoch": 0.34440217950035984, "grad_norm": 37.25, "learning_rate": 7.284669865204478e-07, "logits/chosen": -0.5741784572601318, "logits/rejected": -0.5591640472412109, "logps/chosen": -38.4921875, "logps/rejected": -52.26581573486328, "loss": 0.338, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9293106198310852, "rewards/margins": 1.6960471868515015, "rewards/rejected": -0.7667365074157715, "step": 3350 }, { "epoch": 0.3449162126040917, "grad_norm": 36.5, "learning_rate": 7.278958190541466e-07, "logits/chosen": -0.5450439453125, "logits/rejected": -0.593110203742981, "logps/chosen": -40.238609313964844, "logps/rejected": -51.455474853515625, "loss": 0.3386, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9055455923080444, "rewards/margins": 1.4330828189849854, "rewards/rejected": -0.5275372266769409, "step": 3355 }, { "epoch": 0.3454302457078236, "grad_norm": 37.0, "learning_rate": 7.273246515878455e-07, "logits/chosen": -0.5704784393310547, "logits/rejected": -0.6236244440078735, "logps/chosen": -38.972877502441406, "logps/rejected": -55.1514778137207, "loss": 0.3548, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.764843225479126, "rewards/margins": 1.6168311834335327, "rewards/rejected": -0.8519880175590515, "step": 3360 }, { "epoch": 0.3459442788115555, "grad_norm": 48.25, "learning_rate": 7.267534841215444e-07, "logits/chosen": -0.4711848199367523, "logits/rejected": -0.4965634346008301, "logps/chosen": -31.41057777404785, "logps/rejected": -44.530067443847656, "loss": 0.3729, "rewards/accuracies": 0.875, "rewards/chosen": 0.9571086764335632, "rewards/margins": 1.2504069805145264, "rewards/rejected": -0.29329827427864075, "step": 3365 }, { "epoch": 0.34645831191528736, "grad_norm": 32.0, "learning_rate": 7.261823166552433e-07, "logits/chosen": -0.5781447291374207, "logits/rejected": -0.576653003692627, "logps/chosen": -39.861610412597656, "logps/rejected": -54.47832489013672, "loss": 0.3572, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8584511876106262, "rewards/margins": 1.770948052406311, "rewards/rejected": -0.9124968647956848, "step": 3370 }, { "epoch": 0.34697234501901925, "grad_norm": 29.875, "learning_rate": 7.256111491889422e-07, "logits/chosen": -0.5405364036560059, "logits/rejected": -0.5229585766792297, "logps/chosen": -38.558990478515625, "logps/rejected": -47.059913635253906, "loss": 0.3391, "rewards/accuracies": 0.875, "rewards/chosen": 0.7146512269973755, "rewards/margins": 1.3920856714248657, "rewards/rejected": -0.6774344444274902, "step": 3375 }, { "epoch": 0.3474863781227511, "grad_norm": 29.375, "learning_rate": 7.25039981722641e-07, "logits/chosen": -0.5410059690475464, "logits/rejected": -0.5864638686180115, "logps/chosen": -39.58498001098633, "logps/rejected": -59.148529052734375, "loss": 0.3081, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.835858166217804, "rewards/margins": 1.8096920251846313, "rewards/rejected": -0.9738338589668274, "step": 3380 }, { "epoch": 0.348000411226483, "grad_norm": 44.5, "learning_rate": 7.244688142563399e-07, "logits/chosen": -0.6175568699836731, "logits/rejected": -0.5899081230163574, "logps/chosen": -43.91431427001953, "logps/rejected": -50.72077560424805, "loss": 0.3101, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9280716776847839, "rewards/margins": 1.8531357049942017, "rewards/rejected": -0.9250639081001282, "step": 3385 }, { "epoch": 0.3485144443302149, "grad_norm": 33.5, "learning_rate": 7.238976467900389e-07, "logits/chosen": -0.6202272176742554, "logits/rejected": -0.5664983987808228, "logps/chosen": -37.18269348144531, "logps/rejected": -50.82207489013672, "loss": 0.3464, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7686204314231873, "rewards/margins": 1.4195868968963623, "rewards/rejected": -0.6509664058685303, "step": 3390 }, { "epoch": 0.34902847743394677, "grad_norm": 25.375, "learning_rate": 7.233264793237377e-07, "logits/chosen": -0.6409226655960083, "logits/rejected": -0.5986409187316895, "logps/chosen": -32.462501525878906, "logps/rejected": -46.18360137939453, "loss": 0.3068, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8505361676216125, "rewards/margins": 1.3039238452911377, "rewards/rejected": -0.45338767766952515, "step": 3395 }, { "epoch": 0.34954251053767865, "grad_norm": 31.25, "learning_rate": 7.227553118574365e-07, "logits/chosen": -0.599456787109375, "logits/rejected": -0.6217780709266663, "logps/chosen": -37.19830322265625, "logps/rejected": -52.39397430419922, "loss": 0.3304, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7425116300582886, "rewards/margins": 1.560888409614563, "rewards/rejected": -0.8183764219284058, "step": 3400 }, { "epoch": 0.35005654364141053, "grad_norm": 24.5, "learning_rate": 7.221841443911354e-07, "logits/chosen": -0.6220895648002625, "logits/rejected": -0.6240030527114868, "logps/chosen": -38.535518646240234, "logps/rejected": -52.68434524536133, "loss": 0.3023, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8453322649002075, "rewards/margins": 1.9403644800186157, "rewards/rejected": -1.095031976699829, "step": 3405 }, { "epoch": 0.3505705767451424, "grad_norm": 29.0, "learning_rate": 7.216129769248344e-07, "logits/chosen": -0.7091005444526672, "logits/rejected": -0.7829904556274414, "logps/chosen": -34.36308288574219, "logps/rejected": -50.54589080810547, "loss": 0.324, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8093011975288391, "rewards/margins": 1.8958028554916382, "rewards/rejected": -1.0865017175674438, "step": 3410 }, { "epoch": 0.35108460984887424, "grad_norm": 25.125, "learning_rate": 7.210418094585332e-07, "logits/chosen": -0.587559163570404, "logits/rejected": -0.5947346091270447, "logps/chosen": -38.17960739135742, "logps/rejected": -49.012447357177734, "loss": 0.3313, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6999338269233704, "rewards/margins": 1.3434841632843018, "rewards/rejected": -0.6435503363609314, "step": 3415 }, { "epoch": 0.3515986429526061, "grad_norm": 28.375, "learning_rate": 7.204706419922321e-07, "logits/chosen": -0.5910961031913757, "logits/rejected": -0.6221176981925964, "logps/chosen": -39.694984436035156, "logps/rejected": -54.67927169799805, "loss": 0.3023, "rewards/accuracies": 0.875, "rewards/chosen": 0.6094285249710083, "rewards/margins": 1.662879228591919, "rewards/rejected": -1.053450584411621, "step": 3420 }, { "epoch": 0.352112676056338, "grad_norm": 28.0, "learning_rate": 7.198994745259309e-07, "logits/chosen": -0.6394851207733154, "logits/rejected": -0.6390541791915894, "logps/chosen": -35.61968231201172, "logps/rejected": -51.12517166137695, "loss": 0.3345, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7156451344490051, "rewards/margins": 1.2447593212127686, "rewards/rejected": -0.5291141271591187, "step": 3425 }, { "epoch": 0.3526267091600699, "grad_norm": 26.375, "learning_rate": 7.193283070596299e-07, "logits/chosen": -0.5652199983596802, "logits/rejected": -0.5573745965957642, "logps/chosen": -36.19501495361328, "logps/rejected": -46.967811584472656, "loss": 0.3167, "rewards/accuracies": 0.875, "rewards/chosen": 0.8410699963569641, "rewards/margins": 1.2650136947631836, "rewards/rejected": -0.42394399642944336, "step": 3430 }, { "epoch": 0.35314074226380177, "grad_norm": 27.875, "learning_rate": 7.187571395933287e-07, "logits/chosen": -0.47164472937583923, "logits/rejected": -0.4548265337944031, "logps/chosen": -35.988563537597656, "logps/rejected": -47.80148696899414, "loss": 0.321, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9334548115730286, "rewards/margins": 1.370137095451355, "rewards/rejected": -0.43668240308761597, "step": 3435 }, { "epoch": 0.35365477536753365, "grad_norm": 23.75, "learning_rate": 7.181859721270276e-07, "logits/chosen": -0.5501211881637573, "logits/rejected": -0.5520118474960327, "logps/chosen": -37.51675033569336, "logps/rejected": -50.98607635498047, "loss": 0.3077, "rewards/accuracies": 0.875, "rewards/chosen": 0.8525314331054688, "rewards/margins": 1.6658241748809814, "rewards/rejected": -0.8132928609848022, "step": 3440 }, { "epoch": 0.35416880847126553, "grad_norm": 31.75, "learning_rate": 7.176148046607265e-07, "logits/chosen": -0.40928179025650024, "logits/rejected": -0.4723386764526367, "logps/chosen": -35.668731689453125, "logps/rejected": -46.33372116088867, "loss": 0.33, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7485083937644958, "rewards/margins": 1.5204882621765137, "rewards/rejected": -0.7719799280166626, "step": 3445 }, { "epoch": 0.3546828415749974, "grad_norm": 34.0, "learning_rate": 7.170436371944254e-07, "logits/chosen": -0.570106565952301, "logits/rejected": -0.5321751236915588, "logps/chosen": -39.410526275634766, "logps/rejected": -52.76273727416992, "loss": 0.3247, "rewards/accuracies": 0.875, "rewards/chosen": 0.709184467792511, "rewards/margins": 1.5267951488494873, "rewards/rejected": -0.8176107406616211, "step": 3450 }, { "epoch": 0.3551968746787293, "grad_norm": 27.5, "learning_rate": 7.164724697281243e-07, "logits/chosen": -0.4855722486972809, "logits/rejected": -0.5552786588668823, "logps/chosen": -37.593143463134766, "logps/rejected": -54.327301025390625, "loss": 0.2959, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8686443567276001, "rewards/margins": 1.7878921031951904, "rewards/rejected": -0.9192478060722351, "step": 3455 }, { "epoch": 0.35571090778246117, "grad_norm": 36.0, "learning_rate": 7.159013022618231e-07, "logits/chosen": -0.6122106313705444, "logits/rejected": -0.5660534501075745, "logps/chosen": -41.03999710083008, "logps/rejected": -52.69401168823242, "loss": 0.3445, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.564156174659729, "rewards/margins": 1.1932499408721924, "rewards/rejected": -0.6290937662124634, "step": 3460 }, { "epoch": 0.35622494088619305, "grad_norm": 26.0, "learning_rate": 7.153301347955221e-07, "logits/chosen": -0.6290430426597595, "logits/rejected": -0.6069819331169128, "logps/chosen": -44.95490264892578, "logps/rejected": -48.43047332763672, "loss": 0.3035, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7059026956558228, "rewards/margins": 1.2980694770812988, "rewards/rejected": -0.5921666622161865, "step": 3465 }, { "epoch": 0.35673897398992493, "grad_norm": 48.5, "learning_rate": 7.147589673292209e-07, "logits/chosen": -0.5915549993515015, "logits/rejected": -0.6585012674331665, "logps/chosen": -38.507347106933594, "logps/rejected": -51.01322555541992, "loss": 0.3167, "rewards/accuracies": 0.875, "rewards/chosen": 0.6491963267326355, "rewards/margins": 1.427750587463379, "rewards/rejected": -0.7785543203353882, "step": 3470 }, { "epoch": 0.3572530070936568, "grad_norm": 34.75, "learning_rate": 7.141877998629198e-07, "logits/chosen": -0.5303919911384583, "logits/rejected": -0.5798749327659607, "logps/chosen": -45.72422790527344, "logps/rejected": -48.5920524597168, "loss": 0.3297, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5682054162025452, "rewards/margins": 1.2312633991241455, "rewards/rejected": -0.6630579233169556, "step": 3475 }, { "epoch": 0.3577670401973887, "grad_norm": 59.25, "learning_rate": 7.136166323966186e-07, "logits/chosen": -0.5497788190841675, "logits/rejected": -0.5397573709487915, "logps/chosen": -40.68859100341797, "logps/rejected": -44.40703582763672, "loss": 0.3176, "rewards/accuracies": 0.875, "rewards/chosen": 0.7607947587966919, "rewards/margins": 1.3467830419540405, "rewards/rejected": -0.5859884023666382, "step": 3480 }, { "epoch": 0.3582810733011206, "grad_norm": 34.5, "learning_rate": 7.130454649303176e-07, "logits/chosen": -0.5507196187973022, "logits/rejected": -0.5715481042861938, "logps/chosen": -35.421607971191406, "logps/rejected": -49.25006103515625, "loss": 0.3426, "rewards/accuracies": 0.875, "rewards/chosen": 0.7315815091133118, "rewards/margins": 1.3244549036026, "rewards/rejected": -0.5928734540939331, "step": 3485 }, { "epoch": 0.35879510640485246, "grad_norm": 27.75, "learning_rate": 7.124742974640164e-07, "logits/chosen": -0.5343772172927856, "logits/rejected": -0.544786810874939, "logps/chosen": -35.63601303100586, "logps/rejected": -51.467247009277344, "loss": 0.3466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6694114804267883, "rewards/margins": 1.3688457012176514, "rewards/rejected": -0.6994343996047974, "step": 3490 }, { "epoch": 0.35930913950858434, "grad_norm": 48.75, "learning_rate": 7.119031299977153e-07, "logits/chosen": -0.6168071627616882, "logits/rejected": -0.6936765313148499, "logps/chosen": -41.26195526123047, "logps/rejected": -52.6456413269043, "loss": 0.3443, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3635775148868561, "rewards/margins": 1.2270973920822144, "rewards/rejected": -0.8635198473930359, "step": 3495 }, { "epoch": 0.3598231726123162, "grad_norm": 29.75, "learning_rate": 7.113319625314142e-07, "logits/chosen": -0.6414613127708435, "logits/rejected": -0.6735979318618774, "logps/chosen": -35.88904571533203, "logps/rejected": -57.00139617919922, "loss": 0.3234, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8273776173591614, "rewards/margins": 1.8353887796401978, "rewards/rejected": -1.0080111026763916, "step": 3500 }, { "epoch": 0.3603372057160481, "grad_norm": 30.25, "learning_rate": 7.107607950651131e-07, "logits/chosen": -0.5176626443862915, "logits/rejected": -0.5233656764030457, "logps/chosen": -35.32733917236328, "logps/rejected": -49.59646224975586, "loss": 0.3143, "rewards/accuracies": 0.75, "rewards/chosen": 0.6295152902603149, "rewards/margins": 1.2498133182525635, "rewards/rejected": -0.6202980279922485, "step": 3505 }, { "epoch": 0.36085123881978, "grad_norm": 62.5, "learning_rate": 7.101896275988119e-07, "logits/chosen": -0.5474525094032288, "logits/rejected": -0.6453357934951782, "logps/chosen": -36.550357818603516, "logps/rejected": -49.4011116027832, "loss": 0.3339, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6733068823814392, "rewards/margins": 1.4440046548843384, "rewards/rejected": -0.7706977128982544, "step": 3510 }, { "epoch": 0.36136527192351187, "grad_norm": 45.5, "learning_rate": 7.096184601325108e-07, "logits/chosen": -0.5730225443840027, "logits/rejected": -0.5777338743209839, "logps/chosen": -43.621097564697266, "logps/rejected": -51.487159729003906, "loss": 0.3622, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3897479176521301, "rewards/margins": 1.156671166419983, "rewards/rejected": -0.7669232487678528, "step": 3515 }, { "epoch": 0.36187930502724375, "grad_norm": 30.5, "learning_rate": 7.090472926662097e-07, "logits/chosen": -0.49201256036758423, "logits/rejected": -0.5156179666519165, "logps/chosen": -34.870994567871094, "logps/rejected": -51.66609573364258, "loss": 0.3341, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9641334414482117, "rewards/margins": 1.8606784343719482, "rewards/rejected": -0.8965450525283813, "step": 3520 }, { "epoch": 0.36239333813097563, "grad_norm": 55.0, "learning_rate": 7.084761251999086e-07, "logits/chosen": -0.5873358845710754, "logits/rejected": -0.489006370306015, "logps/chosen": -44.275428771972656, "logps/rejected": -50.360252380371094, "loss": 0.3679, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8045288920402527, "rewards/margins": 1.5537445545196533, "rewards/rejected": -0.7492155432701111, "step": 3525 }, { "epoch": 0.3629073712347075, "grad_norm": 43.25, "learning_rate": 7.079049577336075e-07, "logits/chosen": -0.6193622350692749, "logits/rejected": -0.6407527327537537, "logps/chosen": -38.1121826171875, "logps/rejected": -51.720306396484375, "loss": 0.3271, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0366613864898682, "rewards/margins": 1.7102550268173218, "rewards/rejected": -0.6735935211181641, "step": 3530 }, { "epoch": 0.3634214043384394, "grad_norm": 30.375, "learning_rate": 7.073337902673063e-07, "logits/chosen": -0.6371443271636963, "logits/rejected": -0.6351538896560669, "logps/chosen": -40.4697380065918, "logps/rejected": -54.205665588378906, "loss": 0.335, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5328940749168396, "rewards/margins": 1.479006052017212, "rewards/rejected": -0.9461119771003723, "step": 3535 }, { "epoch": 0.3639354374421713, "grad_norm": 35.5, "learning_rate": 7.067626228010052e-07, "logits/chosen": -0.6025488972663879, "logits/rejected": -0.5332881212234497, "logps/chosen": -39.35700607299805, "logps/rejected": -50.341453552246094, "loss": 0.3314, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8723841905593872, "rewards/margins": 1.5188400745391846, "rewards/rejected": -0.6464558839797974, "step": 3540 }, { "epoch": 0.36444947054590315, "grad_norm": 39.25, "learning_rate": 7.061914553347042e-07, "logits/chosen": -0.6325520277023315, "logits/rejected": -0.6576908230781555, "logps/chosen": -41.9554328918457, "logps/rejected": -57.65367889404297, "loss": 0.3177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9660075306892395, "rewards/margins": 2.204272508621216, "rewards/rejected": -1.238264799118042, "step": 3545 }, { "epoch": 0.36496350364963503, "grad_norm": 28.375, "learning_rate": 7.05620287868403e-07, "logits/chosen": -0.5673293471336365, "logits/rejected": -0.5939796566963196, "logps/chosen": -38.132144927978516, "logps/rejected": -52.492088317871094, "loss": 0.3204, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8496047258377075, "rewards/margins": 1.4285619258880615, "rewards/rejected": -0.5789572596549988, "step": 3550 }, { "epoch": 0.3654775367533669, "grad_norm": 34.75, "learning_rate": 7.050491204021018e-07, "logits/chosen": -0.4686315655708313, "logits/rejected": -0.5626825094223022, "logps/chosen": -36.60552215576172, "logps/rejected": -57.28788375854492, "loss": 0.2836, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6988927125930786, "rewards/margins": 1.7549102306365967, "rewards/rejected": -1.0560176372528076, "step": 3555 }, { "epoch": 0.3659915698570988, "grad_norm": 32.75, "learning_rate": 7.044779529358007e-07, "logits/chosen": -0.5071344971656799, "logits/rejected": -0.6004863381385803, "logps/chosen": -36.99374008178711, "logps/rejected": -47.18567657470703, "loss": 0.3269, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6946179270744324, "rewards/margins": 1.4519504308700562, "rewards/rejected": -0.757332444190979, "step": 3560 }, { "epoch": 0.3665056029608307, "grad_norm": 29.375, "learning_rate": 7.039067854694997e-07, "logits/chosen": -0.5403866171836853, "logits/rejected": -0.5958620309829712, "logps/chosen": -37.85493850708008, "logps/rejected": -55.95240020751953, "loss": 0.3322, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6826072335243225, "rewards/margins": 1.7526594400405884, "rewards/rejected": -1.0700523853302002, "step": 3565 }, { "epoch": 0.36701963606456256, "grad_norm": 41.5, "learning_rate": 7.033356180031985e-07, "logits/chosen": -0.5068909525871277, "logits/rejected": -0.5675694346427917, "logps/chosen": -43.6905403137207, "logps/rejected": -51.154170989990234, "loss": 0.3261, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9373227953910828, "rewards/margins": 1.6122217178344727, "rewards/rejected": -0.6748987436294556, "step": 3570 }, { "epoch": 0.36753366916829444, "grad_norm": 29.75, "learning_rate": 7.027644505368974e-07, "logits/chosen": -0.5040323138237, "logits/rejected": -0.5348215103149414, "logps/chosen": -43.04582595825195, "logps/rejected": -51.29609298706055, "loss": 0.3, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6678676009178162, "rewards/margins": 1.5391099452972412, "rewards/rejected": -0.8712421655654907, "step": 3575 }, { "epoch": 0.3680477022720263, "grad_norm": 41.5, "learning_rate": 7.021932830705962e-07, "logits/chosen": -0.611174464225769, "logits/rejected": -0.6235651969909668, "logps/chosen": -39.25431442260742, "logps/rejected": -51.46540451049805, "loss": 0.3333, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7047244310379028, "rewards/margins": 1.5524146556854248, "rewards/rejected": -0.8476904034614563, "step": 3580 }, { "epoch": 0.3685617353757582, "grad_norm": 38.25, "learning_rate": 7.016221156042952e-07, "logits/chosen": -0.5850902795791626, "logits/rejected": -0.632335901260376, "logps/chosen": -37.81932830810547, "logps/rejected": -49.0044059753418, "loss": 0.3593, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6960914731025696, "rewards/margins": 1.483731985092163, "rewards/rejected": -0.7876405715942383, "step": 3585 }, { "epoch": 0.3690757684794901, "grad_norm": 33.0, "learning_rate": 7.01050948137994e-07, "logits/chosen": -0.6578747630119324, "logits/rejected": -0.5918439626693726, "logps/chosen": -36.329490661621094, "logps/rejected": -52.37046432495117, "loss": 0.3216, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.711270272731781, "rewards/margins": 1.4870309829711914, "rewards/rejected": -0.7757606506347656, "step": 3590 }, { "epoch": 0.36958980158322197, "grad_norm": 47.0, "learning_rate": 7.004797806716929e-07, "logits/chosen": -0.5698155760765076, "logits/rejected": -0.6016528606414795, "logps/chosen": -40.98363494873047, "logps/rejected": -49.55942153930664, "loss": 0.3337, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5246931314468384, "rewards/margins": 1.4716966152191162, "rewards/rejected": -0.9470037221908569, "step": 3595 }, { "epoch": 0.37010383468695385, "grad_norm": 42.75, "learning_rate": 6.999086132053917e-07, "logits/chosen": -0.6431044936180115, "logits/rejected": -0.5817379355430603, "logps/chosen": -39.607994079589844, "logps/rejected": -51.59087371826172, "loss": 0.3098, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.662255048751831, "rewards/margins": 1.4688671827316284, "rewards/rejected": -0.8066123127937317, "step": 3600 }, { "epoch": 0.37061786779068573, "grad_norm": 38.25, "learning_rate": 6.993374457390907e-07, "logits/chosen": -0.5676343441009521, "logits/rejected": -0.533745527267456, "logps/chosen": -43.40663528442383, "logps/rejected": -52.235877990722656, "loss": 0.3192, "rewards/accuracies": 0.875, "rewards/chosen": 0.9266263842582703, "rewards/margins": 1.785108208656311, "rewards/rejected": -0.8584818840026855, "step": 3605 }, { "epoch": 0.3711319008944176, "grad_norm": 24.875, "learning_rate": 6.987662782727896e-07, "logits/chosen": -0.5073542594909668, "logits/rejected": -0.6058710217475891, "logps/chosen": -38.449974060058594, "logps/rejected": -50.16093826293945, "loss": 0.322, "rewards/accuracies": 0.875, "rewards/chosen": 0.8031617999076843, "rewards/margins": 1.608984351158142, "rewards/rejected": -0.8058225512504578, "step": 3610 }, { "epoch": 0.3716459339981495, "grad_norm": 28.5, "learning_rate": 6.981951108064884e-07, "logits/chosen": -0.5287200212478638, "logits/rejected": -0.5643317699432373, "logps/chosen": -46.62472915649414, "logps/rejected": -55.80927276611328, "loss": 0.2921, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7059494256973267, "rewards/margins": 1.8339446783065796, "rewards/rejected": -1.1279948949813843, "step": 3615 }, { "epoch": 0.3721599671018814, "grad_norm": 34.25, "learning_rate": 6.976239433401873e-07, "logits/chosen": -0.5863275527954102, "logits/rejected": -0.5398775935173035, "logps/chosen": -40.304649353027344, "logps/rejected": -50.423301696777344, "loss": 0.3094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7439092397689819, "rewards/margins": 1.5910792350769043, "rewards/rejected": -0.8471697568893433, "step": 3620 }, { "epoch": 0.37267400020561325, "grad_norm": 37.5, "learning_rate": 6.970527758738862e-07, "logits/chosen": -0.6361891031265259, "logits/rejected": -0.6291788816452026, "logps/chosen": -41.39995574951172, "logps/rejected": -54.00714111328125, "loss": 0.3406, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.515776515007019, "rewards/margins": 1.5344045162200928, "rewards/rejected": -1.0186278820037842, "step": 3625 }, { "epoch": 0.37318803330934514, "grad_norm": 32.5, "learning_rate": 6.964816084075851e-07, "logits/chosen": -0.6201636791229248, "logits/rejected": -0.6412386894226074, "logps/chosen": -40.7051887512207, "logps/rejected": -53.14509963989258, "loss": 0.2982, "rewards/accuracies": 0.875, "rewards/chosen": 0.8851588368415833, "rewards/margins": 1.5923824310302734, "rewards/rejected": -0.7072235941886902, "step": 3630 }, { "epoch": 0.373702066413077, "grad_norm": 29.25, "learning_rate": 6.959104409412839e-07, "logits/chosen": -0.4928058683872223, "logits/rejected": -0.5261781811714172, "logps/chosen": -42.543190002441406, "logps/rejected": -48.61647415161133, "loss": 0.3093, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6932367086410522, "rewards/margins": 1.2723982334136963, "rewards/rejected": -0.5791612863540649, "step": 3635 }, { "epoch": 0.3742160995168089, "grad_norm": 34.25, "learning_rate": 6.953392734749828e-07, "logits/chosen": -0.5725463032722473, "logits/rejected": -0.6078193187713623, "logps/chosen": -38.256229400634766, "logps/rejected": -53.823081970214844, "loss": 0.3172, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6329218149185181, "rewards/margins": 1.183932900428772, "rewards/rejected": -0.5510110855102539, "step": 3640 }, { "epoch": 0.3747301326205408, "grad_norm": 34.5, "learning_rate": 6.947681060086817e-07, "logits/chosen": -0.5118530988693237, "logits/rejected": -0.5088781714439392, "logps/chosen": -33.596778869628906, "logps/rejected": -47.322811126708984, "loss": 0.2871, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8655368089675903, "rewards/margins": 1.4605424404144287, "rewards/rejected": -0.5950055718421936, "step": 3645 }, { "epoch": 0.37524416572427266, "grad_norm": 38.75, "learning_rate": 6.941969385423806e-07, "logits/chosen": -0.6088607311248779, "logits/rejected": -0.5499165058135986, "logps/chosen": -40.96807098388672, "logps/rejected": -50.66059875488281, "loss": 0.3532, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6060559749603271, "rewards/margins": 1.1914570331573486, "rewards/rejected": -0.5854011178016663, "step": 3650 }, { "epoch": 0.37575819882800454, "grad_norm": 24.625, "learning_rate": 6.936257710760795e-07, "logits/chosen": -0.5568885207176208, "logits/rejected": -0.5676753520965576, "logps/chosen": -37.84768295288086, "logps/rejected": -48.27003479003906, "loss": 0.3156, "rewards/accuracies": 0.875, "rewards/chosen": 0.9281630516052246, "rewards/margins": 1.7744543552398682, "rewards/rejected": -0.8462912440299988, "step": 3655 }, { "epoch": 0.3762722319317364, "grad_norm": 38.5, "learning_rate": 6.930546036097784e-07, "logits/chosen": -0.5624004006385803, "logits/rejected": -0.6085886359214783, "logps/chosen": -35.93313217163086, "logps/rejected": -52.177833557128906, "loss": 0.3098, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7805162668228149, "rewards/margins": 1.7248340845108032, "rewards/rejected": -0.9443178176879883, "step": 3660 }, { "epoch": 0.3767862650354683, "grad_norm": 26.875, "learning_rate": 6.924834361434773e-07, "logits/chosen": -0.5969532132148743, "logits/rejected": -0.619363009929657, "logps/chosen": -38.55716323852539, "logps/rejected": -52.593421936035156, "loss": 0.3066, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0105082988739014, "rewards/margins": 1.6580030918121338, "rewards/rejected": -0.6474946737289429, "step": 3665 }, { "epoch": 0.3773002981392002, "grad_norm": 70.5, "learning_rate": 6.919122686771761e-07, "logits/chosen": -0.5895209312438965, "logits/rejected": -0.6115767955780029, "logps/chosen": -38.540252685546875, "logps/rejected": -50.26665496826172, "loss": 0.3269, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9182813763618469, "rewards/margins": 1.3658335208892822, "rewards/rejected": -0.4475522041320801, "step": 3670 }, { "epoch": 0.37781433124293207, "grad_norm": 38.5, "learning_rate": 6.91341101210875e-07, "logits/chosen": -0.6434580087661743, "logits/rejected": -0.649929404258728, "logps/chosen": -37.049495697021484, "logps/rejected": -54.376976013183594, "loss": 0.3109, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8891986012458801, "rewards/margins": 1.685976266860962, "rewards/rejected": -0.7967775464057922, "step": 3675 }, { "epoch": 0.37832836434666395, "grad_norm": 26.25, "learning_rate": 6.907699337445739e-07, "logits/chosen": -0.614848256111145, "logits/rejected": -0.607009768486023, "logps/chosen": -39.47943115234375, "logps/rejected": -56.25123977661133, "loss": 0.3097, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9027194976806641, "rewards/margins": 1.7333409786224365, "rewards/rejected": -0.8306214213371277, "step": 3680 }, { "epoch": 0.37884239745039583, "grad_norm": 32.25, "learning_rate": 6.901987662782728e-07, "logits/chosen": -0.6086642146110535, "logits/rejected": -0.6148747205734253, "logps/chosen": -39.152244567871094, "logps/rejected": -50.527008056640625, "loss": 0.3407, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.751498281955719, "rewards/margins": 1.5281652212142944, "rewards/rejected": -0.7766671180725098, "step": 3685 }, { "epoch": 0.3793564305541277, "grad_norm": 27.125, "learning_rate": 6.896275988119716e-07, "logits/chosen": -0.6389743089675903, "logits/rejected": -0.6581498384475708, "logps/chosen": -37.43562698364258, "logps/rejected": -47.13097381591797, "loss": 0.3328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.725746750831604, "rewards/margins": 1.2788095474243164, "rewards/rejected": -0.5530627965927124, "step": 3690 }, { "epoch": 0.3798704636578596, "grad_norm": 27.125, "learning_rate": 6.890564313456705e-07, "logits/chosen": -0.6208550333976746, "logits/rejected": -0.6287524104118347, "logps/chosen": -41.71710968017578, "logps/rejected": -54.244422912597656, "loss": 0.3226, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8605626821517944, "rewards/margins": 1.542914628982544, "rewards/rejected": -0.6823519468307495, "step": 3695 }, { "epoch": 0.3803844967615915, "grad_norm": 49.25, "learning_rate": 6.884852638793695e-07, "logits/chosen": -0.6007488369941711, "logits/rejected": -0.6824541091918945, "logps/chosen": -35.12276840209961, "logps/rejected": -51.5181770324707, "loss": 0.3252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7081014513969421, "rewards/margins": 1.6156460046768188, "rewards/rejected": -0.9075444936752319, "step": 3700 }, { "epoch": 0.38089852986532335, "grad_norm": 28.125, "learning_rate": 6.879140964130683e-07, "logits/chosen": -0.6116541624069214, "logits/rejected": -0.5716336965560913, "logps/chosen": -38.38764953613281, "logps/rejected": -50.76457214355469, "loss": 0.3207, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.680195689201355, "rewards/margins": 1.427450180053711, "rewards/rejected": -0.7472543716430664, "step": 3705 }, { "epoch": 0.3814125629690552, "grad_norm": 47.0, "learning_rate": 6.873429289467672e-07, "logits/chosen": -0.5360977053642273, "logits/rejected": -0.5442615151405334, "logps/chosen": -36.09077453613281, "logps/rejected": -48.15795135498047, "loss": 0.3278, "rewards/accuracies": 0.875, "rewards/chosen": 0.811798095703125, "rewards/margins": 1.4158116579055786, "rewards/rejected": -0.6040134429931641, "step": 3710 }, { "epoch": 0.38192659607278706, "grad_norm": 28.125, "learning_rate": 6.86771761480466e-07, "logits/chosen": -0.5375937223434448, "logits/rejected": -0.5937368869781494, "logps/chosen": -35.9422607421875, "logps/rejected": -46.3986701965332, "loss": 0.3239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6876906752586365, "rewards/margins": 0.9680450558662415, "rewards/rejected": -0.2803543210029602, "step": 3715 }, { "epoch": 0.38244062917651894, "grad_norm": 24.0, "learning_rate": 6.86200594014165e-07, "logits/chosen": -0.5974697470664978, "logits/rejected": -0.5736969709396362, "logps/chosen": -40.081459045410156, "logps/rejected": -50.10702133178711, "loss": 0.3024, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6412360072135925, "rewards/margins": 1.4524320363998413, "rewards/rejected": -0.811195969581604, "step": 3720 }, { "epoch": 0.3829546622802508, "grad_norm": 48.5, "learning_rate": 6.856294265478638e-07, "logits/chosen": -0.5655560493469238, "logits/rejected": -0.573050856590271, "logps/chosen": -40.60520553588867, "logps/rejected": -55.2675666809082, "loss": 0.3036, "rewards/accuracies": 0.875, "rewards/chosen": 0.6370188593864441, "rewards/margins": 1.8416087627410889, "rewards/rejected": -1.204590082168579, "step": 3725 }, { "epoch": 0.3834686953839827, "grad_norm": 31.75, "learning_rate": 6.850582590815627e-07, "logits/chosen": -0.5942426323890686, "logits/rejected": -0.5741270184516907, "logps/chosen": -48.48710250854492, "logps/rejected": -53.029754638671875, "loss": 0.3392, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5422229170799255, "rewards/margins": 1.430229902267456, "rewards/rejected": -0.8880070447921753, "step": 3730 }, { "epoch": 0.3839827284877146, "grad_norm": 26.875, "learning_rate": 6.844870916152615e-07, "logits/chosen": -0.520951509475708, "logits/rejected": -0.5421727895736694, "logps/chosen": -40.7710075378418, "logps/rejected": -53.19267654418945, "loss": 0.3385, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6331368088722229, "rewards/margins": 1.3939930200576782, "rewards/rejected": -0.760856032371521, "step": 3735 }, { "epoch": 0.38449676159144647, "grad_norm": 27.375, "learning_rate": 6.839159241489605e-07, "logits/chosen": -0.5372895002365112, "logits/rejected": -0.6141210794448853, "logps/chosen": -40.620697021484375, "logps/rejected": -53.90655517578125, "loss": 0.3317, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9040212631225586, "rewards/margins": 1.9155563116073608, "rewards/rejected": -1.0115349292755127, "step": 3740 }, { "epoch": 0.38501079469517835, "grad_norm": 25.875, "learning_rate": 6.833447566826593e-07, "logits/chosen": -0.6089102625846863, "logits/rejected": -0.6565605401992798, "logps/chosen": -35.459922790527344, "logps/rejected": -53.48392868041992, "loss": 0.2919, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8518473505973816, "rewards/margins": 1.7468748092651367, "rewards/rejected": -0.8950274586677551, "step": 3745 }, { "epoch": 0.38552482779891023, "grad_norm": 33.75, "learning_rate": 6.827735892163582e-07, "logits/chosen": -0.541422426700592, "logits/rejected": -0.5274873971939087, "logps/chosen": -35.571006774902344, "logps/rejected": -47.814918518066406, "loss": 0.3327, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.847851574420929, "rewards/margins": 1.4687906503677368, "rewards/rejected": -0.6209390163421631, "step": 3750 }, { "epoch": 0.3860388609026421, "grad_norm": 27.375, "learning_rate": 6.82202421750057e-07, "logits/chosen": -0.5195842981338501, "logits/rejected": -0.6182478666305542, "logps/chosen": -38.10958480834961, "logps/rejected": -49.29795837402344, "loss": 0.2938, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8648975491523743, "rewards/margins": 1.8655683994293213, "rewards/rejected": -1.0006706714630127, "step": 3755 }, { "epoch": 0.386552894006374, "grad_norm": 31.0, "learning_rate": 6.81631254283756e-07, "logits/chosen": -0.5606130361557007, "logits/rejected": -0.5885604619979858, "logps/chosen": -41.11568069458008, "logps/rejected": -50.83388137817383, "loss": 0.2847, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6400185227394104, "rewards/margins": 1.5127249956130981, "rewards/rejected": -0.872706413269043, "step": 3760 }, { "epoch": 0.3870669271101059, "grad_norm": 33.5, "learning_rate": 6.810600868174549e-07, "logits/chosen": -0.569942057132721, "logits/rejected": -0.6468198299407959, "logps/chosen": -41.240882873535156, "logps/rejected": -55.048736572265625, "loss": 0.3015, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7313901782035828, "rewards/margins": 1.478645920753479, "rewards/rejected": -0.7472556233406067, "step": 3765 }, { "epoch": 0.38758096021383776, "grad_norm": 34.0, "learning_rate": 6.804889193511537e-07, "logits/chosen": -0.5646135807037354, "logits/rejected": -0.5985299348831177, "logps/chosen": -31.72481346130371, "logps/rejected": -51.132225036621094, "loss": 0.3446, "rewards/accuracies": 0.875, "rewards/chosen": 0.6953091621398926, "rewards/margins": 1.3943465948104858, "rewards/rejected": -0.699037492275238, "step": 3770 }, { "epoch": 0.38809499331756964, "grad_norm": 56.0, "learning_rate": 6.799177518848526e-07, "logits/chosen": -0.5695184469223022, "logits/rejected": -0.5740774869918823, "logps/chosen": -37.13074493408203, "logps/rejected": -48.265113830566406, "loss": 0.3471, "rewards/accuracies": 0.875, "rewards/chosen": 0.787369430065155, "rewards/margins": 1.172380805015564, "rewards/rejected": -0.3850114941596985, "step": 3775 }, { "epoch": 0.3886090264213015, "grad_norm": 24.875, "learning_rate": 6.793465844185515e-07, "logits/chosen": -0.5422323942184448, "logits/rejected": -0.5435086488723755, "logps/chosen": -43.08670425415039, "logps/rejected": -61.48421096801758, "loss": 0.319, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7936128377914429, "rewards/margins": 2.007051467895508, "rewards/rejected": -1.2134383916854858, "step": 3780 }, { "epoch": 0.3891230595250334, "grad_norm": 32.25, "learning_rate": 6.787754169522504e-07, "logits/chosen": -0.6088265776634216, "logits/rejected": -0.6531612277030945, "logps/chosen": -47.21862030029297, "logps/rejected": -53.27509689331055, "loss": 0.2998, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7748712301254272, "rewards/margins": 1.5687774419784546, "rewards/rejected": -0.793906033039093, "step": 3785 }, { "epoch": 0.3896370926287653, "grad_norm": 28.125, "learning_rate": 6.782042494859492e-07, "logits/chosen": -0.5906190872192383, "logits/rejected": -0.6400560140609741, "logps/chosen": -38.59661102294922, "logps/rejected": -55.173240661621094, "loss": 0.2848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7201715707778931, "rewards/margins": 1.591187834739685, "rewards/rejected": -0.8710163235664368, "step": 3790 }, { "epoch": 0.39015112573249716, "grad_norm": 41.25, "learning_rate": 6.776330820196481e-07, "logits/chosen": -0.5849075317382812, "logits/rejected": -0.6218509674072266, "logps/chosen": -37.2797737121582, "logps/rejected": -56.26276397705078, "loss": 0.3074, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7415962219238281, "rewards/margins": 1.5585745573043823, "rewards/rejected": -0.8169782757759094, "step": 3795 }, { "epoch": 0.39066515883622904, "grad_norm": 28.625, "learning_rate": 6.77061914553347e-07, "logits/chosen": -0.5094751119613647, "logits/rejected": -0.5440534353256226, "logps/chosen": -36.72590255737305, "logps/rejected": -49.19585037231445, "loss": 0.33, "rewards/accuracies": 0.875, "rewards/chosen": 0.7476205229759216, "rewards/margins": 1.5585834980010986, "rewards/rejected": -0.8109628558158875, "step": 3800 }, { "epoch": 0.3911791919399609, "grad_norm": 27.5, "learning_rate": 6.764907470870459e-07, "logits/chosen": -0.5894065499305725, "logits/rejected": -0.5303009748458862, "logps/chosen": -37.54151153564453, "logps/rejected": -52.6578369140625, "loss": 0.2906, "rewards/accuracies": 0.875, "rewards/chosen": 0.7712985277175903, "rewards/margins": 1.5238457918167114, "rewards/rejected": -0.7525473833084106, "step": 3805 }, { "epoch": 0.3916932250436928, "grad_norm": 30.5, "learning_rate": 6.759195796207447e-07, "logits/chosen": -0.5406637787818909, "logits/rejected": -0.5327161550521851, "logps/chosen": -37.58359146118164, "logps/rejected": -46.509681701660156, "loss": 0.3292, "rewards/accuracies": 0.875, "rewards/chosen": 0.8465048670768738, "rewards/margins": 1.6078218221664429, "rewards/rejected": -0.7613170742988586, "step": 3810 }, { "epoch": 0.3922072581474247, "grad_norm": 27.25, "learning_rate": 6.753484121544436e-07, "logits/chosen": -0.508861780166626, "logits/rejected": -0.5515080094337463, "logps/chosen": -41.73897933959961, "logps/rejected": -58.67720413208008, "loss": 0.3025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8529539108276367, "rewards/margins": 1.8337466716766357, "rewards/rejected": -0.9807928204536438, "step": 3815 }, { "epoch": 0.39272129125115657, "grad_norm": 40.5, "learning_rate": 6.747772446881426e-07, "logits/chosen": -0.5515906810760498, "logits/rejected": -0.6134246587753296, "logps/chosen": -38.19966506958008, "logps/rejected": -51.937782287597656, "loss": 0.3379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.628212571144104, "rewards/margins": 1.4283196926116943, "rewards/rejected": -0.8001070022583008, "step": 3820 }, { "epoch": 0.39323532435488845, "grad_norm": 51.0, "learning_rate": 6.742060772218414e-07, "logits/chosen": -0.5490572452545166, "logits/rejected": -0.5788778066635132, "logps/chosen": -36.32170867919922, "logps/rejected": -46.477149963378906, "loss": 0.319, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9572094678878784, "rewards/margins": 1.8713901042938232, "rewards/rejected": -0.9141804575920105, "step": 3825 }, { "epoch": 0.39374935745862033, "grad_norm": 46.5, "learning_rate": 6.736349097555403e-07, "logits/chosen": -0.5218265652656555, "logits/rejected": -0.6100238561630249, "logps/chosen": -33.91142272949219, "logps/rejected": -49.98798370361328, "loss": 0.3042, "rewards/accuracies": 0.875, "rewards/chosen": 0.8589922189712524, "rewards/margins": 1.7652637958526611, "rewards/rejected": -0.9062715768814087, "step": 3830 }, { "epoch": 0.3942633905623522, "grad_norm": 30.75, "learning_rate": 6.730637422892391e-07, "logits/chosen": -0.5204633474349976, "logits/rejected": -0.5734152793884277, "logps/chosen": -38.1455192565918, "logps/rejected": -50.641807556152344, "loss": 0.3471, "rewards/accuracies": 0.875, "rewards/chosen": 0.6981874704360962, "rewards/margins": 1.4825787544250488, "rewards/rejected": -0.7843913435935974, "step": 3835 }, { "epoch": 0.3947774236660841, "grad_norm": 27.375, "learning_rate": 6.724925748229381e-07, "logits/chosen": -0.4964517056941986, "logits/rejected": -0.508147120475769, "logps/chosen": -37.307098388671875, "logps/rejected": -56.03923416137695, "loss": 0.3204, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7221745848655701, "rewards/margins": 1.6809594631195068, "rewards/rejected": -0.9587850570678711, "step": 3840 }, { "epoch": 0.395291456769816, "grad_norm": 29.25, "learning_rate": 6.719214073566369e-07, "logits/chosen": -0.5907387733459473, "logits/rejected": -0.6438643336296082, "logps/chosen": -35.53130340576172, "logps/rejected": -50.999481201171875, "loss": 0.2781, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.851956844329834, "rewards/margins": 1.7122758626937866, "rewards/rejected": -0.8603190183639526, "step": 3845 }, { "epoch": 0.39580548987354786, "grad_norm": 31.375, "learning_rate": 6.713502398903358e-07, "logits/chosen": -0.6525614857673645, "logits/rejected": -0.6862962245941162, "logps/chosen": -35.80632019042969, "logps/rejected": -52.406219482421875, "loss": 0.2781, "rewards/accuracies": 1.0, "rewards/chosen": 0.9481002688407898, "rewards/margins": 1.8560281991958618, "rewards/rejected": -0.9079278111457825, "step": 3850 }, { "epoch": 0.39631952297727974, "grad_norm": 35.25, "learning_rate": 6.707790724240346e-07, "logits/chosen": -0.5990287065505981, "logits/rejected": -0.5894515514373779, "logps/chosen": -34.1295166015625, "logps/rejected": -50.194976806640625, "loss": 0.2994, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9025076031684875, "rewards/margins": 1.607786774635315, "rewards/rejected": -0.7052791714668274, "step": 3855 }, { "epoch": 0.3968335560810116, "grad_norm": 46.75, "learning_rate": 6.702079049577336e-07, "logits/chosen": -0.5729220509529114, "logits/rejected": -0.6069069504737854, "logps/chosen": -37.110469818115234, "logps/rejected": -55.67292404174805, "loss": 0.3112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7680020928382874, "rewards/margins": 1.5349245071411133, "rewards/rejected": -0.7669224143028259, "step": 3860 }, { "epoch": 0.3973475891847435, "grad_norm": 35.0, "learning_rate": 6.696367374914325e-07, "logits/chosen": -0.63447505235672, "logits/rejected": -0.6499849557876587, "logps/chosen": -47.84022903442383, "logps/rejected": -50.383949279785156, "loss": 0.3341, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.568153440952301, "rewards/margins": 1.5081225633621216, "rewards/rejected": -0.9399691820144653, "step": 3865 }, { "epoch": 0.3978616222884754, "grad_norm": 27.625, "learning_rate": 6.690655700251313e-07, "logits/chosen": -0.5504629015922546, "logits/rejected": -0.5923797488212585, "logps/chosen": -33.4171142578125, "logps/rejected": -50.29580307006836, "loss": 0.3342, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7554270029067993, "rewards/margins": 1.6401760578155518, "rewards/rejected": -0.8847490549087524, "step": 3870 }, { "epoch": 0.39837565539220726, "grad_norm": 35.5, "learning_rate": 6.684944025588303e-07, "logits/chosen": -0.6619628667831421, "logits/rejected": -0.6172064542770386, "logps/chosen": -37.96855163574219, "logps/rejected": -48.24193572998047, "loss": 0.3172, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7541897892951965, "rewards/margins": 1.3936498165130615, "rewards/rejected": -0.6394602060317993, "step": 3875 }, { "epoch": 0.39888968849593914, "grad_norm": 29.75, "learning_rate": 6.679232350925291e-07, "logits/chosen": -0.5011416673660278, "logits/rejected": -0.5159046053886414, "logps/chosen": -35.942562103271484, "logps/rejected": -53.39418411254883, "loss": 0.3309, "rewards/accuracies": 0.875, "rewards/chosen": 0.7382075786590576, "rewards/margins": 1.5405361652374268, "rewards/rejected": -0.8023285865783691, "step": 3880 }, { "epoch": 0.399403721599671, "grad_norm": 27.875, "learning_rate": 6.67352067626228e-07, "logits/chosen": -0.6570360660552979, "logits/rejected": -0.6519888639450073, "logps/chosen": -43.13930130004883, "logps/rejected": -52.61237716674805, "loss": 0.317, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5376989841461182, "rewards/margins": 1.2724535465240479, "rewards/rejected": -0.7347546815872192, "step": 3885 }, { "epoch": 0.3999177547034029, "grad_norm": 31.875, "learning_rate": 6.667809001599268e-07, "logits/chosen": -0.6241434812545776, "logits/rejected": -0.6059014797210693, "logps/chosen": -36.187889099121094, "logps/rejected": -50.71163558959961, "loss": 0.311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7148731350898743, "rewards/margins": 1.48646879196167, "rewards/rejected": -0.7715956568717957, "step": 3890 }, { "epoch": 0.40012336794489567, "eval_logits/chosen": -0.537955641746521, "eval_logits/rejected": -0.5980899930000305, "eval_logps/chosen": -77.69308471679688, "eval_logps/rejected": -54.979000091552734, "eval_loss": 0.3157811760902405, "eval_rewards/accuracies": 0.8602941036224365, "eval_rewards/chosen": 0.7215525507926941, "eval_rewards/margins": 1.5852106809616089, "eval_rewards/rejected": -0.8636581301689148, "eval_runtime": 2.1106, "eval_samples_per_second": 506.959, "eval_steps_per_second": 8.054, "step": 3892 }, { "epoch": 0.4004317878071348, "grad_norm": 40.5, "learning_rate": 6.662097326936258e-07, "logits/chosen": -0.5587760210037231, "logits/rejected": -0.5293804407119751, "logps/chosen": -41.32257843017578, "logps/rejected": -48.955867767333984, "loss": 0.3012, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7952820062637329, "rewards/margins": 1.3429040908813477, "rewards/rejected": -0.5476219654083252, "step": 3895 }, { "epoch": 0.40094582091086667, "grad_norm": 27.0, "learning_rate": 6.656385652273246e-07, "logits/chosen": -0.530278742313385, "logits/rejected": -0.5579034090042114, "logps/chosen": -40.85106658935547, "logps/rejected": -50.432315826416016, "loss": 0.3221, "rewards/accuracies": 0.875, "rewards/chosen": 0.8539039492607117, "rewards/margins": 1.4394770860671997, "rewards/rejected": -0.5855730772018433, "step": 3900 }, { "epoch": 0.40145985401459855, "grad_norm": 41.25, "learning_rate": 6.650673977610235e-07, "logits/chosen": -0.4578339159488678, "logits/rejected": -0.5094814896583557, "logps/chosen": -37.315696716308594, "logps/rejected": -51.76609420776367, "loss": 0.2799, "rewards/accuracies": 0.875, "rewards/chosen": 0.8265563249588013, "rewards/margins": 1.588948130607605, "rewards/rejected": -0.7623916864395142, "step": 3905 }, { "epoch": 0.40197388711833043, "grad_norm": 29.75, "learning_rate": 6.644962302947224e-07, "logits/chosen": -0.5532804727554321, "logits/rejected": -0.5650850534439087, "logps/chosen": -37.11750030517578, "logps/rejected": -50.195091247558594, "loss": 0.315, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6973693370819092, "rewards/margins": 1.4878032207489014, "rewards/rejected": -0.7904337644577026, "step": 3910 }, { "epoch": 0.4024879202220623, "grad_norm": 34.75, "learning_rate": 6.639250628284213e-07, "logits/chosen": -0.5269370675086975, "logits/rejected": -0.5630624890327454, "logps/chosen": -37.71720504760742, "logps/rejected": -49.186180114746094, "loss": 0.3237, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7180874943733215, "rewards/margins": 1.5952821969985962, "rewards/rejected": -0.8771945238113403, "step": 3915 }, { "epoch": 0.4030019533257942, "grad_norm": 31.125, "learning_rate": 6.633538953621202e-07, "logits/chosen": -0.5488249063491821, "logits/rejected": -0.5716289281845093, "logps/chosen": -36.63420867919922, "logps/rejected": -53.6694221496582, "loss": 0.3606, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7239721417427063, "rewards/margins": 1.5195205211639404, "rewards/rejected": -0.7955485582351685, "step": 3920 }, { "epoch": 0.4035159864295261, "grad_norm": 30.875, "learning_rate": 6.62782727895819e-07, "logits/chosen": -0.5648072957992554, "logits/rejected": -0.6180993318557739, "logps/chosen": -40.84321212768555, "logps/rejected": -54.06669235229492, "loss": 0.3261, "rewards/accuracies": 0.875, "rewards/chosen": 0.8042644262313843, "rewards/margins": 1.6104762554168701, "rewards/rejected": -0.8062116503715515, "step": 3925 }, { "epoch": 0.40403001953325796, "grad_norm": 38.25, "learning_rate": 6.622115604295179e-07, "logits/chosen": -0.6081843376159668, "logits/rejected": -0.6634387969970703, "logps/chosen": -39.69136428833008, "logps/rejected": -54.04021453857422, "loss": 0.319, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7454832792282104, "rewards/margins": 1.4622193574905396, "rewards/rejected": -0.7167359590530396, "step": 3930 }, { "epoch": 0.40454405263698984, "grad_norm": 27.375, "learning_rate": 6.616403929632168e-07, "logits/chosen": -0.5768893957138062, "logits/rejected": -0.665838360786438, "logps/chosen": -40.31504440307617, "logps/rejected": -54.15155029296875, "loss": 0.2734, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7495623826980591, "rewards/margins": 1.5919448137283325, "rewards/rejected": -0.8423823118209839, "step": 3935 }, { "epoch": 0.4050580857407217, "grad_norm": 38.75, "learning_rate": 6.610692254969157e-07, "logits/chosen": -0.5327197313308716, "logits/rejected": -0.632145881652832, "logps/chosen": -43.543155670166016, "logps/rejected": -50.86686325073242, "loss": 0.2539, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7611904740333557, "rewards/margins": 1.7944183349609375, "rewards/rejected": -1.0332280397415161, "step": 3940 }, { "epoch": 0.4055721188444536, "grad_norm": 41.25, "learning_rate": 6.604980580306145e-07, "logits/chosen": -0.5723280310630798, "logits/rejected": -0.5978960990905762, "logps/chosen": -44.46403503417969, "logps/rejected": -51.2523078918457, "loss": 0.3018, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7931991219520569, "rewards/margins": 1.6476885080337524, "rewards/rejected": -0.8544895052909851, "step": 3945 }, { "epoch": 0.4060861519481855, "grad_norm": 42.25, "learning_rate": 6.599268905643134e-07, "logits/chosen": -0.5795473456382751, "logits/rejected": -0.590922474861145, "logps/chosen": -39.86922073364258, "logps/rejected": -49.525291442871094, "loss": 0.3598, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5880581736564636, "rewards/margins": 1.2042748928070068, "rewards/rejected": -0.616216778755188, "step": 3950 }, { "epoch": 0.40660018505191736, "grad_norm": 38.5, "learning_rate": 6.593557230980124e-07, "logits/chosen": -0.5347694158554077, "logits/rejected": -0.5348566770553589, "logps/chosen": -36.69217300415039, "logps/rejected": -55.16130828857422, "loss": 0.3091, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5974810719490051, "rewards/margins": 1.5421581268310547, "rewards/rejected": -0.9446772336959839, "step": 3955 }, { "epoch": 0.40711421815564924, "grad_norm": 50.25, "learning_rate": 6.587845556317112e-07, "logits/chosen": -0.5797332525253296, "logits/rejected": -0.5533524751663208, "logps/chosen": -34.65717697143555, "logps/rejected": -43.095909118652344, "loss": 0.3366, "rewards/accuracies": 0.875, "rewards/chosen": 0.8109577894210815, "rewards/margins": 1.2674777507781982, "rewards/rejected": -0.4565199017524719, "step": 3960 }, { "epoch": 0.4076282512593811, "grad_norm": 37.25, "learning_rate": 6.5821338816541e-07, "logits/chosen": -0.5678290128707886, "logits/rejected": -0.5921781063079834, "logps/chosen": -38.422462463378906, "logps/rejected": -51.2310791015625, "loss": 0.3468, "rewards/accuracies": 0.875, "rewards/chosen": 0.6790429353713989, "rewards/margins": 1.4862419366836548, "rewards/rejected": -0.8071990013122559, "step": 3965 }, { "epoch": 0.408142284363113, "grad_norm": 25.5, "learning_rate": 6.576422206991089e-07, "logits/chosen": -0.5112152695655823, "logits/rejected": -0.5910502672195435, "logps/chosen": -36.93535614013672, "logps/rejected": -50.157955169677734, "loss": 0.3454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5536453723907471, "rewards/margins": 1.004591941833496, "rewards/rejected": -0.4509466290473938, "step": 3970 }, { "epoch": 0.4086563174668449, "grad_norm": 40.75, "learning_rate": 6.570710532328079e-07, "logits/chosen": -0.5469862222671509, "logits/rejected": -0.5801187753677368, "logps/chosen": -39.644195556640625, "logps/rejected": -54.52245330810547, "loss": 0.3041, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8250892758369446, "rewards/margins": 1.696414589881897, "rewards/rejected": -0.8713253736495972, "step": 3975 }, { "epoch": 0.40917035057057677, "grad_norm": 25.25, "learning_rate": 6.564998857665067e-07, "logits/chosen": -0.5613435506820679, "logits/rejected": -0.5737231969833374, "logps/chosen": -39.16984558105469, "logps/rejected": -49.89495086669922, "loss": 0.3199, "rewards/accuracies": 0.875, "rewards/chosen": 0.6222730875015259, "rewards/margins": 1.5238515138626099, "rewards/rejected": -0.9015785455703735, "step": 3980 }, { "epoch": 0.40968438367430865, "grad_norm": 26.5, "learning_rate": 6.559287183002056e-07, "logits/chosen": -0.6825336217880249, "logits/rejected": -0.5815760493278503, "logps/chosen": -39.09775924682617, "logps/rejected": -45.12593460083008, "loss": 0.3393, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6143892407417297, "rewards/margins": 1.1342589855194092, "rewards/rejected": -0.5198698043823242, "step": 3985 }, { "epoch": 0.41019841677804053, "grad_norm": 31.125, "learning_rate": 6.553575508339044e-07, "logits/chosen": -0.5004857778549194, "logits/rejected": -0.544262170791626, "logps/chosen": -37.962867736816406, "logps/rejected": -54.096595764160156, "loss": 0.3342, "rewards/accuracies": 0.875, "rewards/chosen": 0.48833245038986206, "rewards/margins": 1.3884387016296387, "rewards/rejected": -0.9001063108444214, "step": 3990 }, { "epoch": 0.4107124498817724, "grad_norm": 24.125, "learning_rate": 6.547863833676034e-07, "logits/chosen": -0.5497697591781616, "logits/rejected": -0.6054044961929321, "logps/chosen": -40.72673416137695, "logps/rejected": -52.961822509765625, "loss": 0.3086, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7511335611343384, "rewards/margins": 1.4893357753753662, "rewards/rejected": -0.7382022738456726, "step": 3995 }, { "epoch": 0.4112264829855043, "grad_norm": 25.125, "learning_rate": 6.542152159013023e-07, "logits/chosen": -0.5832334160804749, "logits/rejected": -0.6072311401367188, "logps/chosen": -36.23939514160156, "logps/rejected": -52.687400817871094, "loss": 0.3318, "rewards/accuracies": 0.875, "rewards/chosen": 0.7224670648574829, "rewards/margins": 1.3516141176223755, "rewards/rejected": -0.6291468739509583, "step": 4000 }, { "epoch": 0.4117405160892361, "grad_norm": 36.25, "learning_rate": 6.536440484350011e-07, "logits/chosen": -0.6105223894119263, "logits/rejected": -0.6111496686935425, "logps/chosen": -36.368282318115234, "logps/rejected": -50.11288070678711, "loss": 0.3287, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9470676183700562, "rewards/margins": 1.5977791547775269, "rewards/rejected": -0.6507115364074707, "step": 4005 }, { "epoch": 0.412254549192968, "grad_norm": 28.125, "learning_rate": 6.530728809686999e-07, "logits/chosen": -0.592108964920044, "logits/rejected": -0.6157539486885071, "logps/chosen": -42.17633819580078, "logps/rejected": -55.99248504638672, "loss": 0.3285, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7636129260063171, "rewards/margins": 1.76627516746521, "rewards/rejected": -1.0026623010635376, "step": 4010 }, { "epoch": 0.4127685822966999, "grad_norm": 42.5, "learning_rate": 6.525017135023989e-07, "logits/chosen": -0.5502595901489258, "logits/rejected": -0.5944892168045044, "logps/chosen": -36.78754425048828, "logps/rejected": -51.785369873046875, "loss": 0.2988, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9009912610054016, "rewards/margins": 1.456518292427063, "rewards/rejected": -0.5555270910263062, "step": 4015 }, { "epoch": 0.41328261540043176, "grad_norm": 38.5, "learning_rate": 6.519305460360978e-07, "logits/chosen": -0.572262167930603, "logits/rejected": -0.6431592702865601, "logps/chosen": -35.386356353759766, "logps/rejected": -59.32763671875, "loss": 0.3268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8420934677124023, "rewards/margins": 1.7931764125823975, "rewards/rejected": -0.9510828852653503, "step": 4020 }, { "epoch": 0.41379664850416364, "grad_norm": 37.25, "learning_rate": 6.513593785697966e-07, "logits/chosen": -0.5182234048843384, "logits/rejected": -0.5716851353645325, "logps/chosen": -37.74205017089844, "logps/rejected": -52.99250411987305, "loss": 0.3094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5957642197608948, "rewards/margins": 1.4760491847991943, "rewards/rejected": -0.8802850842475891, "step": 4025 }, { "epoch": 0.4143106816078955, "grad_norm": 30.875, "learning_rate": 6.507882111034955e-07, "logits/chosen": -0.5438496470451355, "logits/rejected": -0.5525898933410645, "logps/chosen": -38.65959930419922, "logps/rejected": -53.35285186767578, "loss": 0.3734, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7082618474960327, "rewards/margins": 1.6104243993759155, "rewards/rejected": -0.9021625518798828, "step": 4030 }, { "epoch": 0.4148247147116274, "grad_norm": 28.875, "learning_rate": 6.502170436371944e-07, "logits/chosen": -0.4983298182487488, "logits/rejected": -0.48884883522987366, "logps/chosen": -38.403499603271484, "logps/rejected": -53.8835334777832, "loss": 0.3102, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8437392115592957, "rewards/margins": 1.540212869644165, "rewards/rejected": -0.6964737176895142, "step": 4035 }, { "epoch": 0.4153387478153593, "grad_norm": 50.0, "learning_rate": 6.496458761708933e-07, "logits/chosen": -0.5492400527000427, "logits/rejected": -0.5661754012107849, "logps/chosen": -34.093482971191406, "logps/rejected": -47.06731414794922, "loss": 0.3301, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5780161619186401, "rewards/margins": 1.403842568397522, "rewards/rejected": -0.8258262872695923, "step": 4040 }, { "epoch": 0.41585278091909117, "grad_norm": 47.25, "learning_rate": 6.490747087045921e-07, "logits/chosen": -0.44847172498703003, "logits/rejected": -0.5147194862365723, "logps/chosen": -39.85862731933594, "logps/rejected": -51.76060104370117, "loss": 0.3749, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6770291328430176, "rewards/margins": 1.3802095651626587, "rewards/rejected": -0.7031804323196411, "step": 4045 }, { "epoch": 0.41636681402282305, "grad_norm": 36.25, "learning_rate": 6.48503541238291e-07, "logits/chosen": -0.659838080406189, "logits/rejected": -0.6904193758964539, "logps/chosen": -39.11096954345703, "logps/rejected": -46.955963134765625, "loss": 0.3448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7562324404716492, "rewards/margins": 1.1054553985595703, "rewards/rejected": -0.3492230474948883, "step": 4050 }, { "epoch": 0.41688084712655493, "grad_norm": 54.75, "learning_rate": 6.479323737719899e-07, "logits/chosen": -0.6043277382850647, "logits/rejected": -0.5704463720321655, "logps/chosen": -44.56955337524414, "logps/rejected": -53.17522048950195, "loss": 0.3941, "rewards/accuracies": 0.875, "rewards/chosen": 0.6659468412399292, "rewards/margins": 1.3336408138275146, "rewards/rejected": -0.6676939725875854, "step": 4055 }, { "epoch": 0.4173948802302868, "grad_norm": 51.0, "learning_rate": 6.473612063056888e-07, "logits/chosen": -0.6101676225662231, "logits/rejected": -0.6002903580665588, "logps/chosen": -43.196372985839844, "logps/rejected": -53.59593963623047, "loss": 0.3389, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5456997752189636, "rewards/margins": 1.5366483926773071, "rewards/rejected": -0.9909487962722778, "step": 4060 }, { "epoch": 0.4179089133340187, "grad_norm": 33.0, "learning_rate": 6.467900388393877e-07, "logits/chosen": -0.6242765188217163, "logits/rejected": -0.5632833242416382, "logps/chosen": -38.721839904785156, "logps/rejected": -54.49033737182617, "loss": 0.3101, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7098124623298645, "rewards/margins": 1.594382643699646, "rewards/rejected": -0.8845701217651367, "step": 4065 }, { "epoch": 0.4184229464377506, "grad_norm": 29.0, "learning_rate": 6.462188713730865e-07, "logits/chosen": -0.57290118932724, "logits/rejected": -0.5052729845046997, "logps/chosen": -47.95751953125, "logps/rejected": -52.389671325683594, "loss": 0.3457, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6843419075012207, "rewards/margins": 1.2624640464782715, "rewards/rejected": -0.5781222581863403, "step": 4070 }, { "epoch": 0.41893697954148246, "grad_norm": 30.25, "learning_rate": 6.456477039067855e-07, "logits/chosen": -0.6057463884353638, "logits/rejected": -0.5924266576766968, "logps/chosen": -35.90540313720703, "logps/rejected": -48.39791488647461, "loss": 0.3235, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8927410244941711, "rewards/margins": 1.6189066171646118, "rewards/rejected": -0.7261654138565063, "step": 4075 }, { "epoch": 0.41945101264521434, "grad_norm": 27.875, "learning_rate": 6.450765364404843e-07, "logits/chosen": -0.558242678642273, "logits/rejected": -0.6142635941505432, "logps/chosen": -44.79996109008789, "logps/rejected": -56.55199432373047, "loss": 0.3253, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6268162131309509, "rewards/margins": 1.6002193689346313, "rewards/rejected": -0.9734031558036804, "step": 4080 }, { "epoch": 0.4199650457489462, "grad_norm": 25.625, "learning_rate": 6.445053689741832e-07, "logits/chosen": -0.6075018644332886, "logits/rejected": -0.6232036352157593, "logps/chosen": -40.85198211669922, "logps/rejected": -56.321876525878906, "loss": 0.3198, "rewards/accuracies": 0.875, "rewards/chosen": 0.9545613527297974, "rewards/margins": 1.8287633657455444, "rewards/rejected": -0.8742021322250366, "step": 4085 }, { "epoch": 0.4204790788526781, "grad_norm": 27.875, "learning_rate": 6.439342015078822e-07, "logits/chosen": -0.49638763070106506, "logits/rejected": -0.5298558473587036, "logps/chosen": -48.6041145324707, "logps/rejected": -48.79772186279297, "loss": 0.3213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7441240549087524, "rewards/margins": 1.4064223766326904, "rewards/rejected": -0.662298321723938, "step": 4090 }, { "epoch": 0.42099311195641, "grad_norm": 31.25, "learning_rate": 6.43363034041581e-07, "logits/chosen": -0.5922037363052368, "logits/rejected": -0.5228636264801025, "logps/chosen": -41.87727737426758, "logps/rejected": -46.802093505859375, "loss": 0.3165, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7242056131362915, "rewards/margins": 1.0580661296844482, "rewards/rejected": -0.3338605761528015, "step": 4095 }, { "epoch": 0.42150714506014186, "grad_norm": 26.625, "learning_rate": 6.427918665752798e-07, "logits/chosen": -0.5300833582878113, "logits/rejected": -0.5341532826423645, "logps/chosen": -36.6077766418457, "logps/rejected": -52.440696716308594, "loss": 0.2758, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.932401180267334, "rewards/margins": 1.9652656316757202, "rewards/rejected": -1.0328646898269653, "step": 4100 }, { "epoch": 0.42202117816387374, "grad_norm": 50.75, "learning_rate": 6.422206991089787e-07, "logits/chosen": -0.5984728932380676, "logits/rejected": -0.626305341720581, "logps/chosen": -44.79246520996094, "logps/rejected": -51.33040237426758, "loss": 0.3619, "rewards/accuracies": 0.875, "rewards/chosen": 0.507942795753479, "rewards/margins": 1.1865431070327759, "rewards/rejected": -0.6786003708839417, "step": 4105 }, { "epoch": 0.4225352112676056, "grad_norm": 27.375, "learning_rate": 6.416495316426777e-07, "logits/chosen": -0.6211115121841431, "logits/rejected": -0.6464820504188538, "logps/chosen": -36.9209098815918, "logps/rejected": -47.933406829833984, "loss": 0.3354, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7639728784561157, "rewards/margins": 1.40701425075531, "rewards/rejected": -0.6430415511131287, "step": 4110 }, { "epoch": 0.4230492443713375, "grad_norm": 28.0, "learning_rate": 6.410783641763765e-07, "logits/chosen": -0.49926823377609253, "logits/rejected": -0.4990383982658386, "logps/chosen": -37.47130584716797, "logps/rejected": -47.89086151123047, "loss": 0.3289, "rewards/accuracies": 0.875, "rewards/chosen": 0.6116153001785278, "rewards/margins": 1.4427645206451416, "rewards/rejected": -0.8311492800712585, "step": 4115 }, { "epoch": 0.4235632774750694, "grad_norm": 27.375, "learning_rate": 6.405071967100753e-07, "logits/chosen": -0.5483392477035522, "logits/rejected": -0.5541409254074097, "logps/chosen": -34.51565170288086, "logps/rejected": -51.3707389831543, "loss": 0.3359, "rewards/accuracies": 0.875, "rewards/chosen": 0.7399113774299622, "rewards/margins": 1.3431190252304077, "rewards/rejected": -0.6032077074050903, "step": 4120 }, { "epoch": 0.42407731057880127, "grad_norm": 40.5, "learning_rate": 6.399360292437742e-07, "logits/chosen": -0.6230254769325256, "logits/rejected": -0.6450968980789185, "logps/chosen": -39.40861129760742, "logps/rejected": -53.057960510253906, "loss": 0.3116, "rewards/accuracies": 0.875, "rewards/chosen": 0.7735278010368347, "rewards/margins": 1.520806074142456, "rewards/rejected": -0.7472783327102661, "step": 4125 }, { "epoch": 0.42459134368253315, "grad_norm": 32.0, "learning_rate": 6.393648617774732e-07, "logits/chosen": -0.5479233264923096, "logits/rejected": -0.5546449422836304, "logps/chosen": -35.52483367919922, "logps/rejected": -51.997047424316406, "loss": 0.3339, "rewards/accuracies": 0.875, "rewards/chosen": 0.9620732069015503, "rewards/margins": 1.9058462381362915, "rewards/rejected": -0.943773090839386, "step": 4130 }, { "epoch": 0.42510537678626503, "grad_norm": 35.75, "learning_rate": 6.38793694311172e-07, "logits/chosen": -0.6292346715927124, "logits/rejected": -0.6010660529136658, "logps/chosen": -40.91687774658203, "logps/rejected": -47.69294357299805, "loss": 0.304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6064459085464478, "rewards/margins": 1.3806567192077637, "rewards/rejected": -0.7742108106613159, "step": 4135 }, { "epoch": 0.4256194098899969, "grad_norm": 46.75, "learning_rate": 6.382225268448709e-07, "logits/chosen": -0.5507264733314514, "logits/rejected": -0.5572287440299988, "logps/chosen": -35.31096649169922, "logps/rejected": -49.31682205200195, "loss": 0.3177, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8404159545898438, "rewards/margins": 1.6224418878555298, "rewards/rejected": -0.782025933265686, "step": 4140 }, { "epoch": 0.4261334429937288, "grad_norm": 36.5, "learning_rate": 6.376513593785697e-07, "logits/chosen": -0.5528365969657898, "logits/rejected": -0.5766684412956238, "logps/chosen": -40.324378967285156, "logps/rejected": -54.004234313964844, "loss": 0.308, "rewards/accuracies": 0.875, "rewards/chosen": 0.8833945393562317, "rewards/margins": 1.7839847803115845, "rewards/rejected": -0.9005904197692871, "step": 4145 }, { "epoch": 0.4266474760974607, "grad_norm": 40.0, "learning_rate": 6.370801919122687e-07, "logits/chosen": -0.6005185842514038, "logits/rejected": -0.55133056640625, "logps/chosen": -43.840919494628906, "logps/rejected": -51.023101806640625, "loss": 0.2831, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8229673504829407, "rewards/margins": 1.754066824913025, "rewards/rejected": -0.931099534034729, "step": 4150 }, { "epoch": 0.42716150920119256, "grad_norm": 27.0, "learning_rate": 6.365090244459676e-07, "logits/chosen": -0.5944263935089111, "logits/rejected": -0.6089659929275513, "logps/chosen": -39.966766357421875, "logps/rejected": -54.07178497314453, "loss": 0.331, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.682317852973938, "rewards/margins": 1.554168701171875, "rewards/rejected": -0.8718507885932922, "step": 4155 }, { "epoch": 0.42767554230492444, "grad_norm": 41.0, "learning_rate": 6.359378569796664e-07, "logits/chosen": -0.5410281419754028, "logits/rejected": -0.5432273149490356, "logps/chosen": -36.906715393066406, "logps/rejected": -51.791015625, "loss": 0.2994, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6838253140449524, "rewards/margins": 1.6690304279327393, "rewards/rejected": -0.985205352306366, "step": 4160 }, { "epoch": 0.4281895754086563, "grad_norm": 41.5, "learning_rate": 6.353666895133652e-07, "logits/chosen": -0.6690976023674011, "logits/rejected": -0.6791282892227173, "logps/chosen": -33.51356506347656, "logps/rejected": -50.01358413696289, "loss": 0.325, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9615632891654968, "rewards/margins": 1.5199391841888428, "rewards/rejected": -0.5583759546279907, "step": 4165 }, { "epoch": 0.4287036085123882, "grad_norm": 26.625, "learning_rate": 6.347955220470642e-07, "logits/chosen": -0.5813798308372498, "logits/rejected": -0.5349598526954651, "logps/chosen": -39.517478942871094, "logps/rejected": -49.22647476196289, "loss": 0.3148, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8062032461166382, "rewards/margins": 1.6387361288070679, "rewards/rejected": -0.8325327634811401, "step": 4170 }, { "epoch": 0.4292176416161201, "grad_norm": 32.25, "learning_rate": 6.342243545807631e-07, "logits/chosen": -0.5356937646865845, "logits/rejected": -0.6134769916534424, "logps/chosen": -35.60602569580078, "logps/rejected": -47.85105514526367, "loss": 0.3195, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7530622482299805, "rewards/margins": 1.4984327554702759, "rewards/rejected": -0.7453703880310059, "step": 4175 }, { "epoch": 0.42973167471985196, "grad_norm": 28.5, "learning_rate": 6.336531871144619e-07, "logits/chosen": -0.5855588912963867, "logits/rejected": -0.5813583135604858, "logps/chosen": -40.57073211669922, "logps/rejected": -45.5249137878418, "loss": 0.3229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0053871870040894, "rewards/margins": 1.6431388854980469, "rewards/rejected": -0.6377516984939575, "step": 4180 }, { "epoch": 0.43024570782358385, "grad_norm": 23.0, "learning_rate": 6.330820196481607e-07, "logits/chosen": -0.5963640213012695, "logits/rejected": -0.6416696310043335, "logps/chosen": -37.76520919799805, "logps/rejected": -51.97883987426758, "loss": 0.3121, "rewards/accuracies": 0.875, "rewards/chosen": 0.8772425651550293, "rewards/margins": 1.7683414220809937, "rewards/rejected": -0.8910988569259644, "step": 4185 }, { "epoch": 0.4307597409273157, "grad_norm": 32.0, "learning_rate": 6.325108521818597e-07, "logits/chosen": -0.6274175047874451, "logits/rejected": -0.6123367547988892, "logps/chosen": -38.78569793701172, "logps/rejected": -49.89281463623047, "loss": 0.2751, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9576712846755981, "rewards/margins": 1.486742615699768, "rewards/rejected": -0.5290713906288147, "step": 4190 }, { "epoch": 0.4312737740310476, "grad_norm": 51.25, "learning_rate": 6.319396847155586e-07, "logits/chosen": -0.6284064054489136, "logits/rejected": -0.6900848746299744, "logps/chosen": -43.69279861450195, "logps/rejected": -60.3199462890625, "loss": 0.3281, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7498931288719177, "rewards/margins": 1.8167883157730103, "rewards/rejected": -1.0668952465057373, "step": 4195 }, { "epoch": 0.4317878071347795, "grad_norm": 31.0, "learning_rate": 6.313685172492575e-07, "logits/chosen": -0.5739017724990845, "logits/rejected": -0.5481579899787903, "logps/chosen": -37.211204528808594, "logps/rejected": -52.966941833496094, "loss": 0.3152, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8481265306472778, "rewards/margins": 1.5164109468460083, "rewards/rejected": -0.66828453540802, "step": 4200 }, { "epoch": 0.43230184023851137, "grad_norm": 35.0, "learning_rate": 6.307973497829563e-07, "logits/chosen": -0.5948741436004639, "logits/rejected": -0.6886753439903259, "logps/chosen": -38.603050231933594, "logps/rejected": -51.532814025878906, "loss": 0.2844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1988178491592407, "rewards/margins": 1.988332986831665, "rewards/rejected": -0.7895151376724243, "step": 4205 }, { "epoch": 0.43281587334224325, "grad_norm": 47.5, "learning_rate": 6.302261823166552e-07, "logits/chosen": -0.5564013123512268, "logits/rejected": -0.576689600944519, "logps/chosen": -40.05927658081055, "logps/rejected": -52.786582946777344, "loss": 0.3097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8646873235702515, "rewards/margins": 1.7842931747436523, "rewards/rejected": -0.9196059107780457, "step": 4210 }, { "epoch": 0.43332990644597513, "grad_norm": 57.75, "learning_rate": 6.296550148503541e-07, "logits/chosen": -0.5547322630882263, "logits/rejected": -0.5810917615890503, "logps/chosen": -38.34539031982422, "logps/rejected": -47.720436096191406, "loss": 0.3141, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9357932209968567, "rewards/margins": 1.498422384262085, "rewards/rejected": -0.5626292824745178, "step": 4215 }, { "epoch": 0.433843939549707, "grad_norm": 40.5, "learning_rate": 6.29083847384053e-07, "logits/chosen": -0.6182348132133484, "logits/rejected": -0.5820120573043823, "logps/chosen": -39.45058059692383, "logps/rejected": -51.59892654418945, "loss": 0.3406, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5689059495925903, "rewards/margins": 1.4275586605072021, "rewards/rejected": -0.8586528897285461, "step": 4220 }, { "epoch": 0.4343579726534389, "grad_norm": 31.625, "learning_rate": 6.285126799177518e-07, "logits/chosen": -0.6029633283615112, "logits/rejected": -0.6050738096237183, "logps/chosen": -39.139564514160156, "logps/rejected": -54.6425895690918, "loss": 0.3228, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.45053282380104065, "rewards/margins": 1.5247523784637451, "rewards/rejected": -1.0742194652557373, "step": 4225 }, { "epoch": 0.4348720057571708, "grad_norm": 30.25, "learning_rate": 6.279415124514507e-07, "logits/chosen": -0.5949763059616089, "logits/rejected": -0.6116283535957336, "logps/chosen": -37.637245178222656, "logps/rejected": -51.003971099853516, "loss": 0.2997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8639335632324219, "rewards/margins": 1.6007654666900635, "rewards/rejected": -0.7368318438529968, "step": 4230 }, { "epoch": 0.43538603886090266, "grad_norm": 31.375, "learning_rate": 6.273703449851496e-07, "logits/chosen": -0.5687511563301086, "logits/rejected": -0.5432795286178589, "logps/chosen": -37.986820220947266, "logps/rejected": -49.99372863769531, "loss": 0.2972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8014497756958008, "rewards/margins": 1.4348411560058594, "rewards/rejected": -0.6333913207054138, "step": 4235 }, { "epoch": 0.43590007196463454, "grad_norm": 34.25, "learning_rate": 6.267991775188485e-07, "logits/chosen": -0.5774094462394714, "logits/rejected": -0.5949773788452148, "logps/chosen": -35.71200942993164, "logps/rejected": -49.53398895263672, "loss": 0.3208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7976647615432739, "rewards/margins": 1.4390883445739746, "rewards/rejected": -0.6414235830307007, "step": 4240 }, { "epoch": 0.4364141050683664, "grad_norm": 33.5, "learning_rate": 6.262280100525474e-07, "logits/chosen": -0.5747865438461304, "logits/rejected": -0.6333924531936646, "logps/chosen": -43.0257453918457, "logps/rejected": -51.07326126098633, "loss": 0.2959, "rewards/accuracies": 1.0, "rewards/chosen": 0.6045019030570984, "rewards/margins": 1.6025158166885376, "rewards/rejected": -0.9980138540267944, "step": 4245 }, { "epoch": 0.4369281381720983, "grad_norm": 30.375, "learning_rate": 6.256568425862463e-07, "logits/chosen": -0.6819504499435425, "logits/rejected": -0.7005230188369751, "logps/chosen": -39.55046844482422, "logps/rejected": -49.22693634033203, "loss": 0.3051, "rewards/accuracies": 0.875, "rewards/chosen": 0.6774265766143799, "rewards/margins": 1.3206288814544678, "rewards/rejected": -0.6432023644447327, "step": 4250 }, { "epoch": 0.4374421712758302, "grad_norm": 32.25, "learning_rate": 6.250856751199451e-07, "logits/chosen": -0.5431455969810486, "logits/rejected": -0.5573427081108093, "logps/chosen": -46.2855224609375, "logps/rejected": -52.61237335205078, "loss": 0.3168, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4646541178226471, "rewards/margins": 1.5150654315948486, "rewards/rejected": -1.0504114627838135, "step": 4255 }, { "epoch": 0.43795620437956206, "grad_norm": 37.25, "learning_rate": 6.24514507653644e-07, "logits/chosen": -0.5984776020050049, "logits/rejected": -0.6159597039222717, "logps/chosen": -40.282188415527344, "logps/rejected": -53.35393142700195, "loss": 0.2995, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7974872589111328, "rewards/margins": 1.6586227416992188, "rewards/rejected": -0.8611353039741516, "step": 4260 }, { "epoch": 0.43847023748329395, "grad_norm": 38.25, "learning_rate": 6.239433401873429e-07, "logits/chosen": -0.4907870888710022, "logits/rejected": -0.5029420256614685, "logps/chosen": -38.23505783081055, "logps/rejected": -52.042808532714844, "loss": 0.302, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7645503282546997, "rewards/margins": 1.3857781887054443, "rewards/rejected": -0.6212278604507446, "step": 4265 }, { "epoch": 0.4389842705870258, "grad_norm": 38.25, "learning_rate": 6.233721727210418e-07, "logits/chosen": -0.5302327275276184, "logits/rejected": -0.5879431962966919, "logps/chosen": -35.130699157714844, "logps/rejected": -49.79016876220703, "loss": 0.3084, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8364653587341309, "rewards/margins": 1.4961373805999756, "rewards/rejected": -0.6596721410751343, "step": 4270 }, { "epoch": 0.4394983036907577, "grad_norm": 28.875, "learning_rate": 6.228010052547406e-07, "logits/chosen": -0.5436688661575317, "logits/rejected": -0.6283541321754456, "logps/chosen": -36.87068557739258, "logps/rejected": -48.722129821777344, "loss": 0.3087, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7174199819564819, "rewards/margins": 1.337345838546753, "rewards/rejected": -0.6199259757995605, "step": 4275 }, { "epoch": 0.4400123367944896, "grad_norm": 35.5, "learning_rate": 6.222298377884395e-07, "logits/chosen": -0.5802863240242004, "logits/rejected": -0.5699064135551453, "logps/chosen": -44.7894172668457, "logps/rejected": -52.71779251098633, "loss": 0.3203, "rewards/accuracies": 0.875, "rewards/chosen": 0.7726964950561523, "rewards/margins": 1.759415626525879, "rewards/rejected": -0.986719012260437, "step": 4280 }, { "epoch": 0.44052636989822147, "grad_norm": 31.875, "learning_rate": 6.216586703221384e-07, "logits/chosen": -0.5916663408279419, "logits/rejected": -0.6324084401130676, "logps/chosen": -41.62701416015625, "logps/rejected": -51.90961837768555, "loss": 0.3238, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7323563694953918, "rewards/margins": 1.4809930324554443, "rewards/rejected": -0.7486365437507629, "step": 4285 }, { "epoch": 0.44104040300195335, "grad_norm": 40.0, "learning_rate": 6.210875028558374e-07, "logits/chosen": -0.4893699288368225, "logits/rejected": -0.5797719955444336, "logps/chosen": -40.0073356628418, "logps/rejected": -52.25025177001953, "loss": 0.3031, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6185725331306458, "rewards/margins": 1.6288459300994873, "rewards/rejected": -1.0102732181549072, "step": 4290 }, { "epoch": 0.44155443610568523, "grad_norm": 28.75, "learning_rate": 6.205163353895362e-07, "logits/chosen": -0.5726200342178345, "logits/rejected": -0.5598729848861694, "logps/chosen": -39.45686721801758, "logps/rejected": -54.521278381347656, "loss": 0.3063, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6723884344100952, "rewards/margins": 1.7309774160385132, "rewards/rejected": -1.0585887432098389, "step": 4295 }, { "epoch": 0.4420684692094171, "grad_norm": 26.375, "learning_rate": 6.19945167923235e-07, "logits/chosen": -0.5431586503982544, "logits/rejected": -0.5916115045547485, "logps/chosen": -38.61572265625, "logps/rejected": -51.70732498168945, "loss": 0.3627, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7275944948196411, "rewards/margins": 1.4592931270599365, "rewards/rejected": -0.7316985130310059, "step": 4300 }, { "epoch": 0.44258250231314894, "grad_norm": 28.625, "learning_rate": 6.19374000456934e-07, "logits/chosen": -0.6812648773193359, "logits/rejected": -0.6599335074424744, "logps/chosen": -41.85625457763672, "logps/rejected": -51.42687225341797, "loss": 0.3435, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7580803632736206, "rewards/margins": 1.3852115869522095, "rewards/rejected": -0.6271312236785889, "step": 4305 }, { "epoch": 0.4430965354168808, "grad_norm": 29.0, "learning_rate": 6.188028329906329e-07, "logits/chosen": -0.5884179472923279, "logits/rejected": -0.6010836362838745, "logps/chosen": -41.9610481262207, "logps/rejected": -48.56173324584961, "loss": 0.3087, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7528852820396423, "rewards/margins": 1.4971665143966675, "rewards/rejected": -0.7442812919616699, "step": 4310 }, { "epoch": 0.4436105685206127, "grad_norm": 27.75, "learning_rate": 6.182316655243317e-07, "logits/chosen": -0.5773581266403198, "logits/rejected": -0.5845789909362793, "logps/chosen": -36.985572814941406, "logps/rejected": -52.853424072265625, "loss": 0.3183, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9071577191352844, "rewards/margins": 1.7963144779205322, "rewards/rejected": -0.889156699180603, "step": 4315 }, { "epoch": 0.4441246016243446, "grad_norm": 44.25, "learning_rate": 6.176604980580305e-07, "logits/chosen": -0.5967915058135986, "logits/rejected": -0.6519853472709656, "logps/chosen": -39.15655517578125, "logps/rejected": -46.98552703857422, "loss": 0.3627, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7407762408256531, "rewards/margins": 1.0794458389282227, "rewards/rejected": -0.3386695384979248, "step": 4320 }, { "epoch": 0.44463863472807647, "grad_norm": 37.0, "learning_rate": 6.170893305917295e-07, "logits/chosen": -0.45147794485092163, "logits/rejected": -0.5315023064613342, "logps/chosen": -42.19622039794922, "logps/rejected": -54.096275329589844, "loss": 0.305, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8002623319625854, "rewards/margins": 1.7118326425552368, "rewards/rejected": -0.9115701913833618, "step": 4325 }, { "epoch": 0.44515266783180835, "grad_norm": 51.75, "learning_rate": 6.165181631254284e-07, "logits/chosen": -0.5910664200782776, "logits/rejected": -0.5251246094703674, "logps/chosen": -35.765289306640625, "logps/rejected": -47.121673583984375, "loss": 0.3394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0424869060516357, "rewards/margins": 1.7112785577774048, "rewards/rejected": -0.668791651725769, "step": 4330 }, { "epoch": 0.4456667009355402, "grad_norm": 31.125, "learning_rate": 6.159469956591272e-07, "logits/chosen": -0.5384339094161987, "logits/rejected": -0.5183535814285278, "logps/chosen": -39.822853088378906, "logps/rejected": -54.54465866088867, "loss": 0.318, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.67317134141922, "rewards/margins": 1.3303461074829102, "rewards/rejected": -0.6571746468544006, "step": 4335 }, { "epoch": 0.4461807340392721, "grad_norm": 40.5, "learning_rate": 6.15375828192826e-07, "logits/chosen": -0.5386044979095459, "logits/rejected": -0.5659867525100708, "logps/chosen": -41.15480041503906, "logps/rejected": -50.74098587036133, "loss": 0.3143, "rewards/accuracies": 0.875, "rewards/chosen": 0.7864366173744202, "rewards/margins": 1.4057320356369019, "rewards/rejected": -0.6192954778671265, "step": 4340 }, { "epoch": 0.446694767143004, "grad_norm": 29.0, "learning_rate": 6.14804660726525e-07, "logits/chosen": -0.6056324243545532, "logits/rejected": -0.6636101603507996, "logps/chosen": -39.781898498535156, "logps/rejected": -51.34355545043945, "loss": 0.3392, "rewards/accuracies": 0.875, "rewards/chosen": 0.794486403465271, "rewards/margins": 1.5017377138137817, "rewards/rejected": -0.7072511911392212, "step": 4345 }, { "epoch": 0.44720880024673587, "grad_norm": 29.75, "learning_rate": 6.142334932602239e-07, "logits/chosen": -0.5536537170410156, "logits/rejected": -0.5730202198028564, "logps/chosen": -36.68979263305664, "logps/rejected": -54.91718673706055, "loss": 0.3378, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6960805058479309, "rewards/margins": 1.5957015752792358, "rewards/rejected": -0.8996210098266602, "step": 4350 }, { "epoch": 0.44772283335046775, "grad_norm": 33.0, "learning_rate": 6.136623257939228e-07, "logits/chosen": -0.5815585851669312, "logits/rejected": -0.5744980573654175, "logps/chosen": -38.41724395751953, "logps/rejected": -47.91899871826172, "loss": 0.3057, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0051120519638062, "rewards/margins": 1.5785270929336548, "rewards/rejected": -0.5734151005744934, "step": 4355 }, { "epoch": 0.44823686645419963, "grad_norm": 28.875, "learning_rate": 6.130911583276216e-07, "logits/chosen": -0.5419459939002991, "logits/rejected": -0.5467687845230103, "logps/chosen": -33.697364807128906, "logps/rejected": -50.05740737915039, "loss": 0.329, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8008682131767273, "rewards/margins": 1.5886179208755493, "rewards/rejected": -0.787749707698822, "step": 4360 }, { "epoch": 0.4487508995579315, "grad_norm": 30.375, "learning_rate": 6.125199908613205e-07, "logits/chosen": -0.5456799268722534, "logits/rejected": -0.5810462236404419, "logps/chosen": -35.2459831237793, "logps/rejected": -48.55827713012695, "loss": 0.2878, "rewards/accuracies": 0.875, "rewards/chosen": 0.8501240611076355, "rewards/margins": 1.7096967697143555, "rewards/rejected": -0.8595725893974304, "step": 4365 }, { "epoch": 0.4492649326616634, "grad_norm": 28.125, "learning_rate": 6.119488233950194e-07, "logits/chosen": -0.5362230539321899, "logits/rejected": -0.535419225692749, "logps/chosen": -40.28564453125, "logps/rejected": -51.51750564575195, "loss": 0.3339, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8633099794387817, "rewards/margins": 1.6793978214263916, "rewards/rejected": -0.8160877227783203, "step": 4370 }, { "epoch": 0.4497789657653953, "grad_norm": 35.25, "learning_rate": 6.113776559287183e-07, "logits/chosen": -0.6351960897445679, "logits/rejected": -0.6153637170791626, "logps/chosen": -37.918174743652344, "logps/rejected": -51.07325744628906, "loss": 0.3344, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.4626513421535492, "rewards/margins": 1.2797753810882568, "rewards/rejected": -0.81712406873703, "step": 4375 }, { "epoch": 0.45029299886912716, "grad_norm": 35.75, "learning_rate": 6.108064884624171e-07, "logits/chosen": -0.515992283821106, "logits/rejected": -0.5592582821846008, "logps/chosen": -41.9954948425293, "logps/rejected": -55.90415573120117, "loss": 0.3214, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7995100617408752, "rewards/margins": 1.704737901687622, "rewards/rejected": -0.9052278399467468, "step": 4380 }, { "epoch": 0.45080703197285904, "grad_norm": 38.5, "learning_rate": 6.10235320996116e-07, "logits/chosen": -0.5426300764083862, "logits/rejected": -0.5580174326896667, "logps/chosen": -39.321205139160156, "logps/rejected": -50.97123336791992, "loss": 0.3221, "rewards/accuracies": 0.875, "rewards/chosen": 0.8802615404129028, "rewards/margins": 1.4870779514312744, "rewards/rejected": -0.6068164110183716, "step": 4385 }, { "epoch": 0.4513210650765909, "grad_norm": 30.75, "learning_rate": 6.096641535298149e-07, "logits/chosen": -0.561479926109314, "logits/rejected": -0.5484353303909302, "logps/chosen": -36.61927032470703, "logps/rejected": -48.82826614379883, "loss": 0.3049, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8902942538261414, "rewards/margins": 1.7594152688980103, "rewards/rejected": -0.8691209554672241, "step": 4390 }, { "epoch": 0.4518350981803228, "grad_norm": 28.125, "learning_rate": 6.090929860635138e-07, "logits/chosen": -0.548780620098114, "logits/rejected": -0.6149802207946777, "logps/chosen": -32.32288360595703, "logps/rejected": -46.199581146240234, "loss": 0.3251, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9166557192802429, "rewards/margins": 1.784996747970581, "rewards/rejected": -0.8683410882949829, "step": 4395 }, { "epoch": 0.4523491312840547, "grad_norm": 29.625, "learning_rate": 6.085218185972127e-07, "logits/chosen": -0.6166483759880066, "logits/rejected": -0.568450927734375, "logps/chosen": -38.06099319458008, "logps/rejected": -51.15806579589844, "loss": 0.297, "rewards/accuracies": 1.0, "rewards/chosen": 0.7907224893569946, "rewards/margins": 1.8739427328109741, "rewards/rejected": -1.08322012424469, "step": 4400 }, { "epoch": 0.45286316438778657, "grad_norm": 34.5, "learning_rate": 6.079506511309116e-07, "logits/chosen": -0.5794674754142761, "logits/rejected": -0.580804705619812, "logps/chosen": -47.28245162963867, "logps/rejected": -55.64631271362305, "loss": 0.3086, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8893535733222961, "rewards/margins": 1.732100248336792, "rewards/rejected": -0.842746913433075, "step": 4405 }, { "epoch": 0.45337719749151845, "grad_norm": 25.5, "learning_rate": 6.073794836646104e-07, "logits/chosen": -0.4965120255947113, "logits/rejected": -0.5088602304458618, "logps/chosen": -37.66535949707031, "logps/rejected": -51.257347106933594, "loss": 0.3384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8331076502799988, "rewards/margins": 1.4644721746444702, "rewards/rejected": -0.6313644647598267, "step": 4410 }, { "epoch": 0.45389123059525033, "grad_norm": 29.625, "learning_rate": 6.068083161983093e-07, "logits/chosen": -0.5149792432785034, "logits/rejected": -0.5786292552947998, "logps/chosen": -41.2970085144043, "logps/rejected": -50.4326286315918, "loss": 0.2745, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9665635228157043, "rewards/margins": 1.7013708353042603, "rewards/rejected": -0.7348071932792664, "step": 4415 }, { "epoch": 0.4544052636989822, "grad_norm": 25.125, "learning_rate": 6.062371487320082e-07, "logits/chosen": -0.611458957195282, "logits/rejected": -0.6570644378662109, "logps/chosen": -40.691558837890625, "logps/rejected": -49.86210250854492, "loss": 0.338, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6466978192329407, "rewards/margins": 1.4538214206695557, "rewards/rejected": -0.8071235418319702, "step": 4420 }, { "epoch": 0.4549192968027141, "grad_norm": 31.125, "learning_rate": 6.056659812657071e-07, "logits/chosen": -0.5300582647323608, "logits/rejected": -0.5980950593948364, "logps/chosen": -36.7254753112793, "logps/rejected": -49.04033279418945, "loss": 0.3171, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.813275158405304, "rewards/margins": 1.7810676097869873, "rewards/rejected": -0.9677923917770386, "step": 4425 }, { "epoch": 0.45543332990644597, "grad_norm": 30.0, "learning_rate": 6.050948137994059e-07, "logits/chosen": -0.5444976091384888, "logits/rejected": -0.6119160056114197, "logps/chosen": -33.16553497314453, "logps/rejected": -47.133628845214844, "loss": 0.3476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7186273336410522, "rewards/margins": 1.436830759048462, "rewards/rejected": -0.7182033658027649, "step": 4430 }, { "epoch": 0.45594736301017785, "grad_norm": 35.5, "learning_rate": 6.045236463331048e-07, "logits/chosen": -0.6378283500671387, "logits/rejected": -0.7222896814346313, "logps/chosen": -39.226898193359375, "logps/rejected": -53.554840087890625, "loss": 0.3202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7277973890304565, "rewards/margins": 1.5653009414672852, "rewards/rejected": -0.8375035524368286, "step": 4435 }, { "epoch": 0.45646139611390973, "grad_norm": 46.25, "learning_rate": 6.039524788668037e-07, "logits/chosen": -0.5240603685379028, "logits/rejected": -0.5911895036697388, "logps/chosen": -42.222286224365234, "logps/rejected": -53.223243713378906, "loss": 0.3553, "rewards/accuracies": 0.875, "rewards/chosen": 0.526236355304718, "rewards/margins": 1.36164391040802, "rewards/rejected": -0.8354076147079468, "step": 4440 }, { "epoch": 0.4569754292176416, "grad_norm": 25.625, "learning_rate": 6.033813114005027e-07, "logits/chosen": -0.53724205493927, "logits/rejected": -0.5402536988258362, "logps/chosen": -37.243377685546875, "logps/rejected": -50.93682098388672, "loss": 0.3135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0381003618240356, "rewards/margins": 1.6548645496368408, "rewards/rejected": -0.6167644262313843, "step": 4445 }, { "epoch": 0.4574894623213735, "grad_norm": 31.375, "learning_rate": 6.028101439342015e-07, "logits/chosen": -0.5730106830596924, "logits/rejected": -0.5959600210189819, "logps/chosen": -38.034339904785156, "logps/rejected": -51.36229705810547, "loss": 0.3321, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9880134463310242, "rewards/margins": 1.6573445796966553, "rewards/rejected": -0.6693311929702759, "step": 4450 }, { "epoch": 0.4580034954251054, "grad_norm": 95.0, "learning_rate": 6.022389764679003e-07, "logits/chosen": -0.5551769733428955, "logits/rejected": -0.5826773643493652, "logps/chosen": -40.222007751464844, "logps/rejected": -53.82122039794922, "loss": 0.3305, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7561420202255249, "rewards/margins": 1.7238740921020508, "rewards/rejected": -0.9677318334579468, "step": 4455 }, { "epoch": 0.45851752852883726, "grad_norm": 35.0, "learning_rate": 6.016678090015992e-07, "logits/chosen": -0.5456276535987854, "logits/rejected": -0.5883886814117432, "logps/chosen": -37.810935974121094, "logps/rejected": -53.5661735534668, "loss": 0.3314, "rewards/accuracies": 0.875, "rewards/chosen": 0.7197543978691101, "rewards/margins": 1.2966618537902832, "rewards/rejected": -0.5769074559211731, "step": 4460 }, { "epoch": 0.45903156163256914, "grad_norm": 26.625, "learning_rate": 6.010966415352982e-07, "logits/chosen": -0.5144246816635132, "logits/rejected": -0.500751256942749, "logps/chosen": -39.54693603515625, "logps/rejected": -55.740623474121094, "loss": 0.3276, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7781096696853638, "rewards/margins": 1.7080281972885132, "rewards/rejected": -0.9299185872077942, "step": 4465 }, { "epoch": 0.459545594736301, "grad_norm": 26.75, "learning_rate": 6.00525474068997e-07, "logits/chosen": -0.5655888319015503, "logits/rejected": -0.5991637110710144, "logps/chosen": -39.188568115234375, "logps/rejected": -54.15327072143555, "loss": 0.3416, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6279494762420654, "rewards/margins": 1.560179591178894, "rewards/rejected": -0.9322302937507629, "step": 4470 }, { "epoch": 0.4600596278400329, "grad_norm": 26.375, "learning_rate": 5.999543066026958e-07, "logits/chosen": -0.5459197759628296, "logits/rejected": -0.5908251404762268, "logps/chosen": -39.82365798950195, "logps/rejected": -53.40513229370117, "loss": 0.2946, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6100439429283142, "rewards/margins": 1.5645025968551636, "rewards/rejected": -0.9544585943222046, "step": 4475 }, { "epoch": 0.4605736609437648, "grad_norm": 32.0, "learning_rate": 5.993831391363947e-07, "logits/chosen": -0.6087994575500488, "logits/rejected": -0.6050242185592651, "logps/chosen": -36.08081817626953, "logps/rejected": -52.36199188232422, "loss": 0.3083, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7163437008857727, "rewards/margins": 1.330278754234314, "rewards/rejected": -0.613935112953186, "step": 4480 }, { "epoch": 0.46108769404749667, "grad_norm": 40.0, "learning_rate": 5.988119716700937e-07, "logits/chosen": -0.5704541802406311, "logits/rejected": -0.5654691457748413, "logps/chosen": -36.44591522216797, "logps/rejected": -49.95158767700195, "loss": 0.323, "rewards/accuracies": 1.0, "rewards/chosen": 0.9402214288711548, "rewards/margins": 1.7786500453948975, "rewards/rejected": -0.8384286761283875, "step": 4485 }, { "epoch": 0.46160172715122855, "grad_norm": 35.25, "learning_rate": 5.982408042037926e-07, "logits/chosen": -0.6740047931671143, "logits/rejected": -0.6998353600502014, "logps/chosen": -37.918182373046875, "logps/rejected": -50.45881652832031, "loss": 0.3189, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0121105909347534, "rewards/margins": 1.4823033809661865, "rewards/rejected": -0.47019290924072266, "step": 4490 }, { "epoch": 0.46211576025496043, "grad_norm": 28.875, "learning_rate": 5.976696367374913e-07, "logits/chosen": -0.4983064532279968, "logits/rejected": -0.4911865293979645, "logps/chosen": -38.42607879638672, "logps/rejected": -47.016029357910156, "loss": 0.3389, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7297395467758179, "rewards/margins": 1.5009777545928955, "rewards/rejected": -0.7712382078170776, "step": 4495 }, { "epoch": 0.4626297933586923, "grad_norm": 28.375, "learning_rate": 5.970984692711902e-07, "logits/chosen": -0.5989484786987305, "logits/rejected": -0.6282272338867188, "logps/chosen": -39.24018478393555, "logps/rejected": -50.19068908691406, "loss": 0.3293, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8064171671867371, "rewards/margins": 1.2744481563568115, "rewards/rejected": -0.46803098917007446, "step": 4500 }, { "epoch": 0.4631438264624242, "grad_norm": 35.0, "learning_rate": 5.965273018048892e-07, "logits/chosen": -0.5552816390991211, "logits/rejected": -0.6147178411483765, "logps/chosen": -36.50630187988281, "logps/rejected": -51.19809341430664, "loss": 0.2976, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9618179202079773, "rewards/margins": 1.8785728216171265, "rewards/rejected": -0.9167549014091492, "step": 4505 }, { "epoch": 0.46365785956615607, "grad_norm": 29.0, "learning_rate": 5.959561343385881e-07, "logits/chosen": -0.6239017248153687, "logits/rejected": -0.6134893894195557, "logps/chosen": -36.52698516845703, "logps/rejected": -48.226173400878906, "loss": 0.3032, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2810461521148682, "rewards/margins": 1.6989961862564087, "rewards/rejected": -0.4179500639438629, "step": 4510 }, { "epoch": 0.46417189266988795, "grad_norm": 26.0, "learning_rate": 5.953849668722869e-07, "logits/chosen": -0.5924614667892456, "logits/rejected": -0.6706196069717407, "logps/chosen": -37.17783737182617, "logps/rejected": -53.412200927734375, "loss": 0.3305, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7840844988822937, "rewards/margins": 1.44635009765625, "rewards/rejected": -0.6622655987739563, "step": 4515 }, { "epoch": 0.46468592577361983, "grad_norm": 31.875, "learning_rate": 5.948137994059858e-07, "logits/chosen": -0.5540779232978821, "logits/rejected": -0.6209492087364197, "logps/chosen": -38.11979675292969, "logps/rejected": -48.20746994018555, "loss": 0.3385, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8069054484367371, "rewards/margins": 1.600377082824707, "rewards/rejected": -0.7934715151786804, "step": 4520 }, { "epoch": 0.4651999588773517, "grad_norm": 29.5, "learning_rate": 5.942426319396847e-07, "logits/chosen": -0.49029144644737244, "logits/rejected": -0.5308659076690674, "logps/chosen": -35.489524841308594, "logps/rejected": -49.41027069091797, "loss": 0.3656, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.704103946685791, "rewards/margins": 1.582463264465332, "rewards/rejected": -0.8783591985702515, "step": 4525 }, { "epoch": 0.4657139919810836, "grad_norm": 47.25, "learning_rate": 5.936714644733836e-07, "logits/chosen": -0.4993600845336914, "logits/rejected": -0.5521739721298218, "logps/chosen": -33.77183151245117, "logps/rejected": -53.44072723388672, "loss": 0.2909, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.901269793510437, "rewards/margins": 1.9131431579589844, "rewards/rejected": -1.011873483657837, "step": 4530 }, { "epoch": 0.4662280250848155, "grad_norm": 47.25, "learning_rate": 5.931002970070825e-07, "logits/chosen": -0.5721731185913086, "logits/rejected": -0.596156895160675, "logps/chosen": -41.596641540527344, "logps/rejected": -54.039703369140625, "loss": 0.3148, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7460943460464478, "rewards/margins": 1.752498984336853, "rewards/rejected": -1.0064046382904053, "step": 4535 }, { "epoch": 0.46674205818854736, "grad_norm": 38.0, "learning_rate": 5.925291295407813e-07, "logits/chosen": -0.5506832599639893, "logits/rejected": -0.49562016129493713, "logps/chosen": -40.38154983520508, "logps/rejected": -52.68302536010742, "loss": 0.3517, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5161409378051758, "rewards/margins": 1.320976734161377, "rewards/rejected": -0.8048356771469116, "step": 4540 }, { "epoch": 0.46725609129227924, "grad_norm": 40.75, "learning_rate": 5.919579620744802e-07, "logits/chosen": -0.6318832635879517, "logits/rejected": -0.6521024703979492, "logps/chosen": -39.17869186401367, "logps/rejected": -56.17116165161133, "loss": 0.3265, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8128241300582886, "rewards/margins": 1.6120554208755493, "rewards/rejected": -0.799231231212616, "step": 4545 }, { "epoch": 0.4677701243960111, "grad_norm": 27.625, "learning_rate": 5.913867946081791e-07, "logits/chosen": -0.4979531168937683, "logits/rejected": -0.5953036546707153, "logps/chosen": -29.397159576416016, "logps/rejected": -49.61827850341797, "loss": 0.3314, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9125570058822632, "rewards/margins": 1.5365467071533203, "rewards/rejected": -0.6239895224571228, "step": 4550 }, { "epoch": 0.468284157499743, "grad_norm": 51.5, "learning_rate": 5.90815627141878e-07, "logits/chosen": -0.5236162543296814, "logits/rejected": -0.5803195238113403, "logps/chosen": -36.490333557128906, "logps/rejected": -48.768043518066406, "loss": 0.3746, "rewards/accuracies": 0.875, "rewards/chosen": 0.7090279459953308, "rewards/margins": 1.3941229581832886, "rewards/rejected": -0.6850950717926025, "step": 4555 }, { "epoch": 0.4687981906034749, "grad_norm": 35.5, "learning_rate": 5.902444596755769e-07, "logits/chosen": -0.5992729663848877, "logits/rejected": -0.5629161596298218, "logps/chosen": -32.34125518798828, "logps/rejected": -48.99491500854492, "loss": 0.3322, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0916731357574463, "rewards/margins": 1.9375555515289307, "rewards/rejected": -0.8458824157714844, "step": 4560 }, { "epoch": 0.46931222370720677, "grad_norm": 32.5, "learning_rate": 5.896732922092757e-07, "logits/chosen": -0.3898838460445404, "logits/rejected": -0.47744789719581604, "logps/chosen": -34.869728088378906, "logps/rejected": -44.12540817260742, "loss": 0.3413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4919314980506897, "rewards/margins": 1.0336525440216064, "rewards/rejected": -0.5417210459709167, "step": 4565 }, { "epoch": 0.46982625681093865, "grad_norm": 54.0, "learning_rate": 5.891021247429746e-07, "logits/chosen": -0.5758165121078491, "logits/rejected": -0.6388617753982544, "logps/chosen": -32.978782653808594, "logps/rejected": -52.0555534362793, "loss": 0.3327, "rewards/accuracies": 0.875, "rewards/chosen": 0.8214523196220398, "rewards/margins": 1.4690953493118286, "rewards/rejected": -0.6476432085037231, "step": 4570 }, { "epoch": 0.47034028991467053, "grad_norm": 43.75, "learning_rate": 5.885309572766735e-07, "logits/chosen": -0.5082734823226929, "logits/rejected": -0.5370305776596069, "logps/chosen": -41.891822814941406, "logps/rejected": -49.898921966552734, "loss": 0.3718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5700399875640869, "rewards/margins": 1.0430757999420166, "rewards/rejected": -0.4730357229709625, "step": 4575 }, { "epoch": 0.4708543230184024, "grad_norm": 25.125, "learning_rate": 5.879597898103725e-07, "logits/chosen": -0.5876695513725281, "logits/rejected": -0.5919374227523804, "logps/chosen": -42.18628692626953, "logps/rejected": -52.259788513183594, "loss": 0.2981, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8672122955322266, "rewards/margins": 1.7026252746582031, "rewards/rejected": -0.835412859916687, "step": 4580 }, { "epoch": 0.4713683561221343, "grad_norm": 38.0, "learning_rate": 5.873886223440712e-07, "logits/chosen": -0.5568527579307556, "logits/rejected": -0.5627206563949585, "logps/chosen": -39.943931579589844, "logps/rejected": -53.9622688293457, "loss": 0.3373, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5087813138961792, "rewards/margins": 1.4587815999984741, "rewards/rejected": -0.9500001668930054, "step": 4585 }, { "epoch": 0.4718823892258662, "grad_norm": 34.75, "learning_rate": 5.868174548777701e-07, "logits/chosen": -0.557476818561554, "logits/rejected": -0.6024687886238098, "logps/chosen": -36.707149505615234, "logps/rejected": -51.808197021484375, "loss": 0.3231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9242738485336304, "rewards/margins": 1.5034087896347046, "rewards/rejected": -0.5791350603103638, "step": 4590 }, { "epoch": 0.47239642232959805, "grad_norm": 31.75, "learning_rate": 5.86246287411469e-07, "logits/chosen": -0.6312499046325684, "logits/rejected": -0.6035939455032349, "logps/chosen": -35.32865524291992, "logps/rejected": -45.635005950927734, "loss": 0.3075, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.933014988899231, "rewards/margins": 1.7173633575439453, "rewards/rejected": -0.7843484282493591, "step": 4595 }, { "epoch": 0.4729104554333299, "grad_norm": 54.75, "learning_rate": 5.85675119945168e-07, "logits/chosen": -0.5503040552139282, "logits/rejected": -0.6474310159683228, "logps/chosen": -36.04220199584961, "logps/rejected": -53.93280029296875, "loss": 0.3379, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8580256700515747, "rewards/margins": 1.8926680088043213, "rewards/rejected": -1.0346423387527466, "step": 4600 }, { "epoch": 0.47342448853706176, "grad_norm": 30.0, "learning_rate": 5.851039524788667e-07, "logits/chosen": -0.5747354030609131, "logits/rejected": -0.5674922466278076, "logps/chosen": -43.01982879638672, "logps/rejected": -56.539588928222656, "loss": 0.3265, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8249856233596802, "rewards/margins": 1.5940641164779663, "rewards/rejected": -0.7690785527229309, "step": 4605 }, { "epoch": 0.47393852164079364, "grad_norm": 25.875, "learning_rate": 5.845327850125656e-07, "logits/chosen": -0.56260085105896, "logits/rejected": -0.6130474805831909, "logps/chosen": -36.19697952270508, "logps/rejected": -51.33974075317383, "loss": 0.3544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.690065860748291, "rewards/margins": 1.4306981563568115, "rewards/rejected": -0.7406323552131653, "step": 4610 }, { "epoch": 0.4744525547445255, "grad_norm": 30.75, "learning_rate": 5.839616175462645e-07, "logits/chosen": -0.5234600305557251, "logits/rejected": -0.5046413540840149, "logps/chosen": -41.39293670654297, "logps/rejected": -57.034576416015625, "loss": 0.3024, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9435309171676636, "rewards/margins": 2.0572776794433594, "rewards/rejected": -1.1137467622756958, "step": 4615 }, { "epoch": 0.4749665878482574, "grad_norm": 23.5, "learning_rate": 5.833904500799635e-07, "logits/chosen": -0.607464075088501, "logits/rejected": -0.6052636504173279, "logps/chosen": -36.778587341308594, "logps/rejected": -54.605438232421875, "loss": 0.279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8875362277030945, "rewards/margins": 1.6256376504898071, "rewards/rejected": -0.7381013035774231, "step": 4620 }, { "epoch": 0.4754806209519893, "grad_norm": 32.75, "learning_rate": 5.828192826136624e-07, "logits/chosen": -0.484381765127182, "logits/rejected": -0.4879641532897949, "logps/chosen": -39.54080581665039, "logps/rejected": -47.31566619873047, "loss": 0.2795, "rewards/accuracies": 0.875, "rewards/chosen": 0.7553143501281738, "rewards/margins": 1.5005743503570557, "rewards/rejected": -0.7452598810195923, "step": 4625 }, { "epoch": 0.47599465405572117, "grad_norm": 34.5, "learning_rate": 5.822481151473611e-07, "logits/chosen": -0.5947538614273071, "logits/rejected": -0.5962611436843872, "logps/chosen": -37.76666259765625, "logps/rejected": -55.156578063964844, "loss": 0.302, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8412095904350281, "rewards/margins": 1.6991266012191772, "rewards/rejected": -0.8579171299934387, "step": 4630 }, { "epoch": 0.47650868715945305, "grad_norm": 24.625, "learning_rate": 5.8167694768106e-07, "logits/chosen": -0.5962530970573425, "logits/rejected": -0.5543729066848755, "logps/chosen": -34.96152114868164, "logps/rejected": -49.98963928222656, "loss": 0.3283, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9239694476127625, "rewards/margins": 1.9131975173950195, "rewards/rejected": -0.9892279505729675, "step": 4635 }, { "epoch": 0.47702272026318493, "grad_norm": 42.25, "learning_rate": 5.81105780214759e-07, "logits/chosen": -0.5448412299156189, "logits/rejected": -0.522483229637146, "logps/chosen": -38.179283142089844, "logps/rejected": -50.01634979248047, "loss": 0.3474, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9991272687911987, "rewards/margins": 1.935241460800171, "rewards/rejected": -0.9361141920089722, "step": 4640 }, { "epoch": 0.4775367533669168, "grad_norm": 46.0, "learning_rate": 5.805346127484579e-07, "logits/chosen": -0.5021577477455139, "logits/rejected": -0.5308799147605896, "logps/chosen": -37.18758773803711, "logps/rejected": -50.65886306762695, "loss": 0.3319, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6450022459030151, "rewards/margins": 1.56234872341156, "rewards/rejected": -0.9173463582992554, "step": 4645 }, { "epoch": 0.4780507864706487, "grad_norm": 25.75, "learning_rate": 5.799634452821566e-07, "logits/chosen": -0.5593081116676331, "logits/rejected": -0.5442793965339661, "logps/chosen": -39.67371368408203, "logps/rejected": -50.4030647277832, "loss": 0.303, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7234148383140564, "rewards/margins": 1.3729357719421387, "rewards/rejected": -0.6495209336280823, "step": 4650 }, { "epoch": 0.4785648195743806, "grad_norm": 37.25, "learning_rate": 5.793922778158555e-07, "logits/chosen": -0.5677477121353149, "logits/rejected": -0.5993643999099731, "logps/chosen": -35.897621154785156, "logps/rejected": -50.5791130065918, "loss": 0.3303, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7615023255348206, "rewards/margins": 1.22488272190094, "rewards/rejected": -0.4633803963661194, "step": 4655 }, { "epoch": 0.47907885267811245, "grad_norm": 30.0, "learning_rate": 5.788211103495545e-07, "logits/chosen": -0.544178307056427, "logits/rejected": -0.5799872279167175, "logps/chosen": -39.8201789855957, "logps/rejected": -51.3090705871582, "loss": 0.2948, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8174932599067688, "rewards/margins": 1.740387201309204, "rewards/rejected": -0.9228940010070801, "step": 4660 }, { "epoch": 0.47959288578184434, "grad_norm": 31.875, "learning_rate": 5.782499428832534e-07, "logits/chosen": -0.5736032128334045, "logits/rejected": -0.6252381801605225, "logps/chosen": -44.08926773071289, "logps/rejected": -56.0831413269043, "loss": 0.3287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9170526266098022, "rewards/margins": 1.9167019128799438, "rewards/rejected": -0.9996496438980103, "step": 4665 }, { "epoch": 0.4801069188855762, "grad_norm": 33.5, "learning_rate": 5.776787754169522e-07, "logits/chosen": -0.5540226101875305, "logits/rejected": -0.5882223844528198, "logps/chosen": -37.2564697265625, "logps/rejected": -55.53739547729492, "loss": 0.3251, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6797583699226379, "rewards/margins": 1.4128779172897339, "rewards/rejected": -0.7331196069717407, "step": 4670 }, { "epoch": 0.4806209519893081, "grad_norm": 30.5, "learning_rate": 5.77107607950651e-07, "logits/chosen": -0.5412539839744568, "logits/rejected": -0.4840214252471924, "logps/chosen": -34.3412971496582, "logps/rejected": -54.8447151184082, "loss": 0.3149, "rewards/accuracies": 0.875, "rewards/chosen": 0.4158998429775238, "rewards/margins": 1.5511486530303955, "rewards/rejected": -1.1352488994598389, "step": 4675 }, { "epoch": 0.48113498509304, "grad_norm": 40.25, "learning_rate": 5.7653644048435e-07, "logits/chosen": -0.578429102897644, "logits/rejected": -0.5521516799926758, "logps/chosen": -39.98967742919922, "logps/rejected": -50.9090576171875, "loss": 0.3162, "rewards/accuracies": 0.875, "rewards/chosen": 0.6619094610214233, "rewards/margins": 1.6575496196746826, "rewards/rejected": -0.9956402778625488, "step": 4680 }, { "epoch": 0.48164901819677186, "grad_norm": 40.25, "learning_rate": 5.759652730180489e-07, "logits/chosen": -0.5545605421066284, "logits/rejected": -0.5381421446800232, "logps/chosen": -34.005470275878906, "logps/rejected": -47.77116012573242, "loss": 0.3196, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7512675523757935, "rewards/margins": 1.6371123790740967, "rewards/rejected": -0.8858447074890137, "step": 4685 }, { "epoch": 0.48216305130050374, "grad_norm": 30.125, "learning_rate": 5.753941055517478e-07, "logits/chosen": -0.49456268548965454, "logits/rejected": -0.5506514310836792, "logps/chosen": -42.416316986083984, "logps/rejected": -47.546356201171875, "loss": 0.3012, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5817815065383911, "rewards/margins": 1.2385194301605225, "rewards/rejected": -0.6567379236221313, "step": 4690 }, { "epoch": 0.4826770844042356, "grad_norm": 33.75, "learning_rate": 5.748229380854465e-07, "logits/chosen": -0.49861210584640503, "logits/rejected": -0.5335687398910522, "logps/chosen": -31.335363388061523, "logps/rejected": -48.64970397949219, "loss": 0.315, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8525902032852173, "rewards/margins": 1.725285291671753, "rewards/rejected": -0.8726948499679565, "step": 4695 }, { "epoch": 0.4831911175079675, "grad_norm": 24.25, "learning_rate": 5.742517706191455e-07, "logits/chosen": -0.613906741142273, "logits/rejected": -0.5871154069900513, "logps/chosen": -40.606422424316406, "logps/rejected": -53.32373809814453, "loss": 0.2911, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8667446374893188, "rewards/margins": 1.8685849905014038, "rewards/rejected": -1.001840591430664, "step": 4700 }, { "epoch": 0.4837051506116994, "grad_norm": 28.625, "learning_rate": 5.736806031528444e-07, "logits/chosen": -0.6162980794906616, "logits/rejected": -0.626346230506897, "logps/chosen": -43.69805908203125, "logps/rejected": -53.38178253173828, "loss": 0.342, "rewards/accuracies": 0.875, "rewards/chosen": 0.7523333430290222, "rewards/margins": 1.5076649188995361, "rewards/rejected": -0.7553316354751587, "step": 4705 }, { "epoch": 0.48421918371543127, "grad_norm": 39.25, "learning_rate": 5.731094356865433e-07, "logits/chosen": -0.5191125869750977, "logits/rejected": -0.5637722015380859, "logps/chosen": -41.48810958862305, "logps/rejected": -47.290435791015625, "loss": 0.3423, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6817604303359985, "rewards/margins": 1.1398839950561523, "rewards/rejected": -0.4581235945224762, "step": 4710 }, { "epoch": 0.48473321681916315, "grad_norm": 33.75, "learning_rate": 5.725382682202422e-07, "logits/chosen": -0.5849486589431763, "logits/rejected": -0.589453935623169, "logps/chosen": -43.414180755615234, "logps/rejected": -52.489776611328125, "loss": 0.3082, "rewards/accuracies": 0.875, "rewards/chosen": 0.7238918542861938, "rewards/margins": 1.5159670114517212, "rewards/rejected": -0.7920752167701721, "step": 4715 }, { "epoch": 0.48524724992289503, "grad_norm": 24.75, "learning_rate": 5.71967100753941e-07, "logits/chosen": -0.5569621324539185, "logits/rejected": -0.567907989025116, "logps/chosen": -41.78049850463867, "logps/rejected": -52.872581481933594, "loss": 0.3142, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6386259198188782, "rewards/margins": 1.4928687810897827, "rewards/rejected": -0.8542429208755493, "step": 4720 }, { "epoch": 0.4857612830266269, "grad_norm": 31.625, "learning_rate": 5.713959332876399e-07, "logits/chosen": -0.601215660572052, "logits/rejected": -0.6247516870498657, "logps/chosen": -35.363441467285156, "logps/rejected": -50.784820556640625, "loss": 0.3039, "rewards/accuracies": 0.875, "rewards/chosen": 0.6312280893325806, "rewards/margins": 1.4005558490753174, "rewards/rejected": -0.7693275213241577, "step": 4725 }, { "epoch": 0.4862753161303588, "grad_norm": 28.125, "learning_rate": 5.708247658213388e-07, "logits/chosen": -0.4945266842842102, "logits/rejected": -0.5469409227371216, "logps/chosen": -37.32204818725586, "logps/rejected": -49.414344787597656, "loss": 0.3176, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8604732751846313, "rewards/margins": 1.4764786958694458, "rewards/rejected": -0.616005539894104, "step": 4730 }, { "epoch": 0.4867893492340907, "grad_norm": 36.0, "learning_rate": 5.702535983550378e-07, "logits/chosen": -0.5800634622573853, "logits/rejected": -0.5180038213729858, "logps/chosen": -38.96940231323242, "logps/rejected": -49.08179473876953, "loss": 0.3448, "rewards/accuracies": 0.875, "rewards/chosen": 0.8150889277458191, "rewards/margins": 1.465802550315857, "rewards/rejected": -0.6507137417793274, "step": 4735 }, { "epoch": 0.48730338233782255, "grad_norm": 52.75, "learning_rate": 5.696824308887365e-07, "logits/chosen": -0.5264400243759155, "logits/rejected": -0.5974029302597046, "logps/chosen": -35.152565002441406, "logps/rejected": -46.77347946166992, "loss": 0.353, "rewards/accuracies": 0.875, "rewards/chosen": 0.7015331387519836, "rewards/margins": 1.1712067127227783, "rewards/rejected": -0.4696734547615051, "step": 4740 }, { "epoch": 0.48781741544155444, "grad_norm": 29.0, "learning_rate": 5.691112634224354e-07, "logits/chosen": -0.5171574950218201, "logits/rejected": -0.5627057552337646, "logps/chosen": -34.33367156982422, "logps/rejected": -47.1736946105957, "loss": 0.3014, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6627338528633118, "rewards/margins": 1.6277039051055908, "rewards/rejected": -0.9649699926376343, "step": 4745 }, { "epoch": 0.4883314485452863, "grad_norm": 45.75, "learning_rate": 5.685400959561343e-07, "logits/chosen": -0.42184367775917053, "logits/rejected": -0.5207546353340149, "logps/chosen": -37.866172790527344, "logps/rejected": -55.77153396606445, "loss": 0.3387, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7408025860786438, "rewards/margins": 1.6200870275497437, "rewards/rejected": -0.8792842626571655, "step": 4750 }, { "epoch": 0.4888454816490182, "grad_norm": 35.75, "learning_rate": 5.679689284898333e-07, "logits/chosen": -0.4467403292655945, "logits/rejected": -0.4584035873413086, "logps/chosen": -35.39793395996094, "logps/rejected": -46.02361297607422, "loss": 0.3531, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6507924795150757, "rewards/margins": 1.2570159435272217, "rewards/rejected": -0.6062236428260803, "step": 4755 }, { "epoch": 0.4893595147527501, "grad_norm": 46.0, "learning_rate": 5.67397761023532e-07, "logits/chosen": -0.5951410531997681, "logits/rejected": -0.5586241483688354, "logps/chosen": -39.91360855102539, "logps/rejected": -53.00099563598633, "loss": 0.3273, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8419879674911499, "rewards/margins": 1.7835266590118408, "rewards/rejected": -0.9415383338928223, "step": 4760 }, { "epoch": 0.48987354785648196, "grad_norm": 38.5, "learning_rate": 5.668265935572309e-07, "logits/chosen": -0.6337852478027344, "logits/rejected": -0.6892332434654236, "logps/chosen": -38.72999954223633, "logps/rejected": -50.89537811279297, "loss": 0.3506, "rewards/accuracies": 0.875, "rewards/chosen": 0.6992975473403931, "rewards/margins": 1.3221741914749146, "rewards/rejected": -0.6228765845298767, "step": 4765 }, { "epoch": 0.49038758096021384, "grad_norm": 35.75, "learning_rate": 5.662554260909298e-07, "logits/chosen": -0.5896392464637756, "logits/rejected": -0.5676780939102173, "logps/chosen": -44.095420837402344, "logps/rejected": -48.8907356262207, "loss": 0.2877, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8373373746871948, "rewards/margins": 1.387505292892456, "rewards/rejected": -0.550167977809906, "step": 4770 }, { "epoch": 0.4909016140639457, "grad_norm": 37.75, "learning_rate": 5.656842586246288e-07, "logits/chosen": -0.5861873030662537, "logits/rejected": -0.6306906938552856, "logps/chosen": -33.85993194580078, "logps/rejected": -55.95145797729492, "loss": 0.3075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9378029108047485, "rewards/margins": 1.7384586334228516, "rewards/rejected": -0.8006555438041687, "step": 4775 }, { "epoch": 0.4914156471676776, "grad_norm": 42.75, "learning_rate": 5.651130911583277e-07, "logits/chosen": -0.6089029908180237, "logits/rejected": -0.6127184629440308, "logps/chosen": -40.039634704589844, "logps/rejected": -57.58583450317383, "loss": 0.3126, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5218740701675415, "rewards/margins": 1.6777759790420532, "rewards/rejected": -1.1559019088745117, "step": 4780 }, { "epoch": 0.4919296802714095, "grad_norm": 35.0, "learning_rate": 5.645419236920264e-07, "logits/chosen": -0.5943218469619751, "logits/rejected": -0.5998703241348267, "logps/chosen": -43.112205505371094, "logps/rejected": -50.77952194213867, "loss": 0.3192, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6919920444488525, "rewards/margins": 1.4231504201889038, "rewards/rejected": -0.7311582565307617, "step": 4785 }, { "epoch": 0.49244371337514137, "grad_norm": 46.25, "learning_rate": 5.639707562257253e-07, "logits/chosen": -0.7150375843048096, "logits/rejected": -0.6897162795066833, "logps/chosen": -44.284183502197266, "logps/rejected": -51.70219802856445, "loss": 0.3469, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7800029516220093, "rewards/margins": 1.2520604133605957, "rewards/rejected": -0.47205743193626404, "step": 4790 }, { "epoch": 0.49295774647887325, "grad_norm": 39.5, "learning_rate": 5.633995887594243e-07, "logits/chosen": -0.5688036680221558, "logits/rejected": -0.5695117712020874, "logps/chosen": -34.17479705810547, "logps/rejected": -48.711082458496094, "loss": 0.3369, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8089313507080078, "rewards/margins": 1.4809333086013794, "rewards/rejected": -0.6720020174980164, "step": 4795 }, { "epoch": 0.49347177958260513, "grad_norm": 28.625, "learning_rate": 5.628284212931232e-07, "logits/chosen": -0.4842119812965393, "logits/rejected": -0.48684564232826233, "logps/chosen": -45.03450012207031, "logps/rejected": -51.087501525878906, "loss": 0.3325, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5002101063728333, "rewards/margins": 1.3668622970581055, "rewards/rejected": -0.8666520118713379, "step": 4800 }, { "epoch": 0.493985812686337, "grad_norm": 37.25, "learning_rate": 5.622572538268219e-07, "logits/chosen": -0.5725229978561401, "logits/rejected": -0.6085864305496216, "logps/chosen": -40.385704040527344, "logps/rejected": -53.581260681152344, "loss": 0.3353, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.623784601688385, "rewards/margins": 1.3192545175552368, "rewards/rejected": -0.695469856262207, "step": 4805 }, { "epoch": 0.4944998457900689, "grad_norm": 34.5, "learning_rate": 5.616860863605208e-07, "logits/chosen": -0.5812987089157104, "logits/rejected": -0.5256167650222778, "logps/chosen": -38.46223068237305, "logps/rejected": -50.46474838256836, "loss": 0.3517, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5619007349014282, "rewards/margins": 1.3096346855163574, "rewards/rejected": -0.747734010219574, "step": 4810 }, { "epoch": 0.4950138788938008, "grad_norm": 29.0, "learning_rate": 5.611149188942198e-07, "logits/chosen": -0.5365436673164368, "logits/rejected": -0.552767813205719, "logps/chosen": -40.04716110229492, "logps/rejected": -52.516502380371094, "loss": 0.3143, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8030856847763062, "rewards/margins": 1.5510057210922241, "rewards/rejected": -0.7479199171066284, "step": 4815 }, { "epoch": 0.49552791199753266, "grad_norm": 43.0, "learning_rate": 5.605437514279187e-07, "logits/chosen": -0.5958029627799988, "logits/rejected": -0.6178441047668457, "logps/chosen": -41.44306182861328, "logps/rejected": -52.85321044921875, "loss": 0.3316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9008132815361023, "rewards/margins": 1.7031729221343994, "rewards/rejected": -0.8023597002029419, "step": 4820 }, { "epoch": 0.49604194510126454, "grad_norm": 34.75, "learning_rate": 5.599725839616176e-07, "logits/chosen": -0.6411710977554321, "logits/rejected": -0.5881396532058716, "logps/chosen": -34.487064361572266, "logps/rejected": -53.1145133972168, "loss": 0.3364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7934239506721497, "rewards/margins": 1.5505048036575317, "rewards/rejected": -0.7570807337760925, "step": 4825 }, { "epoch": 0.4965559782049964, "grad_norm": 36.75, "learning_rate": 5.594014164953163e-07, "logits/chosen": -0.5443509817123413, "logits/rejected": -0.5937417149543762, "logps/chosen": -44.64929962158203, "logps/rejected": -50.52254104614258, "loss": 0.3269, "rewards/accuracies": 0.875, "rewards/chosen": 0.9246469736099243, "rewards/margins": 1.7707418203353882, "rewards/rejected": -0.8460949063301086, "step": 4830 }, { "epoch": 0.4970700113087283, "grad_norm": 28.375, "learning_rate": 5.588302490290153e-07, "logits/chosen": -0.6128142476081848, "logits/rejected": -0.5809807777404785, "logps/chosen": -44.90441131591797, "logps/rejected": -55.867271423339844, "loss": 0.3043, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9047134518623352, "rewards/margins": 1.5885823965072632, "rewards/rejected": -0.6838690042495728, "step": 4835 }, { "epoch": 0.4975840444124602, "grad_norm": 40.25, "learning_rate": 5.582590815627142e-07, "logits/chosen": -0.5881766080856323, "logits/rejected": -0.6196789145469666, "logps/chosen": -38.283180236816406, "logps/rejected": -51.63951873779297, "loss": 0.3252, "rewards/accuracies": 0.875, "rewards/chosen": 0.8552149534225464, "rewards/margins": 1.4336585998535156, "rewards/rejected": -0.578443706035614, "step": 4840 }, { "epoch": 0.49809807751619206, "grad_norm": 30.375, "learning_rate": 5.576879140964131e-07, "logits/chosen": -0.5482479929924011, "logits/rejected": -0.569968581199646, "logps/chosen": -32.28260803222656, "logps/rejected": -53.01898956298828, "loss": 0.3259, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8880845308303833, "rewards/margins": 1.8492000102996826, "rewards/rejected": -0.9611154794692993, "step": 4845 }, { "epoch": 0.49861211061992394, "grad_norm": 27.0, "learning_rate": 5.571167466301118e-07, "logits/chosen": -0.49894794821739197, "logits/rejected": -0.5148775577545166, "logps/chosen": -40.83159255981445, "logps/rejected": -51.62507247924805, "loss": 0.3341, "rewards/accuracies": 0.875, "rewards/chosen": 0.5866645574569702, "rewards/margins": 1.585915207862854, "rewards/rejected": -0.999250590801239, "step": 4850 }, { "epoch": 0.4991261437236558, "grad_norm": 30.375, "learning_rate": 5.565455791638108e-07, "logits/chosen": -0.5270397067070007, "logits/rejected": -0.5913654565811157, "logps/chosen": -34.88630294799805, "logps/rejected": -50.467262268066406, "loss": 0.302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0514395236968994, "rewards/margins": 1.7119457721710205, "rewards/rejected": -0.6605063080787659, "step": 4855 }, { "epoch": 0.4996401768273877, "grad_norm": 31.375, "learning_rate": 5.559744116975097e-07, "logits/chosen": -0.5853409767150879, "logits/rejected": -0.627204418182373, "logps/chosen": -39.77196502685547, "logps/rejected": -54.779014587402344, "loss": 0.3325, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7394281625747681, "rewards/margins": 1.7073583602905273, "rewards/rejected": -0.9679301381111145, "step": 4860 }, { "epoch": 0.5001542099311196, "grad_norm": 29.875, "learning_rate": 5.554032442312086e-07, "logits/chosen": -0.5556851029396057, "logits/rejected": -0.5576784610748291, "logps/chosen": -36.60089111328125, "logps/rejected": -51.52105712890625, "loss": 0.3477, "rewards/accuracies": 0.875, "rewards/chosen": 0.7230200171470642, "rewards/margins": 1.3983409404754639, "rewards/rejected": -0.6753207445144653, "step": 4865 }, { "epoch": 0.5001542099311196, "eval_logits/chosen": -0.5377503633499146, "eval_logits/rejected": -0.5979553461074829, "eval_logps/chosen": -77.80587005615234, "eval_logps/rejected": -55.0855827331543, "eval_loss": 0.3149397671222687, "eval_rewards/accuracies": 0.8676470518112183, "eval_rewards/chosen": 0.7102736234664917, "eval_rewards/margins": 1.5845900774002075, "eval_rewards/rejected": -0.8743165135383606, "eval_runtime": 2.099, "eval_samples_per_second": 509.772, "eval_steps_per_second": 8.099, "step": 4865 }, { "epoch": 0.5006682430348515, "grad_norm": 33.5, "learning_rate": 5.548320767649073e-07, "logits/chosen": -0.5664616227149963, "logits/rejected": -0.5496427416801453, "logps/chosen": -45.199012756347656, "logps/rejected": -54.99872970581055, "loss": 0.3502, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5503534078598022, "rewards/margins": 1.6305277347564697, "rewards/rejected": -1.080174207687378, "step": 4870 }, { "epoch": 0.5011822761385833, "grad_norm": 35.75, "learning_rate": 5.542609092986063e-07, "logits/chosen": -0.5706403851509094, "logits/rejected": -0.6225937604904175, "logps/chosen": -35.89201736450195, "logps/rejected": -52.834983825683594, "loss": 0.2972, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9750784039497375, "rewards/margins": 1.8686325550079346, "rewards/rejected": -0.8935542106628418, "step": 4875 }, { "epoch": 0.5016963092423152, "grad_norm": 46.0, "learning_rate": 5.536897418323052e-07, "logits/chosen": -0.5135378837585449, "logits/rejected": -0.5515276789665222, "logps/chosen": -40.75596618652344, "logps/rejected": -48.96894454956055, "loss": 0.3114, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8017946481704712, "rewards/margins": 1.7699756622314453, "rewards/rejected": -0.9681810140609741, "step": 4880 }, { "epoch": 0.5022103423460471, "grad_norm": 40.0, "learning_rate": 5.531185743660041e-07, "logits/chosen": -0.6079093813896179, "logits/rejected": -0.6110457181930542, "logps/chosen": -48.590003967285156, "logps/rejected": -49.65904998779297, "loss": 0.3481, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6521503329277039, "rewards/margins": 1.1803478002548218, "rewards/rejected": -0.5281975865364075, "step": 4885 }, { "epoch": 0.502724375449779, "grad_norm": 30.75, "learning_rate": 5.52547406899703e-07, "logits/chosen": -0.5084449648857117, "logits/rejected": -0.5670529007911682, "logps/chosen": -39.32038879394531, "logps/rejected": -51.280616760253906, "loss": 0.3061, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.810552716255188, "rewards/margins": 1.5618717670440674, "rewards/rejected": -0.7513190507888794, "step": 4890 }, { "epoch": 0.5032384085535109, "grad_norm": 30.5, "learning_rate": 5.519762394334018e-07, "logits/chosen": -0.5664307475090027, "logits/rejected": -0.5430883169174194, "logps/chosen": -39.13208770751953, "logps/rejected": -48.77949905395508, "loss": 0.3122, "rewards/accuracies": 0.875, "rewards/chosen": 0.6788533329963684, "rewards/margins": 1.248708724975586, "rewards/rejected": -0.5698553919792175, "step": 4895 }, { "epoch": 0.5037524416572428, "grad_norm": 34.5, "learning_rate": 5.514050719671007e-07, "logits/chosen": -0.525402843952179, "logits/rejected": -0.5125089883804321, "logps/chosen": -38.560340881347656, "logps/rejected": -50.363922119140625, "loss": 0.3024, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6374646425247192, "rewards/margins": 1.4692871570587158, "rewards/rejected": -0.831822395324707, "step": 4900 }, { "epoch": 0.5042664747609746, "grad_norm": 25.875, "learning_rate": 5.508339045007996e-07, "logits/chosen": -0.639041543006897, "logits/rejected": -0.6090801358222961, "logps/chosen": -43.65624237060547, "logps/rejected": -52.446205139160156, "loss": 0.3342, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.44408243894577026, "rewards/margins": 1.2114629745483398, "rewards/rejected": -0.7673805952072144, "step": 4905 }, { "epoch": 0.5047805078647065, "grad_norm": 30.25, "learning_rate": 5.502627370344985e-07, "logits/chosen": -0.6050786972045898, "logits/rejected": -0.5616417527198792, "logps/chosen": -37.88313293457031, "logps/rejected": -49.755313873291016, "loss": 0.3691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5123801231384277, "rewards/margins": 1.1985766887664795, "rewards/rejected": -0.686196506023407, "step": 4910 }, { "epoch": 0.5052945409684384, "grad_norm": 27.0, "learning_rate": 5.496915695681975e-07, "logits/chosen": -0.5510180592536926, "logits/rejected": -0.6031302809715271, "logps/chosen": -42.9136848449707, "logps/rejected": -55.013710021972656, "loss": 0.3208, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7834815979003906, "rewards/margins": 1.6698487997055054, "rewards/rejected": -0.88636714220047, "step": 4915 }, { "epoch": 0.5058085740721703, "grad_norm": 26.375, "learning_rate": 5.491204021018962e-07, "logits/chosen": -0.6488019824028015, "logits/rejected": -0.6316566467285156, "logps/chosen": -36.6456413269043, "logps/rejected": -50.53575897216797, "loss": 0.3259, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7610824704170227, "rewards/margins": 1.4162614345550537, "rewards/rejected": -0.6551790833473206, "step": 4920 }, { "epoch": 0.5063226071759022, "grad_norm": 29.0, "learning_rate": 5.485492346355951e-07, "logits/chosen": -0.6023762822151184, "logits/rejected": -0.7005181908607483, "logps/chosen": -36.72539520263672, "logps/rejected": -51.057037353515625, "loss": 0.3082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8469164967536926, "rewards/margins": 1.554534912109375, "rewards/rejected": -0.7076184153556824, "step": 4925 }, { "epoch": 0.506836640279634, "grad_norm": 27.75, "learning_rate": 5.479780671692941e-07, "logits/chosen": -0.5391978025436401, "logits/rejected": -0.5353658199310303, "logps/chosen": -39.231971740722656, "logps/rejected": -47.88032913208008, "loss": 0.3516, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6164783239364624, "rewards/margins": 1.4491136074066162, "rewards/rejected": -0.8326352834701538, "step": 4930 }, { "epoch": 0.5073506733833659, "grad_norm": 39.25, "learning_rate": 5.47406899702993e-07, "logits/chosen": -0.6192559003829956, "logits/rejected": -0.5737003684043884, "logps/chosen": -37.59008026123047, "logps/rejected": -46.944007873535156, "loss": 0.3632, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7911713123321533, "rewards/margins": 1.2001466751098633, "rewards/rejected": -0.4089753031730652, "step": 4935 }, { "epoch": 0.5078647064870978, "grad_norm": 34.25, "learning_rate": 5.468357322366917e-07, "logits/chosen": -0.559942364692688, "logits/rejected": -0.5667109489440918, "logps/chosen": -38.969749450683594, "logps/rejected": -54.406776428222656, "loss": 0.3354, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5318917036056519, "rewards/margins": 1.4133880138397217, "rewards/rejected": -0.8814963102340698, "step": 4940 }, { "epoch": 0.5083787395908297, "grad_norm": 24.875, "learning_rate": 5.462645647703906e-07, "logits/chosen": -0.5535880923271179, "logits/rejected": -0.5792238712310791, "logps/chosen": -34.43412780761719, "logps/rejected": -50.8483772277832, "loss": 0.3021, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7874888181686401, "rewards/margins": 1.5718994140625, "rewards/rejected": -0.7844106554985046, "step": 4945 }, { "epoch": 0.5088927726945616, "grad_norm": 35.0, "learning_rate": 5.456933973040896e-07, "logits/chosen": -0.5693756937980652, "logits/rejected": -0.5923997163772583, "logps/chosen": -32.34191131591797, "logps/rejected": -56.093780517578125, "loss": 0.3437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8256816864013672, "rewards/margins": 1.6109319925308228, "rewards/rejected": -0.785250186920166, "step": 4950 }, { "epoch": 0.5094068057982934, "grad_norm": 31.875, "learning_rate": 5.451222298377885e-07, "logits/chosen": -0.6042063236236572, "logits/rejected": -0.6262322068214417, "logps/chosen": -35.27622985839844, "logps/rejected": -47.03837585449219, "loss": 0.3245, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.134322166442871, "rewards/margins": 1.6463851928710938, "rewards/rejected": -0.5120629072189331, "step": 4955 }, { "epoch": 0.5099208389020253, "grad_norm": 29.5, "learning_rate": 5.445510623714872e-07, "logits/chosen": -0.5530611872673035, "logits/rejected": -0.5283544063568115, "logps/chosen": -42.00584030151367, "logps/rejected": -46.63080596923828, "loss": 0.3686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5548064112663269, "rewards/margins": 1.2939367294311523, "rewards/rejected": -0.7391301989555359, "step": 4960 }, { "epoch": 0.5104348720057572, "grad_norm": 28.875, "learning_rate": 5.439798949051861e-07, "logits/chosen": -0.4753776490688324, "logits/rejected": -0.4770779609680176, "logps/chosen": -39.37459182739258, "logps/rejected": -52.98329544067383, "loss": 0.326, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5595160722732544, "rewards/margins": 1.3518084287643433, "rewards/rejected": -0.7922923564910889, "step": 4965 }, { "epoch": 0.5109489051094891, "grad_norm": 42.5, "learning_rate": 5.434087274388851e-07, "logits/chosen": -0.6318701505661011, "logits/rejected": -0.64483243227005, "logps/chosen": -36.51751708984375, "logps/rejected": -52.30110549926758, "loss": 0.3314, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6926358938217163, "rewards/margins": 1.5340576171875, "rewards/rejected": -0.8414216041564941, "step": 4970 }, { "epoch": 0.511462938213221, "grad_norm": 31.375, "learning_rate": 5.42837559972584e-07, "logits/chosen": -0.6369327902793884, "logits/rejected": -0.6201096773147583, "logps/chosen": -35.328880310058594, "logps/rejected": -53.06018829345703, "loss": 0.3303, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.718467652797699, "rewards/margins": 1.5017999410629272, "rewards/rejected": -0.7833324670791626, "step": 4975 }, { "epoch": 0.5119769713169529, "grad_norm": 25.5, "learning_rate": 5.422663925062829e-07, "logits/chosen": -0.5552483797073364, "logits/rejected": -0.5381861925125122, "logps/chosen": -39.83024978637695, "logps/rejected": -53.6484489440918, "loss": 0.3375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7679446935653687, "rewards/margins": 1.3022881746292114, "rewards/rejected": -0.5343435406684875, "step": 4980 }, { "epoch": 0.5124910044206847, "grad_norm": 26.0, "learning_rate": 5.416952250399816e-07, "logits/chosen": -0.5518376231193542, "logits/rejected": -0.5545941591262817, "logps/chosen": -34.455528259277344, "logps/rejected": -50.76951217651367, "loss": 0.3385, "rewards/accuracies": 0.875, "rewards/chosen": 0.6814141273498535, "rewards/margins": 1.6293361186981201, "rewards/rejected": -0.9479220509529114, "step": 4985 }, { "epoch": 0.5130050375244166, "grad_norm": 74.5, "learning_rate": 5.411240575736806e-07, "logits/chosen": -0.53502357006073, "logits/rejected": -0.5838879346847534, "logps/chosen": -38.63257598876953, "logps/rejected": -54.44710159301758, "loss": 0.3626, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.675186812877655, "rewards/margins": 1.7175910472869873, "rewards/rejected": -1.0424044132232666, "step": 4990 }, { "epoch": 0.5135190706281485, "grad_norm": 29.75, "learning_rate": 5.405528901073795e-07, "logits/chosen": -0.4722229838371277, "logits/rejected": -0.493552029132843, "logps/chosen": -33.340782165527344, "logps/rejected": -45.77729034423828, "loss": 0.3107, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7805301547050476, "rewards/margins": 1.5339231491088867, "rewards/rejected": -0.7533929944038391, "step": 4995 }, { "epoch": 0.5140331037318804, "grad_norm": 42.75, "learning_rate": 5.399817226410784e-07, "logits/chosen": -0.5881179571151733, "logits/rejected": -0.6071398854255676, "logps/chosen": -36.24829864501953, "logps/rejected": -48.524192810058594, "loss": 0.3274, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7448930740356445, "rewards/margins": 1.3221842050552368, "rewards/rejected": -0.5772911310195923, "step": 5000 }, { "epoch": 0.5145471368356123, "grad_norm": 49.5, "learning_rate": 5.394105551747771e-07, "logits/chosen": -0.6078559160232544, "logits/rejected": -0.6221489906311035, "logps/chosen": -38.99741744995117, "logps/rejected": -52.193519592285156, "loss": 0.3193, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5004211068153381, "rewards/margins": 1.6417585611343384, "rewards/rejected": -1.141337513923645, "step": 5005 }, { "epoch": 0.5150611699393441, "grad_norm": 31.375, "learning_rate": 5.388393877084761e-07, "logits/chosen": -0.6418801546096802, "logits/rejected": -0.6389087438583374, "logps/chosen": -40.549930572509766, "logps/rejected": -53.0611686706543, "loss": 0.3501, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7591480612754822, "rewards/margins": 1.5968832969665527, "rewards/rejected": -0.8377351760864258, "step": 5010 }, { "epoch": 0.515575203043076, "grad_norm": 31.625, "learning_rate": 5.38268220242175e-07, "logits/chosen": -0.5862824320793152, "logits/rejected": -0.6389979720115662, "logps/chosen": -38.0305290222168, "logps/rejected": -50.22937774658203, "loss": 0.3266, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5810378193855286, "rewards/margins": 1.2715847492218018, "rewards/rejected": -0.6905468106269836, "step": 5015 }, { "epoch": 0.5160892361468079, "grad_norm": 60.75, "learning_rate": 5.376970527758739e-07, "logits/chosen": -0.5658448934555054, "logits/rejected": -0.580070972442627, "logps/chosen": -35.02509307861328, "logps/rejected": -55.887611389160156, "loss": 0.3049, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7088855504989624, "rewards/margins": 1.8326492309570312, "rewards/rejected": -1.1237636804580688, "step": 5020 }, { "epoch": 0.5166032692505398, "grad_norm": 41.5, "learning_rate": 5.371258853095727e-07, "logits/chosen": -0.60016930103302, "logits/rejected": -0.6438034772872925, "logps/chosen": -46.1529541015625, "logps/rejected": -51.99370193481445, "loss": 0.2629, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9850719571113586, "rewards/margins": 1.711385726928711, "rewards/rejected": -0.7263138294219971, "step": 5025 }, { "epoch": 0.5171173023542717, "grad_norm": 36.25, "learning_rate": 5.365547178432716e-07, "logits/chosen": -0.5999141931533813, "logits/rejected": -0.6581990122795105, "logps/chosen": -36.473915100097656, "logps/rejected": -49.441078186035156, "loss": 0.3013, "rewards/accuracies": 0.875, "rewards/chosen": 0.6683362722396851, "rewards/margins": 1.467288613319397, "rewards/rejected": -0.7989522218704224, "step": 5030 }, { "epoch": 0.5176313354580035, "grad_norm": 38.0, "learning_rate": 5.359835503769705e-07, "logits/chosen": -0.6013718843460083, "logits/rejected": -0.6335473656654358, "logps/chosen": -37.744754791259766, "logps/rejected": -48.451194763183594, "loss": 0.321, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6863307952880859, "rewards/margins": 1.45012366771698, "rewards/rejected": -0.763792872428894, "step": 5035 }, { "epoch": 0.5181453685617354, "grad_norm": 30.5, "learning_rate": 5.354123829106694e-07, "logits/chosen": -0.5558117628097534, "logits/rejected": -0.5565828084945679, "logps/chosen": -39.61688232421875, "logps/rejected": -51.06938552856445, "loss": 0.3011, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8842232823371887, "rewards/margins": 1.799992322921753, "rewards/rejected": -0.9157692193984985, "step": 5040 }, { "epoch": 0.5186594016654672, "grad_norm": 29.0, "learning_rate": 5.348412154443683e-07, "logits/chosen": -0.5601394772529602, "logits/rejected": -0.5361719727516174, "logps/chosen": -36.18623352050781, "logps/rejected": -46.87627029418945, "loss": 0.3183, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8138769865036011, "rewards/margins": 1.4310222864151, "rewards/rejected": -0.6171454191207886, "step": 5045 }, { "epoch": 0.5191734347691991, "grad_norm": 28.875, "learning_rate": 5.342700479780671e-07, "logits/chosen": -0.471312940120697, "logits/rejected": -0.5537464022636414, "logps/chosen": -35.5625114440918, "logps/rejected": -45.14116668701172, "loss": 0.3081, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8340433835983276, "rewards/margins": 1.4618867635726929, "rewards/rejected": -0.6278433203697205, "step": 5050 }, { "epoch": 0.519687467872931, "grad_norm": 45.75, "learning_rate": 5.33698880511766e-07, "logits/chosen": -0.5459453463554382, "logits/rejected": -0.5971981287002563, "logps/chosen": -35.325035095214844, "logps/rejected": -51.77758026123047, "loss": 0.3382, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9278813600540161, "rewards/margins": 1.7149368524551392, "rewards/rejected": -0.787055492401123, "step": 5055 }, { "epoch": 0.5202015009766628, "grad_norm": 57.75, "learning_rate": 5.331277130454649e-07, "logits/chosen": -0.4986167848110199, "logits/rejected": -0.5675681829452515, "logps/chosen": -35.066673278808594, "logps/rejected": -53.60710906982422, "loss": 0.3511, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6834322214126587, "rewards/margins": 1.6592810153961182, "rewards/rejected": -0.9758486747741699, "step": 5060 }, { "epoch": 0.5207155340803947, "grad_norm": 27.125, "learning_rate": 5.325565455791638e-07, "logits/chosen": -0.6004576683044434, "logits/rejected": -0.6608420610427856, "logps/chosen": -43.82719039916992, "logps/rejected": -49.55614471435547, "loss": 0.3047, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6267270445823669, "rewards/margins": 1.280731201171875, "rewards/rejected": -0.6540040969848633, "step": 5065 }, { "epoch": 0.5212295671841266, "grad_norm": 39.75, "learning_rate": 5.319853781128627e-07, "logits/chosen": -0.6357198357582092, "logits/rejected": -0.5418845415115356, "logps/chosen": -44.34196090698242, "logps/rejected": -50.53485107421875, "loss": 0.3498, "rewards/accuracies": 0.875, "rewards/chosen": 0.787209153175354, "rewards/margins": 1.1566158533096313, "rewards/rejected": -0.36940670013427734, "step": 5070 }, { "epoch": 0.5217436002878585, "grad_norm": 33.5, "learning_rate": 5.314142106465615e-07, "logits/chosen": -0.5787879228591919, "logits/rejected": -0.6302369236946106, "logps/chosen": -39.148345947265625, "logps/rejected": -47.710548400878906, "loss": 0.3673, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7089246511459351, "rewards/margins": 1.3008018732070923, "rewards/rejected": -0.591877281665802, "step": 5075 }, { "epoch": 0.5222576333915904, "grad_norm": 24.25, "learning_rate": 5.308430431802604e-07, "logits/chosen": -0.5716956853866577, "logits/rejected": -0.6318680644035339, "logps/chosen": -42.18121337890625, "logps/rejected": -53.436195373535156, "loss": 0.3073, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9061332941055298, "rewards/margins": 1.8047263622283936, "rewards/rejected": -0.8985931277275085, "step": 5080 }, { "epoch": 0.5227716664953223, "grad_norm": 25.625, "learning_rate": 5.302718757139593e-07, "logits/chosen": -0.5728383660316467, "logits/rejected": -0.5854747891426086, "logps/chosen": -42.638832092285156, "logps/rejected": -54.005348205566406, "loss": 0.3008, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5813202261924744, "rewards/margins": 1.347771167755127, "rewards/rejected": -0.766450822353363, "step": 5085 }, { "epoch": 0.5232856995990541, "grad_norm": 43.0, "learning_rate": 5.297007082476583e-07, "logits/chosen": -0.5767422914505005, "logits/rejected": -0.5869737267494202, "logps/chosen": -34.80801773071289, "logps/rejected": -47.06276321411133, "loss": 0.3339, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7111290693283081, "rewards/margins": 1.3833422660827637, "rewards/rejected": -0.6722131371498108, "step": 5090 }, { "epoch": 0.523799732702786, "grad_norm": 35.25, "learning_rate": 5.29129540781357e-07, "logits/chosen": -0.5887481570243835, "logits/rejected": -0.6288881301879883, "logps/chosen": -41.79151153564453, "logps/rejected": -51.029544830322266, "loss": 0.2877, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9212445020675659, "rewards/margins": 1.6919710636138916, "rewards/rejected": -0.7707265615463257, "step": 5095 }, { "epoch": 0.5243137658065179, "grad_norm": 35.0, "learning_rate": 5.285583733150559e-07, "logits/chosen": -0.5963364839553833, "logits/rejected": -0.6288725137710571, "logps/chosen": -39.45241165161133, "logps/rejected": -49.64850616455078, "loss": 0.2806, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8032337427139282, "rewards/margins": 1.3060686588287354, "rewards/rejected": -0.5028349161148071, "step": 5100 }, { "epoch": 0.5248277989102498, "grad_norm": 30.875, "learning_rate": 5.279872058487548e-07, "logits/chosen": -0.4590228497982025, "logits/rejected": -0.5078718066215515, "logps/chosen": -37.949764251708984, "logps/rejected": -50.53196334838867, "loss": 0.2978, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7804750204086304, "rewards/margins": 1.4453803300857544, "rewards/rejected": -0.6649053692817688, "step": 5105 }, { "epoch": 0.5253418320139817, "grad_norm": 32.0, "learning_rate": 5.274160383824538e-07, "logits/chosen": -0.5738387703895569, "logits/rejected": -0.6003941297531128, "logps/chosen": -37.759788513183594, "logps/rejected": -47.87232971191406, "loss": 0.3392, "rewards/accuracies": 0.875, "rewards/chosen": 0.7557188272476196, "rewards/margins": 1.5232700109481812, "rewards/rejected": -0.7675513029098511, "step": 5110 }, { "epoch": 0.5258558651177135, "grad_norm": 33.75, "learning_rate": 5.268448709161526e-07, "logits/chosen": -0.6030657887458801, "logits/rejected": -0.5997006893157959, "logps/chosen": -38.30683898925781, "logps/rejected": -53.07283401489258, "loss": 0.2913, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8005486726760864, "rewards/margins": 1.54856276512146, "rewards/rejected": -0.7480141520500183, "step": 5115 }, { "epoch": 0.5263698982214454, "grad_norm": 28.5, "learning_rate": 5.262737034498514e-07, "logits/chosen": -0.6110216379165649, "logits/rejected": -0.6068674921989441, "logps/chosen": -37.558109283447266, "logps/rejected": -54.03557205200195, "loss": 0.282, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6898611783981323, "rewards/margins": 1.4796124696731567, "rewards/rejected": -0.789751410484314, "step": 5120 }, { "epoch": 0.5268839313251773, "grad_norm": 46.0, "learning_rate": 5.257025359835503e-07, "logits/chosen": -0.5737811326980591, "logits/rejected": -0.4905003011226654, "logps/chosen": -43.97298049926758, "logps/rejected": -55.47813034057617, "loss": 0.3053, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5685327053070068, "rewards/margins": 1.6861985921859741, "rewards/rejected": -1.1176660060882568, "step": 5125 }, { "epoch": 0.5273979644289092, "grad_norm": 32.5, "learning_rate": 5.251313685172493e-07, "logits/chosen": -0.4613369107246399, "logits/rejected": -0.5303742289543152, "logps/chosen": -36.99448776245117, "logps/rejected": -48.86695098876953, "loss": 0.3149, "rewards/accuracies": 1.0, "rewards/chosen": 0.7571269273757935, "rewards/margins": 1.668687105178833, "rewards/rejected": -0.9115601778030396, "step": 5130 }, { "epoch": 0.5279119975326411, "grad_norm": 26.5, "learning_rate": 5.245602010509482e-07, "logits/chosen": -0.5336953401565552, "logits/rejected": -0.596032977104187, "logps/chosen": -37.723201751708984, "logps/rejected": -57.26020431518555, "loss": 0.3155, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.878444492816925, "rewards/margins": 1.644811987876892, "rewards/rejected": -0.7663674354553223, "step": 5135 }, { "epoch": 0.528426030636373, "grad_norm": 39.5, "learning_rate": 5.239890335846469e-07, "logits/chosen": -0.49161943793296814, "logits/rejected": -0.5687310099601746, "logps/chosen": -30.540210723876953, "logps/rejected": -44.05038070678711, "loss": 0.3557, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8624671101570129, "rewards/margins": 1.4065247774124146, "rewards/rejected": -0.5440576672554016, "step": 5140 }, { "epoch": 0.5289400637401048, "grad_norm": 56.25, "learning_rate": 5.234178661183459e-07, "logits/chosen": -0.570682168006897, "logits/rejected": -0.5850602388381958, "logps/chosen": -36.98370361328125, "logps/rejected": -53.55597686767578, "loss": 0.3332, "rewards/accuracies": 0.875, "rewards/chosen": 0.8751300573348999, "rewards/margins": 1.6157381534576416, "rewards/rejected": -0.7406080961227417, "step": 5145 }, { "epoch": 0.5294540968438367, "grad_norm": 36.25, "learning_rate": 5.228466986520448e-07, "logits/chosen": -0.5738949179649353, "logits/rejected": -0.5909032225608826, "logps/chosen": -43.480987548828125, "logps/rejected": -53.149436950683594, "loss": 0.3442, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3440312445163727, "rewards/margins": 1.0528370141983032, "rewards/rejected": -0.7088058590888977, "step": 5150 }, { "epoch": 0.5299681299475686, "grad_norm": 36.5, "learning_rate": 5.222755311857437e-07, "logits/chosen": -0.596675455570221, "logits/rejected": -0.5928572416305542, "logps/chosen": -33.77753448486328, "logps/rejected": -46.84181594848633, "loss": 0.3471, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.757846474647522, "rewards/margins": 1.2232329845428467, "rewards/rejected": -0.46538639068603516, "step": 5155 }, { "epoch": 0.5304821630513005, "grad_norm": 36.75, "learning_rate": 5.217043637194425e-07, "logits/chosen": -0.5930576324462891, "logits/rejected": -0.5523659586906433, "logps/chosen": -35.65046310424805, "logps/rejected": -48.11235809326172, "loss": 0.3633, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5433937311172485, "rewards/margins": 1.2242352962493896, "rewards/rejected": -0.6808415651321411, "step": 5160 }, { "epoch": 0.5309961961550324, "grad_norm": 36.25, "learning_rate": 5.211331962531414e-07, "logits/chosen": -0.5565522313117981, "logits/rejected": -0.5561134219169617, "logps/chosen": -40.22990036010742, "logps/rejected": -51.513587951660156, "loss": 0.3077, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9550952911376953, "rewards/margins": 1.5849804878234863, "rewards/rejected": -0.629885196685791, "step": 5165 }, { "epoch": 0.5315102292587642, "grad_norm": 46.0, "learning_rate": 5.205620287868403e-07, "logits/chosen": -0.6121824979782104, "logits/rejected": -0.6037369966506958, "logps/chosen": -35.9212646484375, "logps/rejected": -46.26460647583008, "loss": 0.3399, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5029948353767395, "rewards/margins": 1.2181028127670288, "rewards/rejected": -0.7151080369949341, "step": 5170 }, { "epoch": 0.5320242623624961, "grad_norm": 29.875, "learning_rate": 5.199908613205392e-07, "logits/chosen": -0.5178620219230652, "logits/rejected": -0.5362637042999268, "logps/chosen": -40.19517135620117, "logps/rejected": -55.39420700073242, "loss": 0.3206, "rewards/accuracies": 0.875, "rewards/chosen": 0.5707318782806396, "rewards/margins": 1.3968112468719482, "rewards/rejected": -0.8260793685913086, "step": 5175 }, { "epoch": 0.532538295466228, "grad_norm": 30.625, "learning_rate": 5.19419693854238e-07, "logits/chosen": -0.5723553895950317, "logits/rejected": -0.635463535785675, "logps/chosen": -34.16704177856445, "logps/rejected": -49.355628967285156, "loss": 0.33, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0341691970825195, "rewards/margins": 1.6468536853790283, "rewards/rejected": -0.6126843690872192, "step": 5180 }, { "epoch": 0.5330523285699599, "grad_norm": 40.75, "learning_rate": 5.188485263879369e-07, "logits/chosen": -0.6639121770858765, "logits/rejected": -0.6821603178977966, "logps/chosen": -41.596275329589844, "logps/rejected": -51.55534744262695, "loss": 0.3131, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7309592366218567, "rewards/margins": 1.780124306678772, "rewards/rejected": -1.0491650104522705, "step": 5185 }, { "epoch": 0.5335663616736918, "grad_norm": 30.0, "learning_rate": 5.182773589216358e-07, "logits/chosen": -0.6311384439468384, "logits/rejected": -0.6490311026573181, "logps/chosen": -37.58317947387695, "logps/rejected": -55.3925895690918, "loss": 0.3221, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6578929424285889, "rewards/margins": 1.6349836587905884, "rewards/rejected": -0.9770906567573547, "step": 5190 }, { "epoch": 0.5340803947774236, "grad_norm": 27.875, "learning_rate": 5.177061914553347e-07, "logits/chosen": -0.5511196255683899, "logits/rejected": -0.5226593017578125, "logps/chosen": -43.41379928588867, "logps/rejected": -48.93183517456055, "loss": 0.3693, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.405201256275177, "rewards/margins": 1.3071978092193604, "rewards/rejected": -0.9019967317581177, "step": 5195 }, { "epoch": 0.5345944278811555, "grad_norm": 28.0, "learning_rate": 5.171350239890336e-07, "logits/chosen": -0.5559145212173462, "logits/rejected": -0.5635032653808594, "logps/chosen": -38.36058044433594, "logps/rejected": -49.16997528076172, "loss": 0.3131, "rewards/accuracies": 0.75, "rewards/chosen": 0.4930858016014099, "rewards/margins": 0.9020676612854004, "rewards/rejected": -0.4089818596839905, "step": 5200 }, { "epoch": 0.5351084609848874, "grad_norm": 37.5, "learning_rate": 5.165638565227325e-07, "logits/chosen": -0.5922843217849731, "logits/rejected": -0.6059641242027283, "logps/chosen": -39.130584716796875, "logps/rejected": -50.89771270751953, "loss": 0.3225, "rewards/accuracies": 0.875, "rewards/chosen": 0.6486445069313049, "rewards/margins": 1.355399489402771, "rewards/rejected": -0.7067548632621765, "step": 5205 }, { "epoch": 0.5356224940886193, "grad_norm": 29.75, "learning_rate": 5.159926890564313e-07, "logits/chosen": -0.5539160966873169, "logits/rejected": -0.6456956267356873, "logps/chosen": -33.93125915527344, "logps/rejected": -49.50370788574219, "loss": 0.2958, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7131765484809875, "rewards/margins": 1.5476343631744385, "rewards/rejected": -0.8344579935073853, "step": 5210 }, { "epoch": 0.5361365271923512, "grad_norm": 77.5, "learning_rate": 5.154215215901302e-07, "logits/chosen": -0.5839038491249084, "logits/rejected": -0.6860867738723755, "logps/chosen": -41.19661331176758, "logps/rejected": -56.4055061340332, "loss": 0.337, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5785764455795288, "rewards/margins": 1.5567013025283813, "rewards/rejected": -0.978124737739563, "step": 5215 }, { "epoch": 0.536650560296083, "grad_norm": 48.0, "learning_rate": 5.148503541238291e-07, "logits/chosen": -0.5540838837623596, "logits/rejected": -0.5430899262428284, "logps/chosen": -46.3670539855957, "logps/rejected": -52.55150604248047, "loss": 0.3345, "rewards/accuracies": 0.875, "rewards/chosen": 0.6687859296798706, "rewards/margins": 1.5706342458724976, "rewards/rejected": -0.901848316192627, "step": 5220 }, { "epoch": 0.5371645933998149, "grad_norm": 24.5, "learning_rate": 5.14279186657528e-07, "logits/chosen": -0.5193679928779602, "logits/rejected": -0.5892160534858704, "logps/chosen": -34.40078353881836, "logps/rejected": -53.494300842285156, "loss": 0.3111, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9442436099052429, "rewards/margins": 1.814103364944458, "rewards/rejected": -0.869859516620636, "step": 5225 }, { "epoch": 0.5376786265035468, "grad_norm": 34.25, "learning_rate": 5.137080191912268e-07, "logits/chosen": -0.5103844404220581, "logits/rejected": -0.5750161409378052, "logps/chosen": -36.838462829589844, "logps/rejected": -50.45214080810547, "loss": 0.3064, "rewards/accuracies": 0.875, "rewards/chosen": 0.7700605988502502, "rewards/margins": 1.5770100355148315, "rewards/rejected": -0.8069494962692261, "step": 5230 }, { "epoch": 0.5381926596072787, "grad_norm": 38.0, "learning_rate": 5.131368517249257e-07, "logits/chosen": -0.5328763723373413, "logits/rejected": -0.5923821330070496, "logps/chosen": -37.282413482666016, "logps/rejected": -53.10709762573242, "loss": 0.344, "rewards/accuracies": 1.0, "rewards/chosen": 0.721767008304596, "rewards/margins": 1.8225208520889282, "rewards/rejected": -1.1007537841796875, "step": 5235 }, { "epoch": 0.5387066927110106, "grad_norm": 26.5, "learning_rate": 5.125656842586246e-07, "logits/chosen": -0.5513870120048523, "logits/rejected": -0.5621718168258667, "logps/chosen": -33.305274963378906, "logps/rejected": -48.67365264892578, "loss": 0.3332, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6390558481216431, "rewards/margins": 1.4863038063049316, "rewards/rejected": -0.8472479581832886, "step": 5240 }, { "epoch": 0.5392207258147425, "grad_norm": 29.25, "learning_rate": 5.119945167923236e-07, "logits/chosen": -0.5114129185676575, "logits/rejected": -0.5936216115951538, "logps/chosen": -37.384334564208984, "logps/rejected": -50.9704704284668, "loss": 0.3049, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6941729187965393, "rewards/margins": 1.5457828044891357, "rewards/rejected": -0.8516098856925964, "step": 5245 }, { "epoch": 0.5397347589184743, "grad_norm": 32.5, "learning_rate": 5.114233493260223e-07, "logits/chosen": -0.5909678936004639, "logits/rejected": -0.5663979649543762, "logps/chosen": -38.90419387817383, "logps/rejected": -53.213233947753906, "loss": 0.3459, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6744430661201477, "rewards/margins": 1.6015437841415405, "rewards/rejected": -0.927100658416748, "step": 5250 }, { "epoch": 0.5402487920222062, "grad_norm": 24.375, "learning_rate": 5.108521818597212e-07, "logits/chosen": -0.595575213432312, "logits/rejected": -0.6815346479415894, "logps/chosen": -39.019798278808594, "logps/rejected": -46.02259063720703, "loss": 0.2942, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9640170931816101, "rewards/margins": 1.358742356300354, "rewards/rejected": -0.3947252631187439, "step": 5255 }, { "epoch": 0.5407628251259381, "grad_norm": 34.75, "learning_rate": 5.102810143934201e-07, "logits/chosen": -0.45245328545570374, "logits/rejected": -0.503128170967102, "logps/chosen": -36.053001403808594, "logps/rejected": -49.56135940551758, "loss": 0.3048, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9063347578048706, "rewards/margins": 1.7894493341445923, "rewards/rejected": -0.8831145167350769, "step": 5260 }, { "epoch": 0.54127685822967, "grad_norm": 38.0, "learning_rate": 5.097098469271191e-07, "logits/chosen": -0.48617926239967346, "logits/rejected": -0.5489295721054077, "logps/chosen": -33.642635345458984, "logps/rejected": -48.2120475769043, "loss": 0.2985, "rewards/accuracies": 0.875, "rewards/chosen": 1.062140941619873, "rewards/margins": 1.7319729328155518, "rewards/rejected": -0.6698319315910339, "step": 5265 }, { "epoch": 0.5417908913334019, "grad_norm": 50.75, "learning_rate": 5.091386794608179e-07, "logits/chosen": -0.592360258102417, "logits/rejected": -0.6293856501579285, "logps/chosen": -42.623619079589844, "logps/rejected": -53.22105026245117, "loss": 0.306, "rewards/accuracies": 0.875, "rewards/chosen": 0.6945092678070068, "rewards/margins": 1.682535171508789, "rewards/rejected": -0.9880261421203613, "step": 5270 }, { "epoch": 0.5423049244371337, "grad_norm": 26.0, "learning_rate": 5.085675119945167e-07, "logits/chosen": -0.5932270288467407, "logits/rejected": -0.5787975192070007, "logps/chosen": -36.17060470581055, "logps/rejected": -52.16851043701172, "loss": 0.3434, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6961182355880737, "rewards/margins": 1.6965587139129639, "rewards/rejected": -1.0004405975341797, "step": 5275 }, { "epoch": 0.5428189575408656, "grad_norm": 33.5, "learning_rate": 5.079963445282156e-07, "logits/chosen": -0.5643607378005981, "logits/rejected": -0.5636721849441528, "logps/chosen": -47.501468658447266, "logps/rejected": -50.66032028198242, "loss": 0.3279, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.832225501537323, "rewards/margins": 1.659336805343628, "rewards/rejected": -0.8271113634109497, "step": 5280 }, { "epoch": 0.5433329906445975, "grad_norm": 36.25, "learning_rate": 5.074251770619146e-07, "logits/chosen": -0.48864641785621643, "logits/rejected": -0.56107097864151, "logps/chosen": -35.553897857666016, "logps/rejected": -47.63959503173828, "loss": 0.2978, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8318565487861633, "rewards/margins": 1.589727520942688, "rewards/rejected": -0.7578710317611694, "step": 5285 }, { "epoch": 0.5438470237483294, "grad_norm": 31.875, "learning_rate": 5.068540095956135e-07, "logits/chosen": -0.576358437538147, "logits/rejected": -0.6152011752128601, "logps/chosen": -32.247528076171875, "logps/rejected": -51.63365936279297, "loss": 0.3053, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8498682975769043, "rewards/margins": 1.62869131565094, "rewards/rejected": -0.7788229584693909, "step": 5290 }, { "epoch": 0.5443610568520613, "grad_norm": 31.875, "learning_rate": 5.062828421293122e-07, "logits/chosen": -0.5411609411239624, "logits/rejected": -0.5670135617256165, "logps/chosen": -35.00902557373047, "logps/rejected": -50.10618591308594, "loss": 0.3302, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6729644536972046, "rewards/margins": 1.1828104257583618, "rewards/rejected": -0.5098458528518677, "step": 5295 }, { "epoch": 0.5448750899557931, "grad_norm": 34.75, "learning_rate": 5.057116746630111e-07, "logits/chosen": -0.6193909645080566, "logits/rejected": -0.5940514802932739, "logps/chosen": -40.555564880371094, "logps/rejected": -51.7638053894043, "loss": 0.3015, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7133775353431702, "rewards/margins": 1.4487088918685913, "rewards/rejected": -0.7353314757347107, "step": 5300 }, { "epoch": 0.545389123059525, "grad_norm": 28.25, "learning_rate": 5.051405071967101e-07, "logits/chosen": -0.6239813566207886, "logits/rejected": -0.6414352655410767, "logps/chosen": -36.63289260864258, "logps/rejected": -51.12995147705078, "loss": 0.2925, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6675560474395752, "rewards/margins": 1.6107795238494873, "rewards/rejected": -0.9432234764099121, "step": 5305 }, { "epoch": 0.5459031561632569, "grad_norm": 36.75, "learning_rate": 5.04569339730409e-07, "logits/chosen": -0.5544955134391785, "logits/rejected": -0.5916903614997864, "logps/chosen": -34.562782287597656, "logps/rejected": -52.90831756591797, "loss": 0.3265, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8462445139884949, "rewards/margins": 1.729039192199707, "rewards/rejected": -0.8827944993972778, "step": 5310 }, { "epoch": 0.5464171892669888, "grad_norm": 34.75, "learning_rate": 5.039981722641078e-07, "logits/chosen": -0.5300226807594299, "logits/rejected": -0.5401709675788879, "logps/chosen": -38.930442810058594, "logps/rejected": -55.4549674987793, "loss": 0.3424, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7672668099403381, "rewards/margins": 1.6247628927230835, "rewards/rejected": -0.857495903968811, "step": 5315 }, { "epoch": 0.5469312223707207, "grad_norm": 29.875, "learning_rate": 5.034270047978066e-07, "logits/chosen": -0.6458983421325684, "logits/rejected": -0.6219261884689331, "logps/chosen": -36.87651443481445, "logps/rejected": -47.53354263305664, "loss": 0.3006, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5810403227806091, "rewards/margins": 1.2663801908493042, "rewards/rejected": -0.6853397488594055, "step": 5320 }, { "epoch": 0.5474452554744526, "grad_norm": 25.125, "learning_rate": 5.028558373315056e-07, "logits/chosen": -0.5920066833496094, "logits/rejected": -0.5939021110534668, "logps/chosen": -37.57915496826172, "logps/rejected": -47.34910583496094, "loss": 0.3128, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9966033697128296, "rewards/margins": 1.5529991388320923, "rewards/rejected": -0.5563956499099731, "step": 5325 }, { "epoch": 0.5479592885781844, "grad_norm": 34.0, "learning_rate": 5.022846698652045e-07, "logits/chosen": -0.6700036525726318, "logits/rejected": -0.7104852795600891, "logps/chosen": -42.27153778076172, "logps/rejected": -56.401092529296875, "loss": 0.3411, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9898411631584167, "rewards/margins": 1.7867138385772705, "rewards/rejected": -0.7968726754188538, "step": 5330 }, { "epoch": 0.5484733216819163, "grad_norm": 26.875, "learning_rate": 5.017135023989033e-07, "logits/chosen": -0.4842751622200012, "logits/rejected": -0.562313973903656, "logps/chosen": -39.97228240966797, "logps/rejected": -52.91002655029297, "loss": 0.2808, "rewards/accuracies": 0.875, "rewards/chosen": 0.8582319021224976, "rewards/margins": 1.8121354579925537, "rewards/rejected": -0.9539035558700562, "step": 5335 }, { "epoch": 0.5489873547856482, "grad_norm": 28.25, "learning_rate": 5.011423349326021e-07, "logits/chosen": -0.5943199396133423, "logits/rejected": -0.6346994638442993, "logps/chosen": -34.76818084716797, "logps/rejected": -48.64867401123047, "loss": 0.3081, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0416762828826904, "rewards/margins": 1.7665315866470337, "rewards/rejected": -0.7248552441596985, "step": 5340 }, { "epoch": 0.5495013878893801, "grad_norm": 29.375, "learning_rate": 5.005711674663011e-07, "logits/chosen": -0.5260311365127563, "logits/rejected": -0.5165815353393555, "logps/chosen": -37.712440490722656, "logps/rejected": -48.81612014770508, "loss": 0.3483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5686922073364258, "rewards/margins": 1.2716538906097412, "rewards/rejected": -0.7029617428779602, "step": 5345 }, { "epoch": 0.550015420993112, "grad_norm": 25.125, "learning_rate": 5e-07, "logits/chosen": -0.6918250322341919, "logits/rejected": -0.7127255201339722, "logps/chosen": -42.168033599853516, "logps/rejected": -59.09967041015625, "loss": 0.3128, "rewards/accuracies": 0.875, "rewards/chosen": 0.5827246904373169, "rewards/margins": 1.6004564762115479, "rewards/rejected": -1.017731785774231, "step": 5350 }, { "epoch": 0.5505294540968438, "grad_norm": 33.0, "learning_rate": 4.994288325336989e-07, "logits/chosen": -0.5187739133834839, "logits/rejected": -0.5590237379074097, "logps/chosen": -39.82316589355469, "logps/rejected": -48.59222412109375, "loss": 0.352, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0252200365066528, "rewards/margins": 1.7844302654266357, "rewards/rejected": -0.7592102289199829, "step": 5355 }, { "epoch": 0.5510434872005757, "grad_norm": 26.25, "learning_rate": 4.988576650673977e-07, "logits/chosen": -0.5952342748641968, "logits/rejected": -0.6070914268493652, "logps/chosen": -33.91656494140625, "logps/rejected": -48.998191833496094, "loss": 0.3104, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5975190997123718, "rewards/margins": 1.4172455072402954, "rewards/rejected": -0.8197264671325684, "step": 5360 }, { "epoch": 0.5515575203043076, "grad_norm": 31.875, "learning_rate": 4.982864976010966e-07, "logits/chosen": -0.5682250261306763, "logits/rejected": -0.5965525507926941, "logps/chosen": -39.587581634521484, "logps/rejected": -51.56437301635742, "loss": 0.3364, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8926584124565125, "rewards/margins": 1.7932497262954712, "rewards/rejected": -0.9005910158157349, "step": 5365 }, { "epoch": 0.5520715534080395, "grad_norm": 32.0, "learning_rate": 4.977153301347955e-07, "logits/chosen": -0.6291767954826355, "logits/rejected": -0.587928831577301, "logps/chosen": -34.99839782714844, "logps/rejected": -50.805503845214844, "loss": 0.315, "rewards/accuracies": 0.875, "rewards/chosen": 0.8940040469169617, "rewards/margins": 1.420218586921692, "rewards/rejected": -0.5262144804000854, "step": 5370 }, { "epoch": 0.5525855865117714, "grad_norm": 47.25, "learning_rate": 4.971441626684944e-07, "logits/chosen": -0.4864344596862793, "logits/rejected": -0.5423449277877808, "logps/chosen": -30.107431411743164, "logps/rejected": -48.025733947753906, "loss": 0.3089, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8662234544754028, "rewards/margins": 1.6439186334609985, "rewards/rejected": -0.7776951193809509, "step": 5375 }, { "epoch": 0.5530996196155032, "grad_norm": 32.0, "learning_rate": 4.965729952021932e-07, "logits/chosen": -0.5614736080169678, "logits/rejected": -0.5659979581832886, "logps/chosen": -38.842594146728516, "logps/rejected": -49.70490646362305, "loss": 0.3549, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8725218772888184, "rewards/margins": 1.4655795097351074, "rewards/rejected": -0.5930576920509338, "step": 5380 }, { "epoch": 0.5536136527192351, "grad_norm": 34.0, "learning_rate": 4.960018277358921e-07, "logits/chosen": -0.5640846490859985, "logits/rejected": -0.6180367469787598, "logps/chosen": -32.54931640625, "logps/rejected": -47.15842056274414, "loss": 0.3074, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8350415229797363, "rewards/margins": 1.4325668811798096, "rewards/rejected": -0.5975254774093628, "step": 5385 }, { "epoch": 0.554127685822967, "grad_norm": 29.5, "learning_rate": 4.95430660269591e-07, "logits/chosen": -0.6067177057266235, "logits/rejected": -0.5993624925613403, "logps/chosen": -33.55295944213867, "logps/rejected": -47.19371795654297, "loss": 0.3121, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0783369541168213, "rewards/margins": 1.521639108657837, "rewards/rejected": -0.4433022439479828, "step": 5390 }, { "epoch": 0.5546417189266989, "grad_norm": 29.375, "learning_rate": 4.948594928032899e-07, "logits/chosen": -0.5090099573135376, "logits/rejected": -0.556525707244873, "logps/chosen": -38.07255935668945, "logps/rejected": -49.88135528564453, "loss": 0.3119, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0055513381958008, "rewards/margins": 1.7585245370864868, "rewards/rejected": -0.7529730200767517, "step": 5395 }, { "epoch": 0.5551557520304308, "grad_norm": 44.25, "learning_rate": 4.942883253369887e-07, "logits/chosen": -0.5554569959640503, "logits/rejected": -0.6069145798683167, "logps/chosen": -38.13854217529297, "logps/rejected": -45.74602508544922, "loss": 0.3351, "rewards/accuracies": 0.875, "rewards/chosen": 0.8307322263717651, "rewards/margins": 1.5610443353652954, "rewards/rejected": -0.7303120493888855, "step": 5400 }, { "epoch": 0.5556697851341627, "grad_norm": 28.125, "learning_rate": 4.937171578706877e-07, "logits/chosen": -0.5997242331504822, "logits/rejected": -0.6344844102859497, "logps/chosen": -35.67811965942383, "logps/rejected": -51.1162109375, "loss": 0.3118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0798351764678955, "rewards/margins": 1.8114025592803955, "rewards/rejected": -0.7315672636032104, "step": 5405 }, { "epoch": 0.5561838182378945, "grad_norm": 28.75, "learning_rate": 4.931459904043865e-07, "logits/chosen": -0.4904783368110657, "logits/rejected": -0.5412826538085938, "logps/chosen": -35.812461853027344, "logps/rejected": -50.7703857421875, "loss": 0.38, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8687294721603394, "rewards/margins": 1.6668169498443604, "rewards/rejected": -0.7980874180793762, "step": 5410 }, { "epoch": 0.5566978513416264, "grad_norm": 31.25, "learning_rate": 4.925748229380855e-07, "logits/chosen": -0.5657831430435181, "logits/rejected": -0.6014779210090637, "logps/chosen": -36.15218734741211, "logps/rejected": -49.280845642089844, "loss": 0.3289, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.4956814646720886, "rewards/margins": 1.1686210632324219, "rewards/rejected": -0.672939658164978, "step": 5415 }, { "epoch": 0.5572118844453583, "grad_norm": 26.0, "learning_rate": 4.920036554717843e-07, "logits/chosen": -0.6361567378044128, "logits/rejected": -0.6507669687271118, "logps/chosen": -38.25141525268555, "logps/rejected": -52.3405647277832, "loss": 0.3151, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.717857837677002, "rewards/margins": 1.4202219247817993, "rewards/rejected": -0.7023641467094421, "step": 5420 }, { "epoch": 0.5577259175490902, "grad_norm": 29.625, "learning_rate": 4.914324880054832e-07, "logits/chosen": -0.5913346409797668, "logits/rejected": -0.6129885315895081, "logps/chosen": -38.77965545654297, "logps/rejected": -51.826416015625, "loss": 0.3071, "rewards/accuracies": 0.875, "rewards/chosen": 0.9364654421806335, "rewards/margins": 1.5156967639923096, "rewards/rejected": -0.5792315006256104, "step": 5425 }, { "epoch": 0.5582399506528221, "grad_norm": 30.0, "learning_rate": 4.90861320539182e-07, "logits/chosen": -0.6424923539161682, "logits/rejected": -0.6605609059333801, "logps/chosen": -38.419456481933594, "logps/rejected": -53.54944610595703, "loss": 0.3181, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7132002115249634, "rewards/margins": 1.6374444961547852, "rewards/rejected": -0.924244225025177, "step": 5430 }, { "epoch": 0.5587539837565539, "grad_norm": 24.875, "learning_rate": 4.90290153072881e-07, "logits/chosen": -0.6105496287345886, "logits/rejected": -0.621708869934082, "logps/chosen": -38.57181930541992, "logps/rejected": -46.45188522338867, "loss": 0.3152, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7625431418418884, "rewards/margins": 1.4595110416412354, "rewards/rejected": -0.6969679594039917, "step": 5435 }, { "epoch": 0.5592680168602858, "grad_norm": 36.0, "learning_rate": 4.897189856065798e-07, "logits/chosen": -0.510198712348938, "logits/rejected": -0.5568181872367859, "logps/chosen": -36.5853271484375, "logps/rejected": -52.05637741088867, "loss": 0.3416, "rewards/accuracies": 0.875, "rewards/chosen": 0.8882466554641724, "rewards/margins": 1.8248571157455444, "rewards/rejected": -0.9366105794906616, "step": 5440 }, { "epoch": 0.5597820499640177, "grad_norm": 32.0, "learning_rate": 4.891478181402787e-07, "logits/chosen": -0.558326244354248, "logits/rejected": -0.6410309076309204, "logps/chosen": -47.55301284790039, "logps/rejected": -52.826271057128906, "loss": 0.3475, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5980253219604492, "rewards/margins": 1.2571439743041992, "rewards/rejected": -0.65911865234375, "step": 5445 }, { "epoch": 0.5602960830677496, "grad_norm": 33.75, "learning_rate": 4.885766506739776e-07, "logits/chosen": -0.5366206169128418, "logits/rejected": -0.5961537957191467, "logps/chosen": -39.004417419433594, "logps/rejected": -51.01286315917969, "loss": 0.3301, "rewards/accuracies": 0.875, "rewards/chosen": 0.7586712837219238, "rewards/margins": 1.6109216213226318, "rewards/rejected": -0.8522504568099976, "step": 5450 }, { "epoch": 0.5608101161714815, "grad_norm": 33.0, "learning_rate": 4.880054832076765e-07, "logits/chosen": -0.572895884513855, "logits/rejected": -0.6449416279792786, "logps/chosen": -42.7717170715332, "logps/rejected": -52.80536651611328, "loss": 0.3508, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7800756096839905, "rewards/margins": 1.747786283493042, "rewards/rejected": -0.9677106142044067, "step": 5455 }, { "epoch": 0.5613241492752133, "grad_norm": 29.125, "learning_rate": 4.874343157413754e-07, "logits/chosen": -0.598509669303894, "logits/rejected": -0.697364091873169, "logps/chosen": -37.33442306518555, "logps/rejected": -55.0048828125, "loss": 0.3318, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6378592848777771, "rewards/margins": 1.3078693151474, "rewards/rejected": -0.6700101494789124, "step": 5460 }, { "epoch": 0.5618381823789452, "grad_norm": 33.0, "learning_rate": 4.868631482750743e-07, "logits/chosen": -0.5831464529037476, "logits/rejected": -0.5991762280464172, "logps/chosen": -37.41027069091797, "logps/rejected": -45.9515380859375, "loss": 0.3411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8314011693000793, "rewards/margins": 1.1101688146591187, "rewards/rejected": -0.2787677049636841, "step": 5465 }, { "epoch": 0.5623522154826771, "grad_norm": 37.5, "learning_rate": 4.862919808087731e-07, "logits/chosen": -0.5245425701141357, "logits/rejected": -0.5264366269111633, "logps/chosen": -41.504539489746094, "logps/rejected": -48.63679122924805, "loss": 0.3369, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7026236653327942, "rewards/margins": 1.281455397605896, "rewards/rejected": -0.5788317918777466, "step": 5470 }, { "epoch": 0.562866248586409, "grad_norm": 23.0, "learning_rate": 4.85720813342472e-07, "logits/chosen": -0.5996089577674866, "logits/rejected": -0.6015630960464478, "logps/chosen": -43.60443115234375, "logps/rejected": -50.52325439453125, "loss": 0.3099, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7662347555160522, "rewards/margins": 1.7200558185577393, "rewards/rejected": -0.953821063041687, "step": 5475 }, { "epoch": 0.5633802816901409, "grad_norm": 32.75, "learning_rate": 4.851496458761709e-07, "logits/chosen": -0.6294230222702026, "logits/rejected": -0.609401524066925, "logps/chosen": -40.686126708984375, "logps/rejected": -53.11012649536133, "loss": 0.316, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7022809982299805, "rewards/margins": 1.7280280590057373, "rewards/rejected": -1.0257470607757568, "step": 5480 }, { "epoch": 0.5638943147938728, "grad_norm": 29.125, "learning_rate": 4.845784784098698e-07, "logits/chosen": -0.522826611995697, "logits/rejected": -0.5145086050033569, "logps/chosen": -38.1035041809082, "logps/rejected": -47.55299758911133, "loss": 0.3037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5162757635116577, "rewards/margins": 1.2072150707244873, "rewards/rejected": -0.6909393072128296, "step": 5485 }, { "epoch": 0.5644083478976046, "grad_norm": 30.25, "learning_rate": 4.840073109435686e-07, "logits/chosen": -0.617935299873352, "logits/rejected": -0.5859391093254089, "logps/chosen": -34.4065055847168, "logps/rejected": -42.83157730102539, "loss": 0.3185, "rewards/accuracies": 0.875, "rewards/chosen": 0.8342729806900024, "rewards/margins": 1.5910322666168213, "rewards/rejected": -0.7567592263221741, "step": 5490 }, { "epoch": 0.5649223810013365, "grad_norm": 27.25, "learning_rate": 4.834361434772675e-07, "logits/chosen": -0.5412257313728333, "logits/rejected": -0.5755349397659302, "logps/chosen": -38.1861686706543, "logps/rejected": -53.26483154296875, "loss": 0.3288, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7286289930343628, "rewards/margins": 1.5066890716552734, "rewards/rejected": -0.7780600190162659, "step": 5495 }, { "epoch": 0.5654364141050684, "grad_norm": 28.5, "learning_rate": 4.828649760109664e-07, "logits/chosen": -0.6187722086906433, "logits/rejected": -0.639434814453125, "logps/chosen": -37.34333038330078, "logps/rejected": -53.9117546081543, "loss": 0.3063, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8301005363464355, "rewards/margins": 1.5749003887176514, "rewards/rejected": -0.744799792766571, "step": 5500 }, { "epoch": 0.5659504472088003, "grad_norm": 41.0, "learning_rate": 4.822938085446653e-07, "logits/chosen": -0.5636704564094543, "logits/rejected": -0.5543387532234192, "logps/chosen": -40.96604919433594, "logps/rejected": -57.03691864013672, "loss": 0.3661, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8426015973091125, "rewards/margins": 1.7695907354354858, "rewards/rejected": -0.9269893765449524, "step": 5505 }, { "epoch": 0.5664644803125322, "grad_norm": 30.25, "learning_rate": 4.817226410783642e-07, "logits/chosen": -0.5788537859916687, "logits/rejected": -0.5639528036117554, "logps/chosen": -39.8604850769043, "logps/rejected": -47.75217819213867, "loss": 0.3148, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7439614534378052, "rewards/margins": 1.2334390878677368, "rewards/rejected": -0.48947757482528687, "step": 5510 }, { "epoch": 0.566978513416264, "grad_norm": 28.125, "learning_rate": 4.81151473612063e-07, "logits/chosen": -0.588843047618866, "logits/rejected": -0.5838462710380554, "logps/chosen": -43.08673095703125, "logps/rejected": -52.54121017456055, "loss": 0.3217, "rewards/accuracies": 0.875, "rewards/chosen": 0.8324281573295593, "rewards/margins": 1.8238624334335327, "rewards/rejected": -0.9914342164993286, "step": 5515 }, { "epoch": 0.5674925465199959, "grad_norm": 26.625, "learning_rate": 4.805803061457619e-07, "logits/chosen": -0.5308724045753479, "logits/rejected": -0.6545066237449646, "logps/chosen": -39.12418746948242, "logps/rejected": -52.9316520690918, "loss": 0.3244, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8914023637771606, "rewards/margins": 1.553481101989746, "rewards/rejected": -0.6620787382125854, "step": 5520 }, { "epoch": 0.5680065796237278, "grad_norm": 38.75, "learning_rate": 4.800091386794608e-07, "logits/chosen": -0.5161787271499634, "logits/rejected": -0.6010434031486511, "logps/chosen": -35.66443634033203, "logps/rejected": -48.58409881591797, "loss": 0.3171, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8872359991073608, "rewards/margins": 1.4891257286071777, "rewards/rejected": -0.6018894910812378, "step": 5525 }, { "epoch": 0.5685206127274597, "grad_norm": 29.75, "learning_rate": 4.794379712131597e-07, "logits/chosen": -0.5487528443336487, "logits/rejected": -0.6157029271125793, "logps/chosen": -35.728179931640625, "logps/rejected": -46.430702209472656, "loss": 0.3221, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5062443614006042, "rewards/margins": 1.1394152641296387, "rewards/rejected": -0.633171021938324, "step": 5530 }, { "epoch": 0.5690346458311916, "grad_norm": 33.5, "learning_rate": 4.788668037468585e-07, "logits/chosen": -0.5197395086288452, "logits/rejected": -0.5683005452156067, "logps/chosen": -34.56022262573242, "logps/rejected": -47.91225814819336, "loss": 0.3204, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8521226048469543, "rewards/margins": 1.5269181728363037, "rewards/rejected": -0.6747955083847046, "step": 5535 }, { "epoch": 0.5695486789349234, "grad_norm": 29.25, "learning_rate": 4.782956362805574e-07, "logits/chosen": -0.5171440243721008, "logits/rejected": -0.5464445352554321, "logps/chosen": -39.540931701660156, "logps/rejected": -50.96272659301758, "loss": 0.3143, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8332011103630066, "rewards/margins": 1.5849746465682983, "rewards/rejected": -0.7517735958099365, "step": 5540 }, { "epoch": 0.5700627120386553, "grad_norm": 33.25, "learning_rate": 4.777244688142563e-07, "logits/chosen": -0.6533052921295166, "logits/rejected": -0.6545796990394592, "logps/chosen": -41.46527862548828, "logps/rejected": -48.61058807373047, "loss": 0.3109, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7887009978294373, "rewards/margins": 1.4743165969848633, "rewards/rejected": -0.685615599155426, "step": 5545 }, { "epoch": 0.5705767451423872, "grad_norm": 32.5, "learning_rate": 4.771533013479552e-07, "logits/chosen": -0.586670994758606, "logits/rejected": -0.5917859673500061, "logps/chosen": -38.57280349731445, "logps/rejected": -53.228172302246094, "loss": 0.3174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8182114362716675, "rewards/margins": 1.6159929037094116, "rewards/rejected": -0.7977815270423889, "step": 5550 }, { "epoch": 0.5710907782461191, "grad_norm": 51.75, "learning_rate": 4.765821338816541e-07, "logits/chosen": -0.570526659488678, "logits/rejected": -0.5178820490837097, "logps/chosen": -49.723514556884766, "logps/rejected": -50.413822174072266, "loss": 0.3541, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6955579519271851, "rewards/margins": 1.2517197132110596, "rewards/rejected": -0.5561617612838745, "step": 5555 }, { "epoch": 0.571604811349851, "grad_norm": 33.75, "learning_rate": 4.760109664153529e-07, "logits/chosen": -0.5796928405761719, "logits/rejected": -0.6404918432235718, "logps/chosen": -38.47113800048828, "logps/rejected": -55.165283203125, "loss": 0.3269, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6954489946365356, "rewards/margins": 1.7130407094955444, "rewards/rejected": -1.0175917148590088, "step": 5560 }, { "epoch": 0.5721188444535829, "grad_norm": 28.375, "learning_rate": 4.7543979894905185e-07, "logits/chosen": -0.5537697076797485, "logits/rejected": -0.571895956993103, "logps/chosen": -37.81829071044922, "logps/rejected": -50.72594451904297, "loss": 0.3022, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7509074211120605, "rewards/margins": 1.7374227046966553, "rewards/rejected": -0.9865153431892395, "step": 5565 }, { "epoch": 0.5726328775573147, "grad_norm": 27.125, "learning_rate": 4.7486863148275073e-07, "logits/chosen": -0.5344904065132141, "logits/rejected": -0.5063527822494507, "logps/chosen": -39.64960479736328, "logps/rejected": -52.75616455078125, "loss": 0.3306, "rewards/accuracies": 0.875, "rewards/chosen": 0.6877033710479736, "rewards/margins": 1.6281284093856812, "rewards/rejected": -0.9404250383377075, "step": 5570 }, { "epoch": 0.5731469106610466, "grad_norm": 28.75, "learning_rate": 4.742974640164496e-07, "logits/chosen": -0.4878648817539215, "logits/rejected": -0.4992642402648926, "logps/chosen": -38.08750915527344, "logps/rejected": -55.65885543823242, "loss": 0.3325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7103012800216675, "rewards/margins": 1.6140247583389282, "rewards/rejected": -0.9037235379219055, "step": 5575 }, { "epoch": 0.5736609437647785, "grad_norm": 38.75, "learning_rate": 4.737262965501485e-07, "logits/chosen": -0.5974716544151306, "logits/rejected": -0.6958271265029907, "logps/chosen": -36.764400482177734, "logps/rejected": -54.366310119628906, "loss": 0.329, "rewards/accuracies": 0.875, "rewards/chosen": 0.8676560521125793, "rewards/margins": 1.5921671390533447, "rewards/rejected": -0.7245109677314758, "step": 5580 }, { "epoch": 0.5741749768685104, "grad_norm": 29.875, "learning_rate": 4.7315512908384736e-07, "logits/chosen": -0.5596609115600586, "logits/rejected": -0.6394937634468079, "logps/chosen": -41.77163314819336, "logps/rejected": -57.10003662109375, "loss": 0.3215, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5036495923995972, "rewards/margins": 1.3846503496170044, "rewards/rejected": -0.8810003995895386, "step": 5585 }, { "epoch": 0.5746890099722423, "grad_norm": 56.75, "learning_rate": 4.725839616175463e-07, "logits/chosen": -0.5849025249481201, "logits/rejected": -0.6235945820808411, "logps/chosen": -37.022438049316406, "logps/rejected": -54.692420959472656, "loss": 0.356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4317290186882019, "rewards/margins": 1.1267406940460205, "rewards/rejected": -0.6950114965438843, "step": 5590 }, { "epoch": 0.5752030430759741, "grad_norm": 30.0, "learning_rate": 4.720127941512451e-07, "logits/chosen": -0.6783806085586548, "logits/rejected": -0.6566216349601746, "logps/chosen": -33.78038024902344, "logps/rejected": -55.25464630126953, "loss": 0.3318, "rewards/accuracies": 0.875, "rewards/chosen": 0.8490379452705383, "rewards/margins": 1.7149345874786377, "rewards/rejected": -0.8658965826034546, "step": 5595 }, { "epoch": 0.575717076179706, "grad_norm": 38.25, "learning_rate": 4.7144162668494405e-07, "logits/chosen": -0.47051483392715454, "logits/rejected": -0.5466389656066895, "logps/chosen": -41.17162322998047, "logps/rejected": -53.72459030151367, "loss": 0.316, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7506638169288635, "rewards/margins": 1.8334366083145142, "rewards/rejected": -1.0827728509902954, "step": 5600 }, { "epoch": 0.5762311092834379, "grad_norm": 34.0, "learning_rate": 4.7087045921864287e-07, "logits/chosen": -0.5577563047409058, "logits/rejected": -0.5886602401733398, "logps/chosen": -33.696533203125, "logps/rejected": -54.000694274902344, "loss": 0.3412, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7679862976074219, "rewards/margins": 1.7642335891723633, "rewards/rejected": -0.9962472915649414, "step": 5605 }, { "epoch": 0.5767451423871698, "grad_norm": 24.75, "learning_rate": 4.702992917523418e-07, "logits/chosen": -0.5901495814323425, "logits/rejected": -0.6381442546844482, "logps/chosen": -39.261268615722656, "logps/rejected": -50.67091369628906, "loss": 0.3252, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6713961362838745, "rewards/margins": 1.4824483394622803, "rewards/rejected": -0.811052143573761, "step": 5610 }, { "epoch": 0.5772591754909017, "grad_norm": 26.375, "learning_rate": 4.6972812428604063e-07, "logits/chosen": -0.6187806725502014, "logits/rejected": -0.6792791485786438, "logps/chosen": -36.127197265625, "logps/rejected": -55.559417724609375, "loss": 0.2989, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6338039636611938, "rewards/margins": 1.703683853149414, "rewards/rejected": -1.0698798894882202, "step": 5615 }, { "epoch": 0.5777732085946335, "grad_norm": 40.75, "learning_rate": 4.6915695681973956e-07, "logits/chosen": -0.6084673404693604, "logits/rejected": -0.6116921901702881, "logps/chosen": -38.646324157714844, "logps/rejected": -54.52901077270508, "loss": 0.326, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.91680908203125, "rewards/margins": 1.6888208389282227, "rewards/rejected": -0.7720116376876831, "step": 5620 }, { "epoch": 0.5782872416983654, "grad_norm": 35.75, "learning_rate": 4.685857893534384e-07, "logits/chosen": -0.5510770678520203, "logits/rejected": -0.5721169710159302, "logps/chosen": -38.67646026611328, "logps/rejected": -54.8140754699707, "loss": 0.313, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7853614091873169, "rewards/margins": 1.7177422046661377, "rewards/rejected": -0.9323806762695312, "step": 5625 }, { "epoch": 0.5788012748020973, "grad_norm": 28.0, "learning_rate": 4.680146218871373e-07, "logits/chosen": -0.5691109299659729, "logits/rejected": -0.5796974897384644, "logps/chosen": -34.85259246826172, "logps/rejected": -51.0848503112793, "loss": 0.3105, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9834505915641785, "rewards/margins": 1.8076159954071045, "rewards/rejected": -0.8241653442382812, "step": 5630 }, { "epoch": 0.5793153079058291, "grad_norm": 28.375, "learning_rate": 4.6744345442083614e-07, "logits/chosen": -0.48538702726364136, "logits/rejected": -0.5178495645523071, "logps/chosen": -37.384212493896484, "logps/rejected": -49.84410858154297, "loss": 0.3339, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8668580055236816, "rewards/margins": 1.720973014831543, "rewards/rejected": -0.8541151285171509, "step": 5635 }, { "epoch": 0.579829341009561, "grad_norm": 29.5, "learning_rate": 4.6687228695453507e-07, "logits/chosen": -0.6420586705207825, "logits/rejected": -0.639344334602356, "logps/chosen": -37.513938903808594, "logps/rejected": -49.810874938964844, "loss": 0.3126, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.809395432472229, "rewards/margins": 1.6327040195465088, "rewards/rejected": -0.8233085870742798, "step": 5640 }, { "epoch": 0.5803433741132928, "grad_norm": 20.875, "learning_rate": 4.6630111948823394e-07, "logits/chosen": -0.44498538970947266, "logits/rejected": -0.48956623673439026, "logps/chosen": -35.95832824707031, "logps/rejected": -46.01980972290039, "loss": 0.288, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7488136887550354, "rewards/margins": 1.5328102111816406, "rewards/rejected": -0.7839964628219604, "step": 5645 }, { "epoch": 0.5808574072170247, "grad_norm": 27.25, "learning_rate": 4.657299520219328e-07, "logits/chosen": -0.6363785266876221, "logits/rejected": -0.5631476044654846, "logps/chosen": -39.401512145996094, "logps/rejected": -53.449188232421875, "loss": 0.3067, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7731329798698425, "rewards/margins": 1.5837217569351196, "rewards/rejected": -0.8105886578559875, "step": 5650 }, { "epoch": 0.5813714403207566, "grad_norm": 37.0, "learning_rate": 4.651587845556317e-07, "logits/chosen": -0.5819973945617676, "logits/rejected": -0.5723029971122742, "logps/chosen": -36.02107238769531, "logps/rejected": -52.14338302612305, "loss": 0.3183, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9124771356582642, "rewards/margins": 1.7731910943984985, "rewards/rejected": -0.8607139587402344, "step": 5655 }, { "epoch": 0.5818854734244885, "grad_norm": 28.625, "learning_rate": 4.645876170893306e-07, "logits/chosen": -0.5487987995147705, "logits/rejected": -0.597594678401947, "logps/chosen": -37.172706604003906, "logps/rejected": -48.764305114746094, "loss": 0.2756, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7913802862167358, "rewards/margins": 1.6435766220092773, "rewards/rejected": -0.8521963357925415, "step": 5660 }, { "epoch": 0.5823995065282204, "grad_norm": 32.75, "learning_rate": 4.6401644962302945e-07, "logits/chosen": -0.590233325958252, "logits/rejected": -0.6453453898429871, "logps/chosen": -40.05573272705078, "logps/rejected": -51.57451629638672, "loss": 0.3604, "rewards/accuracies": 0.875, "rewards/chosen": 1.084173560142517, "rewards/margins": 1.6982505321502686, "rewards/rejected": -0.6140770316123962, "step": 5665 }, { "epoch": 0.5829135396319522, "grad_norm": 26.875, "learning_rate": 4.6344528215672833e-07, "logits/chosen": -0.5507546067237854, "logits/rejected": -0.6035882234573364, "logps/chosen": -36.05913543701172, "logps/rejected": -53.1547966003418, "loss": 0.3063, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5562926530838013, "rewards/margins": 1.6268428564071655, "rewards/rejected": -1.0705502033233643, "step": 5670 }, { "epoch": 0.5834275727356841, "grad_norm": 32.5, "learning_rate": 4.628741146904272e-07, "logits/chosen": -0.5206809639930725, "logits/rejected": -0.5592479109764099, "logps/chosen": -41.59159469604492, "logps/rejected": -48.48657989501953, "loss": 0.3364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7500337362289429, "rewards/margins": 1.2498975992202759, "rewards/rejected": -0.49986380338668823, "step": 5675 }, { "epoch": 0.583941605839416, "grad_norm": 29.25, "learning_rate": 4.623029472241261e-07, "logits/chosen": -0.5433398485183716, "logits/rejected": -0.5875434279441833, "logps/chosen": -34.39576721191406, "logps/rejected": -48.04349136352539, "loss": 0.3503, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.8571073412895203, "rewards/margins": 1.4047021865844727, "rewards/rejected": -0.5475948452949524, "step": 5680 }, { "epoch": 0.5844556389431479, "grad_norm": 29.375, "learning_rate": 4.6173177975782496e-07, "logits/chosen": -0.5359436273574829, "logits/rejected": -0.49221786856651306, "logps/chosen": -39.17573928833008, "logps/rejected": -51.51544189453125, "loss": 0.3365, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5645514726638794, "rewards/margins": 1.382960557937622, "rewards/rejected": -0.8184091448783875, "step": 5685 }, { "epoch": 0.5849696720468798, "grad_norm": 26.375, "learning_rate": 4.611606122915239e-07, "logits/chosen": -0.572421133518219, "logits/rejected": -0.588851809501648, "logps/chosen": -34.78474426269531, "logps/rejected": -50.267295837402344, "loss": 0.3377, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6074295043945312, "rewards/margins": 1.5850121974945068, "rewards/rejected": -0.9775827527046204, "step": 5690 }, { "epoch": 0.5854837051506117, "grad_norm": 29.75, "learning_rate": 4.605894448252227e-07, "logits/chosen": -0.4973490834236145, "logits/rejected": -0.5848850011825562, "logps/chosen": -36.815216064453125, "logps/rejected": -51.70698928833008, "loss": 0.3093, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7988650798797607, "rewards/margins": 1.5875505208969116, "rewards/rejected": -0.7886855602264404, "step": 5695 }, { "epoch": 0.5859977382543435, "grad_norm": 46.0, "learning_rate": 4.6001827735892164e-07, "logits/chosen": -0.6384389400482178, "logits/rejected": -0.701396107673645, "logps/chosen": -48.252174377441406, "logps/rejected": -61.52714920043945, "loss": 0.3326, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8036373853683472, "rewards/margins": 1.737622857093811, "rewards/rejected": -0.9339853525161743, "step": 5700 }, { "epoch": 0.5865117713580754, "grad_norm": 28.0, "learning_rate": 4.5944710989262047e-07, "logits/chosen": -0.5591861009597778, "logits/rejected": -0.5968291759490967, "logps/chosen": -36.32762145996094, "logps/rejected": -46.523868560791016, "loss": 0.3298, "rewards/accuracies": 0.875, "rewards/chosen": 0.6983939409255981, "rewards/margins": 1.250826120376587, "rewards/rejected": -0.5524321794509888, "step": 5705 }, { "epoch": 0.5870258044618073, "grad_norm": 39.25, "learning_rate": 4.588759424263194e-07, "logits/chosen": -0.5991836190223694, "logits/rejected": -0.6248829364776611, "logps/chosen": -34.06307601928711, "logps/rejected": -50.95419692993164, "loss": 0.2813, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7387692332267761, "rewards/margins": 1.911855936050415, "rewards/rejected": -1.1730866432189941, "step": 5710 }, { "epoch": 0.5875398375655392, "grad_norm": 30.125, "learning_rate": 4.583047749600182e-07, "logits/chosen": -0.6902505159378052, "logits/rejected": -0.69324791431427, "logps/chosen": -38.02500534057617, "logps/rejected": -52.131317138671875, "loss": 0.3216, "rewards/accuracies": 0.875, "rewards/chosen": 0.8191086649894714, "rewards/margins": 1.758474588394165, "rewards/rejected": -0.9393658638000488, "step": 5715 }, { "epoch": 0.5880538706692711, "grad_norm": 37.25, "learning_rate": 4.5773360749371715e-07, "logits/chosen": -0.6120063662528992, "logits/rejected": -0.5579059720039368, "logps/chosen": -40.97193145751953, "logps/rejected": -49.432281494140625, "loss": 0.3372, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7539774179458618, "rewards/margins": 1.472055196762085, "rewards/rejected": -0.7180777788162231, "step": 5720 }, { "epoch": 0.5885679037730029, "grad_norm": 28.25, "learning_rate": 4.57162440027416e-07, "logits/chosen": -0.600927472114563, "logits/rejected": -0.6537433862686157, "logps/chosen": -38.52452850341797, "logps/rejected": -55.95948028564453, "loss": 0.288, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9025455713272095, "rewards/margins": 1.793055534362793, "rewards/rejected": -0.8905099630355835, "step": 5725 }, { "epoch": 0.5890819368767348, "grad_norm": 29.0, "learning_rate": 4.565912725611149e-07, "logits/chosen": -0.5388263463973999, "logits/rejected": -0.5707365274429321, "logps/chosen": -36.942779541015625, "logps/rejected": -45.73963928222656, "loss": 0.356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.44646376371383667, "rewards/margins": 1.0179989337921143, "rewards/rejected": -0.571535050868988, "step": 5730 }, { "epoch": 0.5895959699804667, "grad_norm": 33.5, "learning_rate": 4.5602010509481373e-07, "logits/chosen": -0.5486391186714172, "logits/rejected": -0.6316149234771729, "logps/chosen": -35.28985595703125, "logps/rejected": -51.277610778808594, "loss": 0.3085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8824718594551086, "rewards/margins": 1.7628040313720703, "rewards/rejected": -0.880332350730896, "step": 5735 }, { "epoch": 0.5901100030841986, "grad_norm": 32.5, "learning_rate": 4.5544893762851266e-07, "logits/chosen": -0.5082904100418091, "logits/rejected": -0.5554991960525513, "logps/chosen": -34.71894073486328, "logps/rejected": -44.83824920654297, "loss": 0.3114, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7456015348434448, "rewards/margins": 1.2740949392318726, "rewards/rejected": -0.5284934043884277, "step": 5740 }, { "epoch": 0.5906240361879305, "grad_norm": 38.25, "learning_rate": 4.5487777016221154e-07, "logits/chosen": -0.4042229652404785, "logits/rejected": -0.5384402275085449, "logps/chosen": -34.1379280090332, "logps/rejected": -52.34333419799805, "loss": 0.3733, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6929291486740112, "rewards/margins": 1.6220424175262451, "rewards/rejected": -0.9291130900382996, "step": 5745 }, { "epoch": 0.5911380692916623, "grad_norm": 30.25, "learning_rate": 4.543066026959104e-07, "logits/chosen": -0.5701355934143066, "logits/rejected": -0.5603810548782349, "logps/chosen": -34.70872116088867, "logps/rejected": -51.6602897644043, "loss": 0.3205, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0210683345794678, "rewards/margins": 2.0287744998931885, "rewards/rejected": -1.0077059268951416, "step": 5750 }, { "epoch": 0.5916521023953942, "grad_norm": 48.75, "learning_rate": 4.537354352296093e-07, "logits/chosen": -0.4886329770088196, "logits/rejected": -0.5388873219490051, "logps/chosen": -37.20477294921875, "logps/rejected": -50.823204040527344, "loss": 0.3202, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5636073350906372, "rewards/margins": 1.096839189529419, "rewards/rejected": -0.5332318544387817, "step": 5755 }, { "epoch": 0.5921661354991261, "grad_norm": 28.375, "learning_rate": 4.5316426776330817e-07, "logits/chosen": -0.5156601071357727, "logits/rejected": -0.5228789448738098, "logps/chosen": -36.5161247253418, "logps/rejected": -49.82749557495117, "loss": 0.2993, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0172038078308105, "rewards/margins": 1.6145572662353516, "rewards/rejected": -0.597353458404541, "step": 5760 }, { "epoch": 0.592680168602858, "grad_norm": 35.0, "learning_rate": 4.5259310029700705e-07, "logits/chosen": -0.6210511922836304, "logits/rejected": -0.6319299936294556, "logps/chosen": -36.26023864746094, "logps/rejected": -51.813011169433594, "loss": 0.3104, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8181589245796204, "rewards/margins": 1.8537944555282593, "rewards/rejected": -1.0356357097625732, "step": 5765 }, { "epoch": 0.5931942017065899, "grad_norm": 31.375, "learning_rate": 4.520219328307059e-07, "logits/chosen": -0.5945835709571838, "logits/rejected": -0.5807428359985352, "logps/chosen": -32.567108154296875, "logps/rejected": -57.139503479003906, "loss": 0.3119, "rewards/accuracies": 0.875, "rewards/chosen": 0.6717483401298523, "rewards/margins": 1.7341525554656982, "rewards/rejected": -1.0624043941497803, "step": 5770 }, { "epoch": 0.5937082348103218, "grad_norm": 37.0, "learning_rate": 4.5145076536440485e-07, "logits/chosen": -0.6375588774681091, "logits/rejected": -0.6290777921676636, "logps/chosen": -35.359413146972656, "logps/rejected": -50.382972717285156, "loss": 0.3268, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8796683549880981, "rewards/margins": 1.571098804473877, "rewards/rejected": -0.6914306879043579, "step": 5775 }, { "epoch": 0.5942222679140536, "grad_norm": 31.875, "learning_rate": 4.508795978981037e-07, "logits/chosen": -0.5019595623016357, "logits/rejected": -0.5634509325027466, "logps/chosen": -36.06718444824219, "logps/rejected": -52.53822708129883, "loss": 0.3005, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7706364393234253, "rewards/margins": 1.758863091468811, "rewards/rejected": -0.9882267117500305, "step": 5780 }, { "epoch": 0.5947363010177855, "grad_norm": 42.75, "learning_rate": 4.503084304318026e-07, "logits/chosen": -0.5776488184928894, "logits/rejected": -0.5224560499191284, "logps/chosen": -41.63819122314453, "logps/rejected": -54.2601432800293, "loss": 0.3024, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8094083666801453, "rewards/margins": 1.916764497756958, "rewards/rejected": -1.1073559522628784, "step": 5785 }, { "epoch": 0.5952503341215174, "grad_norm": 24.625, "learning_rate": 4.497372629655015e-07, "logits/chosen": -0.560250997543335, "logits/rejected": -0.592552125453949, "logps/chosen": -39.934078216552734, "logps/rejected": -50.33665084838867, "loss": 0.3065, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6809595227241516, "rewards/margins": 1.6504713296890259, "rewards/rejected": -0.9695118069648743, "step": 5790 }, { "epoch": 0.5957643672252493, "grad_norm": 34.25, "learning_rate": 4.4916609549920036e-07, "logits/chosen": -0.5329676270484924, "logits/rejected": -0.5811724662780762, "logps/chosen": -38.88140869140625, "logps/rejected": -53.360748291015625, "loss": 0.3073, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7180728912353516, "rewards/margins": 1.2674694061279297, "rewards/rejected": -0.5493965744972229, "step": 5795 }, { "epoch": 0.5962784003289812, "grad_norm": 28.625, "learning_rate": 4.4859492803289924e-07, "logits/chosen": -0.6464225649833679, "logits/rejected": -0.6312161087989807, "logps/chosen": -36.981407165527344, "logps/rejected": -47.73630905151367, "loss": 0.3028, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7175685167312622, "rewards/margins": 1.435533881187439, "rewards/rejected": -0.7179654240608215, "step": 5800 }, { "epoch": 0.596792433432713, "grad_norm": 48.75, "learning_rate": 4.480237605665981e-07, "logits/chosen": -0.5975769758224487, "logits/rejected": -0.6528566479682922, "logps/chosen": -35.51677322387695, "logps/rejected": -51.25379180908203, "loss": 0.3439, "rewards/accuracies": 0.875, "rewards/chosen": 0.7710954546928406, "rewards/margins": 1.5899103879928589, "rewards/rejected": -0.8188151121139526, "step": 5805 }, { "epoch": 0.5973064665364449, "grad_norm": 56.0, "learning_rate": 4.47452593100297e-07, "logits/chosen": -0.609265923500061, "logits/rejected": -0.6302827596664429, "logps/chosen": -43.41059494018555, "logps/rejected": -54.17730712890625, "loss": 0.2877, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8186618685722351, "rewards/margins": 1.84805428981781, "rewards/rejected": -1.0293926000595093, "step": 5810 }, { "epoch": 0.5978204996401768, "grad_norm": 36.75, "learning_rate": 4.4688142563399587e-07, "logits/chosen": -0.5216805934906006, "logits/rejected": -0.5947023630142212, "logps/chosen": -36.69818115234375, "logps/rejected": -46.6397705078125, "loss": 0.3322, "rewards/accuracies": 0.875, "rewards/chosen": 0.6854179501533508, "rewards/margins": 1.2962243556976318, "rewards/rejected": -0.6108065247535706, "step": 5815 }, { "epoch": 0.5983345327439087, "grad_norm": 40.0, "learning_rate": 4.4631025816769475e-07, "logits/chosen": -0.5968950986862183, "logits/rejected": -0.6211062073707581, "logps/chosen": -37.47200012207031, "logps/rejected": -51.211402893066406, "loss": 0.3313, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5791361927986145, "rewards/margins": 1.073500394821167, "rewards/rejected": -0.4943644106388092, "step": 5820 }, { "epoch": 0.5988485658476406, "grad_norm": 29.625, "learning_rate": 4.4573909070139363e-07, "logits/chosen": -0.5500258803367615, "logits/rejected": -0.5677153468132019, "logps/chosen": -44.309791564941406, "logps/rejected": -49.00257110595703, "loss": 0.3564, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6163984537124634, "rewards/margins": 1.4163744449615479, "rewards/rejected": -0.799976110458374, "step": 5825 }, { "epoch": 0.5993625989513724, "grad_norm": 31.0, "learning_rate": 4.451679232350925e-07, "logits/chosen": -0.4720117151737213, "logits/rejected": -0.5499857664108276, "logps/chosen": -36.925235748291016, "logps/rejected": -50.35525894165039, "loss": 0.3323, "rewards/accuracies": 0.875, "rewards/chosen": 0.54684978723526, "rewards/margins": 1.3751556873321533, "rewards/rejected": -0.828305721282959, "step": 5830 }, { "epoch": 0.5998766320551043, "grad_norm": 24.625, "learning_rate": 4.4459675576879143e-07, "logits/chosen": -0.5849648118019104, "logits/rejected": -0.6099086403846741, "logps/chosen": -37.233055114746094, "logps/rejected": -49.04412841796875, "loss": 0.3365, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.838725745677948, "rewards/margins": 1.5005784034729004, "rewards/rejected": -0.6618525981903076, "step": 5835 }, { "epoch": 0.6001850519173435, "eval_logits/chosen": -0.5372914671897888, "eval_logits/rejected": -0.5974280834197998, "eval_logps/chosen": -77.66390228271484, "eval_logps/rejected": -54.96421432495117, "eval_loss": 0.315388560295105, "eval_rewards/accuracies": 0.8676470518112183, "eval_rewards/chosen": 0.7244707942008972, "eval_rewards/margins": 1.5866512060165405, "eval_rewards/rejected": -0.862180233001709, "eval_runtime": 2.1096, "eval_samples_per_second": 507.21, "eval_steps_per_second": 8.058, "step": 5838 }, { "epoch": 0.6003906651588362, "grad_norm": 22.625, "learning_rate": 4.4402558830249026e-07, "logits/chosen": -0.5263804197311401, "logits/rejected": -0.5808123350143433, "logps/chosen": -37.491737365722656, "logps/rejected": -55.09637451171875, "loss": 0.2853, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7548818588256836, "rewards/margins": 1.8998931646347046, "rewards/rejected": -1.145011305809021, "step": 5840 }, { "epoch": 0.6009046982625681, "grad_norm": 30.5, "learning_rate": 4.434544208361892e-07, "logits/chosen": -0.5250980257987976, "logits/rejected": -0.6179174184799194, "logps/chosen": -42.659942626953125, "logps/rejected": -53.50602340698242, "loss": 0.2817, "rewards/accuracies": 0.875, "rewards/chosen": 0.7824276089668274, "rewards/margins": 1.4850554466247559, "rewards/rejected": -0.7026278376579285, "step": 5845 }, { "epoch": 0.6014187313663, "grad_norm": 33.5, "learning_rate": 4.42883253369888e-07, "logits/chosen": -0.5217467546463013, "logits/rejected": -0.5952775478363037, "logps/chosen": -32.532413482666016, "logps/rejected": -48.85432052612305, "loss": 0.3261, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7468265295028687, "rewards/margins": 1.533926010131836, "rewards/rejected": -0.7870994806289673, "step": 5850 }, { "epoch": 0.6019327644700319, "grad_norm": 29.875, "learning_rate": 4.4231208590358694e-07, "logits/chosen": -0.5751782655715942, "logits/rejected": -0.5375757813453674, "logps/chosen": -41.20939254760742, "logps/rejected": -53.3175048828125, "loss": 0.3141, "rewards/accuracies": 0.875, "rewards/chosen": 0.7107683420181274, "rewards/margins": 1.3370023965835571, "rewards/rejected": -0.6262339949607849, "step": 5855 }, { "epoch": 0.6024467975737637, "grad_norm": 44.0, "learning_rate": 4.4174091843728577e-07, "logits/chosen": -0.47073474526405334, "logits/rejected": -0.5425187349319458, "logps/chosen": -33.564674377441406, "logps/rejected": -51.9908561706543, "loss": 0.329, "rewards/accuracies": 0.875, "rewards/chosen": 0.6756154298782349, "rewards/margins": 1.3989254236221313, "rewards/rejected": -0.7233098745346069, "step": 5860 }, { "epoch": 0.6029608306774956, "grad_norm": 32.0, "learning_rate": 4.411697509709847e-07, "logits/chosen": -0.5293257832527161, "logits/rejected": -0.5992894172668457, "logps/chosen": -39.66236114501953, "logps/rejected": -53.56208419799805, "loss": 0.3013, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5979223847389221, "rewards/margins": 1.5308761596679688, "rewards/rejected": -0.9329537153244019, "step": 5865 }, { "epoch": 0.6034748637812275, "grad_norm": 27.5, "learning_rate": 4.405985835046835e-07, "logits/chosen": -0.5547787547111511, "logits/rejected": -0.5664105415344238, "logps/chosen": -39.39262771606445, "logps/rejected": -50.81238555908203, "loss": 0.3143, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.673577606678009, "rewards/margins": 1.5283100605010986, "rewards/rejected": -0.8547323346138, "step": 5870 }, { "epoch": 0.6039888968849594, "grad_norm": 34.25, "learning_rate": 4.4002741603838245e-07, "logits/chosen": -0.5584791302680969, "logits/rejected": -0.5575534105300903, "logps/chosen": -37.787071228027344, "logps/rejected": -50.216827392578125, "loss": 0.2947, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.729153037071228, "rewards/margins": 1.3556063175201416, "rewards/rejected": -0.6264531016349792, "step": 5875 }, { "epoch": 0.6045029299886913, "grad_norm": 32.0, "learning_rate": 4.394562485720813e-07, "logits/chosen": -0.5500592589378357, "logits/rejected": -0.5715928077697754, "logps/chosen": -39.387306213378906, "logps/rejected": -50.51360321044922, "loss": 0.3613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.49749523401260376, "rewards/margins": 1.1955480575561523, "rewards/rejected": -0.6980527639389038, "step": 5880 }, { "epoch": 0.6050169630924231, "grad_norm": 27.0, "learning_rate": 4.388850811057802e-07, "logits/chosen": -0.5412956476211548, "logits/rejected": -0.5706241726875305, "logps/chosen": -40.293880462646484, "logps/rejected": -52.09435272216797, "loss": 0.3221, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5198123455047607, "rewards/margins": 1.4516665935516357, "rewards/rejected": -0.9318543672561646, "step": 5885 }, { "epoch": 0.605530996196155, "grad_norm": 28.25, "learning_rate": 4.383139136394791e-07, "logits/chosen": -0.5697931051254272, "logits/rejected": -0.6227530837059021, "logps/chosen": -37.47074890136719, "logps/rejected": -51.73308181762695, "loss": 0.3176, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7742520570755005, "rewards/margins": 1.2271164655685425, "rewards/rejected": -0.45286455750465393, "step": 5890 }, { "epoch": 0.6060450292998869, "grad_norm": 29.375, "learning_rate": 4.3774274617317796e-07, "logits/chosen": -0.6259880661964417, "logits/rejected": -0.6720592975616455, "logps/chosen": -37.598487854003906, "logps/rejected": -51.107364654541016, "loss": 0.3125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9229871034622192, "rewards/margins": 1.3839541673660278, "rewards/rejected": -0.4609670639038086, "step": 5895 }, { "epoch": 0.6065590624036188, "grad_norm": 32.75, "learning_rate": 4.3717157870687684e-07, "logits/chosen": -0.5854833722114563, "logits/rejected": -0.5940893292427063, "logps/chosen": -39.788352966308594, "logps/rejected": -52.90739059448242, "loss": 0.3649, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.748259425163269, "rewards/margins": 1.2980797290802002, "rewards/rejected": -0.5498202443122864, "step": 5900 }, { "epoch": 0.6070730955073507, "grad_norm": 30.875, "learning_rate": 4.366004112405757e-07, "logits/chosen": -0.5443757772445679, "logits/rejected": -0.5546153783798218, "logps/chosen": -40.674320220947266, "logps/rejected": -52.96555709838867, "loss": 0.3399, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.817864716053009, "rewards/margins": 1.5276644229888916, "rewards/rejected": -0.7097996473312378, "step": 5905 }, { "epoch": 0.6075871286110825, "grad_norm": 26.625, "learning_rate": 4.360292437742746e-07, "logits/chosen": -0.5717006325721741, "logits/rejected": -0.6108525991439819, "logps/chosen": -41.74953842163086, "logps/rejected": -56.275726318359375, "loss": 0.3311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.63818359375, "rewards/margins": 1.5031770467758179, "rewards/rejected": -0.8649934530258179, "step": 5910 }, { "epoch": 0.6081011617148144, "grad_norm": 26.5, "learning_rate": 4.3545807630797347e-07, "logits/chosen": -0.6623461246490479, "logits/rejected": -0.612859845161438, "logps/chosen": -43.883445739746094, "logps/rejected": -51.250389099121094, "loss": 0.3029, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5314711332321167, "rewards/margins": 1.4093387126922607, "rewards/rejected": -0.8778674006462097, "step": 5915 }, { "epoch": 0.6086151948185463, "grad_norm": 36.0, "learning_rate": 4.3488690884167235e-07, "logits/chosen": -0.577355682849884, "logits/rejected": -0.6592684984207153, "logps/chosen": -37.709686279296875, "logps/rejected": -56.0356559753418, "loss": 0.3319, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9365561604499817, "rewards/margins": 1.8824800252914429, "rewards/rejected": -0.9459236860275269, "step": 5920 }, { "epoch": 0.6091292279222782, "grad_norm": 33.0, "learning_rate": 4.343157413753712e-07, "logits/chosen": -0.5865254402160645, "logits/rejected": -0.6491035223007202, "logps/chosen": -38.21077346801758, "logps/rejected": -52.44469451904297, "loss": 0.3166, "rewards/accuracies": 0.875, "rewards/chosen": 0.8613814115524292, "rewards/margins": 1.6392061710357666, "rewards/rejected": -0.7778247594833374, "step": 5925 }, { "epoch": 0.6096432610260101, "grad_norm": 28.5, "learning_rate": 4.337445739090701e-07, "logits/chosen": -0.4485880434513092, "logits/rejected": -0.4972618520259857, "logps/chosen": -40.62041473388672, "logps/rejected": -44.76218032836914, "loss": 0.3097, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6329669952392578, "rewards/margins": 1.1085193157196045, "rewards/rejected": -0.47555217146873474, "step": 5930 }, { "epoch": 0.610157294129742, "grad_norm": 27.875, "learning_rate": 4.3317340644276903e-07, "logits/chosen": -0.5585506558418274, "logits/rejected": -0.5723002552986145, "logps/chosen": -31.862468719482422, "logps/rejected": -48.41667556762695, "loss": 0.301, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8942328691482544, "rewards/margins": 1.786250352859497, "rewards/rejected": -0.8920172452926636, "step": 5935 }, { "epoch": 0.6106713272334738, "grad_norm": 36.75, "learning_rate": 4.3260223897646785e-07, "logits/chosen": -0.6209938526153564, "logits/rejected": -0.6433176398277283, "logps/chosen": -39.60226058959961, "logps/rejected": -52.45722198486328, "loss": 0.2936, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.717507541179657, "rewards/margins": 1.6354148387908936, "rewards/rejected": -0.9179073572158813, "step": 5940 }, { "epoch": 0.6111853603372057, "grad_norm": 33.75, "learning_rate": 4.320310715101668e-07, "logits/chosen": -0.534376859664917, "logits/rejected": -0.606226921081543, "logps/chosen": -36.86358642578125, "logps/rejected": -57.28660202026367, "loss": 0.273, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.827008068561554, "rewards/margins": 1.9460623264312744, "rewards/rejected": -1.1190540790557861, "step": 5945 }, { "epoch": 0.6116993934409376, "grad_norm": 41.25, "learning_rate": 4.314599040438656e-07, "logits/chosen": -0.6448413133621216, "logits/rejected": -0.6436132192611694, "logps/chosen": -40.77336883544922, "logps/rejected": -47.82203674316406, "loss": 0.3613, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5910572409629822, "rewards/margins": 1.1409618854522705, "rewards/rejected": -0.5499045848846436, "step": 5950 }, { "epoch": 0.6122134265446695, "grad_norm": 28.0, "learning_rate": 4.3088873657756454e-07, "logits/chosen": -0.5786934494972229, "logits/rejected": -0.6023791432380676, "logps/chosen": -40.02457046508789, "logps/rejected": -53.13640213012695, "loss": 0.329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8107911944389343, "rewards/margins": 1.9430803060531616, "rewards/rejected": -1.1322892904281616, "step": 5955 }, { "epoch": 0.6127274596484014, "grad_norm": 34.25, "learning_rate": 4.3031756911126336e-07, "logits/chosen": -0.5824376344680786, "logits/rejected": -0.6184274554252625, "logps/chosen": -36.7752799987793, "logps/rejected": -55.8281364440918, "loss": 0.3251, "rewards/accuracies": 0.875, "rewards/chosen": 0.7871125936508179, "rewards/margins": 1.491281270980835, "rewards/rejected": -0.7041687965393066, "step": 5960 }, { "epoch": 0.6132414927521332, "grad_norm": 40.0, "learning_rate": 4.297464016449623e-07, "logits/chosen": -0.6167775988578796, "logits/rejected": -0.7190467119216919, "logps/chosen": -41.60444641113281, "logps/rejected": -51.963714599609375, "loss": 0.3339, "rewards/accuracies": 0.875, "rewards/chosen": 0.7057298421859741, "rewards/margins": 1.4603655338287354, "rewards/rejected": -0.7546356916427612, "step": 5965 }, { "epoch": 0.6137555258558651, "grad_norm": 39.75, "learning_rate": 4.291752341786611e-07, "logits/chosen": -0.5506775975227356, "logits/rejected": -0.5885018110275269, "logps/chosen": -37.81081008911133, "logps/rejected": -44.22455596923828, "loss": 0.341, "rewards/accuracies": 0.875, "rewards/chosen": 0.7869201898574829, "rewards/margins": 1.2683658599853516, "rewards/rejected": -0.4814457893371582, "step": 5970 }, { "epoch": 0.614269558959597, "grad_norm": 26.25, "learning_rate": 4.2860406671236005e-07, "logits/chosen": -0.582465648651123, "logits/rejected": -0.6106687784194946, "logps/chosen": -42.67621994018555, "logps/rejected": -55.17253494262695, "loss": 0.3213, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6583107113838196, "rewards/margins": 1.6979186534881592, "rewards/rejected": -1.0396078824996948, "step": 5975 }, { "epoch": 0.6147835920633289, "grad_norm": 32.0, "learning_rate": 4.2803289924605887e-07, "logits/chosen": -0.4766151010990143, "logits/rejected": -0.5167249441146851, "logps/chosen": -37.74381637573242, "logps/rejected": -45.77138137817383, "loss": 0.3145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7729898691177368, "rewards/margins": 1.3293672800064087, "rewards/rejected": -0.5563774704933167, "step": 5980 }, { "epoch": 0.6152976251670608, "grad_norm": 29.0, "learning_rate": 4.274617317797578e-07, "logits/chosen": -0.5770729184150696, "logits/rejected": -0.5821505188941956, "logps/chosen": -38.235069274902344, "logps/rejected": -52.86005783081055, "loss": 0.3174, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.661395251750946, "rewards/margins": 1.4955204725265503, "rewards/rejected": -0.8341252207756042, "step": 5985 }, { "epoch": 0.6158116582707927, "grad_norm": 27.25, "learning_rate": 4.2689056431345673e-07, "logits/chosen": -0.49685317277908325, "logits/rejected": -0.600597083568573, "logps/chosen": -37.09367370605469, "logps/rejected": -51.50550079345703, "loss": 0.3222, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6791023015975952, "rewards/margins": 1.6911300420761108, "rewards/rejected": -1.0120275020599365, "step": 5990 }, { "epoch": 0.6163256913745245, "grad_norm": 28.125, "learning_rate": 4.2631939684715556e-07, "logits/chosen": -0.602433443069458, "logits/rejected": -0.6733653545379639, "logps/chosen": -38.4224853515625, "logps/rejected": -54.27231979370117, "loss": 0.2939, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.873740553855896, "rewards/margins": 1.9360977411270142, "rewards/rejected": -1.0623571872711182, "step": 5995 }, { "epoch": 0.6168397244782564, "grad_norm": 37.5, "learning_rate": 4.257482293808545e-07, "logits/chosen": -0.5776938796043396, "logits/rejected": -0.5661753416061401, "logps/chosen": -36.924102783203125, "logps/rejected": -54.44993209838867, "loss": 0.3549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8130871057510376, "rewards/margins": 1.4818085432052612, "rewards/rejected": -0.668721616268158, "step": 6000 }, { "epoch": 0.6173537575819883, "grad_norm": 27.625, "learning_rate": 4.251770619145533e-07, "logits/chosen": -0.669303297996521, "logits/rejected": -0.675082802772522, "logps/chosen": -43.56365966796875, "logps/rejected": -58.138214111328125, "loss": 0.3287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7759415507316589, "rewards/margins": 1.8586164712905884, "rewards/rejected": -1.0826747417449951, "step": 6005 }, { "epoch": 0.6178677906857202, "grad_norm": 29.5, "learning_rate": 4.2460589444825224e-07, "logits/chosen": -0.4850102961063385, "logits/rejected": -0.500055730342865, "logps/chosen": -42.574134826660156, "logps/rejected": -53.1622200012207, "loss": 0.3273, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8961049914360046, "rewards/margins": 1.7877403497695923, "rewards/rejected": -0.8916352987289429, "step": 6010 }, { "epoch": 0.6183818237894521, "grad_norm": 43.25, "learning_rate": 4.2403472698195107e-07, "logits/chosen": -0.613563060760498, "logits/rejected": -0.6142324209213257, "logps/chosen": -39.3028564453125, "logps/rejected": -51.706504821777344, "loss": 0.275, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9389569163322449, "rewards/margins": 1.8733680248260498, "rewards/rejected": -0.9344111680984497, "step": 6015 }, { "epoch": 0.6188958568931839, "grad_norm": 28.5, "learning_rate": 4.2346355951565e-07, "logits/chosen": -0.4974898397922516, "logits/rejected": -0.5338177680969238, "logps/chosen": -35.46951675415039, "logps/rejected": -46.33216094970703, "loss": 0.3064, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9754365086555481, "rewards/margins": 1.518108606338501, "rewards/rejected": -0.5426720380783081, "step": 6020 }, { "epoch": 0.6194098899969158, "grad_norm": 25.5, "learning_rate": 4.228923920493488e-07, "logits/chosen": -0.6303747296333313, "logits/rejected": -0.5557425022125244, "logps/chosen": -37.36860656738281, "logps/rejected": -53.66881561279297, "loss": 0.3325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6728995442390442, "rewards/margins": 1.7067617177963257, "rewards/rejected": -1.033862590789795, "step": 6025 }, { "epoch": 0.6199239231006477, "grad_norm": 31.5, "learning_rate": 4.2232122458304775e-07, "logits/chosen": -0.5905717611312866, "logits/rejected": -0.6130779385566711, "logps/chosen": -41.24983596801758, "logps/rejected": -52.04716873168945, "loss": 0.317, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7033143043518066, "rewards/margins": 1.9036242961883545, "rewards/rejected": -1.2003099918365479, "step": 6030 }, { "epoch": 0.6204379562043796, "grad_norm": 28.5, "learning_rate": 4.2175005711674663e-07, "logits/chosen": -0.5342227816581726, "logits/rejected": -0.5886737108230591, "logps/chosen": -35.47758483886719, "logps/rejected": -52.390716552734375, "loss": 0.3134, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8527208566665649, "rewards/margins": 1.5843937397003174, "rewards/rejected": -0.7316728830337524, "step": 6035 }, { "epoch": 0.6209519893081115, "grad_norm": 33.0, "learning_rate": 4.211788896504455e-07, "logits/chosen": -0.5626376867294312, "logits/rejected": -0.5801733732223511, "logps/chosen": -38.542503356933594, "logps/rejected": -52.5684928894043, "loss": 0.307, "rewards/accuracies": 0.875, "rewards/chosen": 0.47716742753982544, "rewards/margins": 1.2241780757904053, "rewards/rejected": -0.7470107078552246, "step": 6040 }, { "epoch": 0.6214660224118433, "grad_norm": 38.5, "learning_rate": 4.206077221841444e-07, "logits/chosen": -0.6112022399902344, "logits/rejected": -0.6236532330513, "logps/chosen": -40.54891586303711, "logps/rejected": -48.70509338378906, "loss": 0.3346, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7577794790267944, "rewards/margins": 1.48287034034729, "rewards/rejected": -0.7250910401344299, "step": 6045 }, { "epoch": 0.6219800555155752, "grad_norm": 33.0, "learning_rate": 4.2003655471784326e-07, "logits/chosen": -0.5735376477241516, "logits/rejected": -0.6029638051986694, "logps/chosen": -43.283119201660156, "logps/rejected": -54.00172805786133, "loss": 0.3456, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5226176977157593, "rewards/margins": 1.6115872859954834, "rewards/rejected": -1.0889695882797241, "step": 6050 }, { "epoch": 0.6224940886193071, "grad_norm": 34.75, "learning_rate": 4.1946538725154214e-07, "logits/chosen": -0.5415120124816895, "logits/rejected": -0.5438119173049927, "logps/chosen": -43.25099563598633, "logps/rejected": -53.67878341674805, "loss": 0.3387, "rewards/accuracies": 0.875, "rewards/chosen": 0.7985376119613647, "rewards/margins": 1.6577255725860596, "rewards/rejected": -0.8591877818107605, "step": 6055 }, { "epoch": 0.623008121723039, "grad_norm": 26.375, "learning_rate": 4.18894219785241e-07, "logits/chosen": -0.5850507020950317, "logits/rejected": -0.5933838486671448, "logps/chosen": -36.325931549072266, "logps/rejected": -52.89921951293945, "loss": 0.3224, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7778958082199097, "rewards/margins": 1.7139606475830078, "rewards/rejected": -0.9360648393630981, "step": 6060 }, { "epoch": 0.6235221548267709, "grad_norm": 23.875, "learning_rate": 4.183230523189399e-07, "logits/chosen": -0.6227039694786072, "logits/rejected": -0.6314096450805664, "logps/chosen": -37.49051284790039, "logps/rejected": -46.18269348144531, "loss": 0.3298, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6541382670402527, "rewards/margins": 1.1770811080932617, "rewards/rejected": -0.5229429602622986, "step": 6065 }, { "epoch": 0.6240361879305028, "grad_norm": 27.625, "learning_rate": 4.1775188485263877e-07, "logits/chosen": -0.6635180711746216, "logits/rejected": -0.6972432136535645, "logps/chosen": -42.05979919433594, "logps/rejected": -48.95487594604492, "loss": 0.3262, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8795195817947388, "rewards/margins": 1.6945774555206299, "rewards/rejected": -0.8150579333305359, "step": 6070 }, { "epoch": 0.6245502210342346, "grad_norm": 38.75, "learning_rate": 4.1718071738633764e-07, "logits/chosen": -0.6102014780044556, "logits/rejected": -0.5640468597412109, "logps/chosen": -37.144657135009766, "logps/rejected": -52.82014083862305, "loss": 0.2987, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9806919097900391, "rewards/margins": 1.6282764673233032, "rewards/rejected": -0.6475844979286194, "step": 6075 }, { "epoch": 0.6250642541379665, "grad_norm": 39.25, "learning_rate": 4.166095499200366e-07, "logits/chosen": -0.49647775292396545, "logits/rejected": -0.633417546749115, "logps/chosen": -33.541046142578125, "logps/rejected": -48.6927604675293, "loss": 0.3377, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6732036471366882, "rewards/margins": 1.3736339807510376, "rewards/rejected": -0.7004302144050598, "step": 6080 }, { "epoch": 0.6255782872416984, "grad_norm": 29.375, "learning_rate": 4.160383824537354e-07, "logits/chosen": -0.5543195605278015, "logits/rejected": -0.5364891886711121, "logps/chosen": -43.23090362548828, "logps/rejected": -60.46656036376953, "loss": 0.2878, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6918272972106934, "rewards/margins": 1.7822370529174805, "rewards/rejected": -1.0904099941253662, "step": 6085 }, { "epoch": 0.6260923203454303, "grad_norm": 30.375, "learning_rate": 4.1546721498743433e-07, "logits/chosen": -0.5900058150291443, "logits/rejected": -0.5589081048965454, "logps/chosen": -38.71635437011719, "logps/rejected": -50.8583984375, "loss": 0.3462, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5120704174041748, "rewards/margins": 1.1768202781677246, "rewards/rejected": -0.6647499799728394, "step": 6090 }, { "epoch": 0.6266063534491622, "grad_norm": 22.5, "learning_rate": 4.1489604752113315e-07, "logits/chosen": -0.5659338235855103, "logits/rejected": -0.6086455583572388, "logps/chosen": -33.128509521484375, "logps/rejected": -49.25102233886719, "loss": 0.3037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5414998531341553, "rewards/margins": 1.4499056339263916, "rewards/rejected": -0.9084059000015259, "step": 6095 }, { "epoch": 0.627120386552894, "grad_norm": 33.25, "learning_rate": 4.143248800548321e-07, "logits/chosen": -0.5649323463439941, "logits/rejected": -0.6107709407806396, "logps/chosen": -37.60742950439453, "logps/rejected": -47.32973861694336, "loss": 0.3333, "rewards/accuracies": 1.0, "rewards/chosen": 0.7484627962112427, "rewards/margins": 1.61318838596344, "rewards/rejected": -0.8647255897521973, "step": 6100 }, { "epoch": 0.6276344196566259, "grad_norm": 36.5, "learning_rate": 4.137537125885309e-07, "logits/chosen": -0.6317888498306274, "logits/rejected": -0.6428834795951843, "logps/chosen": -39.076820373535156, "logps/rejected": -54.07526397705078, "loss": 0.3236, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7865627408027649, "rewards/margins": 1.699419379234314, "rewards/rejected": -0.9128566980361938, "step": 6105 }, { "epoch": 0.6281484527603578, "grad_norm": 43.25, "learning_rate": 4.1318254512222984e-07, "logits/chosen": -0.5374249219894409, "logits/rejected": -0.48089537024497986, "logps/chosen": -36.995025634765625, "logps/rejected": -48.21913146972656, "loss": 0.3343, "rewards/accuracies": 0.875, "rewards/chosen": 0.7167745232582092, "rewards/margins": 1.5158977508544922, "rewards/rejected": -0.7991231679916382, "step": 6110 }, { "epoch": 0.6286624858640897, "grad_norm": 46.0, "learning_rate": 4.1261137765592866e-07, "logits/chosen": -0.5529475212097168, "logits/rejected": -0.5635805130004883, "logps/chosen": -37.810909271240234, "logps/rejected": -55.3044548034668, "loss": 0.3422, "rewards/accuracies": 0.875, "rewards/chosen": 0.7714223861694336, "rewards/margins": 1.6578357219696045, "rewards/rejected": -0.8864132761955261, "step": 6115 }, { "epoch": 0.6291765189678216, "grad_norm": 27.375, "learning_rate": 4.120402101896276e-07, "logits/chosen": -0.5598713159561157, "logits/rejected": -0.5956839323043823, "logps/chosen": -38.31792068481445, "logps/rejected": -45.818382263183594, "loss": 0.3261, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8020793795585632, "rewards/margins": 1.4146554470062256, "rewards/rejected": -0.6125760078430176, "step": 6120 }, { "epoch": 0.6296905520715534, "grad_norm": 50.0, "learning_rate": 4.1146904272332647e-07, "logits/chosen": -0.4709319472312927, "logits/rejected": -0.5603095293045044, "logps/chosen": -39.287994384765625, "logps/rejected": -53.912841796875, "loss": 0.3354, "rewards/accuracies": 0.875, "rewards/chosen": 0.6270114183425903, "rewards/margins": 1.6551777124404907, "rewards/rejected": -1.0281665325164795, "step": 6125 }, { "epoch": 0.6302045851752853, "grad_norm": 35.75, "learning_rate": 4.1089787525702535e-07, "logits/chosen": -0.5570937395095825, "logits/rejected": -0.6099939942359924, "logps/chosen": -37.738319396972656, "logps/rejected": -46.81117630004883, "loss": 0.3415, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.61869877576828, "rewards/margins": 1.1867033243179321, "rewards/rejected": -0.5680044889450073, "step": 6130 }, { "epoch": 0.6307186182790172, "grad_norm": 27.375, "learning_rate": 4.103267077907242e-07, "logits/chosen": -0.5537891983985901, "logits/rejected": -0.5447515845298767, "logps/chosen": -35.28565216064453, "logps/rejected": -51.809364318847656, "loss": 0.2957, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7778784036636353, "rewards/margins": 1.5371391773223877, "rewards/rejected": -0.7592608332633972, "step": 6135 }, { "epoch": 0.6312326513827491, "grad_norm": 31.125, "learning_rate": 4.097555403244231e-07, "logits/chosen": -0.576437771320343, "logits/rejected": -0.5931424498558044, "logps/chosen": -42.88020706176758, "logps/rejected": -53.05785369873047, "loss": 0.3217, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8181831240653992, "rewards/margins": 1.7237884998321533, "rewards/rejected": -0.9056053161621094, "step": 6140 }, { "epoch": 0.631746684486481, "grad_norm": 34.0, "learning_rate": 4.09184372858122e-07, "logits/chosen": -0.6280516386032104, "logits/rejected": -0.6576388478279114, "logps/chosen": -37.54340362548828, "logps/rejected": -44.04956817626953, "loss": 0.3333, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6974038481712341, "rewards/margins": 0.9240224957466125, "rewards/rejected": -0.22661857306957245, "step": 6145 }, { "epoch": 0.6322607175902129, "grad_norm": 26.875, "learning_rate": 4.0861320539182085e-07, "logits/chosen": -0.5966674089431763, "logits/rejected": -0.6091368794441223, "logps/chosen": -41.562950134277344, "logps/rejected": -52.86211395263672, "loss": 0.2831, "rewards/accuracies": 0.875, "rewards/chosen": 0.7763030529022217, "rewards/margins": 1.4506332874298096, "rewards/rejected": -0.6743301153182983, "step": 6150 }, { "epoch": 0.6327747506939447, "grad_norm": 34.25, "learning_rate": 4.0804203792551973e-07, "logits/chosen": -0.5656888484954834, "logits/rejected": -0.6144009232521057, "logps/chosen": -39.89799880981445, "logps/rejected": -49.66423797607422, "loss": 0.3623, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7631144523620605, "rewards/margins": 1.39784836769104, "rewards/rejected": -0.6347337961196899, "step": 6155 }, { "epoch": 0.6332887837976766, "grad_norm": 60.25, "learning_rate": 4.074708704592186e-07, "logits/chosen": -0.5979375243186951, "logits/rejected": -0.5667296051979065, "logps/chosen": -37.175411224365234, "logps/rejected": -53.3577880859375, "loss": 0.3257, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8317616581916809, "rewards/margins": 1.5442891120910645, "rewards/rejected": -0.7125274538993835, "step": 6160 }, { "epoch": 0.6338028169014085, "grad_norm": 31.125, "learning_rate": 4.068997029929175e-07, "logits/chosen": -0.5793441534042358, "logits/rejected": -0.5985867977142334, "logps/chosen": -37.1436767578125, "logps/rejected": -49.66056442260742, "loss": 0.2973, "rewards/accuracies": 0.875, "rewards/chosen": 0.7981970906257629, "rewards/margins": 1.7628800868988037, "rewards/rejected": -0.964682936668396, "step": 6165 }, { "epoch": 0.6343168500051404, "grad_norm": 28.375, "learning_rate": 4.0632853552661636e-07, "logits/chosen": -0.5260875821113586, "logits/rejected": -0.5252823829650879, "logps/chosen": -41.35645294189453, "logps/rejected": -53.9130859375, "loss": 0.306, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.4327011704444885, "rewards/margins": 1.3517038822174072, "rewards/rejected": -0.9190027117729187, "step": 6170 }, { "epoch": 0.6348308831088723, "grad_norm": 31.0, "learning_rate": 4.0575736806031524e-07, "logits/chosen": -0.5102326273918152, "logits/rejected": -0.564828097820282, "logps/chosen": -43.993858337402344, "logps/rejected": -52.07172393798828, "loss": 0.3111, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7834130525588989, "rewards/margins": 1.5896198749542236, "rewards/rejected": -0.8062068223953247, "step": 6175 }, { "epoch": 0.6353449162126041, "grad_norm": 27.625, "learning_rate": 4.0518620059401417e-07, "logits/chosen": -0.6093959808349609, "logits/rejected": -0.6458842158317566, "logps/chosen": -35.74528503417969, "logps/rejected": -49.69989776611328, "loss": 0.3025, "rewards/accuracies": 0.875, "rewards/chosen": 0.7993605732917786, "rewards/margins": 1.3896734714508057, "rewards/rejected": -0.5903128385543823, "step": 6180 }, { "epoch": 0.635858949316336, "grad_norm": 35.5, "learning_rate": 4.04615033127713e-07, "logits/chosen": -0.4740746021270752, "logits/rejected": -0.5302911996841431, "logps/chosen": -35.029903411865234, "logps/rejected": -51.23150634765625, "loss": 0.3026, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8257178068161011, "rewards/margins": 1.8142802715301514, "rewards/rejected": -0.9885624647140503, "step": 6185 }, { "epoch": 0.6363729824200679, "grad_norm": 25.875, "learning_rate": 4.040438656614119e-07, "logits/chosen": -0.6135870218276978, "logits/rejected": -0.6668054461479187, "logps/chosen": -35.73356246948242, "logps/rejected": -57.47318649291992, "loss": 0.322, "rewards/accuracies": 0.875, "rewards/chosen": 0.8847047686576843, "rewards/margins": 1.7146282196044922, "rewards/rejected": -0.8299235105514526, "step": 6190 }, { "epoch": 0.6368870155237998, "grad_norm": 34.0, "learning_rate": 4.0347269819511075e-07, "logits/chosen": -0.5543133020401001, "logits/rejected": -0.6238716840744019, "logps/chosen": -35.679935455322266, "logps/rejected": -55.47370529174805, "loss": 0.288, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7495296597480774, "rewards/margins": 1.9681463241577148, "rewards/rejected": -1.2186167240142822, "step": 6195 }, { "epoch": 0.6374010486275317, "grad_norm": 32.25, "learning_rate": 4.029015307288097e-07, "logits/chosen": -0.5957568883895874, "logits/rejected": -0.6324573755264282, "logps/chosen": -39.39809036254883, "logps/rejected": -50.53911590576172, "loss": 0.3297, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5980271697044373, "rewards/margins": 1.1958584785461426, "rewards/rejected": -0.597831130027771, "step": 6200 }, { "epoch": 0.6379150817312635, "grad_norm": 32.25, "learning_rate": 4.0233036326250856e-07, "logits/chosen": -0.5455362200737, "logits/rejected": -0.6097668409347534, "logps/chosen": -37.824058532714844, "logps/rejected": -50.00800323486328, "loss": 0.3334, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6956779360771179, "rewards/margins": 1.4943336248397827, "rewards/rejected": -0.7986558675765991, "step": 6205 }, { "epoch": 0.6384291148349954, "grad_norm": 36.75, "learning_rate": 4.0175919579620743e-07, "logits/chosen": -0.6826270818710327, "logits/rejected": -0.7148049473762512, "logps/chosen": -40.446712493896484, "logps/rejected": -53.11431121826172, "loss": 0.332, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7327169179916382, "rewards/margins": 1.5658180713653564, "rewards/rejected": -0.8331010937690735, "step": 6210 }, { "epoch": 0.6389431479387273, "grad_norm": 30.25, "learning_rate": 4.011880283299063e-07, "logits/chosen": -0.46773916482925415, "logits/rejected": -0.5626048445701599, "logps/chosen": -41.2999382019043, "logps/rejected": -47.89286422729492, "loss": 0.3486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7588196992874146, "rewards/margins": 1.2949906587600708, "rewards/rejected": -0.5361708998680115, "step": 6215 }, { "epoch": 0.6394571810424592, "grad_norm": 27.125, "learning_rate": 4.006168608636052e-07, "logits/chosen": -0.5710068941116333, "logits/rejected": -0.5998252034187317, "logps/chosen": -40.7110481262207, "logps/rejected": -49.80135726928711, "loss": 0.307, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7300962209701538, "rewards/margins": 1.4975783824920654, "rewards/rejected": -0.7674821615219116, "step": 6220 }, { "epoch": 0.6399712141461911, "grad_norm": 34.5, "learning_rate": 4.000456933973041e-07, "logits/chosen": -0.5523272156715393, "logits/rejected": -0.65040522813797, "logps/chosen": -37.4412956237793, "logps/rejected": -52.534515380859375, "loss": 0.3201, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8703673481941223, "rewards/margins": 1.7332998514175415, "rewards/rejected": -0.862932562828064, "step": 6225 }, { "epoch": 0.6404852472499228, "grad_norm": 33.5, "learning_rate": 3.9947452593100294e-07, "logits/chosen": -0.5718914270401001, "logits/rejected": -0.640109658241272, "logps/chosen": -44.49851989746094, "logps/rejected": -53.07331085205078, "loss": 0.3104, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.751669704914093, "rewards/margins": 1.487139105796814, "rewards/rejected": -0.7354695200920105, "step": 6230 }, { "epoch": 0.6409992803536547, "grad_norm": 26.125, "learning_rate": 3.9890335846470187e-07, "logits/chosen": -0.6128534078598022, "logits/rejected": -0.6434439420700073, "logps/chosen": -37.88142395019531, "logps/rejected": -49.34844207763672, "loss": 0.3127, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9232962727546692, "rewards/margins": 1.6849639415740967, "rewards/rejected": -0.761667788028717, "step": 6235 }, { "epoch": 0.6415133134573866, "grad_norm": 31.0, "learning_rate": 3.983321909984007e-07, "logits/chosen": -0.5277296304702759, "logits/rejected": -0.5463413000106812, "logps/chosen": -39.77079391479492, "logps/rejected": -51.417724609375, "loss": 0.3196, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.693546712398529, "rewards/margins": 1.733782172203064, "rewards/rejected": -1.0402355194091797, "step": 6240 }, { "epoch": 0.6420273465611185, "grad_norm": 29.0, "learning_rate": 3.9776102353209963e-07, "logits/chosen": -0.5179370641708374, "logits/rejected": -0.5399909615516663, "logps/chosen": -40.93974685668945, "logps/rejected": -49.05671691894531, "loss": 0.3302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6549406051635742, "rewards/margins": 1.3814458847045898, "rewards/rejected": -0.7265053987503052, "step": 6245 }, { "epoch": 0.6425413796648504, "grad_norm": 29.25, "learning_rate": 3.9718985606579845e-07, "logits/chosen": -0.5653403401374817, "logits/rejected": -0.6036586761474609, "logps/chosen": -37.32725143432617, "logps/rejected": -48.881195068359375, "loss": 0.3154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7472789883613586, "rewards/margins": 1.7540483474731445, "rewards/rejected": -1.0067694187164307, "step": 6250 }, { "epoch": 0.6430554127685822, "grad_norm": 41.75, "learning_rate": 3.966186885994974e-07, "logits/chosen": -0.5942732095718384, "logits/rejected": -0.6022464036941528, "logps/chosen": -42.063629150390625, "logps/rejected": -55.5563850402832, "loss": 0.313, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7193593978881836, "rewards/margins": 1.9599555730819702, "rewards/rejected": -1.2405961751937866, "step": 6255 }, { "epoch": 0.6435694458723141, "grad_norm": 23.5, "learning_rate": 3.960475211331962e-07, "logits/chosen": -0.518065869808197, "logits/rejected": -0.5472652912139893, "logps/chosen": -41.47471237182617, "logps/rejected": -52.684120178222656, "loss": 0.2979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7473419308662415, "rewards/margins": 1.6932880878448486, "rewards/rejected": -0.9459459185600281, "step": 6260 }, { "epoch": 0.644083478976046, "grad_norm": 48.0, "learning_rate": 3.9547635366689514e-07, "logits/chosen": -0.5882883667945862, "logits/rejected": -0.6140221357345581, "logps/chosen": -40.637489318847656, "logps/rejected": -52.753211975097656, "loss": 0.3509, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7996484041213989, "rewards/margins": 1.555996298789978, "rewards/rejected": -0.7563478350639343, "step": 6265 }, { "epoch": 0.6445975120797779, "grad_norm": 26.0, "learning_rate": 3.94905186200594e-07, "logits/chosen": -0.6055796146392822, "logits/rejected": -0.6060872077941895, "logps/chosen": -36.7097282409668, "logps/rejected": -45.910282135009766, "loss": 0.3358, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7515993118286133, "rewards/margins": 1.0918840169906616, "rewards/rejected": -0.34028464555740356, "step": 6270 }, { "epoch": 0.6451115451835098, "grad_norm": 47.0, "learning_rate": 3.943340187342929e-07, "logits/chosen": -0.5131157636642456, "logits/rejected": -0.5771640539169312, "logps/chosen": -35.957725524902344, "logps/rejected": -55.22175216674805, "loss": 0.3146, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9826990365982056, "rewards/margins": 2.0002856254577637, "rewards/rejected": -1.0175864696502686, "step": 6275 }, { "epoch": 0.6456255782872417, "grad_norm": 27.125, "learning_rate": 3.9376285126799177e-07, "logits/chosen": -0.5536524653434753, "logits/rejected": -0.5997606515884399, "logps/chosen": -41.36670684814453, "logps/rejected": -53.320106506347656, "loss": 0.299, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5662119388580322, "rewards/margins": 1.5306408405303955, "rewards/rejected": -0.9644289016723633, "step": 6280 }, { "epoch": 0.6461396113909735, "grad_norm": 31.625, "learning_rate": 3.9319168380169064e-07, "logits/chosen": -0.5529013872146606, "logits/rejected": -0.6000347137451172, "logps/chosen": -43.35363006591797, "logps/rejected": -51.71327590942383, "loss": 0.32, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6848562955856323, "rewards/margins": 1.3920150995254517, "rewards/rejected": -0.7071589231491089, "step": 6285 }, { "epoch": 0.6466536444947054, "grad_norm": 44.25, "learning_rate": 3.926205163353895e-07, "logits/chosen": -0.5865246653556824, "logits/rejected": -0.5825439691543579, "logps/chosen": -30.985361099243164, "logps/rejected": -51.15685272216797, "loss": 0.3182, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7622967958450317, "rewards/margins": 1.487160325050354, "rewards/rejected": -0.7248635292053223, "step": 6290 }, { "epoch": 0.6471676775984373, "grad_norm": 42.75, "learning_rate": 3.920493488690884e-07, "logits/chosen": -0.6238270998001099, "logits/rejected": -0.6826130747795105, "logps/chosen": -36.715213775634766, "logps/rejected": -51.690208435058594, "loss": 0.3391, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.811445415019989, "rewards/margins": 1.4078677892684937, "rewards/rejected": -0.5964223146438599, "step": 6295 }, { "epoch": 0.6476817107021692, "grad_norm": 28.125, "learning_rate": 3.914781814027873e-07, "logits/chosen": -0.5759671330451965, "logits/rejected": -0.5722204446792603, "logps/chosen": -40.18327331542969, "logps/rejected": -48.58772659301758, "loss": 0.3102, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5642583966255188, "rewards/margins": 1.2901532649993896, "rewards/rejected": -0.7258949279785156, "step": 6300 }, { "epoch": 0.6481957438059011, "grad_norm": 33.25, "learning_rate": 3.9090701393648615e-07, "logits/chosen": -0.5667942762374878, "logits/rejected": -0.5614017844200134, "logps/chosen": -38.87432098388672, "logps/rejected": -54.333282470703125, "loss": 0.3205, "rewards/accuracies": 0.875, "rewards/chosen": 0.7019263505935669, "rewards/margins": 1.3709008693695068, "rewards/rejected": -0.6689743995666504, "step": 6305 }, { "epoch": 0.6487097769096329, "grad_norm": 28.125, "learning_rate": 3.9033584647018503e-07, "logits/chosen": -0.5568113923072815, "logits/rejected": -0.6160850524902344, "logps/chosen": -35.75375747680664, "logps/rejected": -46.49337387084961, "loss": 0.3107, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8421422839164734, "rewards/margins": 1.2254453897476196, "rewards/rejected": -0.3833029866218567, "step": 6310 }, { "epoch": 0.6492238100133648, "grad_norm": 28.875, "learning_rate": 3.897646790038839e-07, "logits/chosen": -0.553541898727417, "logits/rejected": -0.530171275138855, "logps/chosen": -35.560585021972656, "logps/rejected": -49.873680114746094, "loss": 0.3478, "rewards/accuracies": 0.875, "rewards/chosen": 0.7134214043617249, "rewards/margins": 1.4496490955352783, "rewards/rejected": -0.7362276315689087, "step": 6315 }, { "epoch": 0.6497378431170967, "grad_norm": 26.75, "learning_rate": 3.891935115375828e-07, "logits/chosen": -0.6072096824645996, "logits/rejected": -0.6494132876396179, "logps/chosen": -38.54955291748047, "logps/rejected": -52.63712692260742, "loss": 0.3021, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8890652656555176, "rewards/margins": 1.664086103439331, "rewards/rejected": -0.775020956993103, "step": 6320 }, { "epoch": 0.6502518762208286, "grad_norm": 27.25, "learning_rate": 3.886223440712817e-07, "logits/chosen": -0.5236946940422058, "logits/rejected": -0.5561856031417847, "logps/chosen": -39.691619873046875, "logps/rejected": -52.96849822998047, "loss": 0.3415, "rewards/accuracies": 0.875, "rewards/chosen": 0.694573163986206, "rewards/margins": 1.4947935342788696, "rewards/rejected": -0.8002201914787292, "step": 6325 }, { "epoch": 0.6507659093245605, "grad_norm": 25.0, "learning_rate": 3.8805117660498054e-07, "logits/chosen": -0.5847314596176147, "logits/rejected": -0.641315221786499, "logps/chosen": -36.78528594970703, "logps/rejected": -52.37980270385742, "loss": 0.2892, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0766677856445312, "rewards/margins": 1.7410316467285156, "rewards/rejected": -0.6643639206886292, "step": 6330 }, { "epoch": 0.6512799424282923, "grad_norm": 26.5, "learning_rate": 3.8748000913867947e-07, "logits/chosen": -0.5561734437942505, "logits/rejected": -0.5608338117599487, "logps/chosen": -32.90951156616211, "logps/rejected": -50.90693664550781, "loss": 0.3039, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.834577739238739, "rewards/margins": 1.5769689083099365, "rewards/rejected": -0.7423912882804871, "step": 6335 }, { "epoch": 0.6517939755320242, "grad_norm": 34.25, "learning_rate": 3.869088416723783e-07, "logits/chosen": -0.6505110859870911, "logits/rejected": -0.7101349234580994, "logps/chosen": -42.988243103027344, "logps/rejected": -50.19813537597656, "loss": 0.3063, "rewards/accuracies": 0.875, "rewards/chosen": 0.9439882040023804, "rewards/margins": 1.4902400970458984, "rewards/rejected": -0.5462517142295837, "step": 6340 }, { "epoch": 0.6523080086357561, "grad_norm": 36.25, "learning_rate": 3.863376742060772e-07, "logits/chosen": -0.5607174038887024, "logits/rejected": -0.5864924192428589, "logps/chosen": -32.23534393310547, "logps/rejected": -56.14277267456055, "loss": 0.3419, "rewards/accuracies": 0.875, "rewards/chosen": 0.7909739017486572, "rewards/margins": 1.5310382843017578, "rewards/rejected": -0.740064263343811, "step": 6345 }, { "epoch": 0.652822041739488, "grad_norm": 41.5, "learning_rate": 3.8576650673977605e-07, "logits/chosen": -0.5506992936134338, "logits/rejected": -0.582848072052002, "logps/chosen": -39.86126708984375, "logps/rejected": -51.49281692504883, "loss": 0.3253, "rewards/accuracies": 0.875, "rewards/chosen": 0.7502139806747437, "rewards/margins": 1.4384262561798096, "rewards/rejected": -0.6882122755050659, "step": 6350 }, { "epoch": 0.6533360748432199, "grad_norm": 34.25, "learning_rate": 3.85195339273475e-07, "logits/chosen": -0.559023916721344, "logits/rejected": -0.5520543456077576, "logps/chosen": -41.09568786621094, "logps/rejected": -52.425453186035156, "loss": 0.3259, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8554865717887878, "rewards/margins": 1.8394911289215088, "rewards/rejected": -0.9840043187141418, "step": 6355 }, { "epoch": 0.6538501079469518, "grad_norm": 32.5, "learning_rate": 3.846241718071738e-07, "logits/chosen": -0.5477770566940308, "logits/rejected": -0.583655059337616, "logps/chosen": -42.09861373901367, "logps/rejected": -55.57175827026367, "loss": 0.2985, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8768289685249329, "rewards/margins": 1.9801349639892578, "rewards/rejected": -1.1033059358596802, "step": 6360 }, { "epoch": 0.6543641410506836, "grad_norm": 27.125, "learning_rate": 3.8405300434087273e-07, "logits/chosen": -0.5699203610420227, "logits/rejected": -0.6326855421066284, "logps/chosen": -35.329856872558594, "logps/rejected": -52.88677978515625, "loss": 0.3116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6502125859260559, "rewards/margins": 1.2572866678237915, "rewards/rejected": -0.607073962688446, "step": 6365 }, { "epoch": 0.6548781741544155, "grad_norm": 32.5, "learning_rate": 3.834818368745716e-07, "logits/chosen": -0.5215973854064941, "logits/rejected": -0.5501793026924133, "logps/chosen": -37.406028747558594, "logps/rejected": -53.63349151611328, "loss": 0.3206, "rewards/accuracies": 0.875, "rewards/chosen": 0.7953089475631714, "rewards/margins": 1.5642362833023071, "rewards/rejected": -0.768927276134491, "step": 6370 }, { "epoch": 0.6553922072581474, "grad_norm": 43.75, "learning_rate": 3.829106694082705e-07, "logits/chosen": -0.6321054697036743, "logits/rejected": -0.6672964692115784, "logps/chosen": -38.49093246459961, "logps/rejected": -56.106239318847656, "loss": 0.3525, "rewards/accuracies": 0.875, "rewards/chosen": 0.686619758605957, "rewards/margins": 1.6776230335235596, "rewards/rejected": -0.9910030364990234, "step": 6375 }, { "epoch": 0.6559062403618793, "grad_norm": 32.75, "learning_rate": 3.8233950194196936e-07, "logits/chosen": -0.5737737417221069, "logits/rejected": -0.5802602767944336, "logps/chosen": -41.30876541137695, "logps/rejected": -53.462921142578125, "loss": 0.3183, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8399672508239746, "rewards/margins": 1.7183597087860107, "rewards/rejected": -0.8783925771713257, "step": 6380 }, { "epoch": 0.6564202734656112, "grad_norm": 30.5, "learning_rate": 3.8176833447566824e-07, "logits/chosen": -0.5867705941200256, "logits/rejected": -0.6367760896682739, "logps/chosen": -37.4222526550293, "logps/rejected": -55.47607421875, "loss": 0.3077, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6970101594924927, "rewards/margins": 1.5183848142623901, "rewards/rejected": -0.8213747143745422, "step": 6385 }, { "epoch": 0.656934306569343, "grad_norm": 34.75, "learning_rate": 3.811971670093671e-07, "logits/chosen": -0.5918774604797363, "logits/rejected": -0.6382650136947632, "logps/chosen": -38.543636322021484, "logps/rejected": -53.714256286621094, "loss": 0.3546, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9049445390701294, "rewards/margins": 1.588138222694397, "rewards/rejected": -0.6831936836242676, "step": 6390 }, { "epoch": 0.6574483396730749, "grad_norm": 33.5, "learning_rate": 3.80625999543066e-07, "logits/chosen": -0.46109986305236816, "logits/rejected": -0.4972231388092041, "logps/chosen": -36.46257781982422, "logps/rejected": -43.2873649597168, "loss": 0.3569, "rewards/accuracies": 0.875, "rewards/chosen": 0.8972558975219727, "rewards/margins": 1.3306032419204712, "rewards/rejected": -0.4333474040031433, "step": 6395 }, { "epoch": 0.6579623727768068, "grad_norm": 40.75, "learning_rate": 3.8005483207676487e-07, "logits/chosen": -0.5699660181999207, "logits/rejected": -0.6034724712371826, "logps/chosen": -35.189361572265625, "logps/rejected": -46.68206787109375, "loss": 0.3547, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8448463678359985, "rewards/margins": 1.4352580308914185, "rewards/rejected": -0.5904118418693542, "step": 6400 }, { "epoch": 0.6584764058805387, "grad_norm": 37.5, "learning_rate": 3.7948366461046375e-07, "logits/chosen": -0.5827854871749878, "logits/rejected": -0.5648724436759949, "logps/chosen": -36.241554260253906, "logps/rejected": -60.047454833984375, "loss": 0.3172, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8276296854019165, "rewards/margins": 1.8843876123428345, "rewards/rejected": -1.0567578077316284, "step": 6405 }, { "epoch": 0.6589904389842706, "grad_norm": 29.625, "learning_rate": 3.7891249714416263e-07, "logits/chosen": -0.5643479228019714, "logits/rejected": -0.5783836245536804, "logps/chosen": -45.87486267089844, "logps/rejected": -48.80139923095703, "loss": 0.3209, "rewards/accuracies": 0.875, "rewards/chosen": 0.7938356399536133, "rewards/margins": 1.6630340814590454, "rewards/rejected": -0.8691985011100769, "step": 6410 }, { "epoch": 0.6595044720880024, "grad_norm": 38.75, "learning_rate": 3.7834132967786156e-07, "logits/chosen": -0.554538369178772, "logits/rejected": -0.5710011720657349, "logps/chosen": -37.66180419921875, "logps/rejected": -50.35866165161133, "loss": 0.3408, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5425837635993958, "rewards/margins": 1.5016064643859863, "rewards/rejected": -0.9590226411819458, "step": 6415 }, { "epoch": 0.6600185051917343, "grad_norm": 27.25, "learning_rate": 3.7777016221156043e-07, "logits/chosen": -0.6148771047592163, "logits/rejected": -0.5768560171127319, "logps/chosen": -44.309532165527344, "logps/rejected": -53.15008544921875, "loss": 0.2981, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8805248141288757, "rewards/margins": 1.7519956827163696, "rewards/rejected": -0.8714707493782043, "step": 6420 }, { "epoch": 0.6605325382954662, "grad_norm": 27.375, "learning_rate": 3.771989947452593e-07, "logits/chosen": -0.5954708456993103, "logits/rejected": -0.6044126749038696, "logps/chosen": -46.32086944580078, "logps/rejected": -51.653785705566406, "loss": 0.3024, "rewards/accuracies": 0.875, "rewards/chosen": 0.7608620524406433, "rewards/margins": 1.4494497776031494, "rewards/rejected": -0.6885878443717957, "step": 6425 }, { "epoch": 0.6610465713991981, "grad_norm": 24.875, "learning_rate": 3.766278272789582e-07, "logits/chosen": -0.4506637156009674, "logits/rejected": -0.5428694486618042, "logps/chosen": -33.92245864868164, "logps/rejected": -44.38158416748047, "loss": 0.3263, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7991474866867065, "rewards/margins": 1.3847845792770386, "rewards/rejected": -0.5856372117996216, "step": 6430 }, { "epoch": 0.66156060450293, "grad_norm": 26.875, "learning_rate": 3.7605665981265707e-07, "logits/chosen": -0.5722813606262207, "logits/rejected": -0.5963786840438843, "logps/chosen": -40.88620376586914, "logps/rejected": -48.43010711669922, "loss": 0.3051, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8726688623428345, "rewards/margins": 1.5444815158843994, "rewards/rejected": -0.6718127131462097, "step": 6435 }, { "epoch": 0.6620746376066619, "grad_norm": 28.125, "learning_rate": 3.7548549234635594e-07, "logits/chosen": -0.6148470044136047, "logits/rejected": -0.6597779989242554, "logps/chosen": -35.864715576171875, "logps/rejected": -55.50197219848633, "loss": 0.321, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7450498342514038, "rewards/margins": 1.456907868385315, "rewards/rejected": -0.7118580341339111, "step": 6440 }, { "epoch": 0.6625886707103937, "grad_norm": 29.25, "learning_rate": 3.749143248800548e-07, "logits/chosen": -0.5650975108146667, "logits/rejected": -0.5610609650611877, "logps/chosen": -40.78483200073242, "logps/rejected": -50.55907440185547, "loss": 0.3039, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8224369883537292, "rewards/margins": 1.5958927869796753, "rewards/rejected": -0.7734559178352356, "step": 6445 }, { "epoch": 0.6631027038141256, "grad_norm": 25.5, "learning_rate": 3.743431574137537e-07, "logits/chosen": -0.6229384541511536, "logits/rejected": -0.6470184326171875, "logps/chosen": -38.446380615234375, "logps/rejected": -55.07499313354492, "loss": 0.3328, "rewards/accuracies": 0.875, "rewards/chosen": 0.831530749797821, "rewards/margins": 1.9292080402374268, "rewards/rejected": -1.097677230834961, "step": 6450 }, { "epoch": 0.6636167369178575, "grad_norm": 30.25, "learning_rate": 3.737719899474526e-07, "logits/chosen": -0.5508572459220886, "logits/rejected": -0.580500066280365, "logps/chosen": -31.432785034179688, "logps/rejected": -47.16753387451172, "loss": 0.3112, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.8758019208908081, "rewards/margins": 1.5430638790130615, "rewards/rejected": -0.6672620177268982, "step": 6455 }, { "epoch": 0.6641307700215894, "grad_norm": 28.25, "learning_rate": 3.732008224811515e-07, "logits/chosen": -0.5663665533065796, "logits/rejected": -0.5018566846847534, "logps/chosen": -36.857887268066406, "logps/rejected": -48.821861267089844, "loss": 0.3008, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7677624821662903, "rewards/margins": 1.4480478763580322, "rewards/rejected": -0.6802853941917419, "step": 6460 }, { "epoch": 0.6646448031253213, "grad_norm": 28.5, "learning_rate": 3.7262965501485033e-07, "logits/chosen": -0.6105861663818359, "logits/rejected": -0.5695523023605347, "logps/chosen": -37.79560470581055, "logps/rejected": -49.32074737548828, "loss": 0.3239, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9466173052787781, "rewards/margins": 1.6843839883804321, "rewards/rejected": -0.7377667427062988, "step": 6465 }, { "epoch": 0.6651588362290531, "grad_norm": 33.75, "learning_rate": 3.7205848754854926e-07, "logits/chosen": -0.5251679420471191, "logits/rejected": -0.5874218344688416, "logps/chosen": -39.49662780761719, "logps/rejected": -49.2816276550293, "loss": 0.3057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8736166954040527, "rewards/margins": 1.5082409381866455, "rewards/rejected": -0.6346243023872375, "step": 6470 }, { "epoch": 0.665672869332785, "grad_norm": 28.5, "learning_rate": 3.714873200822481e-07, "logits/chosen": -0.5144044160842896, "logits/rejected": -0.6045738458633423, "logps/chosen": -45.97319412231445, "logps/rejected": -51.37739944458008, "loss": 0.2972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.928300678730011, "rewards/margins": 1.7179572582244873, "rewards/rejected": -0.7896565198898315, "step": 6475 }, { "epoch": 0.6661869024365169, "grad_norm": 27.375, "learning_rate": 3.70916152615947e-07, "logits/chosen": -0.5975179076194763, "logits/rejected": -0.6213350892066956, "logps/chosen": -38.31957244873047, "logps/rejected": -47.07155227661133, "loss": 0.3101, "rewards/accuracies": 0.875, "rewards/chosen": 0.8197857737541199, "rewards/margins": 1.4116488695144653, "rewards/rejected": -0.5918631553649902, "step": 6480 }, { "epoch": 0.6667009355402488, "grad_norm": 30.125, "learning_rate": 3.7034498514964584e-07, "logits/chosen": -0.5908713936805725, "logits/rejected": -0.5596396923065186, "logps/chosen": -36.99856185913086, "logps/rejected": -49.905025482177734, "loss": 0.3006, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7728275060653687, "rewards/margins": 1.4088736772537231, "rewards/rejected": -0.636046290397644, "step": 6485 }, { "epoch": 0.6672149686439807, "grad_norm": 30.125, "learning_rate": 3.6977381768334477e-07, "logits/chosen": -0.5995365381240845, "logits/rejected": -0.6122029423713684, "logps/chosen": -37.82451629638672, "logps/rejected": -57.341094970703125, "loss": 0.3086, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8849107623100281, "rewards/margins": 1.9921588897705078, "rewards/rejected": -1.107248067855835, "step": 6490 }, { "epoch": 0.6677290017477125, "grad_norm": 29.625, "learning_rate": 3.692026502170436e-07, "logits/chosen": -0.5633103847503662, "logits/rejected": -0.6285046339035034, "logps/chosen": -35.967933654785156, "logps/rejected": -47.202796936035156, "loss": 0.298, "rewards/accuracies": 0.875, "rewards/chosen": 0.9562920331954956, "rewards/margins": 1.570652723312378, "rewards/rejected": -0.6143606305122375, "step": 6495 }, { "epoch": 0.6682430348514444, "grad_norm": 59.5, "learning_rate": 3.686314827507425e-07, "logits/chosen": -0.4755156636238098, "logits/rejected": -0.49879854917526245, "logps/chosen": -33.445289611816406, "logps/rejected": -45.77444839477539, "loss": 0.3371, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8337769508361816, "rewards/margins": 1.3951936960220337, "rewards/rejected": -0.5614168047904968, "step": 6500 }, { "epoch": 0.6687570679551763, "grad_norm": 40.5, "learning_rate": 3.6806031528444135e-07, "logits/chosen": -0.5439689755439758, "logits/rejected": -0.5657305121421814, "logps/chosen": -36.022789001464844, "logps/rejected": -50.27812194824219, "loss": 0.3129, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9860197305679321, "rewards/margins": 1.4755162000656128, "rewards/rejected": -0.4894963800907135, "step": 6505 }, { "epoch": 0.6692711010589082, "grad_norm": 28.0, "learning_rate": 3.674891478181403e-07, "logits/chosen": -0.5678152441978455, "logits/rejected": -0.6147022843360901, "logps/chosen": -42.50465774536133, "logps/rejected": -54.1407585144043, "loss": 0.3182, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.46967846155166626, "rewards/margins": 1.5075114965438843, "rewards/rejected": -1.0378330945968628, "step": 6510 }, { "epoch": 0.6697851341626401, "grad_norm": 31.375, "learning_rate": 3.6691798035183915e-07, "logits/chosen": -0.5845736265182495, "logits/rejected": -0.5952835083007812, "logps/chosen": -38.04875564575195, "logps/rejected": -51.192291259765625, "loss": 0.322, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.749167799949646, "rewards/margins": 1.684149980545044, "rewards/rejected": -0.9349821209907532, "step": 6515 }, { "epoch": 0.670299167266372, "grad_norm": 28.375, "learning_rate": 3.6634681288553803e-07, "logits/chosen": -0.5098927021026611, "logits/rejected": -0.5477493405342102, "logps/chosen": -37.0514030456543, "logps/rejected": -48.1064453125, "loss": 0.3154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7478753328323364, "rewards/margins": 1.5088837146759033, "rewards/rejected": -0.7610083818435669, "step": 6520 }, { "epoch": 0.6708132003701038, "grad_norm": 26.125, "learning_rate": 3.657756454192369e-07, "logits/chosen": -0.5438401699066162, "logits/rejected": -0.6456743478775024, "logps/chosen": -39.65143585205078, "logps/rejected": -51.42863082885742, "loss": 0.2808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7451239824295044, "rewards/margins": 1.6286592483520508, "rewards/rejected": -0.8835352659225464, "step": 6525 }, { "epoch": 0.6713272334738357, "grad_norm": 34.75, "learning_rate": 3.652044779529358e-07, "logits/chosen": -0.5876379609107971, "logits/rejected": -0.568850040435791, "logps/chosen": -36.57881164550781, "logps/rejected": -47.59454345703125, "loss": 0.3284, "rewards/accuracies": 0.875, "rewards/chosen": 0.872397780418396, "rewards/margins": 1.5743507146835327, "rewards/rejected": -0.7019528150558472, "step": 6530 }, { "epoch": 0.6718412665775676, "grad_norm": 30.625, "learning_rate": 3.6463331048663466e-07, "logits/chosen": -0.5722657442092896, "logits/rejected": -0.5967468619346619, "logps/chosen": -43.173912048339844, "logps/rejected": -51.23271942138672, "loss": 0.2675, "rewards/accuracies": 0.875, "rewards/chosen": 0.5815714597702026, "rewards/margins": 1.4686858654022217, "rewards/rejected": -0.887114405632019, "step": 6535 }, { "epoch": 0.6723552996812995, "grad_norm": 30.75, "learning_rate": 3.6406214302033354e-07, "logits/chosen": -0.5180428624153137, "logits/rejected": -0.5098180174827576, "logps/chosen": -35.39967346191406, "logps/rejected": -47.0742301940918, "loss": 0.3104, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6600988507270813, "rewards/margins": 1.2451931238174438, "rewards/rejected": -0.5850942730903625, "step": 6540 }, { "epoch": 0.6728693327850314, "grad_norm": 38.25, "learning_rate": 3.634909755540324e-07, "logits/chosen": -0.5615891814231873, "logits/rejected": -0.5824823975563049, "logps/chosen": -37.79671096801758, "logps/rejected": -49.03508377075195, "loss": 0.2816, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6197007298469543, "rewards/margins": 1.4490668773651123, "rewards/rejected": -0.8293660879135132, "step": 6545 }, { "epoch": 0.6733833658887632, "grad_norm": 28.125, "learning_rate": 3.629198080877313e-07, "logits/chosen": -0.6513224244117737, "logits/rejected": -0.6893632411956787, "logps/chosen": -41.88184356689453, "logps/rejected": -49.871273040771484, "loss": 0.3388, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7837417125701904, "rewards/margins": 1.425989031791687, "rewards/rejected": -0.6422473788261414, "step": 6550 }, { "epoch": 0.6738973989924951, "grad_norm": 30.125, "learning_rate": 3.6234864062143017e-07, "logits/chosen": -0.5086066126823425, "logits/rejected": -0.5081312656402588, "logps/chosen": -37.50717544555664, "logps/rejected": -49.47705841064453, "loss": 0.3191, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.711792528629303, "rewards/margins": 1.5098965167999268, "rewards/rejected": -0.7981040477752686, "step": 6555 }, { "epoch": 0.674411432096227, "grad_norm": 42.5, "learning_rate": 3.617774731551291e-07, "logits/chosen": -0.5799996256828308, "logits/rejected": -0.5951187610626221, "logps/chosen": -39.906654357910156, "logps/rejected": -53.93434524536133, "loss": 0.3256, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8272415995597839, "rewards/margins": 1.696748971939087, "rewards/rejected": -0.8695074319839478, "step": 6560 }, { "epoch": 0.6749254651999589, "grad_norm": 27.25, "learning_rate": 3.612063056888279e-07, "logits/chosen": -0.6037696599960327, "logits/rejected": -0.5562019944190979, "logps/chosen": -38.672271728515625, "logps/rejected": -54.60719680786133, "loss": 0.3459, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5870368480682373, "rewards/margins": 1.6171869039535522, "rewards/rejected": -1.0301501750946045, "step": 6565 }, { "epoch": 0.6754394983036908, "grad_norm": 29.5, "learning_rate": 3.6063513822252686e-07, "logits/chosen": -0.5118308067321777, "logits/rejected": -0.5575879812240601, "logps/chosen": -36.132755279541016, "logps/rejected": -48.3232536315918, "loss": 0.312, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.748661994934082, "rewards/margins": 1.4837398529052734, "rewards/rejected": -0.735077977180481, "step": 6570 }, { "epoch": 0.6759535314074226, "grad_norm": 24.75, "learning_rate": 3.600639707562257e-07, "logits/chosen": -0.5420844554901123, "logits/rejected": -0.5051285028457642, "logps/chosen": -41.466487884521484, "logps/rejected": -51.25425338745117, "loss": 0.3116, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7314370274543762, "rewards/margins": 1.5419714450836182, "rewards/rejected": -0.8105343580245972, "step": 6575 }, { "epoch": 0.6764675645111545, "grad_norm": 26.125, "learning_rate": 3.594928032899246e-07, "logits/chosen": -0.5936631560325623, "logits/rejected": -0.5916523933410645, "logps/chosen": -39.508888244628906, "logps/rejected": -54.351844787597656, "loss": 0.3144, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8929344415664673, "rewards/margins": 1.7534013986587524, "rewards/rejected": -0.8604669570922852, "step": 6580 }, { "epoch": 0.6769815976148864, "grad_norm": 29.0, "learning_rate": 3.5892163582362343e-07, "logits/chosen": -0.5652129054069519, "logits/rejected": -0.4936315417289734, "logps/chosen": -39.96804428100586, "logps/rejected": -53.264625549316406, "loss": 0.3003, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7689300775527954, "rewards/margins": 1.6737182140350342, "rewards/rejected": -0.9047881364822388, "step": 6585 }, { "epoch": 0.6774956307186183, "grad_norm": 26.25, "learning_rate": 3.5835046835732236e-07, "logits/chosen": -0.6199600100517273, "logits/rejected": -0.6493330001831055, "logps/chosen": -45.820823669433594, "logps/rejected": -54.538597106933594, "loss": 0.3072, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8409161567687988, "rewards/margins": 1.794281005859375, "rewards/rejected": -0.9533647298812866, "step": 6590 }, { "epoch": 0.6780096638223502, "grad_norm": 29.875, "learning_rate": 3.577793008910212e-07, "logits/chosen": -0.5204852819442749, "logits/rejected": -0.6115708947181702, "logps/chosen": -34.61149215698242, "logps/rejected": -51.38554763793945, "loss": 0.3081, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8593713641166687, "rewards/margins": 1.6514278650283813, "rewards/rejected": -0.7920565605163574, "step": 6595 }, { "epoch": 0.678523696926082, "grad_norm": 34.25, "learning_rate": 3.572081334247201e-07, "logits/chosen": -0.5616456270217896, "logits/rejected": -0.6001805067062378, "logps/chosen": -39.2794189453125, "logps/rejected": -52.091636657714844, "loss": 0.3104, "rewards/accuracies": 0.875, "rewards/chosen": 0.6774304509162903, "rewards/margins": 1.4270045757293701, "rewards/rejected": -0.7495741248130798, "step": 6600 }, { "epoch": 0.6790377300298139, "grad_norm": 32.5, "learning_rate": 3.5663696595841894e-07, "logits/chosen": -0.5112361907958984, "logits/rejected": -0.6284658908843994, "logps/chosen": -35.90294647216797, "logps/rejected": -50.24280548095703, "loss": 0.3313, "rewards/accuracies": 0.875, "rewards/chosen": 0.7981746792793274, "rewards/margins": 1.5815551280975342, "rewards/rejected": -0.783380389213562, "step": 6605 }, { "epoch": 0.6795517631335458, "grad_norm": 28.5, "learning_rate": 3.5606579849211787e-07, "logits/chosen": -0.5898119211196899, "logits/rejected": -0.5924276113510132, "logps/chosen": -38.246585845947266, "logps/rejected": -57.51323699951172, "loss": 0.3163, "rewards/accuracies": 0.875, "rewards/chosen": 0.6650969386100769, "rewards/margins": 1.7132943868637085, "rewards/rejected": -1.0481973886489868, "step": 6610 }, { "epoch": 0.6800657962372777, "grad_norm": 43.5, "learning_rate": 3.5549463102581675e-07, "logits/chosen": -0.5780328512191772, "logits/rejected": -0.5259068012237549, "logps/chosen": -37.61781692504883, "logps/rejected": -53.37681198120117, "loss": 0.3219, "rewards/accuracies": 0.875, "rewards/chosen": 0.6395431756973267, "rewards/margins": 1.440044641494751, "rewards/rejected": -0.8005015254020691, "step": 6615 }, { "epoch": 0.6805798293410096, "grad_norm": 24.5, "learning_rate": 3.5492346355951563e-07, "logits/chosen": -0.6024438142776489, "logits/rejected": -0.6190939545631409, "logps/chosen": -36.69843292236328, "logps/rejected": -48.2143440246582, "loss": 0.3015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8209223747253418, "rewards/margins": 1.6534942388534546, "rewards/rejected": -0.8325718641281128, "step": 6620 }, { "epoch": 0.6810938624447415, "grad_norm": 35.75, "learning_rate": 3.5435229609321456e-07, "logits/chosen": -0.5678987503051758, "logits/rejected": -0.6775773763656616, "logps/chosen": -35.709964752197266, "logps/rejected": -50.27495193481445, "loss": 0.3298, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0864521265029907, "rewards/margins": 1.3982229232788086, "rewards/rejected": -0.3117709457874298, "step": 6625 }, { "epoch": 0.6816078955484733, "grad_norm": 27.25, "learning_rate": 3.537811286269134e-07, "logits/chosen": -0.5941759347915649, "logits/rejected": -0.6060600876808167, "logps/chosen": -42.61265182495117, "logps/rejected": -49.81741714477539, "loss": 0.329, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6238235235214233, "rewards/margins": 1.2173569202423096, "rewards/rejected": -0.5935332179069519, "step": 6630 }, { "epoch": 0.6821219286522052, "grad_norm": 28.75, "learning_rate": 3.532099611606123e-07, "logits/chosen": -0.5890936255455017, "logits/rejected": -0.620055615901947, "logps/chosen": -36.61260223388672, "logps/rejected": -50.851715087890625, "loss": 0.3356, "rewards/accuracies": 0.875, "rewards/chosen": 0.7722662091255188, "rewards/margins": 1.687059760093689, "rewards/rejected": -0.9147934913635254, "step": 6635 }, { "epoch": 0.6826359617559371, "grad_norm": 34.0, "learning_rate": 3.5263879369431114e-07, "logits/chosen": -0.4890469014644623, "logits/rejected": -0.5519613027572632, "logps/chosen": -39.10005569458008, "logps/rejected": -52.543785095214844, "loss": 0.3194, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.727967381477356, "rewards/margins": 1.6779181957244873, "rewards/rejected": -0.9499509930610657, "step": 6640 }, { "epoch": 0.683149994859669, "grad_norm": 47.75, "learning_rate": 3.5206762622801007e-07, "logits/chosen": -0.6216936111450195, "logits/rejected": -0.5856636166572571, "logps/chosen": -36.68206024169922, "logps/rejected": -49.93891143798828, "loss": 0.3507, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9637151956558228, "rewards/margins": 1.6084181070327759, "rewards/rejected": -0.6447029113769531, "step": 6645 }, { "epoch": 0.6836640279634009, "grad_norm": 53.75, "learning_rate": 3.514964587617089e-07, "logits/chosen": -0.5554354190826416, "logits/rejected": -0.6098548769950867, "logps/chosen": -38.34941864013672, "logps/rejected": -57.863380432128906, "loss": 0.3212, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8149875402450562, "rewards/margins": 2.0009145736694336, "rewards/rejected": -1.185927152633667, "step": 6650 }, { "epoch": 0.6841780610671327, "grad_norm": 30.5, "learning_rate": 3.509252912954078e-07, "logits/chosen": -0.5229849219322205, "logits/rejected": -0.5839934945106506, "logps/chosen": -42.180328369140625, "logps/rejected": -54.759239196777344, "loss": 0.3119, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8458555340766907, "rewards/margins": 1.7836545705795288, "rewards/rejected": -0.9377990961074829, "step": 6655 }, { "epoch": 0.6846920941708646, "grad_norm": 29.5, "learning_rate": 3.503541238291067e-07, "logits/chosen": -0.5861266851425171, "logits/rejected": -0.6555637121200562, "logps/chosen": -40.5758171081543, "logps/rejected": -53.99775314331055, "loss": 0.298, "rewards/accuracies": 0.875, "rewards/chosen": 0.738853394985199, "rewards/margins": 1.779119849205017, "rewards/rejected": -1.0402661561965942, "step": 6660 }, { "epoch": 0.6852061272745965, "grad_norm": 29.25, "learning_rate": 3.497829563628056e-07, "logits/chosen": -0.7007829546928406, "logits/rejected": -0.6621405482292175, "logps/chosen": -40.73734664916992, "logps/rejected": -56.580101013183594, "loss": 0.3229, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9439202547073364, "rewards/margins": 1.5065797567367554, "rewards/rejected": -0.5626594424247742, "step": 6665 }, { "epoch": 0.6857201603783284, "grad_norm": 32.5, "learning_rate": 3.4921178889650445e-07, "logits/chosen": -0.5903002023696899, "logits/rejected": -0.6492711305618286, "logps/chosen": -38.76734161376953, "logps/rejected": -59.79999542236328, "loss": 0.3108, "rewards/accuracies": 0.875, "rewards/chosen": 0.692867636680603, "rewards/margins": 1.635634183883667, "rewards/rejected": -0.942766547203064, "step": 6670 }, { "epoch": 0.6862341934820603, "grad_norm": 27.125, "learning_rate": 3.4864062143020333e-07, "logits/chosen": -0.5169321894645691, "logits/rejected": -0.5934115052223206, "logps/chosen": -38.1980094909668, "logps/rejected": -53.37287521362305, "loss": 0.3272, "rewards/accuracies": 0.875, "rewards/chosen": 0.9040005803108215, "rewards/margins": 1.7927945852279663, "rewards/rejected": -0.8887939453125, "step": 6675 }, { "epoch": 0.6867482265857922, "grad_norm": 24.875, "learning_rate": 3.480694539639022e-07, "logits/chosen": -0.5220983624458313, "logits/rejected": -0.5901752710342407, "logps/chosen": -36.7931022644043, "logps/rejected": -53.47283172607422, "loss": 0.3092, "rewards/accuracies": 0.875, "rewards/chosen": 0.7243099212646484, "rewards/margins": 1.7529271841049194, "rewards/rejected": -1.0286173820495605, "step": 6680 }, { "epoch": 0.687262259689524, "grad_norm": 48.75, "learning_rate": 3.474982864976011e-07, "logits/chosen": -0.663009524345398, "logits/rejected": -0.7098935842514038, "logps/chosen": -35.26447296142578, "logps/rejected": -47.884742736816406, "loss": 0.3303, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0203001499176025, "rewards/margins": 1.7233541011810303, "rewards/rejected": -0.7030541896820068, "step": 6685 }, { "epoch": 0.6877762927932559, "grad_norm": 31.875, "learning_rate": 3.4692711903129996e-07, "logits/chosen": -0.5702277421951294, "logits/rejected": -0.6165934205055237, "logps/chosen": -32.28583526611328, "logps/rejected": -50.946937561035156, "loss": 0.3513, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7173754572868347, "rewards/margins": 1.4496291875839233, "rewards/rejected": -0.732253909111023, "step": 6690 }, { "epoch": 0.6882903258969878, "grad_norm": 44.75, "learning_rate": 3.4635595156499884e-07, "logits/chosen": -0.6045598983764648, "logits/rejected": -0.6391546726226807, "logps/chosen": -37.41290283203125, "logps/rejected": -52.243194580078125, "loss": 0.3228, "rewards/accuracies": 0.875, "rewards/chosen": 0.7207907438278198, "rewards/margins": 1.7799991369247437, "rewards/rejected": -1.0592083930969238, "step": 6695 }, { "epoch": 0.6888043590007197, "grad_norm": 30.125, "learning_rate": 3.457847840986977e-07, "logits/chosen": -0.5524998903274536, "logits/rejected": -0.6069918870925903, "logps/chosen": -35.98689270019531, "logps/rejected": -52.0197868347168, "loss": 0.3037, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8061431050300598, "rewards/margins": 1.6679664850234985, "rewards/rejected": -0.8618232607841492, "step": 6700 }, { "epoch": 0.6893183921044516, "grad_norm": 28.75, "learning_rate": 3.4521361663239664e-07, "logits/chosen": -0.6683164834976196, "logits/rejected": -0.6564679741859436, "logps/chosen": -35.41541290283203, "logps/rejected": -50.31007385253906, "loss": 0.3165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6282873749732971, "rewards/margins": 1.0867345333099365, "rewards/rejected": -0.458447128534317, "step": 6705 }, { "epoch": 0.6898324252081834, "grad_norm": 29.625, "learning_rate": 3.4464244916609547e-07, "logits/chosen": -0.5702388286590576, "logits/rejected": -0.585209846496582, "logps/chosen": -34.55183410644531, "logps/rejected": -47.72419357299805, "loss": 0.2986, "rewards/accuracies": 0.875, "rewards/chosen": 0.8095976114273071, "rewards/margins": 1.5325721502304077, "rewards/rejected": -0.722974419593811, "step": 6710 }, { "epoch": 0.6903464583119153, "grad_norm": 56.5, "learning_rate": 3.440712816997944e-07, "logits/chosen": -0.5877346992492676, "logits/rejected": -0.6263354420661926, "logps/chosen": -37.67417526245117, "logps/rejected": -47.187652587890625, "loss": 0.3078, "rewards/accuracies": 0.875, "rewards/chosen": 0.8339808583259583, "rewards/margins": 1.6695665121078491, "rewards/rejected": -0.8355857133865356, "step": 6715 }, { "epoch": 0.6908604914156472, "grad_norm": 48.25, "learning_rate": 3.435001142334932e-07, "logits/chosen": -0.527840793132782, "logits/rejected": -0.6339670419692993, "logps/chosen": -35.10261154174805, "logps/rejected": -52.88445281982422, "loss": 0.3358, "rewards/accuracies": 0.875, "rewards/chosen": 0.8337764739990234, "rewards/margins": 1.830946922302246, "rewards/rejected": -0.997170627117157, "step": 6720 }, { "epoch": 0.6913745245193791, "grad_norm": 31.125, "learning_rate": 3.4292894676719215e-07, "logits/chosen": -0.540868878364563, "logits/rejected": -0.5006425380706787, "logps/chosen": -35.649017333984375, "logps/rejected": -46.87302017211914, "loss": 0.3511, "rewards/accuracies": 0.875, "rewards/chosen": 0.8310689926147461, "rewards/margins": 1.420620083808899, "rewards/rejected": -0.5895511507987976, "step": 6725 }, { "epoch": 0.691888557623111, "grad_norm": 29.125, "learning_rate": 3.42357779300891e-07, "logits/chosen": -0.5784572958946228, "logits/rejected": -0.6057882905006409, "logps/chosen": -42.641483306884766, "logps/rejected": -48.29700469970703, "loss": 0.3136, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.102770209312439, "rewards/margins": 1.5153719186782837, "rewards/rejected": -0.4126017987728119, "step": 6730 }, { "epoch": 0.6924025907268428, "grad_norm": 40.75, "learning_rate": 3.417866118345899e-07, "logits/chosen": -0.6181451082229614, "logits/rejected": -0.5815081596374512, "logps/chosen": -35.576725006103516, "logps/rejected": -45.120361328125, "loss": 0.3722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6182773113250732, "rewards/margins": 1.1997700929641724, "rewards/rejected": -0.5814928412437439, "step": 6735 }, { "epoch": 0.6929166238305747, "grad_norm": 29.375, "learning_rate": 3.4121544436828873e-07, "logits/chosen": -0.588456928730011, "logits/rejected": -0.5520392656326294, "logps/chosen": -34.53407287597656, "logps/rejected": -47.22511291503906, "loss": 0.3301, "rewards/accuracies": 0.875, "rewards/chosen": 0.6390758156776428, "rewards/margins": 1.6187069416046143, "rewards/rejected": -0.9796310663223267, "step": 6740 }, { "epoch": 0.6934306569343066, "grad_norm": 28.875, "learning_rate": 3.4064427690198766e-07, "logits/chosen": -0.5775912404060364, "logits/rejected": -0.6420048475265503, "logps/chosen": -38.3358268737793, "logps/rejected": -52.78057098388672, "loss": 0.3085, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.088478684425354, "rewards/margins": 1.8256229162216187, "rewards/rejected": -0.7371443510055542, "step": 6745 }, { "epoch": 0.6939446900380385, "grad_norm": 27.875, "learning_rate": 3.400731094356865e-07, "logits/chosen": -0.5799140334129333, "logits/rejected": -0.6396588087081909, "logps/chosen": -41.696495056152344, "logps/rejected": -56.32643508911133, "loss": 0.315, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5499870777130127, "rewards/margins": 1.546396017074585, "rewards/rejected": -0.9964089393615723, "step": 6750 }, { "epoch": 0.6944587231417704, "grad_norm": 33.5, "learning_rate": 3.395019419693854e-07, "logits/chosen": -0.5350148677825928, "logits/rejected": -0.6038007736206055, "logps/chosen": -38.725643157958984, "logps/rejected": -55.00359344482422, "loss": 0.3191, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8902837634086609, "rewards/margins": 1.7184925079345703, "rewards/rejected": -0.8282088041305542, "step": 6755 }, { "epoch": 0.6949727562455023, "grad_norm": 35.5, "learning_rate": 3.389307745030843e-07, "logits/chosen": -0.676551878452301, "logits/rejected": -0.6780543327331543, "logps/chosen": -32.66019058227539, "logps/rejected": -48.91490936279297, "loss": 0.3668, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8395546674728394, "rewards/margins": 1.604971170425415, "rewards/rejected": -0.7654162645339966, "step": 6760 }, { "epoch": 0.6954867893492341, "grad_norm": 57.25, "learning_rate": 3.3835960703678317e-07, "logits/chosen": -0.5554944276809692, "logits/rejected": -0.5599928498268127, "logps/chosen": -39.4140510559082, "logps/rejected": -47.80232620239258, "loss": 0.3231, "rewards/accuracies": 0.875, "rewards/chosen": 0.76838219165802, "rewards/margins": 1.205732822418213, "rewards/rejected": -0.43735066056251526, "step": 6765 }, { "epoch": 0.696000822452966, "grad_norm": 34.75, "learning_rate": 3.3778843957048205e-07, "logits/chosen": -0.5398467779159546, "logits/rejected": -0.5656562447547913, "logps/chosen": -38.90705490112305, "logps/rejected": -43.6827392578125, "loss": 0.3195, "rewards/accuracies": 0.75, "rewards/chosen": 0.8933572769165039, "rewards/margins": 0.9915605783462524, "rewards/rejected": -0.09820310026407242, "step": 6770 }, { "epoch": 0.6965148555566979, "grad_norm": 28.25, "learning_rate": 3.372172721041809e-07, "logits/chosen": -0.5896262526512146, "logits/rejected": -0.6380261182785034, "logps/chosen": -40.84934616088867, "logps/rejected": -49.416114807128906, "loss": 0.3022, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5170515775680542, "rewards/margins": 1.2770617008209229, "rewards/rejected": -0.7600101828575134, "step": 6775 }, { "epoch": 0.6970288886604298, "grad_norm": 26.625, "learning_rate": 3.366461046378798e-07, "logits/chosen": -0.51703941822052, "logits/rejected": -0.5885367393493652, "logps/chosen": -38.0942497253418, "logps/rejected": -54.71366500854492, "loss": 0.307, "rewards/accuracies": 0.875, "rewards/chosen": 0.88177889585495, "rewards/margins": 1.6648868322372437, "rewards/rejected": -0.7831080555915833, "step": 6780 }, { "epoch": 0.6975429217641617, "grad_norm": 29.375, "learning_rate": 3.360749371715787e-07, "logits/chosen": -0.6384421586990356, "logits/rejected": -0.5917397737503052, "logps/chosen": -43.598533630371094, "logps/rejected": -56.81514358520508, "loss": 0.3303, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7300888895988464, "rewards/margins": 1.6092250347137451, "rewards/rejected": -0.8791363835334778, "step": 6785 }, { "epoch": 0.6980569548678935, "grad_norm": 49.5, "learning_rate": 3.3550376970527756e-07, "logits/chosen": -0.5381984710693359, "logits/rejected": -0.5865342617034912, "logps/chosen": -38.4866828918457, "logps/rejected": -52.34387969970703, "loss": 0.3496, "rewards/accuracies": 0.875, "rewards/chosen": 0.7458866834640503, "rewards/margins": 1.577416181564331, "rewards/rejected": -0.831529438495636, "step": 6790 }, { "epoch": 0.6985709879716254, "grad_norm": 37.25, "learning_rate": 3.3493260223897643e-07, "logits/chosen": -0.6160122752189636, "logits/rejected": -0.6052639484405518, "logps/chosen": -41.552879333496094, "logps/rejected": -46.9583625793457, "loss": 0.3231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8775633573532104, "rewards/margins": 1.298734426498413, "rewards/rejected": -0.42117100954055786, "step": 6795 }, { "epoch": 0.6990850210753573, "grad_norm": 27.875, "learning_rate": 3.343614347726753e-07, "logits/chosen": -0.5876093506813049, "logits/rejected": -0.5966060161590576, "logps/chosen": -40.016353607177734, "logps/rejected": -53.236671447753906, "loss": 0.2948, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.636778712272644, "rewards/margins": 1.430797815322876, "rewards/rejected": -0.7940191626548767, "step": 6800 }, { "epoch": 0.6995990541790892, "grad_norm": 28.375, "learning_rate": 3.3379026730637424e-07, "logits/chosen": -0.5421963930130005, "logits/rejected": -0.5991467237472534, "logps/chosen": -32.782752990722656, "logps/rejected": -48.24341583251953, "loss": 0.3325, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7780217528343201, "rewards/margins": 1.3973767757415771, "rewards/rejected": -0.6193551421165466, "step": 6805 }, { "epoch": 0.7001130872828211, "grad_norm": 32.0, "learning_rate": 3.3321909984007307e-07, "logits/chosen": -0.5691710710525513, "logits/rejected": -0.6164656281471252, "logps/chosen": -41.71488571166992, "logps/rejected": -52.84794235229492, "loss": 0.3461, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5266071557998657, "rewards/margins": 1.204559564590454, "rewards/rejected": -0.6779525876045227, "step": 6810 }, { "epoch": 0.7002158939035674, "eval_logits/chosen": -0.5375662446022034, "eval_logits/rejected": -0.5977898836135864, "eval_logps/chosen": -77.81807708740234, "eval_logps/rejected": -55.162235260009766, "eval_loss": 0.3146648108959198, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.7090535759925842, "eval_rewards/margins": 1.5910354852676392, "eval_rewards/rejected": -0.8819818496704102, "eval_runtime": 2.1187, "eval_samples_per_second": 505.023, "eval_steps_per_second": 8.024, "step": 6811 } ], "logging_steps": 5, "max_steps": 9727, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 973, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }