diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6297 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 4164, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007204610951008645, + "grad_norm": 14.58157977905889, + "learning_rate": 1.199040767386091e-10, + "logits/chosen": -1.901450514793396, + "logits/rejected": -1.9076323509216309, + "logps/chosen": -0.8524526953697205, + "logps/rejected": -0.9626365900039673, + "loss": 1.1927, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.704905390739441, + "rewards/margins": 0.22036786377429962, + "rewards/rejected": -1.9252731800079346, + "step": 1 + }, + { + "epoch": 0.007204610951008645, + "grad_norm": 17.709463159121455, + "learning_rate": 1.199040767386091e-09, + "logits/chosen": -2.020684242248535, + "logits/rejected": -2.0064282417297363, + "logps/chosen": -1.0048482418060303, + "logps/rejected": -1.1098697185516357, + "loss": 1.216, + "rewards/accuracies": 0.5208333134651184, + "rewards/chosen": -2.0096964836120605, + "rewards/margins": 0.21004274487495422, + "rewards/rejected": -2.2197394371032715, + "step": 10 + }, + { + "epoch": 0.01440922190201729, + "grad_norm": 22.640302051500377, + "learning_rate": 2.398081534772182e-09, + "logits/chosen": -2.021089792251587, + "logits/rejected": -2.0176689624786377, + "logps/chosen": -1.0516496896743774, + "logps/rejected": -1.1834802627563477, + "loss": 1.1858, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.103299379348755, + "rewards/margins": 0.26366108655929565, + "rewards/rejected": -2.3669605255126953, + "step": 20 + }, + { + "epoch": 0.021613832853025938, + "grad_norm": 17.8606028438409, + "learning_rate": 3.597122302158273e-09, + "logits/chosen": -1.9866092205047607, + "logits/rejected": -1.9793494939804077, + "logps/chosen": -1.0540482997894287, + "logps/rejected": -1.1519711017608643, + "loss": 1.2346, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1080965995788574, + "rewards/margins": 0.19584545493125916, + "rewards/rejected": -2.3039422035217285, + "step": 30 + }, + { + "epoch": 0.02881844380403458, + "grad_norm": 19.245572130250604, + "learning_rate": 4.796163069544364e-09, + "logits/chosen": -2.0317888259887695, + "logits/rejected": -2.031811475753784, + "logps/chosen": -1.0351777076721191, + "logps/rejected": -1.136722207069397, + "loss": 1.2355, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0703554153442383, + "rewards/margins": 0.20308911800384521, + "rewards/rejected": -2.273444414138794, + "step": 40 + }, + { + "epoch": 0.03602305475504323, + "grad_norm": 14.943806509066846, + "learning_rate": 5.995203836930456e-09, + "logits/chosen": -1.9625627994537354, + "logits/rejected": -1.9631847143173218, + "logps/chosen": -0.9414892196655273, + "logps/rejected": -1.007533311843872, + "loss": 1.2547, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8829784393310547, + "rewards/margins": 0.13208839297294617, + "rewards/rejected": -2.015066623687744, + "step": 50 + }, + { + "epoch": 0.043227665706051875, + "grad_norm": 21.528231741291215, + "learning_rate": 7.194244604316546e-09, + "logits/chosen": -2.033930778503418, + "logits/rejected": -2.0294690132141113, + "logps/chosen": -1.0896106958389282, + "logps/rejected": -1.1459602117538452, + "loss": 1.2679, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.1792213916778564, + "rewards/margins": 0.1126992255449295, + "rewards/rejected": -2.2919204235076904, + "step": 60 + }, + { + "epoch": 0.05043227665706052, + "grad_norm": 20.70296936549822, + "learning_rate": 8.393285371702639e-09, + "logits/chosen": -2.0241129398345947, + "logits/rejected": -2.0117270946502686, + "logps/chosen": -1.1098978519439697, + "logps/rejected": -1.204820156097412, + "loss": 1.2271, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2197957038879395, + "rewards/margins": 0.1898445188999176, + "rewards/rejected": -2.409640312194824, + "step": 70 + }, + { + "epoch": 0.05763688760806916, + "grad_norm": 24.40623296093575, + "learning_rate": 9.592326139088728e-09, + "logits/chosen": -2.0398144721984863, + "logits/rejected": -2.036891222000122, + "logps/chosen": -1.1656566858291626, + "logps/rejected": -1.237831473350525, + "loss": 1.2527, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.331313371658325, + "rewards/margins": 0.14434944093227386, + "rewards/rejected": -2.47566294670105, + "step": 80 + }, + { + "epoch": 0.06484149855907781, + "grad_norm": 15.525751311455734, + "learning_rate": 1.0791366906474819e-08, + "logits/chosen": -2.0057613849639893, + "logits/rejected": -2.0072615146636963, + "logps/chosen": -1.0418776273727417, + "logps/rejected": -1.1488852500915527, + "loss": 1.215, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.0837552547454834, + "rewards/margins": 0.21401505172252655, + "rewards/rejected": -2.2977705001831055, + "step": 90 + }, + { + "epoch": 0.07204610951008646, + "grad_norm": 19.01739570575657, + "learning_rate": 1.1990407673860912e-08, + "logits/chosen": -2.0440549850463867, + "logits/rejected": -2.038007974624634, + "logps/chosen": -1.0073726177215576, + "logps/rejected": -1.114424467086792, + "loss": 1.2172, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0147452354431152, + "rewards/margins": 0.21410349011421204, + "rewards/rejected": -2.228848934173584, + "step": 100 + }, + { + "epoch": 0.0792507204610951, + "grad_norm": 16.468864603689383, + "learning_rate": 1.3189448441247003e-08, + "logits/chosen": -1.986783742904663, + "logits/rejected": -1.975547194480896, + "logps/chosen": -1.0294089317321777, + "logps/rejected": -1.1291263103485107, + "loss": 1.2279, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0588178634643555, + "rewards/margins": 0.19943459331989288, + "rewards/rejected": -2.2582526206970215, + "step": 110 + }, + { + "epoch": 0.08645533141210375, + "grad_norm": 18.27069220463476, + "learning_rate": 1.4388489208633092e-08, + "logits/chosen": -1.9731948375701904, + "logits/rejected": -1.9713401794433594, + "logps/chosen": -0.9640307426452637, + "logps/rejected": -1.0653537511825562, + "loss": 1.2087, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9280614852905273, + "rewards/margins": 0.2026461362838745, + "rewards/rejected": -2.1307075023651123, + "step": 120 + }, + { + "epoch": 0.0936599423631124, + "grad_norm": 17.232187953156046, + "learning_rate": 1.5587529976019183e-08, + "logits/chosen": -2.066575527191162, + "logits/rejected": -2.065995931625366, + "logps/chosen": -1.0801920890808105, + "logps/rejected": -1.1521753072738647, + "loss": 1.2549, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.160384178161621, + "rewards/margins": 0.14396657049655914, + "rewards/rejected": -2.3043506145477295, + "step": 130 + }, + { + "epoch": 0.10086455331412104, + "grad_norm": 20.847348575081657, + "learning_rate": 1.6786570743405277e-08, + "logits/chosen": -1.9832985401153564, + "logits/rejected": -1.9769630432128906, + "logps/chosen": -0.9781940579414368, + "logps/rejected": -1.122657060623169, + "loss": 1.1694, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9563881158828735, + "rewards/margins": 0.2889261543750763, + "rewards/rejected": -2.245314121246338, + "step": 140 + }, + { + "epoch": 0.10806916426512968, + "grad_norm": 19.95238793204191, + "learning_rate": 1.7985611510791365e-08, + "logits/chosen": -1.9963840246200562, + "logits/rejected": -1.9920928478240967, + "logps/chosen": -1.0187867879867554, + "logps/rejected": -1.136918306350708, + "loss": 1.2067, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0375735759735107, + "rewards/margins": 0.23626303672790527, + "rewards/rejected": -2.273836612701416, + "step": 150 + }, + { + "epoch": 0.11527377521613832, + "grad_norm": 17.4507491502089, + "learning_rate": 1.9184652278177456e-08, + "logits/chosen": -2.00455904006958, + "logits/rejected": -1.9985454082489014, + "logps/chosen": -0.9479260444641113, + "logps/rejected": -1.0970423221588135, + "loss": 1.1509, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.8958520889282227, + "rewards/margins": 0.2982328534126282, + "rewards/rejected": -2.194084644317627, + "step": 160 + }, + { + "epoch": 0.12247838616714697, + "grad_norm": 22.64495005377011, + "learning_rate": 2.038369304556355e-08, + "logits/chosen": -2.0030527114868164, + "logits/rejected": -1.995448350906372, + "logps/chosen": -1.0368740558624268, + "logps/rejected": -1.1604634523391724, + "loss": 1.2057, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0737481117248535, + "rewards/margins": 0.24717874825000763, + "rewards/rejected": -2.3209269046783447, + "step": 170 + }, + { + "epoch": 0.12968299711815562, + "grad_norm": 23.590437364971006, + "learning_rate": 2.1582733812949638e-08, + "logits/chosen": -2.0346579551696777, + "logits/rejected": -2.027749538421631, + "logps/chosen": -1.020750641822815, + "logps/rejected": -1.1084620952606201, + "loss": 1.2476, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.04150128364563, + "rewards/margins": 0.17542308568954468, + "rewards/rejected": -2.2169241905212402, + "step": 180 + }, + { + "epoch": 0.13688760806916425, + "grad_norm": 22.966874261128403, + "learning_rate": 2.278177458033573e-08, + "logits/chosen": -2.073704719543457, + "logits/rejected": -2.0714824199676514, + "logps/chosen": -0.9697921872138977, + "logps/rejected": -1.065453290939331, + "loss": 1.212, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9395843744277954, + "rewards/margins": 0.19132229685783386, + "rewards/rejected": -2.130906581878662, + "step": 190 + }, + { + "epoch": 0.1440922190201729, + "grad_norm": 22.638490791764895, + "learning_rate": 2.3980815347721823e-08, + "logits/chosen": -2.0427424907684326, + "logits/rejected": -2.0397419929504395, + "logps/chosen": -1.0259110927581787, + "logps/rejected": -1.1529022455215454, + "loss": 1.1871, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0518221855163574, + "rewards/margins": 0.2539823353290558, + "rewards/rejected": -2.305804491043091, + "step": 200 + }, + { + "epoch": 0.15129682997118155, + "grad_norm": 21.113736148839788, + "learning_rate": 2.517985611510791e-08, + "logits/chosen": -2.0403473377227783, + "logits/rejected": -2.037600040435791, + "logps/chosen": -1.0739350318908691, + "logps/rejected": -1.150781273841858, + "loss": 1.2504, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1478700637817383, + "rewards/margins": 0.15369237959384918, + "rewards/rejected": -2.301562547683716, + "step": 210 + }, + { + "epoch": 0.1585014409221902, + "grad_norm": 15.482070000655302, + "learning_rate": 2.6378896882494006e-08, + "logits/chosen": -1.9863160848617554, + "logits/rejected": -1.982267141342163, + "logps/chosen": -1.0080206394195557, + "logps/rejected": -1.176837682723999, + "loss": 1.1505, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0160412788391113, + "rewards/margins": 0.3376340866088867, + "rewards/rejected": -2.353675365447998, + "step": 220 + }, + { + "epoch": 0.16570605187319884, + "grad_norm": 17.014637756082593, + "learning_rate": 2.7577937649880097e-08, + "logits/chosen": -2.021378993988037, + "logits/rejected": -2.021695613861084, + "logps/chosen": -1.0124410390853882, + "logps/rejected": -1.12635338306427, + "loss": 1.2019, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0248820781707764, + "rewards/margins": 0.22782447934150696, + "rewards/rejected": -2.25270676612854, + "step": 230 + }, + { + "epoch": 0.1729106628242075, + "grad_norm": 22.32772580016105, + "learning_rate": 2.8776978417266184e-08, + "logits/chosen": -2.0529181957244873, + "logits/rejected": -2.0477967262268066, + "logps/chosen": -1.0616161823272705, + "logps/rejected": -1.1394503116607666, + "loss": 1.2614, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.123232364654541, + "rewards/margins": 0.15566802024841309, + "rewards/rejected": -2.278900623321533, + "step": 240 + }, + { + "epoch": 0.18011527377521613, + "grad_norm": 19.079728631088813, + "learning_rate": 2.997601918465228e-08, + "logits/chosen": -1.9696033000946045, + "logits/rejected": -1.9657630920410156, + "logps/chosen": -1.0835182666778564, + "logps/rejected": -1.1734166145324707, + "loss": 1.2393, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.167036533355713, + "rewards/margins": 0.17979690432548523, + "rewards/rejected": -2.3468332290649414, + "step": 250 + }, + { + "epoch": 0.1873198847262248, + "grad_norm": 21.30398020890557, + "learning_rate": 3.1175059952038366e-08, + "logits/chosen": -1.9843509197235107, + "logits/rejected": -1.9924278259277344, + "logps/chosen": -1.1062877178192139, + "logps/rejected": -1.2165796756744385, + "loss": 1.2142, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.2125754356384277, + "rewards/margins": 0.22058391571044922, + "rewards/rejected": -2.433159351348877, + "step": 260 + }, + { + "epoch": 0.19452449567723343, + "grad_norm": 20.993622960377618, + "learning_rate": 3.237410071942446e-08, + "logits/chosen": -2.0651376247406006, + "logits/rejected": -2.0571722984313965, + "logps/chosen": -1.0719540119171143, + "logps/rejected": -1.2004284858703613, + "loss": 1.181, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.1439080238342285, + "rewards/margins": 0.2569490075111389, + "rewards/rejected": -2.4008569717407227, + "step": 270 + }, + { + "epoch": 0.2017291066282421, + "grad_norm": 25.067055659781758, + "learning_rate": 3.3573141486810555e-08, + "logits/chosen": -2.014195680618286, + "logits/rejected": -2.012540102005005, + "logps/chosen": -0.935396671295166, + "logps/rejected": -1.049852967262268, + "loss": 1.1977, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.870793342590332, + "rewards/margins": 0.2289123237133026, + "rewards/rejected": -2.099705934524536, + "step": 280 + }, + { + "epoch": 0.20893371757925072, + "grad_norm": 21.777264205916122, + "learning_rate": 3.477218225419664e-08, + "logits/chosen": -2.044172763824463, + "logits/rejected": -2.0461270809173584, + "logps/chosen": -1.0135643482208252, + "logps/rejected": -1.1082309484481812, + "loss": 1.2343, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0271286964416504, + "rewards/margins": 0.18933361768722534, + "rewards/rejected": -2.2164618968963623, + "step": 290 + }, + { + "epoch": 0.21613832853025935, + "grad_norm": 20.318543545834533, + "learning_rate": 3.597122302158273e-08, + "logits/chosen": -2.0240025520324707, + "logits/rejected": -2.0156774520874023, + "logps/chosen": -1.0902036428451538, + "logps/rejected": -1.1914021968841553, + "loss": 1.2135, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1804072856903076, + "rewards/margins": 0.20239713788032532, + "rewards/rejected": -2.3828043937683105, + "step": 300 + }, + { + "epoch": 0.22334293948126802, + "grad_norm": 18.50470861360763, + "learning_rate": 3.717026378896883e-08, + "logits/chosen": -1.9557920694351196, + "logits/rejected": -1.955775260925293, + "logps/chosen": -1.0874634981155396, + "logps/rejected": -1.1727240085601807, + "loss": 1.2381, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.174926996231079, + "rewards/margins": 0.17052076756954193, + "rewards/rejected": -2.3454480171203613, + "step": 310 + }, + { + "epoch": 0.23054755043227665, + "grad_norm": 15.935054480540096, + "learning_rate": 3.836930455635491e-08, + "logits/chosen": -2.031646966934204, + "logits/rejected": -2.0232386589050293, + "logps/chosen": -1.0084177255630493, + "logps/rejected": -1.1408658027648926, + "loss": 1.1926, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0168354511260986, + "rewards/margins": 0.2648962140083313, + "rewards/rejected": -2.281731605529785, + "step": 320 + }, + { + "epoch": 0.2377521613832853, + "grad_norm": 15.808626134367197, + "learning_rate": 3.9568345323741003e-08, + "logits/chosen": -2.019885778427124, + "logits/rejected": -2.022150754928589, + "logps/chosen": -1.0463831424713135, + "logps/rejected": -1.069990873336792, + "loss": 1.3364, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -2.092766284942627, + "rewards/margins": 0.04721563309431076, + "rewards/rejected": -2.139981746673584, + "step": 330 + }, + { + "epoch": 0.24495677233429394, + "grad_norm": 18.32115617252851, + "learning_rate": 4.07673860911271e-08, + "logits/chosen": -2.0614123344421387, + "logits/rejected": -2.055767297744751, + "logps/chosen": -1.0877503156661987, + "logps/rejected": -1.16796875, + "loss": 1.2366, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1755006313323975, + "rewards/margins": 0.16043710708618164, + "rewards/rejected": -2.3359375, + "step": 340 + }, + { + "epoch": 0.2521613832853026, + "grad_norm": 19.369790564686102, + "learning_rate": 4.1966426858513185e-08, + "logits/chosen": -1.9940099716186523, + "logits/rejected": -1.9883639812469482, + "logps/chosen": -0.9887149930000305, + "logps/rejected": -1.1154861450195312, + "loss": 1.1858, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.977429986000061, + "rewards/margins": 0.25354230403900146, + "rewards/rejected": -2.2309722900390625, + "step": 350 + }, + { + "epoch": 0.25936599423631124, + "grad_norm": 21.686526135721945, + "learning_rate": 4.3165467625899276e-08, + "logits/chosen": -1.9959064722061157, + "logits/rejected": -1.9917312860488892, + "logps/chosen": -1.0866310596466064, + "logps/rejected": -1.2025970220565796, + "loss": 1.1977, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.173262119293213, + "rewards/margins": 0.23193176090717316, + "rewards/rejected": -2.405194044113159, + "step": 360 + }, + { + "epoch": 0.2665706051873199, + "grad_norm": 18.21919999535183, + "learning_rate": 4.4364508393285374e-08, + "logits/chosen": -2.0002856254577637, + "logits/rejected": -2.000253200531006, + "logps/chosen": -1.0520254373550415, + "logps/rejected": -1.180267572402954, + "loss": 1.1778, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.104050874710083, + "rewards/margins": 0.2564844489097595, + "rewards/rejected": -2.360535144805908, + "step": 370 + }, + { + "epoch": 0.2737752161383285, + "grad_norm": 16.536106044001812, + "learning_rate": 4.556354916067146e-08, + "logits/chosen": -2.028313398361206, + "logits/rejected": -2.032285213470459, + "logps/chosen": -1.0125257968902588, + "logps/rejected": -1.0858430862426758, + "loss": 1.2682, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0250515937805176, + "rewards/margins": 0.14663462340831757, + "rewards/rejected": -2.1716861724853516, + "step": 380 + }, + { + "epoch": 0.28097982708933716, + "grad_norm": 15.31773608533987, + "learning_rate": 4.676258992805755e-08, + "logits/chosen": -2.0320096015930176, + "logits/rejected": -2.0257675647735596, + "logps/chosen": -1.0224783420562744, + "logps/rejected": -1.1486625671386719, + "loss": 1.1819, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.044956684112549, + "rewards/margins": 0.25236865878105164, + "rewards/rejected": -2.2973251342773438, + "step": 390 + }, + { + "epoch": 0.2881844380403458, + "grad_norm": 18.995537958721503, + "learning_rate": 4.796163069544365e-08, + "logits/chosen": -2.034123420715332, + "logits/rejected": -2.034450054168701, + "logps/chosen": -0.9964189529418945, + "logps/rejected": -1.0486726760864258, + "loss": 1.2726, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.992837905883789, + "rewards/margins": 0.10450725257396698, + "rewards/rejected": -2.0973453521728516, + "step": 400 + }, + { + "epoch": 0.2953890489913545, + "grad_norm": 18.624392586338367, + "learning_rate": 4.916067146282973e-08, + "logits/chosen": -2.0277891159057617, + "logits/rejected": -2.0259571075439453, + "logps/chosen": -1.0748345851898193, + "logps/rejected": -1.1457411050796509, + "loss": 1.262, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.1496691703796387, + "rewards/margins": 0.14181289076805115, + "rewards/rejected": -2.2914822101593018, + "step": 410 + }, + { + "epoch": 0.3025936599423631, + "grad_norm": 16.76581954495512, + "learning_rate": 4.999992091672379e-08, + "logits/chosen": -2.011078119277954, + "logits/rejected": -2.0153493881225586, + "logps/chosen": -1.0450259447097778, + "logps/rejected": -1.1236448287963867, + "loss": 1.2425, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.0900518894195557, + "rewards/margins": 0.15723773837089539, + "rewards/rejected": -2.2472896575927734, + "step": 420 + }, + { + "epoch": 0.30979827089337175, + "grad_norm": 17.72733209255425, + "learning_rate": 4.999851500573209e-08, + "logits/chosen": -1.9903459548950195, + "logits/rejected": -1.991233229637146, + "logps/chosen": -1.0592777729034424, + "logps/rejected": -1.0997775793075562, + "loss": 1.3022, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -2.1185555458068848, + "rewards/margins": 0.08099973201751709, + "rewards/rejected": -2.1995551586151123, + "step": 430 + }, + { + "epoch": 0.3170028818443804, + "grad_norm": 15.96665018689344, + "learning_rate": 4.999535180235972e-08, + "logits/chosen": -1.990563988685608, + "logits/rejected": -1.9907207489013672, + "logps/chosen": -1.0212013721466064, + "logps/rejected": -1.1435030698776245, + "loss": 1.1959, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.042402744293213, + "rewards/margins": 0.2446034699678421, + "rewards/rejected": -2.287006139755249, + "step": 440 + }, + { + "epoch": 0.3242074927953891, + "grad_norm": 17.84897470512453, + "learning_rate": 4.9990431528966836e-08, + "logits/chosen": -2.010443925857544, + "logits/rejected": -2.006673574447632, + "logps/chosen": -1.1450097560882568, + "logps/rejected": -1.1849489212036133, + "loss": 1.3018, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.2900195121765137, + "rewards/margins": 0.07987822592258453, + "rewards/rejected": -2.3698978424072266, + "step": 450 + }, + { + "epoch": 0.3314121037463977, + "grad_norm": 24.49190807066052, + "learning_rate": 4.9983754531428326e-08, + "logits/chosen": -2.006472110748291, + "logits/rejected": -2.00079083442688, + "logps/chosen": -1.1708580255508423, + "logps/rejected": -1.2872368097305298, + "loss": 1.2012, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.3417160511016846, + "rewards/margins": 0.23275737464427948, + "rewards/rejected": -2.5744736194610596, + "step": 460 + }, + { + "epoch": 0.33861671469740634, + "grad_norm": 23.024434569130843, + "learning_rate": 4.997532127910954e-08, + "logits/chosen": -2.0429301261901855, + "logits/rejected": -2.0308475494384766, + "logps/chosen": -1.100434422492981, + "logps/rejected": -1.2019624710083008, + "loss": 1.2198, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.200868844985962, + "rewards/margins": 0.20305626094341278, + "rewards/rejected": -2.4039249420166016, + "step": 470 + }, + { + "epoch": 0.345821325648415, + "grad_norm": 21.129827787614413, + "learning_rate": 4.996513236483331e-08, + "logits/chosen": -2.101729154586792, + "logits/rejected": -2.091571569442749, + "logps/chosen": -0.9851749539375305, + "logps/rejected": -1.106676459312439, + "loss": 1.185, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.970349907875061, + "rewards/margins": 0.2430029660463333, + "rewards/rejected": -2.213352918624878, + "step": 480 + }, + { + "epoch": 0.3530259365994236, + "grad_norm": 18.94655048736081, + "learning_rate": 4.9953188504838225e-08, + "logits/chosen": -2.0206782817840576, + "logits/rejected": -2.0197720527648926, + "logps/chosen": -0.9880903959274292, + "logps/rejected": -1.1017425060272217, + "loss": 1.1937, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9761807918548584, + "rewards/margins": 0.2273043841123581, + "rewards/rejected": -2.2034850120544434, + "step": 490 + }, + { + "epoch": 0.36023054755043227, + "grad_norm": 18.60846892662722, + "learning_rate": 4.993949053872834e-08, + "logits/chosen": -2.019057035446167, + "logits/rejected": -2.0055313110351562, + "logps/chosen": -1.0131161212921143, + "logps/rejected": -1.139453649520874, + "loss": 1.1821, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0262322425842285, + "rewards/margins": 0.2526749074459076, + "rewards/rejected": -2.278907299041748, + "step": 500 + }, + { + "epoch": 0.36743515850144093, + "grad_norm": 19.18531858517567, + "learning_rate": 4.9924039429414086e-08, + "logits/chosen": -2.0883572101593018, + "logits/rejected": -2.0818283557891846, + "logps/chosen": -1.0440417528152466, + "logps/rejected": -1.1581791639328003, + "loss": 1.2079, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.088083505630493, + "rewards/margins": 0.22827525436878204, + "rewards/rejected": -2.3163583278656006, + "step": 510 + }, + { + "epoch": 0.3746397694524496, + "grad_norm": 16.068632795684866, + "learning_rate": 4.990683626304467e-08, + "logits/chosen": -2.010894775390625, + "logits/rejected": -2.0092484951019287, + "logps/chosen": -1.1070988178253174, + "logps/rejected": -1.2031704187393188, + "loss": 1.2198, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.2141976356506348, + "rewards/margins": 0.19214320182800293, + "rewards/rejected": -2.4063408374786377, + "step": 520 + }, + { + "epoch": 0.3818443804034582, + "grad_norm": 17.727178124609676, + "learning_rate": 4.9887882248931646e-08, + "logits/chosen": -1.97884202003479, + "logits/rejected": -1.968973159790039, + "logps/chosen": -0.9846093058586121, + "logps/rejected": -1.0614283084869385, + "loss": 1.2503, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.9692186117172241, + "rewards/margins": 0.15363821387290955, + "rewards/rejected": -2.122856616973877, + "step": 530 + }, + { + "epoch": 0.38904899135446686, + "grad_norm": 22.67722196494781, + "learning_rate": 4.986717871946393e-08, + "logits/chosen": -2.004068374633789, + "logits/rejected": -1.99717116355896, + "logps/chosen": -1.0308892726898193, + "logps/rejected": -1.1323744058609009, + "loss": 1.2209, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0617785453796387, + "rewards/margins": 0.20297034084796906, + "rewards/rejected": -2.2647488117218018, + "step": 540 + }, + { + "epoch": 0.3962536023054755, + "grad_norm": 17.281352424891857, + "learning_rate": 4.984472713001416e-08, + "logits/chosen": -1.9620494842529297, + "logits/rejected": -1.962517499923706, + "logps/chosen": -1.0005210638046265, + "logps/rejected": -1.0776532888412476, + "loss": 1.2683, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.001042127609253, + "rewards/margins": 0.1542646884918213, + "rewards/rejected": -2.155306577682495, + "step": 550 + }, + { + "epoch": 0.4034582132564842, + "grad_norm": 17.117159642375974, + "learning_rate": 4.982052905883637e-08, + "logits/chosen": -2.031991481781006, + "logits/rejected": -2.0326719284057617, + "logps/chosen": -1.080214262008667, + "logps/rejected": -1.181120753288269, + "loss": 1.224, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.160428524017334, + "rewards/margins": 0.2018129527568817, + "rewards/rejected": -2.362241506576538, + "step": 560 + }, + { + "epoch": 0.4106628242074928, + "grad_norm": 16.328895540705197, + "learning_rate": 4.979458620695505e-08, + "logits/chosen": -2.029468059539795, + "logits/rejected": -2.0152204036712646, + "logps/chosen": -1.0948221683502197, + "logps/rejected": -1.208194613456726, + "loss": 1.2094, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1896443367004395, + "rewards/margins": 0.22674505412578583, + "rewards/rejected": -2.416389226913452, + "step": 570 + }, + { + "epoch": 0.41786743515850144, + "grad_norm": 19.61140460251683, + "learning_rate": 4.976690039804555e-08, + "logits/chosen": -2.033027172088623, + "logits/rejected": -2.0314948558807373, + "logps/chosen": -0.9877282381057739, + "logps/rejected": -1.0673277378082275, + "loss": 1.2473, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9754564762115479, + "rewards/margins": 0.1591992825269699, + "rewards/rejected": -2.134655475616455, + "step": 580 + }, + { + "epoch": 0.4250720461095101, + "grad_norm": 21.430631009789273, + "learning_rate": 4.973747357830592e-08, + "logits/chosen": -2.0215108394622803, + "logits/rejected": -2.021780490875244, + "logps/chosen": -1.0275431871414185, + "logps/rejected": -1.1647249460220337, + "loss": 1.1677, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.055086374282837, + "rewards/margins": 0.2743634283542633, + "rewards/rejected": -2.3294498920440674, + "step": 590 + }, + { + "epoch": 0.4322766570605187, + "grad_norm": 19.463998303694815, + "learning_rate": 4.970630781632009e-08, + "logits/chosen": -2.0801994800567627, + "logits/rejected": -2.076254367828369, + "logps/chosen": -1.0327340364456177, + "logps/rejected": -1.1751863956451416, + "loss": 1.1681, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0654680728912354, + "rewards/margins": 0.28490471839904785, + "rewards/rejected": -2.350372791290283, + "step": 600 + }, + { + "epoch": 0.43948126801152737, + "grad_norm": 21.00995063503415, + "learning_rate": 4.967340530291242e-08, + "logits/chosen": -2.027909517288208, + "logits/rejected": -2.0180211067199707, + "logps/chosen": -1.0928115844726562, + "logps/rejected": -1.1507136821746826, + "loss": 1.2682, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.1856231689453125, + "rewards/margins": 0.11580429971218109, + "rewards/rejected": -2.3014273643493652, + "step": 610 + }, + { + "epoch": 0.44668587896253603, + "grad_norm": 24.905225792062406, + "learning_rate": 4.9638768350993755e-08, + "logits/chosen": -2.0285048484802246, + "logits/rejected": -2.021249771118164, + "logps/chosen": -0.9952943921089172, + "logps/rejected": -1.0829205513000488, + "loss": 1.2345, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.9905887842178345, + "rewards/margins": 0.17525213956832886, + "rewards/rejected": -2.1658411026000977, + "step": 620 + }, + { + "epoch": 0.4538904899135447, + "grad_norm": 20.771750563160076, + "learning_rate": 4.9602399395398786e-08, + "logits/chosen": -2.0377490520477295, + "logits/rejected": -2.037675380706787, + "logps/chosen": -1.027521014213562, + "logps/rejected": -1.1547839641571045, + "loss": 1.183, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.055042028427124, + "rewards/margins": 0.2545255422592163, + "rewards/rejected": -2.309567928314209, + "step": 630 + }, + { + "epoch": 0.4610951008645533, + "grad_norm": 16.17835710154515, + "learning_rate": 4.9564300992714914e-08, + "logits/chosen": -1.9597883224487305, + "logits/rejected": -1.9607963562011719, + "logps/chosen": -1.0108855962753296, + "logps/rejected": -1.116549015045166, + "loss": 1.2101, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.021771192550659, + "rewards/margins": 0.21132683753967285, + "rewards/rejected": -2.233098030090332, + "step": 640 + }, + { + "epoch": 0.46829971181556196, + "grad_norm": 21.86769715087536, + "learning_rate": 4.952447582110253e-08, + "logits/chosen": -2.0587735176086426, + "logits/rejected": -2.044377565383911, + "logps/chosen": -1.0383652448654175, + "logps/rejected": -1.1178988218307495, + "loss": 1.2479, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.076730489730835, + "rewards/margins": 0.15906734764575958, + "rewards/rejected": -2.235797643661499, + "step": 650 + }, + { + "epoch": 0.4755043227665706, + "grad_norm": 23.755054747254476, + "learning_rate": 4.948292668010676e-08, + "logits/chosen": -2.031721353530884, + "logits/rejected": -2.032727003097534, + "logps/chosen": -1.0880773067474365, + "logps/rejected": -1.1748898029327393, + "loss": 1.2449, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.176154613494873, + "rewards/margins": 0.17362497746944427, + "rewards/rejected": -2.3497796058654785, + "step": 660 + }, + { + "epoch": 0.4827089337175792, + "grad_norm": 20.474460354625247, + "learning_rate": 4.943965649046064e-08, + "logits/chosen": -2.0048508644104004, + "logits/rejected": -1.9955081939697266, + "logps/chosen": -1.062713384628296, + "logps/rejected": -1.1663198471069336, + "loss": 1.2154, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.125426769256592, + "rewards/margins": 0.2072126865386963, + "rewards/rejected": -2.332639694213867, + "step": 670 + }, + { + "epoch": 0.4899135446685879, + "grad_norm": 19.048186528049722, + "learning_rate": 4.9394668293879835e-08, + "logits/chosen": -1.959315538406372, + "logits/rejected": -1.9503145217895508, + "logps/chosen": -1.0368311405181885, + "logps/rejected": -1.1063206195831299, + "loss": 1.2624, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.073662281036377, + "rewards/margins": 0.13897888362407684, + "rewards/rejected": -2.2126412391662598, + "step": 680 + }, + { + "epoch": 0.49711815561959655, + "grad_norm": 24.933354819026505, + "learning_rate": 4.93479652528488e-08, + "logits/chosen": -2.020735502243042, + "logits/rejected": -2.0154590606689453, + "logps/chosen": -1.1052331924438477, + "logps/rejected": -1.209161639213562, + "loss": 1.2262, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.2104663848876953, + "rewards/margins": 0.2078566551208496, + "rewards/rejected": -2.418323278427124, + "step": 690 + }, + { + "epoch": 0.5043227665706052, + "grad_norm": 20.317629206968732, + "learning_rate": 4.929955065039848e-08, + "logits/chosen": -2.0213494300842285, + "logits/rejected": -2.0158300399780273, + "logps/chosen": -1.0192697048187256, + "logps/rejected": -1.1514381170272827, + "loss": 1.1829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.038539409637451, + "rewards/margins": 0.2643369436264038, + "rewards/rejected": -2.3028762340545654, + "step": 700 + }, + { + "epoch": 0.5115273775216138, + "grad_norm": 19.004922715885144, + "learning_rate": 4.92494278898755e-08, + "logits/chosen": -1.985918402671814, + "logits/rejected": -1.982656717300415, + "logps/chosen": -0.8973722457885742, + "logps/rejected": -1.0216716527938843, + "loss": 1.1973, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.7947444915771484, + "rewards/margins": 0.24859857559204102, + "rewards/rejected": -2.0433433055877686, + "step": 710 + }, + { + "epoch": 0.5187319884726225, + "grad_norm": 18.960064654240945, + "learning_rate": 4.9197600494702955e-08, + "logits/chosen": -2.007420539855957, + "logits/rejected": -2.001126289367676, + "logps/chosen": -1.0426667928695679, + "logps/rejected": -1.1658456325531006, + "loss": 1.1852, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0853335857391357, + "rewards/margins": 0.2463577687740326, + "rewards/rejected": -2.331691265106201, + "step": 720 + }, + { + "epoch": 0.5259365994236311, + "grad_norm": 20.53343043509484, + "learning_rate": 4.9144072108132725e-08, + "logits/chosen": -2.0134854316711426, + "logits/rejected": -2.0023691654205322, + "logps/chosen": -1.0226707458496094, + "logps/rejected": -1.1051828861236572, + "loss": 1.2518, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0453414916992188, + "rewards/margins": 0.16502413153648376, + "rewards/rejected": -2.2103657722473145, + "step": 730 + }, + { + "epoch": 0.5331412103746398, + "grad_norm": 17.758862211588106, + "learning_rate": 4.908884649298937e-08, + "logits/chosen": -1.9972114562988281, + "logits/rejected": -2.004119634628296, + "logps/chosen": -1.0192463397979736, + "logps/rejected": -1.0796899795532227, + "loss": 1.2835, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.0384926795959473, + "rewards/margins": 0.12088724225759506, + "rewards/rejected": -2.1593799591064453, + "step": 740 + }, + { + "epoch": 0.5403458213256485, + "grad_norm": 23.124810759913256, + "learning_rate": 4.903192753140557e-08, + "logits/chosen": -2.0147690773010254, + "logits/rejected": -2.009342908859253, + "logps/chosen": -1.1004369258880615, + "logps/rejected": -1.1906808614730835, + "loss": 1.2378, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.200873851776123, + "rewards/margins": 0.18048794567584991, + "rewards/rejected": -2.381361722946167, + "step": 750 + }, + { + "epoch": 0.547550432276657, + "grad_norm": 19.72534726379729, + "learning_rate": 4.897331922454931e-08, + "logits/chosen": -1.9795690774917603, + "logits/rejected": -1.9833734035491943, + "logps/chosen": -1.0041850805282593, + "logps/rejected": -1.1136337518692017, + "loss": 1.2165, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0083701610565186, + "rewards/margins": 0.2188970297574997, + "rewards/rejected": -2.2272675037384033, + "step": 760 + }, + { + "epoch": 0.5547550432276657, + "grad_norm": 20.539097658978797, + "learning_rate": 4.891302569234256e-08, + "logits/chosen": -1.9727134704589844, + "logits/rejected": -1.9754774570465088, + "logps/chosen": -0.9772794842720032, + "logps/rejected": -1.1290626525878906, + "loss": 1.1643, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9545589685440063, + "rewards/margins": 0.3035663962364197, + "rewards/rejected": -2.2581253051757812, + "step": 770 + }, + { + "epoch": 0.5619596541786743, + "grad_norm": 22.07597844396349, + "learning_rate": 4.8851051173171656e-08, + "logits/chosen": -1.9940038919448853, + "logits/rejected": -1.9926246404647827, + "logps/chosen": -1.0405977964401245, + "logps/rejected": -1.1220670938491821, + "loss": 1.2393, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.081195592880249, + "rewards/margins": 0.1629386693239212, + "rewards/rejected": -2.2441341876983643, + "step": 780 + }, + { + "epoch": 0.569164265129683, + "grad_norm": 17.470111374688827, + "learning_rate": 4.87874000235894e-08, + "logits/chosen": -2.013667106628418, + "logits/rejected": -2.0078587532043457, + "logps/chosen": -1.0763031244277954, + "logps/rejected": -1.233242154121399, + "loss": 1.1596, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.152606248855591, + "rewards/margins": 0.3138778507709503, + "rewards/rejected": -2.466484308242798, + "step": 790 + }, + { + "epoch": 0.5763688760806917, + "grad_norm": 19.520543671943127, + "learning_rate": 4.872207671800876e-08, + "logits/chosen": -2.0354135036468506, + "logits/rejected": -2.0318105220794678, + "logps/chosen": -1.0444949865341187, + "logps/rejected": -1.1220977306365967, + "loss": 1.2567, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0889899730682373, + "rewards/margins": 0.15520496666431427, + "rewards/rejected": -2.2441954612731934, + "step": 800 + }, + { + "epoch": 0.5835734870317003, + "grad_norm": 15.931565272235597, + "learning_rate": 4.865508584838841e-08, + "logits/chosen": -2.0230934619903564, + "logits/rejected": -2.025510311126709, + "logps/chosen": -1.0136370658874512, + "logps/rejected": -1.1028186082839966, + "loss": 1.2343, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.0272741317749023, + "rewards/margins": 0.17836324870586395, + "rewards/rejected": -2.205637216567993, + "step": 810 + }, + { + "epoch": 0.590778097982709, + "grad_norm": 21.101696006896514, + "learning_rate": 4.858643212390985e-08, + "logits/chosen": -2.0232460498809814, + "logits/rejected": -2.0134730339050293, + "logps/chosen": -1.0298725366592407, + "logps/rejected": -1.1151840686798096, + "loss": 1.2504, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.0597450733184814, + "rewards/margins": 0.1706230342388153, + "rewards/rejected": -2.230368137359619, + "step": 820 + }, + { + "epoch": 0.5979827089337176, + "grad_norm": 18.384683685983724, + "learning_rate": 4.851612037064643e-08, + "logits/chosen": -2.0008656978607178, + "logits/rejected": -1.9988391399383545, + "logps/chosen": -0.96119225025177, + "logps/rejected": -1.0799301862716675, + "loss": 1.2051, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.92238450050354, + "rewards/margins": 0.2374759167432785, + "rewards/rejected": -2.159860372543335, + "step": 830 + }, + { + "epoch": 0.6051873198847262, + "grad_norm": 15.976129382373403, + "learning_rate": 4.8444155531224065e-08, + "logits/chosen": -2.0319008827209473, + "logits/rejected": -2.031928539276123, + "logps/chosen": -1.0886750221252441, + "logps/rejected": -1.1605113744735718, + "loss": 1.2625, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1773500442504883, + "rewards/margins": 0.143672913312912, + "rewards/rejected": -2.3210227489471436, + "step": 840 + }, + { + "epoch": 0.6123919308357348, + "grad_norm": 15.372106337343025, + "learning_rate": 4.8370542664473805e-08, + "logits/chosen": -2.03184175491333, + "logits/rejected": -2.0259571075439453, + "logps/chosen": -1.0505023002624512, + "logps/rejected": -1.15494704246521, + "loss": 1.2248, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1010046005249023, + "rewards/margins": 0.20888929069042206, + "rewards/rejected": -2.30989408493042, + "step": 850 + }, + { + "epoch": 0.6195965417867435, + "grad_norm": 17.833021138756298, + "learning_rate": 4.829528694507624e-08, + "logits/chosen": -2.011185646057129, + "logits/rejected": -2.0070912837982178, + "logps/chosen": -1.161972999572754, + "logps/rejected": -1.218332290649414, + "loss": 1.28, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.323945999145508, + "rewards/margins": 0.11271880567073822, + "rewards/rejected": -2.436664581298828, + "step": 860 + }, + { + "epoch": 0.6268011527377522, + "grad_norm": 20.10043591744987, + "learning_rate": 4.821839366319768e-08, + "logits/chosen": -2.0453944206237793, + "logits/rejected": -2.0392508506774902, + "logps/chosen": -1.0053439140319824, + "logps/rejected": -1.12282395362854, + "loss": 1.1973, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.010687828063965, + "rewards/margins": 0.23495987057685852, + "rewards/rejected": -2.24564790725708, + "step": 870 + }, + { + "epoch": 0.6340057636887608, + "grad_norm": 19.633475514009838, + "learning_rate": 4.813986822411833e-08, + "logits/chosen": -2.037318706512451, + "logits/rejected": -2.035334825515747, + "logps/chosen": -1.0152684450149536, + "logps/rejected": -1.0797330141067505, + "loss": 1.2669, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.0305368900299072, + "rewards/margins": 0.12892897427082062, + "rewards/rejected": -2.159466028213501, + "step": 880 + }, + { + "epoch": 0.6412103746397695, + "grad_norm": 19.900627573984437, + "learning_rate": 4.805971614785231e-08, + "logits/chosen": -2.0658364295959473, + "logits/rejected": -2.0642929077148438, + "logps/chosen": -1.0170501470565796, + "logps/rejected": -1.11166250705719, + "loss": 1.2213, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.034100294113159, + "rewards/margins": 0.1892244815826416, + "rewards/rejected": -2.22332501411438, + "step": 890 + }, + { + "epoch": 0.6484149855907781, + "grad_norm": 20.046745017622534, + "learning_rate": 4.797794306875963e-08, + "logits/chosen": -1.9768317937850952, + "logits/rejected": -1.9782040119171143, + "logps/chosen": -1.1424155235290527, + "logps/rejected": -1.2143452167510986, + "loss": 1.2686, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.2848310470581055, + "rewards/margins": 0.14385904371738434, + "rewards/rejected": -2.4286904335021973, + "step": 900 + }, + { + "epoch": 0.6556195965417867, + "grad_norm": 20.156486798671747, + "learning_rate": 4.7894554735150076e-08, + "logits/chosen": -1.979318618774414, + "logits/rejected": -1.9829566478729248, + "logps/chosen": -1.042389154434204, + "logps/rejected": -1.108424186706543, + "loss": 1.2626, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.084778308868408, + "rewards/margins": 0.1320703774690628, + "rewards/rejected": -2.216848373413086, + "step": 910 + }, + { + "epoch": 0.6628242074927954, + "grad_norm": 23.296556306421977, + "learning_rate": 4.7809557008879185e-08, + "logits/chosen": -2.017183780670166, + "logits/rejected": -2.0119588375091553, + "logps/chosen": -0.9740872383117676, + "logps/rejected": -1.0616848468780518, + "loss": 1.2388, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.9481744766235352, + "rewards/margins": 0.1751951277256012, + "rewards/rejected": -2.1233696937561035, + "step": 920 + }, + { + "epoch": 0.670028818443804, + "grad_norm": 18.069785801871536, + "learning_rate": 4.772295586493613e-08, + "logits/chosen": -2.057365894317627, + "logits/rejected": -2.054624080657959, + "logps/chosen": -1.0349071025848389, + "logps/rejected": -1.1510379314422607, + "loss": 1.193, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0698142051696777, + "rewards/margins": 0.23226144909858704, + "rewards/rejected": -2.3020758628845215, + "step": 930 + }, + { + "epoch": 0.6772334293948127, + "grad_norm": 19.620026043686646, + "learning_rate": 4.763475739102374e-08, + "logits/chosen": -2.00927472114563, + "logits/rejected": -2.015021562576294, + "logps/chosen": -1.1269561052322388, + "logps/rejected": -1.1944589614868164, + "loss": 1.2561, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2539122104644775, + "rewards/margins": 0.13500596582889557, + "rewards/rejected": -2.388917922973633, + "step": 940 + }, + { + "epoch": 0.6844380403458213, + "grad_norm": 15.430566823053855, + "learning_rate": 4.754496778713054e-08, + "logits/chosen": -1.9693466424942017, + "logits/rejected": -1.9732694625854492, + "logps/chosen": -1.0118048191070557, + "logps/rejected": -1.1344263553619385, + "loss": 1.2008, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0236096382141113, + "rewards/margins": 0.24524304270744324, + "rewards/rejected": -2.268852710723877, + "step": 950 + }, + { + "epoch": 0.69164265129683, + "grad_norm": 21.25135809120288, + "learning_rate": 4.7453593365094926e-08, + "logits/chosen": -2.04045033454895, + "logits/rejected": -2.039541244506836, + "logps/chosen": -1.049902319908142, + "logps/rejected": -1.1590924263000488, + "loss": 1.2091, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.099804639816284, + "rewards/margins": 0.21838030219078064, + "rewards/rejected": -2.3181848526000977, + "step": 960 + }, + { + "epoch": 0.6988472622478387, + "grad_norm": 21.39072451404026, + "learning_rate": 4.736064054816145e-08, + "logits/chosen": -2.042609691619873, + "logits/rejected": -2.0387399196624756, + "logps/chosen": -0.9685258865356445, + "logps/rejected": -1.0943108797073364, + "loss": 1.1795, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.937051773071289, + "rewards/margins": 0.25157004594802856, + "rewards/rejected": -2.188621759414673, + "step": 970 + }, + { + "epoch": 0.7060518731988472, + "grad_norm": 17.20168162072602, + "learning_rate": 4.726611587052933e-08, + "logits/chosen": -1.9772526025772095, + "logits/rejected": -1.9768762588500977, + "logps/chosen": -1.1084102392196655, + "logps/rejected": -1.2353932857513428, + "loss": 1.1801, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.216820478439331, + "rewards/margins": 0.2539660334587097, + "rewards/rejected": -2.4707865715026855, + "step": 980 + }, + { + "epoch": 0.7132564841498559, + "grad_norm": 22.219628346195623, + "learning_rate": 4.71700259768931e-08, + "logits/chosen": -2.0274641513824463, + "logits/rejected": -2.0244956016540527, + "logps/chosen": -1.109879732131958, + "logps/rejected": -1.2068617343902588, + "loss": 1.2336, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.219759464263916, + "rewards/margins": 0.19396351277828217, + "rewards/rejected": -2.4137234687805176, + "step": 990 + }, + { + "epoch": 0.7204610951008645, + "grad_norm": 19.81819744621828, + "learning_rate": 4.707237762197549e-08, + "logits/chosen": -2.013184070587158, + "logits/rejected": -2.0100245475769043, + "logps/chosen": -1.0080925226211548, + "logps/rejected": -1.1278679370880127, + "loss": 1.2121, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0161850452423096, + "rewards/margins": 0.23955106735229492, + "rewards/rejected": -2.2557358741760254, + "step": 1000 + }, + { + "epoch": 0.7276657060518732, + "grad_norm": 23.320316952087914, + "learning_rate": 4.697317767005265e-08, + "logits/chosen": -2.0245862007141113, + "logits/rejected": -2.0211946964263916, + "logps/chosen": -1.002010703086853, + "logps/rejected": -1.0942790508270264, + "loss": 1.2568, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.004021406173706, + "rewards/margins": 0.1845366507768631, + "rewards/rejected": -2.1885581016540527, + "step": 1010 + }, + { + "epoch": 0.7348703170028819, + "grad_norm": 17.35614684932965, + "learning_rate": 4.6872433094471577e-08, + "logits/chosen": -2.0214576721191406, + "logits/rejected": -2.01664137840271, + "logps/chosen": -1.0324314832687378, + "logps/rejected": -1.127612590789795, + "loss": 1.2122, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0648629665374756, + "rewards/margins": 0.19036227464675903, + "rewards/rejected": -2.25522518157959, + "step": 1020 + }, + { + "epoch": 0.7420749279538905, + "grad_norm": 16.487356163413914, + "learning_rate": 4.677015097715994e-08, + "logits/chosen": -1.9668807983398438, + "logits/rejected": -1.9662902355194092, + "logps/chosen": -1.0229971408843994, + "logps/rejected": -1.1552445888519287, + "loss": 1.1997, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.045994281768799, + "rewards/margins": 0.2644946873188019, + "rewards/rejected": -2.3104891777038574, + "step": 1030 + }, + { + "epoch": 0.7492795389048992, + "grad_norm": 17.492033929105126, + "learning_rate": 4.666633850812825e-08, + "logits/chosen": -2.0216238498687744, + "logits/rejected": -2.0157718658447266, + "logps/chosen": -1.0129607915878296, + "logps/rejected": -1.0947651863098145, + "loss": 1.2367, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.025921583175659, + "rewards/margins": 0.16360855102539062, + "rewards/rejected": -2.189530372619629, + "step": 1040 + }, + { + "epoch": 0.7564841498559077, + "grad_norm": 17.49180259130834, + "learning_rate": 4.656100298496439e-08, + "logits/chosen": -1.971518874168396, + "logits/rejected": -1.9679629802703857, + "logps/chosen": -0.9385242462158203, + "logps/rejected": -1.0688835382461548, + "loss": 1.1859, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8770484924316406, + "rewards/margins": 0.26071876287460327, + "rewards/rejected": -2.1377670764923096, + "step": 1050 + }, + { + "epoch": 0.7636887608069164, + "grad_norm": 17.905832545876255, + "learning_rate": 4.6454151812320715e-08, + "logits/chosen": -2.0001180171966553, + "logits/rejected": -1.9940083026885986, + "logps/chosen": -1.03890061378479, + "logps/rejected": -1.1473093032836914, + "loss": 1.2178, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.07780122756958, + "rewards/margins": 0.21681778132915497, + "rewards/rejected": -2.294618606567383, + "step": 1060 + }, + { + "epoch": 0.770893371757925, + "grad_norm": 20.829212072329433, + "learning_rate": 4.6345792501393434e-08, + "logits/chosen": -2.0026588439941406, + "logits/rejected": -2.0007362365722656, + "logps/chosen": -1.0745230913162231, + "logps/rejected": -1.201542615890503, + "loss": 1.2046, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.1490461826324463, + "rewards/margins": 0.2540392279624939, + "rewards/rejected": -2.403085231781006, + "step": 1070 + }, + { + "epoch": 0.7780979827089337, + "grad_norm": 20.734671350383845, + "learning_rate": 4.6235932669394676e-08, + "logits/chosen": -2.0293848514556885, + "logits/rejected": -2.030176877975464, + "logps/chosen": -1.0878403186798096, + "logps/rejected": -1.196656584739685, + "loss": 1.2186, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.175680637359619, + "rewards/margins": 0.21763241291046143, + "rewards/rejected": -2.39331316947937, + "step": 1080 + }, + { + "epoch": 0.7853025936599424, + "grad_norm": 24.317214064629283, + "learning_rate": 4.612458003901698e-08, + "logits/chosen": -2.041074514389038, + "logits/rejected": -2.0332765579223633, + "logps/chosen": -1.109058141708374, + "logps/rejected": -1.2108246088027954, + "loss": 1.2286, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.218116283416748, + "rewards/margins": 0.20353302359580994, + "rewards/rejected": -2.421649217605591, + "step": 1090 + }, + { + "epoch": 0.792507204610951, + "grad_norm": 23.34024566936978, + "learning_rate": 4.6011742437890476e-08, + "logits/chosen": -2.028428077697754, + "logits/rejected": -2.023019790649414, + "logps/chosen": -1.0458049774169922, + "logps/rejected": -1.1794006824493408, + "loss": 1.1775, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0916099548339844, + "rewards/margins": 0.2671913504600525, + "rewards/rejected": -2.3588013648986816, + "step": 1100 + }, + { + "epoch": 0.7997118155619597, + "grad_norm": 16.933139927466357, + "learning_rate": 4.589742779803259e-08, + "logits/chosen": -2.025526523590088, + "logits/rejected": -2.018397569656372, + "logps/chosen": -1.0093412399291992, + "logps/rejected": -1.129741907119751, + "loss": 1.1948, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0186824798583984, + "rewards/margins": 0.2408013790845871, + "rewards/rejected": -2.259483814239502, + "step": 1110 + }, + { + "epoch": 0.8069164265129684, + "grad_norm": 18.433386982266423, + "learning_rate": 4.5781644155290486e-08, + "logits/chosen": -1.9837512969970703, + "logits/rejected": -1.9759635925292969, + "logps/chosen": -1.047893762588501, + "logps/rejected": -1.1082303524017334, + "loss": 1.2713, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.095787525177002, + "rewards/margins": 0.12067310512065887, + "rewards/rejected": -2.216460704803467, + "step": 1120 + }, + { + "epoch": 0.8141210374639769, + "grad_norm": 18.152544924178944, + "learning_rate": 4.566439964877613e-08, + "logits/chosen": -2.0132524967193604, + "logits/rejected": -2.0092389583587646, + "logps/chosen": -0.9992140531539917, + "logps/rejected": -1.0850255489349365, + "loss": 1.2443, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9984281063079834, + "rewards/margins": 0.17162318527698517, + "rewards/rejected": -2.170051097869873, + "step": 1130 + }, + { + "epoch": 0.8213256484149856, + "grad_norm": 16.195560643437258, + "learning_rate": 4.554570252029421e-08, + "logits/chosen": -2.0523180961608887, + "logits/rejected": -2.0510833263397217, + "logps/chosen": -1.0488303899765015, + "logps/rejected": -1.1647334098815918, + "loss": 1.201, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.097660779953003, + "rewards/margins": 0.23180584609508514, + "rewards/rejected": -2.3294668197631836, + "step": 1140 + }, + { + "epoch": 0.8285302593659942, + "grad_norm": 17.9745846350065, + "learning_rate": 4.542556111376274e-08, + "logits/chosen": -2.0492236614227295, + "logits/rejected": -2.0428290367126465, + "logps/chosen": -1.0749974250793457, + "logps/rejected": -1.166634202003479, + "loss": 1.2395, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1499948501586914, + "rewards/margins": 0.18327349424362183, + "rewards/rejected": -2.333268404006958, + "step": 1150 + }, + { + "epoch": 0.8357348703170029, + "grad_norm": 23.066926614034124, + "learning_rate": 4.5303983874626506e-08, + "logits/chosen": -1.9916588068008423, + "logits/rejected": -1.9900974035263062, + "logps/chosen": -1.0387059450149536, + "logps/rejected": -1.115934133529663, + "loss": 1.2651, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.0774118900299072, + "rewards/margins": 0.15445652604103088, + "rewards/rejected": -2.231868267059326, + "step": 1160 + }, + { + "epoch": 0.8429394812680115, + "grad_norm": 20.106291828506194, + "learning_rate": 4.518097934926339e-08, + "logits/chosen": -1.995008111000061, + "logits/rejected": -1.9863529205322266, + "logps/chosen": -1.0165393352508545, + "logps/rejected": -1.1260240077972412, + "loss": 1.2057, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.033078670501709, + "rewards/margins": 0.21896927058696747, + "rewards/rejected": -2.2520480155944824, + "step": 1170 + }, + { + "epoch": 0.8501440922190202, + "grad_norm": 22.8857527390999, + "learning_rate": 4.505655618438363e-08, + "logits/chosen": -1.9628753662109375, + "logits/rejected": -1.9588840007781982, + "logps/chosen": -1.0615794658660889, + "logps/rejected": -1.1646844148635864, + "loss": 1.2307, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1231589317321777, + "rewards/margins": 0.20620973408222198, + "rewards/rejected": -2.329368829727173, + "step": 1180 + }, + { + "epoch": 0.8573487031700289, + "grad_norm": 17.434899766590377, + "learning_rate": 4.4930723126421945e-08, + "logits/chosen": -2.0546653270721436, + "logits/rejected": -2.047938585281372, + "logps/chosen": -1.0720479488372803, + "logps/rejected": -1.1471359729766846, + "loss": 1.252, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1440958976745605, + "rewards/margins": 0.15017575025558472, + "rewards/rejected": -2.294271945953369, + "step": 1190 + }, + { + "epoch": 0.8645533141210374, + "grad_norm": 22.14075015263452, + "learning_rate": 4.48034890209227e-08, + "logits/chosen": -1.983888030052185, + "logits/rejected": -1.9716113805770874, + "logps/chosen": -1.0871121883392334, + "logps/rejected": -1.1737545728683472, + "loss": 1.23, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.174224376678467, + "rewards/margins": 0.1732848584651947, + "rewards/rejected": -2.3475091457366943, + "step": 1200 + }, + { + "epoch": 0.8717579250720461, + "grad_norm": 18.53077050982448, + "learning_rate": 4.4674862811918155e-08, + "logits/chosen": -1.971573829650879, + "logits/rejected": -1.980055570602417, + "logps/chosen": -0.9388012886047363, + "logps/rejected": -1.091797113418579, + "loss": 1.1596, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8776025772094727, + "rewards/margins": 0.30599164962768555, + "rewards/rejected": -2.183594226837158, + "step": 1210 + }, + { + "epoch": 0.8789625360230547, + "grad_norm": 17.441252552193376, + "learning_rate": 4.454485354129966e-08, + "logits/chosen": -1.9985713958740234, + "logits/rejected": -1.994210958480835, + "logps/chosen": -1.0104329586029053, + "logps/rejected": -1.11543869972229, + "loss": 1.2194, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.0208659172058105, + "rewards/margins": 0.21001139283180237, + "rewards/rejected": -2.23087739944458, + "step": 1220 + }, + { + "epoch": 0.8861671469740634, + "grad_norm": 17.20275284474546, + "learning_rate": 4.4413470348182124e-08, + "logits/chosen": -1.9755537509918213, + "logits/rejected": -1.9634100198745728, + "logps/chosen": -0.9853811264038086, + "logps/rejected": -1.076774001121521, + "loss": 1.2316, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9707622528076172, + "rewards/margins": 0.18278571963310242, + "rewards/rejected": -2.153548002243042, + "step": 1230 + }, + { + "epoch": 0.8933717579250721, + "grad_norm": 21.253905408711432, + "learning_rate": 4.42807224682615e-08, + "logits/chosen": -1.9815731048583984, + "logits/rejected": -1.9793331623077393, + "logps/chosen": -0.9373159408569336, + "logps/rejected": -1.0729162693023682, + "loss": 1.181, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.8746318817138672, + "rewards/margins": 0.2712007462978363, + "rewards/rejected": -2.1458325386047363, + "step": 1240 + }, + { + "epoch": 0.9005763688760807, + "grad_norm": 18.803145183231678, + "learning_rate": 4.4146619233165604e-08, + "logits/chosen": -2.0230329036712646, + "logits/rejected": -2.025296688079834, + "logps/chosen": -1.0652821063995361, + "logps/rejected": -1.2190508842468262, + "loss": 1.1677, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.1305642127990723, + "rewards/margins": 0.3075374960899353, + "rewards/rejected": -2.4381017684936523, + "step": 1250 + }, + { + "epoch": 0.9077809798270894, + "grad_norm": 25.018490567837954, + "learning_rate": 4.4011170069798126e-08, + "logits/chosen": -2.020940065383911, + "logits/rejected": -2.025850296020508, + "logps/chosen": -1.1181256771087646, + "logps/rejected": -1.2433640956878662, + "loss": 1.1932, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.2362513542175293, + "rewards/margins": 0.25047701597213745, + "rewards/rejected": -2.4867281913757324, + "step": 1260 + }, + { + "epoch": 0.9149855907780979, + "grad_norm": 18.123087760553187, + "learning_rate": 4.387438449967594e-08, + "logits/chosen": -1.982254981994629, + "logits/rejected": -1.97560715675354, + "logps/chosen": -0.9658223986625671, + "logps/rejected": -1.085925579071045, + "loss": 1.1909, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9316447973251343, + "rewards/margins": 0.24020643532276154, + "rewards/rejected": -2.17185115814209, + "step": 1270 + }, + { + "epoch": 0.9221902017291066, + "grad_norm": 21.17056826903978, + "learning_rate": 4.373627213825983e-08, + "logits/chosen": -2.0719313621520996, + "logits/rejected": -2.0676798820495605, + "logps/chosen": -1.0272830724716187, + "logps/rejected": -1.1627672910690308, + "loss": 1.1829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0545661449432373, + "rewards/margins": 0.27096837759017944, + "rewards/rejected": -2.3255345821380615, + "step": 1280 + }, + { + "epoch": 0.9293948126801153, + "grad_norm": 16.73743221772608, + "learning_rate": 4.359684269427848e-08, + "logits/chosen": -2.034970760345459, + "logits/rejected": -2.0339713096618652, + "logps/chosen": -0.9956309199333191, + "logps/rejected": -1.0993244647979736, + "loss": 1.2107, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9912618398666382, + "rewards/margins": 0.20738673210144043, + "rewards/rejected": -2.1986489295959473, + "step": 1290 + }, + { + "epoch": 0.9365994236311239, + "grad_norm": 23.479698749807888, + "learning_rate": 4.34561059690461e-08, + "logits/chosen": -2.079378843307495, + "logits/rejected": -2.0813305377960205, + "logps/chosen": -1.047837495803833, + "logps/rejected": -1.112128496170044, + "loss": 1.2707, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.095674991607666, + "rewards/margins": 0.12858203053474426, + "rewards/rejected": -2.224256992340088, + "step": 1300 + }, + { + "epoch": 0.9438040345821326, + "grad_norm": 21.373476828454745, + "learning_rate": 4.3314071855773314e-08, + "logits/chosen": -2.044544219970703, + "logits/rejected": -2.0450897216796875, + "logps/chosen": -0.9845747947692871, + "logps/rejected": -1.0792670249938965, + "loss": 1.2235, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9691495895385742, + "rewards/margins": 0.18938450515270233, + "rewards/rejected": -2.158534049987793, + "step": 1310 + }, + { + "epoch": 0.9510086455331412, + "grad_norm": 20.390675123621403, + "learning_rate": 4.3170750338871806e-08, + "logits/chosen": -2.0153450965881348, + "logits/rejected": -2.008953094482422, + "logps/chosen": -1.0770236253738403, + "logps/rejected": -1.2197729349136353, + "loss": 1.1662, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.1540472507476807, + "rewards/margins": 0.2854984402656555, + "rewards/rejected": -2.4395458698272705, + "step": 1320 + }, + { + "epoch": 0.9582132564841499, + "grad_norm": 14.835531781677203, + "learning_rate": 4.3026151493252414e-08, + "logits/chosen": -2.039367437362671, + "logits/rejected": -2.0349154472351074, + "logps/chosen": -1.0609954595565796, + "logps/rejected": -1.1818583011627197, + "loss": 1.2003, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.121990919113159, + "rewards/margins": 0.24172568321228027, + "rewards/rejected": -2.3637166023254395, + "step": 1330 + }, + { + "epoch": 0.9654178674351584, + "grad_norm": 25.71038185604989, + "learning_rate": 4.2880285483616895e-08, + "logits/chosen": -2.0069704055786133, + "logits/rejected": -2.007664680480957, + "logps/chosen": -1.0175052881240845, + "logps/rejected": -1.1325770616531372, + "loss": 1.2093, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.035010576248169, + "rewards/margins": 0.2301437109708786, + "rewards/rejected": -2.2651541233062744, + "step": 1340 + }, + { + "epoch": 0.9726224783861671, + "grad_norm": 16.092608904878997, + "learning_rate": 4.273316256374342e-08, + "logits/chosen": -1.9464366436004639, + "logits/rejected": -1.9446899890899658, + "logps/chosen": -1.01396644115448, + "logps/rejected": -1.0869011878967285, + "loss": 1.264, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.02793288230896, + "rewards/margins": 0.1458693891763687, + "rewards/rejected": -2.173802375793457, + "step": 1350 + }, + { + "epoch": 0.9798270893371758, + "grad_norm": 16.212857235886922, + "learning_rate": 4.258479307576576e-08, + "logits/chosen": -1.9840402603149414, + "logits/rejected": -1.9818894863128662, + "logps/chosen": -0.9638694524765015, + "logps/rejected": -1.0546468496322632, + "loss": 1.24, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.927738904953003, + "rewards/margins": 0.18155473470687866, + "rewards/rejected": -2.1092936992645264, + "step": 1360 + }, + { + "epoch": 0.9870317002881844, + "grad_norm": 21.341000872382455, + "learning_rate": 4.243518744944626e-08, + "logits/chosen": -2.0093555450439453, + "logits/rejected": -2.0047600269317627, + "logps/chosen": -1.0009874105453491, + "logps/rejected": -1.1216598749160767, + "loss": 1.1889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0019748210906982, + "rewards/margins": 0.24134452641010284, + "rewards/rejected": -2.2433197498321533, + "step": 1370 + }, + { + "epoch": 0.9942363112391931, + "grad_norm": 20.83793747644969, + "learning_rate": 4.22843562014427e-08, + "logits/chosen": -1.9709367752075195, + "logits/rejected": -1.9672348499298096, + "logps/chosen": -1.0514830350875854, + "logps/rejected": -1.1256954669952393, + "loss": 1.2503, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.102966070175171, + "rewards/margins": 0.14842486381530762, + "rewards/rejected": -2.2513909339904785, + "step": 1380 + }, + { + "epoch": 1.0014409221902016, + "grad_norm": 28.539886189287515, + "learning_rate": 4.2132309934569e-08, + "logits/chosen": -2.051409959793091, + "logits/rejected": -2.051856517791748, + "logps/chosen": -1.015867829322815, + "logps/rejected": -1.128615379333496, + "loss": 1.2103, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.03173565864563, + "rewards/margins": 0.22549493610858917, + "rewards/rejected": -2.257230758666992, + "step": 1390 + }, + { + "epoch": 1.0086455331412103, + "grad_norm": 18.4548159325349, + "learning_rate": 4.197905933704989e-08, + "logits/chosen": -1.9460541009902954, + "logits/rejected": -1.9434579610824585, + "logps/chosen": -1.0608787536621094, + "logps/rejected": -1.1942651271820068, + "loss": 1.2017, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1217575073242188, + "rewards/margins": 0.2667728066444397, + "rewards/rejected": -2.3885302543640137, + "step": 1400 + }, + { + "epoch": 1.015850144092219, + "grad_norm": 23.677146712392545, + "learning_rate": 4.1824615181769577e-08, + "logits/chosen": -1.992706060409546, + "logits/rejected": -1.9971202611923218, + "logps/chosen": -1.0128545761108398, + "logps/rejected": -1.138115644454956, + "loss": 1.2035, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0257091522216797, + "rewards/margins": 0.2505221366882324, + "rewards/rejected": -2.276231288909912, + "step": 1410 + }, + { + "epoch": 1.0230547550432276, + "grad_norm": 18.466913113268376, + "learning_rate": 4.1668988325514434e-08, + "logits/chosen": -2.0149149894714355, + "logits/rejected": -2.0098109245300293, + "logps/chosen": -1.1170905828475952, + "logps/rejected": -1.2321112155914307, + "loss": 1.2252, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.2341811656951904, + "rewards/margins": 0.2300410270690918, + "rewards/rejected": -2.4642224311828613, + "step": 1420 + }, + { + "epoch": 1.0302593659942363, + "grad_norm": 20.916480925982736, + "learning_rate": 4.1512189708209844e-08, + "logits/chosen": -2.0576863288879395, + "logits/rejected": -2.0563559532165527, + "logps/chosen": -0.9412269592285156, + "logps/rejected": -1.0276962518692017, + "loss": 1.2464, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.8824539184570312, + "rewards/margins": 0.17293845117092133, + "rewards/rejected": -2.0553925037384033, + "step": 1430 + }, + { + "epoch": 1.037463976945245, + "grad_norm": 22.10230375057076, + "learning_rate": 4.1354230352151143e-08, + "logits/chosen": -2.009265661239624, + "logits/rejected": -2.002540111541748, + "logps/chosen": -1.138351559638977, + "logps/rejected": -1.2199509143829346, + "loss": 1.2585, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.276703119277954, + "rewards/margins": 0.16319862008094788, + "rewards/rejected": -2.439901828765869, + "step": 1440 + }, + { + "epoch": 1.0446685878962536, + "grad_norm": 16.842031017248782, + "learning_rate": 4.119512136122882e-08, + "logits/chosen": -2.0774807929992676, + "logits/rejected": -2.086643695831299, + "logps/chosen": -0.9951038360595703, + "logps/rejected": -1.1451139450073242, + "loss": 1.1708, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9902076721191406, + "rewards/margins": 0.30002015829086304, + "rewards/rejected": -2.2902278900146484, + "step": 1450 + }, + { + "epoch": 1.0518731988472623, + "grad_norm": 15.427164808054908, + "learning_rate": 4.103487392014795e-08, + "logits/chosen": -1.992767095565796, + "logits/rejected": -1.980544090270996, + "logps/chosen": -1.0006814002990723, + "logps/rejected": -1.15886390209198, + "loss": 1.1455, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0013628005981445, + "rewards/margins": 0.31636515259742737, + "rewards/rejected": -2.31772780418396, + "step": 1460 + }, + { + "epoch": 1.059077809798271, + "grad_norm": 16.81042888795935, + "learning_rate": 4.087349929364192e-08, + "logits/chosen": -2.034682273864746, + "logits/rejected": -2.0252864360809326, + "logps/chosen": -0.9601753354072571, + "logps/rejected": -1.0913857221603394, + "loss": 1.1863, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.9203506708145142, + "rewards/margins": 0.26242080330848694, + "rewards/rejected": -2.1827714443206787, + "step": 1470 + }, + { + "epoch": 1.0662824207492796, + "grad_norm": 17.584619579081235, + "learning_rate": 4.0711008825680645e-08, + "logits/chosen": -1.979069709777832, + "logits/rejected": -1.97795832157135, + "logps/chosen": -1.0063145160675049, + "logps/rejected": -1.1248080730438232, + "loss": 1.2064, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0126290321350098, + "rewards/margins": 0.2369869500398636, + "rewards/rejected": -2.2496161460876465, + "step": 1480 + }, + { + "epoch": 1.0734870317002883, + "grad_norm": 19.98068478862068, + "learning_rate": 4.054741393867306e-08, + "logits/chosen": -1.99558424949646, + "logits/rejected": -1.9926925897598267, + "logps/chosen": -1.1117796897888184, + "logps/rejected": -1.1623036861419678, + "loss": 1.2882, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.2235593795776367, + "rewards/margins": 0.10104763507843018, + "rewards/rejected": -2.3246073722839355, + "step": 1490 + }, + { + "epoch": 1.080691642651297, + "grad_norm": 18.569188294062595, + "learning_rate": 4.038272613266419e-08, + "logits/chosen": -1.9959461688995361, + "logits/rejected": -1.9826500415802002, + "logps/chosen": -1.0095350742340088, + "logps/rejected": -1.1202278137207031, + "loss": 1.2023, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.0190701484680176, + "rewards/margins": 0.22138550877571106, + "rewards/rejected": -2.2404556274414062, + "step": 1500 + }, + { + "epoch": 1.0878962536023056, + "grad_norm": 18.129783454014866, + "learning_rate": 4.0216956984526784e-08, + "logits/chosen": -2.04606032371521, + "logits/rejected": -2.047947406768799, + "logps/chosen": -1.0161449909210205, + "logps/rejected": -1.124267339706421, + "loss": 1.2167, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.032289981842041, + "rewards/margins": 0.21624493598937988, + "rewards/rejected": -2.248534679412842, + "step": 1510 + }, + { + "epoch": 1.0951008645533142, + "grad_norm": 16.171374987629033, + "learning_rate": 4.0050118147147446e-08, + "logits/chosen": -1.9890464544296265, + "logits/rejected": -1.989335298538208, + "logps/chosen": -1.0982977151870728, + "logps/rejected": -1.110621690750122, + "loss": 1.3393, + "rewards/accuracies": 0.4375, + "rewards/chosen": -2.1965954303741455, + "rewards/margins": 0.02464829757809639, + "rewards/rejected": -2.221243381500244, + "step": 1520 + }, + { + "epoch": 1.1023054755043227, + "grad_norm": 17.66132069219183, + "learning_rate": 3.988222134860755e-08, + "logits/chosen": -2.0323548316955566, + "logits/rejected": -2.0236430168151855, + "logps/chosen": -0.9508152008056641, + "logps/rejected": -1.1158647537231445, + "loss": 1.1407, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9016304016113281, + "rewards/margins": 0.33009934425354004, + "rewards/rejected": -2.231729507446289, + "step": 1530 + }, + { + "epoch": 1.1095100864553313, + "grad_norm": 23.676130358664636, + "learning_rate": 3.9713278391358724e-08, + "logits/chosen": -2.0360183715820312, + "logits/rejected": -2.0298333168029785, + "logps/chosen": -1.025137186050415, + "logps/rejected": -1.1484403610229492, + "loss": 1.1877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.05027437210083, + "rewards/margins": 0.24660632014274597, + "rewards/rejected": -2.2968807220458984, + "step": 1540 + }, + { + "epoch": 1.11671469740634, + "grad_norm": 17.77840056029204, + "learning_rate": 3.954330115139328e-08, + "logits/chosen": -2.0122570991516113, + "logits/rejected": -2.0070974826812744, + "logps/chosen": -1.0277431011199951, + "logps/rejected": -1.1330978870391846, + "loss": 1.2216, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0554862022399902, + "rewards/margins": 0.21070995926856995, + "rewards/rejected": -2.266195774078369, + "step": 1550 + }, + { + "epoch": 1.1239193083573487, + "grad_norm": 25.812098081681867, + "learning_rate": 3.937230157740931e-08, + "logits/chosen": -2.067347764968872, + "logits/rejected": -2.0611376762390137, + "logps/chosen": -1.0478734970092773, + "logps/rejected": -1.1832859516143799, + "loss": 1.1824, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0957469940185547, + "rewards/margins": 0.2708250880241394, + "rewards/rejected": -2.3665719032287598, + "step": 1560 + }, + { + "epoch": 1.1311239193083573, + "grad_norm": 16.22328310375803, + "learning_rate": 3.920029168997077e-08, + "logits/chosen": -2.04835844039917, + "logits/rejected": -2.04648494720459, + "logps/chosen": -1.0037837028503418, + "logps/rejected": -1.131502628326416, + "loss": 1.1863, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0075674057006836, + "rewards/margins": 0.25543779134750366, + "rewards/rejected": -2.263005256652832, + "step": 1570 + }, + { + "epoch": 1.138328530259366, + "grad_norm": 29.81353401958458, + "learning_rate": 3.9027283580662476e-08, + "logits/chosen": -2.0225307941436768, + "logits/rejected": -2.0166521072387695, + "logps/chosen": -1.0478241443634033, + "logps/rejected": -1.1936235427856445, + "loss": 1.1765, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0956482887268066, + "rewards/margins": 0.2915985882282257, + "rewards/rejected": -2.387247085571289, + "step": 1580 + }, + { + "epoch": 1.1455331412103746, + "grad_norm": 16.941588748106863, + "learning_rate": 3.885328941124014e-08, + "logits/chosen": -1.991965889930725, + "logits/rejected": -1.9873225688934326, + "logps/chosen": -0.9666957855224609, + "logps/rejected": -1.1006277799606323, + "loss": 1.1706, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.9333915710449219, + "rewards/margins": 0.26786428689956665, + "rewards/rejected": -2.2012555599212646, + "step": 1590 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 20.82364621838478, + "learning_rate": 3.867832141277539e-08, + "logits/chosen": -2.0321202278137207, + "logits/rejected": -2.0232601165771484, + "logps/chosen": -1.0682156085968018, + "logps/rejected": -1.180410623550415, + "loss": 1.2096, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.1364312171936035, + "rewards/margins": 0.2243901491165161, + "rewards/rejected": -2.36082124710083, + "step": 1600 + }, + { + "epoch": 1.159942363112392, + "grad_norm": 20.912686096120964, + "learning_rate": 3.850239188479606e-08, + "logits/chosen": -1.9847033023834229, + "logits/rejected": -1.9881378412246704, + "logps/chosen": -1.0096313953399658, + "logps/rejected": -1.1002733707427979, + "loss": 1.2372, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0192627906799316, + "rewards/margins": 0.18128342926502228, + "rewards/rejected": -2.2005467414855957, + "step": 1610 + }, + { + "epoch": 1.1671469740634006, + "grad_norm": 21.899733424702635, + "learning_rate": 3.832551319442151e-08, + "logits/chosen": -2.0586349964141846, + "logits/rejected": -2.059906482696533, + "logps/chosen": -1.057755708694458, + "logps/rejected": -1.184890627861023, + "loss": 1.1897, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.115511417388916, + "rewards/margins": 0.254270076751709, + "rewards/rejected": -2.369781255722046, + "step": 1620 + }, + { + "epoch": 1.1743515850144093, + "grad_norm": 17.2910410178799, + "learning_rate": 3.81476977754933e-08, + "logits/chosen": -1.9560763835906982, + "logits/rejected": -1.9524368047714233, + "logps/chosen": -1.0269958972930908, + "logps/rejected": -1.0969812870025635, + "loss": 1.2579, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0539917945861816, + "rewards/margins": 0.13997015357017517, + "rewards/rejected": -2.193962574005127, + "step": 1630 + }, + { + "epoch": 1.181556195965418, + "grad_norm": 16.85537517324203, + "learning_rate": 3.796895812770114e-08, + "logits/chosen": -1.9784526824951172, + "logits/rejected": -1.9793262481689453, + "logps/chosen": -1.0173704624176025, + "logps/rejected": -1.1091585159301758, + "loss": 1.2408, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.034740924835205, + "rewards/margins": 0.1835760474205017, + "rewards/rejected": -2.2183170318603516, + "step": 1640 + }, + { + "epoch": 1.1887608069164266, + "grad_norm": 22.175531020521074, + "learning_rate": 3.7789306815704216e-08, + "logits/chosen": -2.009108781814575, + "logits/rejected": -2.006824254989624, + "logps/chosen": -1.0072455406188965, + "logps/rejected": -1.0781667232513428, + "loss": 1.2618, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.014491081237793, + "rewards/margins": 0.14184223115444183, + "rewards/rejected": -2.1563334465026855, + "step": 1650 + }, + { + "epoch": 1.195965417867435, + "grad_norm": 18.89404553225258, + "learning_rate": 3.760875646824795e-08, + "logits/chosen": -1.932428002357483, + "logits/rejected": -1.9363291263580322, + "logps/chosen": -0.9747514724731445, + "logps/rejected": -1.0793020725250244, + "loss": 1.2232, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.949502944946289, + "rewards/margins": 0.20910124480724335, + "rewards/rejected": -2.158604145050049, + "step": 1660 + }, + { + "epoch": 1.2031700288184437, + "grad_norm": 22.227342153467788, + "learning_rate": 3.742731977727623e-08, + "logits/chosen": -2.0331270694732666, + "logits/rejected": -2.0301709175109863, + "logps/chosen": -1.0413671731948853, + "logps/rejected": -1.1772552728652954, + "loss": 1.1795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0827343463897705, + "rewards/margins": 0.27177631855010986, + "rewards/rejected": -2.354510545730591, + "step": 1670 + }, + { + "epoch": 1.2103746397694524, + "grad_norm": 19.70333261721218, + "learning_rate": 3.7245009497039244e-08, + "logits/chosen": -1.970715880393982, + "logits/rejected": -1.9627761840820312, + "logps/chosen": -1.0116103887557983, + "logps/rejected": -1.1484659910202026, + "loss": 1.1716, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0232207775115967, + "rewards/margins": 0.2737112045288086, + "rewards/rejected": -2.2969319820404053, + "step": 1680 + }, + { + "epoch": 1.217579250720461, + "grad_norm": 18.86128397711634, + "learning_rate": 3.7061838443196886e-08, + "logits/chosen": -2.0151665210723877, + "logits/rejected": -2.016679048538208, + "logps/chosen": -1.026761531829834, + "logps/rejected": -1.150320053100586, + "loss": 1.1888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.053523063659668, + "rewards/margins": 0.2471170723438263, + "rewards/rejected": -2.300640106201172, + "step": 1690 + }, + { + "epoch": 1.2247838616714697, + "grad_norm": 22.70930817597516, + "learning_rate": 3.68778194919179e-08, + "logits/chosen": -1.983304738998413, + "logits/rejected": -1.984287977218628, + "logps/chosen": -1.0792837142944336, + "logps/rejected": -1.2009527683258057, + "loss": 1.1955, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.158567428588867, + "rewards/margins": 0.24333825707435608, + "rewards/rejected": -2.4019055366516113, + "step": 1700 + }, + { + "epoch": 1.2319884726224783, + "grad_norm": 20.280682845222326, + "learning_rate": 3.66929655789747e-08, + "logits/chosen": -2.0337467193603516, + "logits/rejected": -2.0225701332092285, + "logps/chosen": -0.9402590990066528, + "logps/rejected": -1.0919773578643799, + "loss": 1.1634, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.8805181980133057, + "rewards/margins": 0.30343663692474365, + "rewards/rejected": -2.1839547157287598, + "step": 1710 + }, + { + "epoch": 1.239193083573487, + "grad_norm": 16.359613747281564, + "learning_rate": 3.6507289698834064e-08, + "logits/chosen": -1.9774224758148193, + "logits/rejected": -1.973842978477478, + "logps/chosen": -0.98408442735672, + "logps/rejected": -1.1161837577819824, + "loss": 1.196, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.96816885471344, + "rewards/margins": 0.2641984820365906, + "rewards/rejected": -2.232367515563965, + "step": 1720 + }, + { + "epoch": 1.2463976945244957, + "grad_norm": 25.191044914408238, + "learning_rate": 3.6320804903743684e-08, + "logits/chosen": -2.026642322540283, + "logits/rejected": -2.0262598991394043, + "logps/chosen": -1.0340476036071777, + "logps/rejected": -1.1598145961761475, + "loss": 1.1976, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0680952072143555, + "rewards/margins": 0.251534104347229, + "rewards/rejected": -2.319629192352295, + "step": 1730 + }, + { + "epoch": 1.2536023054755043, + "grad_norm": 17.23248745457562, + "learning_rate": 3.61335243028146e-08, + "logits/chosen": -2.0114941596984863, + "logits/rejected": -2.016153573989868, + "logps/chosen": -1.092045783996582, + "logps/rejected": -1.2228668928146362, + "loss": 1.1916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.184091567993164, + "rewards/margins": 0.2616419494152069, + "rewards/rejected": -2.4457337856292725, + "step": 1740 + }, + { + "epoch": 1.260806916426513, + "grad_norm": 18.437527072676268, + "learning_rate": 3.5945461061099736e-08, + "logits/chosen": -1.972100853919983, + "logits/rejected": -1.9586093425750732, + "logps/chosen": -1.0443975925445557, + "logps/rejected": -1.1218501329421997, + "loss": 1.2706, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.0887951850891113, + "rewards/margins": 0.15490522980690002, + "rewards/rejected": -2.2437002658843994, + "step": 1750 + }, + { + "epoch": 1.2680115273775217, + "grad_norm": 19.695402848445642, + "learning_rate": 3.5756628398668446e-08, + "logits/chosen": -2.0573649406433105, + "logits/rejected": -2.0625429153442383, + "logps/chosen": -1.1325814723968506, + "logps/rejected": -1.232399821281433, + "loss": 1.2403, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.265162944793701, + "rewards/margins": 0.19963672757148743, + "rewards/rejected": -2.464799642562866, + "step": 1760 + }, + { + "epoch": 1.2752161383285303, + "grad_norm": 17.758331420145563, + "learning_rate": 3.556703958967716e-08, + "logits/chosen": -2.041581630706787, + "logits/rejected": -2.036958694458008, + "logps/chosen": -1.0513150691986084, + "logps/rejected": -1.1853423118591309, + "loss": 1.1882, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.102630138397217, + "rewards/margins": 0.2680542469024658, + "rewards/rejected": -2.3706846237182617, + "step": 1770 + }, + { + "epoch": 1.282420749279539, + "grad_norm": 24.11832525210908, + "learning_rate": 3.5376707961436297e-08, + "logits/chosen": -2.025054454803467, + "logits/rejected": -2.019120693206787, + "logps/chosen": -1.1408239603042603, + "logps/rejected": -1.202470064163208, + "loss": 1.2726, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.2816479206085205, + "rewards/margins": 0.12329187244176865, + "rewards/rejected": -2.404940128326416, + "step": 1780 + }, + { + "epoch": 1.2896253602305476, + "grad_norm": 12.89708045158757, + "learning_rate": 3.51856468934734e-08, + "logits/chosen": -1.9773098230361938, + "logits/rejected": -1.9786754846572876, + "logps/chosen": -0.9762522578239441, + "logps/rejected": -1.0697864294052124, + "loss": 1.2234, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9525045156478882, + "rewards/margins": 0.18706828355789185, + "rewards/rejected": -2.139572858810425, + "step": 1790 + }, + { + "epoch": 1.2968299711815563, + "grad_norm": 20.10190857160128, + "learning_rate": 3.499386981659262e-08, + "logits/chosen": -2.0595974922180176, + "logits/rejected": -2.0540311336517334, + "logps/chosen": -1.0190843343734741, + "logps/rejected": -1.208898901939392, + "loss": 1.1243, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0381686687469482, + "rewards/margins": 0.3796289563179016, + "rewards/rejected": -2.417797803878784, + "step": 1800 + }, + { + "epoch": 1.304034582132565, + "grad_norm": 20.878615577501385, + "learning_rate": 3.480139021193057e-08, + "logits/chosen": -1.9839977025985718, + "logits/rejected": -1.9858938455581665, + "logps/chosen": -0.9964865446090698, + "logps/rejected": -1.1170662641525269, + "loss": 1.212, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.9929730892181396, + "rewards/margins": 0.24115952849388123, + "rewards/rejected": -2.2341325283050537, + "step": 1810 + }, + { + "epoch": 1.3112391930835736, + "grad_norm": 28.47013732688272, + "learning_rate": 3.4608221610008666e-08, + "logits/chosen": -2.0153214931488037, + "logits/rejected": -2.010758876800537, + "logps/chosen": -0.9736091494560242, + "logps/rejected": -1.120499849319458, + "loss": 1.1707, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9472182989120483, + "rewards/margins": 0.29378125071525574, + "rewards/rejected": -2.240999698638916, + "step": 1820 + }, + { + "epoch": 1.318443804034582, + "grad_norm": 15.221657015785182, + "learning_rate": 3.4414377589782e-08, + "logits/chosen": -1.9868896007537842, + "logits/rejected": -1.9957456588745117, + "logps/chosen": -1.0180634260177612, + "logps/rejected": -1.150467872619629, + "loss": 1.1966, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0361268520355225, + "rewards/margins": 0.26480910181999207, + "rewards/rejected": -2.300935745239258, + "step": 1830 + }, + { + "epoch": 1.3256484149855907, + "grad_norm": 18.190653029469026, + "learning_rate": 3.4219871777684745e-08, + "logits/chosen": -1.9982116222381592, + "logits/rejected": -1.9859825372695923, + "logps/chosen": -0.9929243922233582, + "logps/rejected": -1.1142441034317017, + "loss": 1.2076, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9858487844467163, + "rewards/margins": 0.24263925850391388, + "rewards/rejected": -2.2284882068634033, + "step": 1840 + }, + { + "epoch": 1.3328530259365994, + "grad_norm": 17.791029774645512, + "learning_rate": 3.4024717846672364e-08, + "logits/chosen": -2.0318691730499268, + "logits/rejected": -2.025087833404541, + "logps/chosen": -0.9934013485908508, + "logps/rejected": -1.1215975284576416, + "loss": 1.1959, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.9868026971817017, + "rewards/margins": 0.25639256834983826, + "rewards/rejected": -2.243195056915283, + "step": 1850 + }, + { + "epoch": 1.340057636887608, + "grad_norm": 17.7783196169273, + "learning_rate": 3.382892951526036e-08, + "logits/chosen": -2.0219979286193848, + "logits/rejected": -2.0191798210144043, + "logps/chosen": -1.0518848896026611, + "logps/rejected": -1.20078444480896, + "loss": 1.1628, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1037697792053223, + "rewards/margins": 0.2977990508079529, + "rewards/rejected": -2.40156888961792, + "step": 1860 + }, + { + "epoch": 1.3472622478386167, + "grad_norm": 20.2114199388819, + "learning_rate": 3.3632520546559974e-08, + "logits/chosen": -1.9855273962020874, + "logits/rejected": -1.9737205505371094, + "logps/chosen": -0.926679253578186, + "logps/rejected": -1.0955464839935303, + "loss": 1.1271, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.853358507156372, + "rewards/margins": 0.3377344310283661, + "rewards/rejected": -2.1910929679870605, + "step": 1870 + }, + { + "epoch": 1.3544668587896254, + "grad_norm": 19.579421951203443, + "learning_rate": 3.34355047473107e-08, + "logits/chosen": -1.9991194009780884, + "logits/rejected": -1.9949671030044556, + "logps/chosen": -1.0290135145187378, + "logps/rejected": -1.1186621189117432, + "loss": 1.2445, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0580270290374756, + "rewards/margins": 0.1792970895767212, + "rewards/rejected": -2.2373242378234863, + "step": 1880 + }, + { + "epoch": 1.361671469740634, + "grad_norm": 22.936331468503273, + "learning_rate": 3.323789596690971e-08, + "logits/chosen": -1.966144323348999, + "logits/rejected": -1.9670454263687134, + "logps/chosen": -1.0209920406341553, + "logps/rejected": -1.1551681756973267, + "loss": 1.1787, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0419840812683105, + "rewards/margins": 0.26835212111473083, + "rewards/rejected": -2.3103363513946533, + "step": 1890 + }, + { + "epoch": 1.3688760806916427, + "grad_norm": 15.801647380635032, + "learning_rate": 3.303970809643828e-08, + "logits/chosen": -1.998286247253418, + "logits/rejected": -2.0028045177459717, + "logps/chosen": -1.0353937149047852, + "logps/rejected": -1.1643174886703491, + "loss": 1.1927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0707874298095703, + "rewards/margins": 0.25784778594970703, + "rewards/rejected": -2.3286349773406982, + "step": 1900 + }, + { + "epoch": 1.3760806916426513, + "grad_norm": 20.693514419325513, + "learning_rate": 3.2840955067685356e-08, + "logits/chosen": -2.031480312347412, + "logits/rejected": -2.035548686981201, + "logps/chosen": -1.0550123453140259, + "logps/rejected": -1.2029016017913818, + "loss": 1.1631, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.1100246906280518, + "rewards/margins": 0.29577863216400146, + "rewards/rejected": -2.4058032035827637, + "step": 1910 + }, + { + "epoch": 1.38328530259366, + "grad_norm": 16.858093329362955, + "learning_rate": 3.264165085216817e-08, + "logits/chosen": -2.038879871368408, + "logits/rejected": -2.0388429164886475, + "logps/chosen": -0.9352089166641235, + "logps/rejected": -1.10355544090271, + "loss": 1.1401, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.870417833328247, + "rewards/margins": 0.33669325709342957, + "rewards/rejected": -2.20711088180542, + "step": 1920 + }, + { + "epoch": 1.3904899135446687, + "grad_norm": 18.690349536010206, + "learning_rate": 3.244180946015008e-08, + "logits/chosen": -1.966835618019104, + "logits/rejected": -1.967462182044983, + "logps/chosen": -1.0351486206054688, + "logps/rejected": -1.0991723537445068, + "loss": 1.273, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.0702972412109375, + "rewards/margins": 0.12804751098155975, + "rewards/rejected": -2.1983447074890137, + "step": 1930 + }, + { + "epoch": 1.397694524495677, + "grad_norm": 15.348372078288971, + "learning_rate": 3.224144493965578e-08, + "logits/chosen": -2.0523886680603027, + "logits/rejected": -2.0558857917785645, + "logps/chosen": -0.9908173680305481, + "logps/rejected": -1.0954809188842773, + "loss": 1.2177, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9816347360610962, + "rewards/margins": 0.20932729542255402, + "rewards/rejected": -2.1909618377685547, + "step": 1940 + }, + { + "epoch": 1.4048991354466858, + "grad_norm": 17.879874010257755, + "learning_rate": 3.204057137548371e-08, + "logits/chosen": -2.014993667602539, + "logits/rejected": -2.0096094608306885, + "logps/chosen": -0.9776601791381836, + "logps/rejected": -1.0827131271362305, + "loss": 1.2165, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9553203582763672, + "rewards/margins": 0.2101059854030609, + "rewards/rejected": -2.165426254272461, + "step": 1950 + }, + { + "epoch": 1.4121037463976944, + "grad_norm": 19.498418734777132, + "learning_rate": 3.183920288821597e-08, + "logits/chosen": -1.9974403381347656, + "logits/rejected": -1.9938675165176392, + "logps/chosen": -1.002251386642456, + "logps/rejected": -1.1633012294769287, + "loss": 1.1473, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.004502773284912, + "rewards/margins": 0.32209956645965576, + "rewards/rejected": -2.3266024589538574, + "step": 1960 + }, + { + "epoch": 1.419308357348703, + "grad_norm": 23.526801289262714, + "learning_rate": 3.1637353633225735e-08, + "logits/chosen": -2.0382392406463623, + "logits/rejected": -2.0321145057678223, + "logps/chosen": -1.0285365581512451, + "logps/rejected": -1.1749916076660156, + "loss": 1.1708, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0570731163024902, + "rewards/margins": 0.2929099202156067, + "rewards/rejected": -2.3499832153320312, + "step": 1970 + }, + { + "epoch": 1.4265129682997117, + "grad_norm": 19.581143803282398, + "learning_rate": 3.143503779968213e-08, + "logits/chosen": -2.0113444328308105, + "logits/rejected": -2.011580467224121, + "logps/chosen": -1.0159164667129517, + "logps/rejected": -1.1518559455871582, + "loss": 1.196, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0318329334259033, + "rewards/margins": 0.2718789875507355, + "rewards/rejected": -2.3037118911743164, + "step": 1980 + }, + { + "epoch": 1.4337175792507204, + "grad_norm": 18.05404403193421, + "learning_rate": 3.1232269609552875e-08, + "logits/chosen": -1.9945173263549805, + "logits/rejected": -1.9919058084487915, + "logps/chosen": -0.9980915188789368, + "logps/rejected": -1.1186559200286865, + "loss": 1.2014, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9961830377578735, + "rewards/margins": 0.24112899601459503, + "rewards/rejected": -2.237311840057373, + "step": 1990 + }, + { + "epoch": 1.440922190201729, + "grad_norm": 16.09307467422962, + "learning_rate": 3.102906331660444e-08, + "logits/chosen": -2.0536019802093506, + "logits/rejected": -2.045327663421631, + "logps/chosen": -0.9929038882255554, + "logps/rejected": -1.1643650531768799, + "loss": 1.1353, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9858077764511108, + "rewards/margins": 0.34292247891426086, + "rewards/rejected": -2.3287301063537598, + "step": 2000 + }, + { + "epoch": 1.4481268011527377, + "grad_norm": 16.081259631225404, + "learning_rate": 3.082543320540015e-08, + "logits/chosen": -1.9962193965911865, + "logits/rejected": -1.9890375137329102, + "logps/chosen": -1.0065879821777344, + "logps/rejected": -1.1499404907226562, + "loss": 1.1679, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0131759643554688, + "rewards/margins": 0.2867050766944885, + "rewards/rejected": -2.2998809814453125, + "step": 2010 + }, + { + "epoch": 1.4553314121037464, + "grad_norm": 18.028275293304183, + "learning_rate": 3.062139359029599e-08, + "logits/chosen": -2.031736373901367, + "logits/rejected": -2.0316202640533447, + "logps/chosen": -1.0291239023208618, + "logps/rejected": -1.1133326292037964, + "loss": 1.2476, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.0582478046417236, + "rewards/margins": 0.16841746866703033, + "rewards/rejected": -2.2266652584075928, + "step": 2020 + }, + { + "epoch": 1.462536023054755, + "grad_norm": 18.739986191205507, + "learning_rate": 3.041695881443437e-08, + "logits/chosen": -2.0472700595855713, + "logits/rejected": -2.0425424575805664, + "logps/chosen": -0.9730477333068848, + "logps/rejected": -1.1086480617523193, + "loss": 1.1771, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.9460954666137695, + "rewards/margins": 0.271200567483902, + "rewards/rejected": -2.2172961235046387, + "step": 2030 + }, + { + "epoch": 1.4697406340057637, + "grad_norm": 22.13845084834241, + "learning_rate": 3.0212143248735886e-08, + "logits/chosen": -2.0294270515441895, + "logits/rejected": -2.029846668243408, + "logps/chosen": -0.9991294741630554, + "logps/rejected": -1.1360986232757568, + "loss": 1.1761, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9982589483261108, + "rewards/margins": 0.27393826842308044, + "rewards/rejected": -2.2721972465515137, + "step": 2040 + }, + { + "epoch": 1.4769452449567724, + "grad_norm": 19.822466302624346, + "learning_rate": 3.0006961290889077e-08, + "logits/chosen": -2.0190889835357666, + "logits/rejected": -2.0099661350250244, + "logps/chosen": -1.1185331344604492, + "logps/rejected": -1.286892056465149, + "loss": 1.1647, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.2370662689208984, + "rewards/margins": 0.33671754598617554, + "rewards/rejected": -2.573784112930298, + "step": 2050 + }, + { + "epoch": 1.484149855907781, + "grad_norm": 21.925715491881135, + "learning_rate": 2.980142736433833e-08, + "logits/chosen": -2.01119327545166, + "logits/rejected": -2.004316806793213, + "logps/chosen": -1.0309051275253296, + "logps/rejected": -1.0949158668518066, + "loss": 1.2751, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.061810255050659, + "rewards/margins": 0.12802138924598694, + "rewards/rejected": -2.1898317337036133, + "step": 2060 + }, + { + "epoch": 1.4913544668587897, + "grad_norm": 24.46772736032293, + "learning_rate": 2.9595555917269997e-08, + "logits/chosen": -2.03961181640625, + "logits/rejected": -2.0247714519500732, + "logps/chosen": -1.1411329507827759, + "logps/rejected": -1.2373685836791992, + "loss": 1.2153, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.2822659015655518, + "rewards/margins": 0.19247153401374817, + "rewards/rejected": -2.4747371673583984, + "step": 2070 + }, + { + "epoch": 1.4985590778097984, + "grad_norm": 18.449968646671344, + "learning_rate": 2.9389361421596725e-08, + "logits/chosen": -1.9533805847167969, + "logits/rejected": -1.9556515216827393, + "logps/chosen": -1.0595835447311401, + "logps/rejected": -1.1933083534240723, + "loss": 1.1832, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1191670894622803, + "rewards/margins": 0.2674497365951538, + "rewards/rejected": -2.3866167068481445, + "step": 2080 + }, + { + "epoch": 1.505763688760807, + "grad_norm": 20.34455177562933, + "learning_rate": 2.9182858371940126e-08, + "logits/chosen": -2.0372543334960938, + "logits/rejected": -2.031832218170166, + "logps/chosen": -1.0473064184188843, + "logps/rejected": -1.1757621765136719, + "loss": 1.188, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0946128368377686, + "rewards/margins": 0.25691163539886475, + "rewards/rejected": -2.3515243530273438, + "step": 2090 + }, + { + "epoch": 1.5129682997118157, + "grad_norm": 18.90759740416456, + "learning_rate": 2.8976061284611908e-08, + "logits/chosen": -1.9889026880264282, + "logits/rejected": -1.9977174997329712, + "logps/chosen": -0.9364235997200012, + "logps/rejected": -1.0655431747436523, + "loss": 1.1944, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.8728471994400024, + "rewards/margins": 0.2582393288612366, + "rewards/rejected": -2.1310863494873047, + "step": 2100 + }, + { + "epoch": 1.5201729106628243, + "grad_norm": 21.434032214198695, + "learning_rate": 2.8768984696593384e-08, + "logits/chosen": -1.9844554662704468, + "logits/rejected": -1.974907636642456, + "logps/chosen": -1.0168864727020264, + "logps/rejected": -1.1343626976013184, + "loss": 1.2154, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0337729454040527, + "rewards/margins": 0.23495233058929443, + "rewards/rejected": -2.2687253952026367, + "step": 2110 + }, + { + "epoch": 1.527377521613833, + "grad_norm": 18.047284778863265, + "learning_rate": 2.8561643164513637e-08, + "logits/chosen": -1.9067440032958984, + "logits/rejected": -1.9029529094696045, + "logps/chosen": -1.0492842197418213, + "logps/rejected": -1.1676268577575684, + "loss": 1.2013, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.0985684394836426, + "rewards/margins": 0.23668520152568817, + "rewards/rejected": -2.3352537155151367, + "step": 2120 + }, + { + "epoch": 1.5345821325648417, + "grad_norm": 18.95635925899202, + "learning_rate": 2.8354051263626227e-08, + "logits/chosen": -1.9887434244155884, + "logits/rejected": -1.994476556777954, + "logps/chosen": -1.0596574544906616, + "logps/rejected": -1.1733436584472656, + "loss": 1.206, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.1193149089813232, + "rewards/margins": 0.22737233340740204, + "rewards/rejected": -2.3466873168945312, + "step": 2130 + }, + { + "epoch": 1.54178674351585, + "grad_norm": 19.584229312796637, + "learning_rate": 2.8146223586784573e-08, + "logits/chosen": -1.9805008172988892, + "logits/rejected": -1.9726234674453735, + "logps/chosen": -1.0646823644638062, + "logps/rejected": -1.1987252235412598, + "loss": 1.1873, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.1293647289276123, + "rewards/margins": 0.26808565855026245, + "rewards/rejected": -2.3974504470825195, + "step": 2140 + }, + { + "epoch": 1.5489913544668588, + "grad_norm": 25.50415369546022, + "learning_rate": 2.7938174743416205e-08, + "logits/chosen": -1.9369735717773438, + "logits/rejected": -1.933683156967163, + "logps/chosen": -1.050445795059204, + "logps/rejected": -1.1607348918914795, + "loss": 1.2135, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.100891590118408, + "rewards/margins": 0.220577672123909, + "rewards/rejected": -2.321469783782959, + "step": 2150 + }, + { + "epoch": 1.5561959654178674, + "grad_norm": 19.684619038178205, + "learning_rate": 2.7729919358495728e-08, + "logits/chosen": -2.005277395248413, + "logits/rejected": -2.0062077045440674, + "logps/chosen": -1.1121950149536133, + "logps/rejected": -1.19098699092865, + "loss": 1.2586, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.2243900299072266, + "rewards/margins": 0.1575840413570404, + "rewards/rejected": -2.3819739818573, + "step": 2160 + }, + { + "epoch": 1.563400576368876, + "grad_norm": 19.293166467927325, + "learning_rate": 2.7521472071516772e-08, + "logits/chosen": -2.0027170181274414, + "logits/rejected": -2.0016961097717285, + "logps/chosen": -0.9449695348739624, + "logps/rejected": -1.0605154037475586, + "loss": 1.2076, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.8899390697479248, + "rewards/margins": 0.2310914546251297, + "rewards/rejected": -2.121030807495117, + "step": 2170 + }, + { + "epoch": 1.5706051873198847, + "grad_norm": 22.062496687144794, + "learning_rate": 2.731284753546289e-08, + "logits/chosen": -1.9856891632080078, + "logits/rejected": -1.9836666584014893, + "logps/chosen": -1.081839919090271, + "logps/rejected": -1.2224990129470825, + "loss": 1.1741, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.163679838180542, + "rewards/margins": 0.2813180387020111, + "rewards/rejected": -2.444998025894165, + "step": 2180 + }, + { + "epoch": 1.5778097982708934, + "grad_norm": 21.803351526445823, + "learning_rate": 2.710406041577751e-08, + "logits/chosen": -2.04976224899292, + "logits/rejected": -2.0463500022888184, + "logps/chosen": -1.0325794219970703, + "logps/rejected": -1.1858645677566528, + "loss": 1.1631, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0651588439941406, + "rewards/margins": 0.3065701127052307, + "rewards/rejected": -2.3717291355133057, + "step": 2190 + }, + { + "epoch": 1.585014409221902, + "grad_norm": 18.0281741107113, + "learning_rate": 2.6895125389333017e-08, + "logits/chosen": -2.0131421089172363, + "logits/rejected": -2.0089025497436523, + "logps/chosen": -1.0270203351974487, + "logps/rejected": -1.177971363067627, + "loss": 1.1622, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0540406703948975, + "rewards/margins": 0.3019018769264221, + "rewards/rejected": -2.355942726135254, + "step": 2200 + }, + { + "epoch": 1.5922190201729105, + "grad_norm": 17.129921710950377, + "learning_rate": 2.6686057143399028e-08, + "logits/chosen": -2.010429620742798, + "logits/rejected": -2.0119571685791016, + "logps/chosen": -1.0614731311798096, + "logps/rejected": -1.1593468189239502, + "loss": 1.2433, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.122946262359619, + "rewards/margins": 0.1957472264766693, + "rewards/rejected": -2.3186936378479004, + "step": 2210 + }, + { + "epoch": 1.5994236311239192, + "grad_norm": 19.402506516811066, + "learning_rate": 2.647687037460996e-08, + "logits/chosen": -2.0160350799560547, + "logits/rejected": -2.0153958797454834, + "logps/chosen": -1.0873758792877197, + "logps/rejected": -1.2827941179275513, + "loss": 1.1246, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1747517585754395, + "rewards/margins": 0.3908364176750183, + "rewards/rejected": -2.5655882358551025, + "step": 2220 + }, + { + "epoch": 1.6066282420749278, + "grad_norm": 20.370963061014333, + "learning_rate": 2.626757978793187e-08, + "logits/chosen": -2.0244648456573486, + "logits/rejected": -2.0181853771209717, + "logps/chosen": -1.0852004289627075, + "logps/rejected": -1.2089064121246338, + "loss": 1.2036, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.170400857925415, + "rewards/margins": 0.24741193652153015, + "rewards/rejected": -2.4178128242492676, + "step": 2230 + }, + { + "epoch": 1.6138328530259365, + "grad_norm": 23.538795309630903, + "learning_rate": 2.6058200095628797e-08, + "logits/chosen": -1.9968335628509521, + "logits/rejected": -2.000123977661133, + "logps/chosen": -0.9178045392036438, + "logps/rejected": -1.086455225944519, + "loss": 1.1446, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.8356090784072876, + "rewards/margins": 0.3373013734817505, + "rewards/rejected": -2.172910451889038, + "step": 2240 + }, + { + "epoch": 1.6210374639769451, + "grad_norm": 18.7834477811749, + "learning_rate": 2.584874601622854e-08, + "logits/chosen": -2.0577359199523926, + "logits/rejected": -2.0486464500427246, + "logps/chosen": -1.0842779874801636, + "logps/rejected": -1.2169630527496338, + "loss": 1.2055, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.168555974960327, + "rewards/margins": 0.26537027955055237, + "rewards/rejected": -2.4339261054992676, + "step": 2250 + }, + { + "epoch": 1.6282420749279538, + "grad_norm": 21.50129735883824, + "learning_rate": 2.5639232273487993e-08, + "logits/chosen": -1.9792057275772095, + "logits/rejected": -1.9694305658340454, + "logps/chosen": -0.9786102175712585, + "logps/rejected": -1.0999042987823486, + "loss": 1.2022, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.957220435142517, + "rewards/margins": 0.24258823692798615, + "rewards/rejected": -2.1998085975646973, + "step": 2260 + }, + { + "epoch": 1.6354466858789625, + "grad_norm": 20.836699972853967, + "learning_rate": 2.5429673595358142e-08, + "logits/chosen": -2.0185582637786865, + "logits/rejected": -2.0170459747314453, + "logps/chosen": -1.043128490447998, + "logps/rejected": -1.165533185005188, + "loss": 1.1967, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.086256980895996, + "rewards/margins": 0.24480919539928436, + "rewards/rejected": -2.331066370010376, + "step": 2270 + }, + { + "epoch": 1.6426512968299711, + "grad_norm": 23.73080611195804, + "learning_rate": 2.5220084712948764e-08, + "logits/chosen": -1.9826107025146484, + "logits/rejected": -1.9717817306518555, + "logps/chosen": -1.1198116540908813, + "logps/rejected": -1.2381196022033691, + "loss": 1.1906, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.2396233081817627, + "rewards/margins": 0.23661574721336365, + "rewards/rejected": -2.4762392044067383, + "step": 2280 + }, + { + "epoch": 1.6498559077809798, + "grad_norm": 19.691578624312058, + "learning_rate": 2.5010480359492838e-08, + "logits/chosen": -1.9650003910064697, + "logits/rejected": -1.9621715545654297, + "logps/chosen": -1.0505197048187256, + "logps/rejected": -1.1106680631637573, + "loss": 1.2861, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.101039409637451, + "rewards/margins": 0.12029679119586945, + "rewards/rejected": -2.2213361263275146, + "step": 2290 + }, + { + "epoch": 1.6570605187319885, + "grad_norm": 21.239713120458195, + "learning_rate": 2.480087526931091e-08, + "logits/chosen": -2.0088658332824707, + "logits/rejected": -1.9966083765029907, + "logps/chosen": -1.0031955242156982, + "logps/rejected": -1.1170064210891724, + "loss": 1.2166, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0063910484313965, + "rewards/margins": 0.22762183845043182, + "rewards/rejected": -2.2340128421783447, + "step": 2300 + }, + { + "epoch": 1.6642651296829971, + "grad_norm": 19.4121166793283, + "learning_rate": 2.4591284176775326e-08, + "logits/chosen": -1.9742721319198608, + "logits/rejected": -1.9704244136810303, + "logps/chosen": -1.07572340965271, + "logps/rejected": -1.159128189086914, + "loss": 1.2565, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.15144681930542, + "rewards/margins": 0.16680975258350372, + "rewards/rejected": -2.318256378173828, + "step": 2310 + }, + { + "epoch": 1.6714697406340058, + "grad_norm": 21.976079747343572, + "learning_rate": 2.4381721815274443e-08, + "logits/chosen": -2.0400168895721436, + "logits/rejected": -2.0402297973632812, + "logps/chosen": -1.019706130027771, + "logps/rejected": -1.1512401103973389, + "loss": 1.1928, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.039412260055542, + "rewards/margins": 0.26306766271591187, + "rewards/rejected": -2.3024802207946777, + "step": 2320 + }, + { + "epoch": 1.6786743515850144, + "grad_norm": 19.583839102475277, + "learning_rate": 2.4172202916176936e-08, + "logits/chosen": -2.0487046241760254, + "logits/rejected": -2.0510191917419434, + "logps/chosen": -0.9676868319511414, + "logps/rejected": -1.1354566812515259, + "loss": 1.1567, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.9353736639022827, + "rewards/margins": 0.3355395197868347, + "rewards/rejected": -2.2709133625030518, + "step": 2330 + }, + { + "epoch": 1.685878962536023, + "grad_norm": 19.19182662272249, + "learning_rate": 2.3962742207796268e-08, + "logits/chosen": -1.9858787059783936, + "logits/rejected": -1.9837026596069336, + "logps/chosen": -0.9570139050483704, + "logps/rejected": -1.1192009449005127, + "loss": 1.1603, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9140278100967407, + "rewards/margins": 0.32437413930892944, + "rewards/rejected": -2.2384018898010254, + "step": 2340 + }, + { + "epoch": 1.6930835734870318, + "grad_norm": 22.691977894194924, + "learning_rate": 2.3753354414355334e-08, + "logits/chosen": -1.9461901187896729, + "logits/rejected": -1.9355932474136353, + "logps/chosen": -1.0649149417877197, + "logps/rejected": -1.1817948818206787, + "loss": 1.213, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1298298835754395, + "rewards/margins": 0.2337600290775299, + "rewards/rejected": -2.3635897636413574, + "step": 2350 + }, + { + "epoch": 1.7002881844380404, + "grad_norm": 18.59945891396093, + "learning_rate": 2.3544054254951408e-08, + "logits/chosen": -1.9878515005111694, + "logits/rejected": -1.9792087078094482, + "logps/chosen": -0.9375497102737427, + "logps/rejected": -1.1345303058624268, + "loss": 1.114, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.8750994205474854, + "rewards/margins": 0.393961638212204, + "rewards/rejected": -2.2690606117248535, + "step": 2360 + }, + { + "epoch": 1.707492795389049, + "grad_norm": 18.203541895462912, + "learning_rate": 2.3334856442521435e-08, + "logits/chosen": -2.0370235443115234, + "logits/rejected": -2.0295424461364746, + "logps/chosen": -1.0964655876159668, + "logps/rejected": -1.166515588760376, + "loss": 1.27, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.1929311752319336, + "rewards/margins": 0.1401001363992691, + "rewards/rejected": -2.333031177520752, + "step": 2370 + }, + { + "epoch": 1.7146974063400577, + "grad_norm": 19.123513495613718, + "learning_rate": 2.3125775682807826e-08, + "logits/chosen": -2.0507147312164307, + "logits/rejected": -2.0506680011749268, + "logps/chosen": -1.1658060550689697, + "logps/rejected": -1.2665237188339233, + "loss": 1.2309, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.3316121101379395, + "rewards/margins": 0.2014356553554535, + "rewards/rejected": -2.5330474376678467, + "step": 2380 + }, + { + "epoch": 1.7219020172910664, + "grad_norm": 20.583955091856193, + "learning_rate": 2.291682667332464e-08, + "logits/chosen": -2.0658912658691406, + "logits/rejected": -2.0607848167419434, + "logps/chosen": -1.0484416484832764, + "logps/rejected": -1.1794416904449463, + "loss": 1.1918, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.0968832969665527, + "rewards/margins": 0.2620001435279846, + "rewards/rejected": -2.3588833808898926, + "step": 2390 + }, + { + "epoch": 1.729106628242075, + "grad_norm": 15.255925002553854, + "learning_rate": 2.2708024102324454e-08, + "logits/chosen": -2.0251784324645996, + "logits/rejected": -2.0195064544677734, + "logps/chosen": -1.0335527658462524, + "logps/rejected": -1.2097657918930054, + "loss": 1.1498, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.067105531692505, + "rewards/margins": 0.35242635011672974, + "rewards/rejected": -2.4195315837860107, + "step": 2400 + }, + { + "epoch": 1.7363112391930837, + "grad_norm": 22.44593573299748, + "learning_rate": 2.2499382647765797e-08, + "logits/chosen": -2.0198001861572266, + "logits/rejected": -2.016092538833618, + "logps/chosen": -1.0722882747650146, + "logps/rejected": -1.161583662033081, + "loss": 1.2463, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.1445765495300293, + "rewards/margins": 0.17859075963497162, + "rewards/rejected": -2.323167324066162, + "step": 2410 + }, + { + "epoch": 1.7435158501440924, + "grad_norm": 20.996477598226324, + "learning_rate": 2.2290916976281427e-08, + "logits/chosen": -1.997984528541565, + "logits/rejected": -1.991624116897583, + "logps/chosen": -0.9992947578430176, + "logps/rejected": -1.1312335729599, + "loss": 1.2149, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.9985895156860352, + "rewards/margins": 0.26387742161750793, + "rewards/rejected": -2.2624671459198, + "step": 2420 + }, + { + "epoch": 1.7507204610951008, + "grad_norm": 18.145146158512926, + "learning_rate": 2.2082641742147238e-08, + "logits/chosen": -1.9863611459732056, + "logits/rejected": -1.9797251224517822, + "logps/chosen": -1.0165367126464844, + "logps/rejected": -1.2077693939208984, + "loss": 1.115, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0330734252929688, + "rewards/margins": 0.38246554136276245, + "rewards/rejected": -2.415538787841797, + "step": 2430 + }, + { + "epoch": 1.7579250720461095, + "grad_norm": 20.511354788346416, + "learning_rate": 2.1874571586252177e-08, + "logits/chosen": -2.0291788578033447, + "logits/rejected": -2.0222790241241455, + "logps/chosen": -1.0278832912445068, + "logps/rejected": -1.1068981885910034, + "loss": 1.256, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0557665824890137, + "rewards/margins": 0.15802964568138123, + "rewards/rejected": -2.213796377182007, + "step": 2440 + }, + { + "epoch": 1.7651296829971181, + "grad_norm": 20.78736849578736, + "learning_rate": 2.1666721135069037e-08, + "logits/chosen": -2.022594928741455, + "logits/rejected": -2.019284725189209, + "logps/chosen": -1.1104170083999634, + "logps/rejected": -1.2043354511260986, + "loss": 1.2436, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.2208340167999268, + "rewards/margins": 0.18783698976039886, + "rewards/rejected": -2.4086709022521973, + "step": 2450 + }, + { + "epoch": 1.7723342939481268, + "grad_norm": 15.559026450288725, + "learning_rate": 2.145910499962628e-08, + "logits/chosen": -2.0644400119781494, + "logits/rejected": -2.0565133094787598, + "logps/chosen": -0.9585247039794922, + "logps/rejected": -1.101301908493042, + "loss": 1.1824, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9170494079589844, + "rewards/margins": 0.28555426001548767, + "rewards/rejected": -2.202603816986084, + "step": 2460 + }, + { + "epoch": 1.7795389048991355, + "grad_norm": 23.88258329458798, + "learning_rate": 2.1251737774480915e-08, + "logits/chosen": -2.0456204414367676, + "logits/rejected": -2.036010265350342, + "logps/chosen": -1.1689999103546143, + "logps/rejected": -1.2592300176620483, + "loss": 1.2704, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.3379998207092285, + "rewards/margins": 0.18046024441719055, + "rewards/rejected": -2.5184600353240967, + "step": 2470 + }, + { + "epoch": 1.7867435158501441, + "grad_norm": 17.478397647824718, + "learning_rate": 2.104463403669264e-08, + "logits/chosen": -1.9978790283203125, + "logits/rejected": -1.9951884746551514, + "logps/chosen": -1.0451444387435913, + "logps/rejected": -1.189968466758728, + "loss": 1.1806, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0902888774871826, + "rewards/margins": 0.2896478772163391, + "rewards/rejected": -2.379936933517456, + "step": 2480 + }, + { + "epoch": 1.7939481268011528, + "grad_norm": 17.10039588248249, + "learning_rate": 2.0837808344799028e-08, + "logits/chosen": -1.9799407720565796, + "logits/rejected": -1.9754537343978882, + "logps/chosen": -0.9404538869857788, + "logps/rejected": -1.0731335878372192, + "loss": 1.1826, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.8809077739715576, + "rewards/margins": 0.26535919308662415, + "rewards/rejected": -2.1462671756744385, + "step": 2490 + }, + { + "epoch": 1.8011527377521612, + "grad_norm": 18.18583469521082, + "learning_rate": 2.063127523779219e-08, + "logits/chosen": -1.9833685159683228, + "logits/rejected": -1.9792015552520752, + "logps/chosen": -1.0076165199279785, + "logps/rejected": -1.1942096948623657, + "loss": 1.1139, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.015233039855957, + "rewards/margins": 0.373186320066452, + "rewards/rejected": -2.3884193897247314, + "step": 2500 + }, + { + "epoch": 1.8083573487031699, + "grad_norm": 19.97417842705391, + "learning_rate": 2.0425049234096737e-08, + "logits/chosen": -1.9911282062530518, + "logits/rejected": -1.9853017330169678, + "logps/chosen": -1.0088120698928833, + "logps/rejected": -1.1258007287979126, + "loss": 1.2158, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0176241397857666, + "rewards/margins": 0.2339775562286377, + "rewards/rejected": -2.251601457595825, + "step": 2510 + }, + { + "epoch": 1.8155619596541785, + "grad_norm": 19.435490123277745, + "learning_rate": 2.0219144830549163e-08, + "logits/chosen": -1.9644883871078491, + "logits/rejected": -1.9634536504745483, + "logps/chosen": -1.0153688192367554, + "logps/rejected": -1.161481261253357, + "loss": 1.1831, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0307376384735107, + "rewards/margins": 0.2922249436378479, + "rewards/rejected": -2.322962522506714, + "step": 2520 + }, + { + "epoch": 1.8227665706051872, + "grad_norm": 19.09312194813426, + "learning_rate": 2.0013576501378823e-08, + "logits/chosen": -1.9830167293548584, + "logits/rejected": -1.9765300750732422, + "logps/chosen": -1.0100529193878174, + "logps/rejected": -1.144884705543518, + "loss": 1.1939, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0201058387756348, + "rewards/margins": 0.2696635127067566, + "rewards/rejected": -2.289769411087036, + "step": 2530 + }, + { + "epoch": 1.8299711815561959, + "grad_norm": 20.224925594213033, + "learning_rate": 1.9808358697190426e-08, + "logits/chosen": -1.9704053401947021, + "logits/rejected": -1.966780662536621, + "logps/chosen": -0.9303935766220093, + "logps/rejected": -1.0650821924209595, + "loss": 1.1986, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8607871532440186, + "rewards/margins": 0.26937711238861084, + "rewards/rejected": -2.130164384841919, + "step": 2540 + }, + { + "epoch": 1.8371757925072045, + "grad_norm": 21.09688980967129, + "learning_rate": 1.9603505843948214e-08, + "logits/chosen": -2.01230525970459, + "logits/rejected": -2.002260208129883, + "logps/chosen": -0.948139488697052, + "logps/rejected": -1.1198240518569946, + "loss": 1.1395, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.896278977394104, + "rewards/margins": 0.3433689475059509, + "rewards/rejected": -2.2396481037139893, + "step": 2550 + }, + { + "epoch": 1.8443804034582132, + "grad_norm": 20.232449119924333, + "learning_rate": 1.9399032341961886e-08, + "logits/chosen": -1.9766803979873657, + "logits/rejected": -1.960636854171753, + "logps/chosen": -0.9899090528488159, + "logps/rejected": -1.0627111196517944, + "loss": 1.2699, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.9798181056976318, + "rewards/margins": 0.1456039845943451, + "rewards/rejected": -2.125422239303589, + "step": 2560 + }, + { + "epoch": 1.8515850144092219, + "grad_norm": 26.201248917968616, + "learning_rate": 1.9194952564874323e-08, + "logits/chosen": -2.0239641666412354, + "logits/rejected": -2.0179200172424316, + "logps/chosen": -1.0649644136428833, + "logps/rejected": -1.2079579830169678, + "loss": 1.1683, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1299288272857666, + "rewards/margins": 0.2859875559806824, + "rewards/rejected": -2.4159159660339355, + "step": 2570 + }, + { + "epoch": 1.8587896253602305, + "grad_norm": 20.644198497609576, + "learning_rate": 1.8991280858651157e-08, + "logits/chosen": -1.9820836782455444, + "logits/rejected": -1.9764404296875, + "logps/chosen": -1.0636051893234253, + "logps/rejected": -1.14960777759552, + "loss": 1.2503, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.1272103786468506, + "rewards/margins": 0.1720050871372223, + "rewards/rejected": -2.29921555519104, + "step": 2580 + }, + { + "epoch": 1.8659942363112392, + "grad_norm": 16.854142688708556, + "learning_rate": 1.8788031540572327e-08, + "logits/chosen": -1.9858763217926025, + "logits/rejected": -1.977818489074707, + "logps/chosen": -0.9995776414871216, + "logps/rejected": -1.1453144550323486, + "loss": 1.1718, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9991552829742432, + "rewards/margins": 0.29147323966026306, + "rewards/rejected": -2.2906289100646973, + "step": 2590 + }, + { + "epoch": 1.8731988472622478, + "grad_norm": 16.996398857656907, + "learning_rate": 1.858521889822565e-08, + "logits/chosen": -2.0046029090881348, + "logits/rejected": -2.007223129272461, + "logps/chosen": -0.9735875129699707, + "logps/rejected": -1.0832773447036743, + "loss": 1.2233, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.9471750259399414, + "rewards/margins": 0.21937978267669678, + "rewards/rejected": -2.1665546894073486, + "step": 2600 + }, + { + "epoch": 1.8804034582132565, + "grad_norm": 16.42143731996496, + "learning_rate": 1.8382857188502422e-08, + "logits/chosen": -1.9885772466659546, + "logits/rejected": -1.9836734533309937, + "logps/chosen": -0.9854310750961304, + "logps/rejected": -1.1128942966461182, + "loss": 1.1824, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9708621501922607, + "rewards/margins": 0.25492629408836365, + "rewards/rejected": -2.2257885932922363, + "step": 2610 + }, + { + "epoch": 1.8876080691642652, + "grad_norm": 22.507165910966208, + "learning_rate": 1.8180960636595234e-08, + "logits/chosen": -1.9683783054351807, + "logits/rejected": -1.966205358505249, + "logps/chosen": -1.0359306335449219, + "logps/rejected": -1.1797659397125244, + "loss": 1.1791, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0718612670898438, + "rewards/margins": 0.28767016530036926, + "rewards/rejected": -2.359531879425049, + "step": 2620 + }, + { + "epoch": 1.8948126801152738, + "grad_norm": 20.53168247865903, + "learning_rate": 1.7979543434998015e-08, + "logits/chosen": -2.038526773452759, + "logits/rejected": -2.0433452129364014, + "logps/chosen": -1.1234701871871948, + "logps/rejected": -1.2116920948028564, + "loss": 1.2381, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.2469403743743896, + "rewards/margins": 0.17644372582435608, + "rewards/rejected": -2.423384189605713, + "step": 2630 + }, + { + "epoch": 1.9020172910662825, + "grad_norm": 26.15291556775582, + "learning_rate": 1.7778619742508345e-08, + "logits/chosen": -1.9968883991241455, + "logits/rejected": -1.9899314641952515, + "logps/chosen": -1.0930571556091309, + "logps/rejected": -1.1869739294052124, + "loss": 1.2543, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1861143112182617, + "rewards/margins": 0.18783339858055115, + "rewards/rejected": -2.373947858810425, + "step": 2640 + }, + { + "epoch": 1.9092219020172911, + "grad_norm": 23.18245485008842, + "learning_rate": 1.757820368323213e-08, + "logits/chosen": -1.9929345846176147, + "logits/rejected": -1.9831962585449219, + "logps/chosen": -1.1062356233596802, + "logps/rejected": -1.2650859355926514, + "loss": 1.161, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.2124712467193604, + "rewards/margins": 0.3177003860473633, + "rewards/rejected": -2.5301718711853027, + "step": 2650 + }, + { + "epoch": 1.9164265129682998, + "grad_norm": 22.34671676050883, + "learning_rate": 1.7378309345590803e-08, + "logits/chosen": -2.006321907043457, + "logits/rejected": -2.015603542327881, + "logps/chosen": -1.0863068103790283, + "logps/rejected": -1.2286168336868286, + "loss": 1.1821, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1726136207580566, + "rewards/margins": 0.2846204340457916, + "rewards/rejected": -2.4572336673736572, + "step": 2660 + }, + { + "epoch": 1.9236311239193085, + "grad_norm": 20.09934555506027, + "learning_rate": 1.717895078133088e-08, + "logits/chosen": -2.059466600418091, + "logits/rejected": -2.0556395053863525, + "logps/chosen": -1.0593311786651611, + "logps/rejected": -1.2005198001861572, + "loss": 1.1828, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.1186623573303223, + "rewards/margins": 0.28237712383270264, + "rewards/rejected": -2.4010396003723145, + "step": 2670 + }, + { + "epoch": 1.9308357348703171, + "grad_norm": 21.005378635461394, + "learning_rate": 1.698014200453624e-08, + "logits/chosen": -2.0109024047851562, + "logits/rejected": -2.0184760093688965, + "logps/chosen": -1.031286597251892, + "logps/rejected": -1.1622233390808105, + "loss": 1.1776, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.062573194503784, + "rewards/margins": 0.26187336444854736, + "rewards/rejected": -2.324446678161621, + "step": 2680 + }, + { + "epoch": 1.9380403458213258, + "grad_norm": 24.456240122864646, + "learning_rate": 1.6781896990642964e-08, + "logits/chosen": -1.9447215795516968, + "logits/rejected": -1.942016839981079, + "logps/chosen": -1.1477292776107788, + "logps/rejected": -1.2380485534667969, + "loss": 1.2441, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.2954585552215576, + "rewards/margins": 0.18063834309577942, + "rewards/rejected": -2.4760971069335938, + "step": 2690 + }, + { + "epoch": 1.9452449567723344, + "grad_norm": 24.379018095612878, + "learning_rate": 1.658422967545693e-08, + "logits/chosen": -2.0516204833984375, + "logits/rejected": -2.0385327339172363, + "logps/chosen": -1.0048881769180298, + "logps/rejected": -1.1195095777511597, + "loss": 1.2189, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -2.0097763538360596, + "rewards/margins": 0.2292429655790329, + "rewards/rejected": -2.2390191555023193, + "step": 2700 + }, + { + "epoch": 1.952449567723343, + "grad_norm": 20.606423235238847, + "learning_rate": 1.638715395417418e-08, + "logits/chosen": -2.0263454914093018, + "logits/rejected": -2.024291515350342, + "logps/chosen": -1.069252610206604, + "logps/rejected": -1.2053518295288086, + "loss": 1.1848, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.138505220413208, + "rewards/margins": 0.27219831943511963, + "rewards/rejected": -2.410703659057617, + "step": 2710 + }, + { + "epoch": 1.9596541786743515, + "grad_norm": 22.535979632799137, + "learning_rate": 1.619068368040416e-08, + "logits/chosen": -2.024005174636841, + "logits/rejected": -2.0195693969726562, + "logps/chosen": -1.0005989074707031, + "logps/rejected": -1.178637981414795, + "loss": 1.1296, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0011978149414062, + "rewards/margins": 0.356078177690506, + "rewards/rejected": -2.35727596282959, + "step": 2720 + }, + { + "epoch": 1.9668587896253602, + "grad_norm": 17.493969053743083, + "learning_rate": 1.5994832665195853e-08, + "logits/chosen": -1.9611831903457642, + "logits/rejected": -1.9615755081176758, + "logps/chosen": -1.0340797901153564, + "logps/rejected": -1.146831750869751, + "loss": 1.2119, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.068159580230713, + "rewards/margins": 0.22550389170646667, + "rewards/rejected": -2.293663501739502, + "step": 2730 + }, + { + "epoch": 1.9740634005763689, + "grad_norm": 20.261024993446156, + "learning_rate": 1.5799614676066906e-08, + "logits/chosen": -2.068851947784424, + "logits/rejected": -2.065795421600342, + "logps/chosen": -0.9484384655952454, + "logps/rejected": -1.0868208408355713, + "loss": 1.1744, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8968769311904907, + "rewards/margins": 0.2767646610736847, + "rewards/rejected": -2.1736416816711426, + "step": 2740 + }, + { + "epoch": 1.9812680115273775, + "grad_norm": 16.03971358941223, + "learning_rate": 1.560504343603587e-08, + "logits/chosen": -1.9830427169799805, + "logits/rejected": -1.983306884765625, + "logps/chosen": -1.0689435005187988, + "logps/rejected": -1.224401593208313, + "loss": 1.1606, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1378870010375977, + "rewards/margins": 0.3109160363674164, + "rewards/rejected": -2.448803186416626, + "step": 2750 + }, + { + "epoch": 1.9884726224783862, + "grad_norm": 18.729955235435014, + "learning_rate": 1.541113262265748e-08, + "logits/chosen": -2.0666756629943848, + "logits/rejected": -2.0645081996917725, + "logps/chosen": -1.0288206338882446, + "logps/rejected": -1.1466666460037231, + "loss": 1.2071, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0576412677764893, + "rewards/margins": 0.23569221794605255, + "rewards/rejected": -2.2933332920074463, + "step": 2760 + }, + { + "epoch": 1.9956772334293948, + "grad_norm": 25.946584240501473, + "learning_rate": 1.5217895867061227e-08, + "logits/chosen": -2.00740385055542, + "logits/rejected": -2.0015203952789307, + "logps/chosen": -1.0842344760894775, + "logps/rejected": -1.1836035251617432, + "loss": 1.2464, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.168468952178955, + "rewards/margins": 0.19873787462711334, + "rewards/rejected": -2.3672070503234863, + "step": 2770 + }, + { + "epoch": 2.0028818443804033, + "grad_norm": 22.724338628633177, + "learning_rate": 1.5025346752993098e-08, + "logits/chosen": -1.9985382556915283, + "logits/rejected": -2.000462532043457, + "logps/chosen": -1.071683406829834, + "logps/rejected": -1.1988445520401, + "loss": 1.1999, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.143366813659668, + "rewards/margins": 0.25432220101356506, + "rewards/rejected": -2.3976891040802, + "step": 2780 + }, + { + "epoch": 2.010086455331412, + "grad_norm": 23.240965924702092, + "learning_rate": 1.4833498815860756e-08, + "logits/chosen": -2.053358554840088, + "logits/rejected": -2.055558443069458, + "logps/chosen": -0.9996848106384277, + "logps/rejected": -1.1848304271697998, + "loss": 1.149, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9993696212768555, + "rewards/margins": 0.3702912926673889, + "rewards/rejected": -2.3696608543395996, + "step": 2790 + }, + { + "epoch": 2.0172910662824206, + "grad_norm": 18.225560415881105, + "learning_rate": 1.4642365541781993e-08, + "logits/chosen": -1.9646400213241577, + "logits/rejected": -1.9561887979507446, + "logps/chosen": -1.0267730951309204, + "logps/rejected": -1.1928188800811768, + "loss": 1.1511, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.053546190261841, + "rewards/margins": 0.3320915699005127, + "rewards/rejected": -2.3856377601623535, + "step": 2800 + }, + { + "epoch": 2.0244956772334293, + "grad_norm": 17.931282922261985, + "learning_rate": 1.4451960366636745e-08, + "logits/chosen": -2.026698589324951, + "logits/rejected": -2.0378384590148926, + "logps/chosen": -1.0406183004379272, + "logps/rejected": -1.1752769947052002, + "loss": 1.1819, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0812366008758545, + "rewards/margins": 0.26931747794151306, + "rewards/rejected": -2.3505539894104004, + "step": 2810 + }, + { + "epoch": 2.031700288184438, + "grad_norm": 19.290431128690432, + "learning_rate": 1.4262296675122592e-08, + "logits/chosen": -2.0173158645629883, + "logits/rejected": -2.0136220455169678, + "logps/chosen": -1.030659556388855, + "logps/rejected": -1.1910489797592163, + "loss": 1.1523, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.06131911277771, + "rewards/margins": 0.32077842950820923, + "rewards/rejected": -2.3820979595184326, + "step": 2820 + }, + { + "epoch": 2.0389048991354466, + "grad_norm": 17.902852888321604, + "learning_rate": 1.407338779981389e-08, + "logits/chosen": -1.9934546947479248, + "logits/rejected": -1.9913368225097656, + "logps/chosen": -0.9143481254577637, + "logps/rejected": -1.0957781076431274, + "loss": 1.1116, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8286962509155273, + "rewards/margins": 0.36285993456840515, + "rewards/rejected": -2.191556215286255, + "step": 2830 + }, + { + "epoch": 2.0461095100864553, + "grad_norm": 21.306516095869544, + "learning_rate": 1.3885247020224534e-08, + "logits/chosen": -2.0094637870788574, + "logits/rejected": -2.004822015762329, + "logps/chosen": -1.0016566514968872, + "logps/rejected": -1.1311957836151123, + "loss": 1.1913, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0033133029937744, + "rewards/margins": 0.25907841324806213, + "rewards/rejected": -2.2623915672302246, + "step": 2840 + }, + { + "epoch": 2.053314121037464, + "grad_norm": 17.105370578566056, + "learning_rate": 1.369788756187445e-08, + "logits/chosen": -2.008868455886841, + "logits/rejected": -2.0054876804351807, + "logps/chosen": -1.0270612239837646, + "logps/rejected": -1.1226084232330322, + "loss": 1.2343, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.0541224479675293, + "rewards/margins": 0.19109439849853516, + "rewards/rejected": -2.2452168464660645, + "step": 2850 + }, + { + "epoch": 2.0605187319884726, + "grad_norm": 18.695157813530198, + "learning_rate": 1.3511322595359925e-08, + "logits/chosen": -2.033163547515869, + "logits/rejected": -2.0247857570648193, + "logps/chosen": -0.9382683634757996, + "logps/rejected": -1.1057795286178589, + "loss": 1.14, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.8765367269515991, + "rewards/margins": 0.3350227475166321, + "rewards/rejected": -2.2115590572357178, + "step": 2860 + }, + { + "epoch": 2.0677233429394812, + "grad_norm": 17.33211536858926, + "learning_rate": 1.3325565235427716e-08, + "logits/chosen": -2.028552770614624, + "logits/rejected": -2.0268807411193848, + "logps/chosen": -0.9831819534301758, + "logps/rejected": -1.1274645328521729, + "loss": 1.177, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9663639068603516, + "rewards/margins": 0.288565069437027, + "rewards/rejected": -2.2549290657043457, + "step": 2870 + }, + { + "epoch": 2.07492795389049, + "grad_norm": 17.173051243263835, + "learning_rate": 1.3140628540053218e-08, + "logits/chosen": -1.9946448802947998, + "logits/rejected": -1.9966709613800049, + "logps/chosen": -0.9759955406188965, + "logps/rejected": -1.1114940643310547, + "loss": 1.1833, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.951991081237793, + "rewards/margins": 0.2709970772266388, + "rewards/rejected": -2.2229881286621094, + "step": 2880 + }, + { + "epoch": 2.0821325648414986, + "grad_norm": 19.25779046293631, + "learning_rate": 1.2956525509522451e-08, + "logits/chosen": -1.9791204929351807, + "logits/rejected": -1.97879159450531, + "logps/chosen": -1.1120542287826538, + "logps/rejected": -1.2156860828399658, + "loss": 1.234, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.2241084575653076, + "rewards/margins": 0.20726370811462402, + "rewards/rejected": -2.4313721656799316, + "step": 2890 + }, + { + "epoch": 2.089337175792507, + "grad_norm": 19.696970893217582, + "learning_rate": 1.2773269085518267e-08, + "logits/chosen": -2.011164426803589, + "logits/rejected": -2.0127670764923096, + "logps/chosen": -1.0766938924789429, + "logps/rejected": -1.2073553800582886, + "loss": 1.1837, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1533877849578857, + "rewards/margins": 0.2613227963447571, + "rewards/rejected": -2.414710760116577, + "step": 2900 + }, + { + "epoch": 2.096541786743516, + "grad_norm": 20.176935063380885, + "learning_rate": 1.2590872150210574e-08, + "logits/chosen": -2.0675017833709717, + "logits/rejected": -2.0605273246765137, + "logps/chosen": -1.057830810546875, + "logps/rejected": -1.1687963008880615, + "loss": 1.2247, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.11566162109375, + "rewards/margins": 0.22193074226379395, + "rewards/rejected": -2.337592601776123, + "step": 2910 + }, + { + "epoch": 2.1037463976945245, + "grad_norm": 20.572020361191345, + "learning_rate": 1.2409347525350775e-08, + "logits/chosen": -2.0273613929748535, + "logits/rejected": -2.0172836780548096, + "logps/chosen": -1.1093732118606567, + "logps/rejected": -1.2572507858276367, + "loss": 1.166, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2187464237213135, + "rewards/margins": 0.2957550585269928, + "rewards/rejected": -2.5145015716552734, + "step": 2920 + }, + { + "epoch": 2.110951008645533, + "grad_norm": 22.752346590850024, + "learning_rate": 1.2228707971370421e-08, + "logits/chosen": -2.018433094024658, + "logits/rejected": -2.01145601272583, + "logps/chosen": -0.9928643107414246, + "logps/rejected": -1.1063158512115479, + "loss": 1.2256, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9857286214828491, + "rewards/margins": 0.22690317034721375, + "rewards/rejected": -2.2126317024230957, + "step": 2930 + }, + { + "epoch": 2.118155619596542, + "grad_norm": 21.233577131341413, + "learning_rate": 1.2048966186484282e-08, + "logits/chosen": -2.015031337738037, + "logits/rejected": -1.9982612133026123, + "logps/chosen": -1.117865800857544, + "logps/rejected": -1.232062578201294, + "loss": 1.2127, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.235731601715088, + "rewards/margins": 0.2283933460712433, + "rewards/rejected": -2.464125156402588, + "step": 2940 + }, + { + "epoch": 2.1253602305475505, + "grad_norm": 28.100094599633593, + "learning_rate": 1.187013480579762e-08, + "logits/chosen": -2.010659694671631, + "logits/rejected": -2.0132699012756348, + "logps/chosen": -1.0425684452056885, + "logps/rejected": -1.1762912273406982, + "loss": 1.201, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.085136890411377, + "rewards/margins": 0.26744550466537476, + "rewards/rejected": -2.3525824546813965, + "step": 2950 + }, + { + "epoch": 2.132564841498559, + "grad_norm": 40.79478390193748, + "learning_rate": 1.1692226400418073e-08, + "logits/chosen": -1.9510002136230469, + "logits/rejected": -1.9495048522949219, + "logps/chosen": -1.0822184085845947, + "logps/rejected": -1.212081789970398, + "loss": 1.2188, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1644368171691895, + "rewards/margins": 0.2597268521785736, + "rewards/rejected": -2.424163579940796, + "step": 2960 + }, + { + "epoch": 2.139769452449568, + "grad_norm": 16.326675723252357, + "learning_rate": 1.1515253476571923e-08, + "logits/chosen": -1.9815658330917358, + "logits/rejected": -1.975783109664917, + "logps/chosen": -1.0105996131896973, + "logps/rejected": -1.192287564277649, + "loss": 1.1208, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0211992263793945, + "rewards/margins": 0.36337584257125854, + "rewards/rejected": -2.384575128555298, + "step": 2970 + }, + { + "epoch": 2.1469740634005765, + "grad_norm": 19.921977505309442, + "learning_rate": 1.133922847472496e-08, + "logits/chosen": -1.9953126907348633, + "logits/rejected": -1.9960988759994507, + "logps/chosen": -1.110705018043518, + "logps/rejected": -1.209160566329956, + "loss": 1.2518, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.221410036087036, + "rewards/margins": 0.19691102206707, + "rewards/rejected": -2.418321132659912, + "step": 2980 + }, + { + "epoch": 2.154178674351585, + "grad_norm": 22.89240067306987, + "learning_rate": 1.1164163768707952e-08, + "logits/chosen": -2.003279209136963, + "logits/rejected": -1.9980405569076538, + "logps/chosen": -1.0043981075286865, + "logps/rejected": -1.1427768468856812, + "loss": 1.1843, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.008796215057373, + "rewards/margins": 0.27675721049308777, + "rewards/rejected": -2.2855536937713623, + "step": 2990 + }, + { + "epoch": 2.161383285302594, + "grad_norm": 17.89862895130365, + "learning_rate": 1.0990071664846861e-08, + "logits/chosen": -1.9780519008636475, + "logits/rejected": -1.9771487712860107, + "logps/chosen": -1.0197398662567139, + "logps/rejected": -1.1996923685073853, + "loss": 1.1581, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0394797325134277, + "rewards/margins": 0.35990482568740845, + "rewards/rejected": -2.3993847370147705, + "step": 3000 + }, + { + "epoch": 2.1685878962536025, + "grad_norm": 18.90659171579793, + "learning_rate": 1.0816964401097739e-08, + "logits/chosen": -1.964535117149353, + "logits/rejected": -1.9613316059112549, + "logps/chosen": -0.9563964009284973, + "logps/rejected": -1.0800330638885498, + "loss": 1.2052, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.9127928018569946, + "rewards/margins": 0.24727335572242737, + "rewards/rejected": -2.1600661277770996, + "step": 3010 + }, + { + "epoch": 2.175792507204611, + "grad_norm": 19.878484764331017, + "learning_rate": 1.0644854146186406e-08, + "logits/chosen": -2.0236928462982178, + "logits/rejected": -2.0173866748809814, + "logps/chosen": -1.0241403579711914, + "logps/rejected": -1.1835166215896606, + "loss": 1.1624, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.048280715942383, + "rewards/margins": 0.318752646446228, + "rewards/rejected": -2.3670332431793213, + "step": 3020 + }, + { + "epoch": 2.18299711815562, + "grad_norm": 19.088076538610206, + "learning_rate": 1.0473752998753114e-08, + "logits/chosen": -2.004102945327759, + "logits/rejected": -1.9954335689544678, + "logps/chosen": -1.0195882320404053, + "logps/rejected": -1.1798489093780518, + "loss": 1.1535, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0391764640808105, + "rewards/margins": 0.3205214738845825, + "rewards/rejected": -2.3596978187561035, + "step": 3030 + }, + { + "epoch": 2.1902017291066285, + "grad_norm": 19.31361091042759, + "learning_rate": 1.030367298650201e-08, + "logits/chosen": -2.023881196975708, + "logits/rejected": -2.0238354206085205, + "logps/chosen": -1.0392138957977295, + "logps/rejected": -1.19191312789917, + "loss": 1.1578, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.078427791595459, + "rewards/margins": 0.3053986132144928, + "rewards/rejected": -2.38382625579834, + "step": 3040 + }, + { + "epoch": 2.1974063400576367, + "grad_norm": 22.093759972479646, + "learning_rate": 1.0134626065355675e-08, + "logits/chosen": -2.0746548175811768, + "logits/rejected": -2.0715177059173584, + "logps/chosen": -1.0234037637710571, + "logps/rejected": -1.1665077209472656, + "loss": 1.1883, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0468075275421143, + "rewards/margins": 0.2862081527709961, + "rewards/rejected": -2.3330154418945312, + "step": 3050 + }, + { + "epoch": 2.2046109510086453, + "grad_norm": 19.64286406855496, + "learning_rate": 9.966624118614611e-09, + "logits/chosen": -2.006706476211548, + "logits/rejected": -2.0016489028930664, + "logps/chosen": -1.0631506443023682, + "logps/rejected": -1.2085468769073486, + "loss": 1.1876, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.1263012886047363, + "rewards/margins": 0.29079198837280273, + "rewards/rejected": -2.4170937538146973, + "step": 3060 + }, + { + "epoch": 2.211815561959654, + "grad_norm": 14.856155733229528, + "learning_rate": 9.799678956121976e-09, + "logits/chosen": -1.9717843532562256, + "logits/rejected": -1.9674240350723267, + "logps/chosen": -1.0307656526565552, + "logps/rejected": -1.1394712924957275, + "loss": 1.2006, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0615313053131104, + "rewards/margins": 0.21741144359111786, + "rewards/rejected": -2.278942584991455, + "step": 3070 + }, + { + "epoch": 2.2190201729106627, + "grad_norm": 23.633018389781732, + "learning_rate": 9.633802313433314e-09, + "logits/chosen": -1.9454095363616943, + "logits/rejected": -1.9511306285858154, + "logps/chosen": -1.0190519094467163, + "logps/rejected": -1.1248835325241089, + "loss": 1.2055, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0381038188934326, + "rewards/margins": 0.21166305243968964, + "rewards/rejected": -2.2497670650482178, + "step": 3080 + }, + { + "epoch": 2.2262247838616713, + "grad_norm": 20.794315619142072, + "learning_rate": 9.469005850991705e-09, + "logits/chosen": -2.0088305473327637, + "logits/rejected": -2.003154754638672, + "logps/chosen": -1.0144597291946411, + "logps/rejected": -1.1316652297973633, + "loss": 1.2343, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.0289194583892822, + "rewards/margins": 0.23441116511821747, + "rewards/rejected": -2.2633304595947266, + "step": 3090 + }, + { + "epoch": 2.23342939481268, + "grad_norm": 18.67038535819961, + "learning_rate": 9.305301153307949e-09, + "logits/chosen": -2.0057482719421387, + "logits/rejected": -2.0133614540100098, + "logps/chosen": -0.9462668299674988, + "logps/rejected": -1.1108109951019287, + "loss": 1.1573, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.8925336599349976, + "rewards/margins": 0.3290883004665375, + "rewards/rejected": -2.2216219902038574, + "step": 3100 + }, + { + "epoch": 2.2406340057636887, + "grad_norm": 18.08187045245269, + "learning_rate": 9.142699728146336e-09, + "logits/chosen": -1.9763425588607788, + "logits/rejected": -1.9695403575897217, + "logps/chosen": -1.0319360494613647, + "logps/rejected": -1.1644192934036255, + "loss": 1.2014, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0638720989227295, + "rewards/margins": 0.26496636867523193, + "rewards/rejected": -2.328838586807251, + "step": 3110 + }, + { + "epoch": 2.2478386167146973, + "grad_norm": 16.765059853307356, + "learning_rate": 8.981213005715627e-09, + "logits/chosen": -2.002300500869751, + "logits/rejected": -2.005335569381714, + "logps/chosen": -0.9925374984741211, + "logps/rejected": -1.1654067039489746, + "loss": 1.1484, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.9850749969482422, + "rewards/margins": 0.34573858976364136, + "rewards/rejected": -2.330813407897949, + "step": 3120 + }, + { + "epoch": 2.255043227665706, + "grad_norm": 21.826707648017194, + "learning_rate": 8.820852337865611e-09, + "logits/chosen": -2.0354738235473633, + "logits/rejected": -2.031705617904663, + "logps/chosen": -0.9956668019294739, + "logps/rejected": -1.1440733671188354, + "loss": 1.1717, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.9913336038589478, + "rewards/margins": 0.296813428401947, + "rewards/rejected": -2.288146734237671, + "step": 3130 + }, + { + "epoch": 2.2622478386167146, + "grad_norm": 17.005533531498173, + "learning_rate": 8.661628997289044e-09, + "logits/chosen": -1.9752384424209595, + "logits/rejected": -1.97113835811615, + "logps/chosen": -1.0153406858444214, + "logps/rejected": -1.1710517406463623, + "loss": 1.1687, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0306813716888428, + "rewards/margins": 0.3114221394062042, + "rewards/rejected": -2.3421034812927246, + "step": 3140 + }, + { + "epoch": 2.2694524495677233, + "grad_norm": 16.411029038337308, + "learning_rate": 8.503554176729341e-09, + "logits/chosen": -1.976362943649292, + "logits/rejected": -1.974590539932251, + "logps/chosen": -1.026755928993225, + "logps/rejected": -1.1865880489349365, + "loss": 1.1696, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.05351185798645, + "rewards/margins": 0.3196641802787781, + "rewards/rejected": -2.373176097869873, + "step": 3150 + }, + { + "epoch": 2.276657060518732, + "grad_norm": 24.940392400474, + "learning_rate": 8.346638988193636e-09, + "logits/chosen": -2.0030248165130615, + "logits/rejected": -1.9979517459869385, + "logps/chosen": -0.9251815676689148, + "logps/rejected": -1.0761079788208008, + "loss": 1.1768, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.8503631353378296, + "rewards/margins": 0.30185258388519287, + "rewards/rejected": -2.1522159576416016, + "step": 3160 + }, + { + "epoch": 2.2838616714697406, + "grad_norm": 23.2363909978899, + "learning_rate": 8.19089446217176e-09, + "logits/chosen": -1.9777719974517822, + "logits/rejected": -1.9676783084869385, + "logps/chosen": -1.0022261142730713, + "logps/rejected": -1.1919556856155396, + "loss": 1.1202, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0044522285461426, + "rewards/margins": 0.3794591426849365, + "rewards/rejected": -2.383911371231079, + "step": 3170 + }, + { + "epoch": 2.2910662824207493, + "grad_norm": 17.14618373707155, + "learning_rate": 8.036331546860777e-09, + "logits/chosen": -1.982940435409546, + "logits/rejected": -1.982465386390686, + "logps/chosen": -0.9494163393974304, + "logps/rejected": -1.039945125579834, + "loss": 1.2442, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.8988326787948608, + "rewards/margins": 0.18105748295783997, + "rewards/rejected": -2.079890251159668, + "step": 3180 + }, + { + "epoch": 2.298270893371758, + "grad_norm": 23.5198522631464, + "learning_rate": 7.882961107395416e-09, + "logits/chosen": -1.9984643459320068, + "logits/rejected": -1.9926011562347412, + "logps/chosen": -1.130748987197876, + "logps/rejected": -1.177819848060608, + "loss": 1.315, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -2.261497974395752, + "rewards/margins": 0.09414196014404297, + "rewards/rejected": -2.355639696121216, + "step": 3190 + }, + { + "epoch": 2.3054755043227666, + "grad_norm": 25.590315233089598, + "learning_rate": 7.73079392508428e-09, + "logits/chosen": -1.9712879657745361, + "logits/rejected": -1.9705880880355835, + "logps/chosen": -1.0907241106033325, + "logps/rejected": -1.2799861431121826, + "loss": 1.1523, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.181448221206665, + "rewards/margins": 0.37852445244789124, + "rewards/rejected": -2.5599722862243652, + "step": 3200 + }, + { + "epoch": 2.3126801152737753, + "grad_norm": 21.478168268234054, + "learning_rate": 7.579840696651938e-09, + "logits/chosen": -1.9963871240615845, + "logits/rejected": -1.9932276010513306, + "logps/chosen": -1.0474956035614014, + "logps/rejected": -1.172515869140625, + "loss": 1.2079, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0949912071228027, + "rewards/margins": 0.2500404119491577, + "rewards/rejected": -2.34503173828125, + "step": 3210 + }, + { + "epoch": 2.319884726224784, + "grad_norm": 20.9972814315902, + "learning_rate": 7.43011203348704e-09, + "logits/chosen": -1.9149713516235352, + "logits/rejected": -1.9116861820220947, + "logps/chosen": -1.0514217615127563, + "logps/rejected": -1.1269280910491943, + "loss": 1.2686, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1028435230255127, + "rewards/margins": 0.1510128229856491, + "rewards/rejected": -2.2538561820983887, + "step": 3220 + }, + { + "epoch": 2.3270893371757926, + "grad_norm": 18.760848272652197, + "learning_rate": 7.281618460896344e-09, + "logits/chosen": -1.995486855506897, + "logits/rejected": -1.9929373264312744, + "logps/chosen": -0.9654563665390015, + "logps/rejected": -1.1074378490447998, + "loss": 1.1729, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.930912733078003, + "rewards/margins": 0.2839628756046295, + "rewards/rejected": -2.2148756980895996, + "step": 3230 + }, + { + "epoch": 2.3342939481268012, + "grad_norm": 20.42845258559301, + "learning_rate": 7.134370417364849e-09, + "logits/chosen": -1.9683917760849, + "logits/rejected": -1.9679603576660156, + "logps/chosen": -1.001461386680603, + "logps/rejected": -1.1403329372406006, + "loss": 1.2027, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.002922773361206, + "rewards/margins": 0.2777433395385742, + "rewards/rejected": -2.280665874481201, + "step": 3240 + }, + { + "epoch": 2.34149855907781, + "grad_norm": 23.164358986342677, + "learning_rate": 6.988378253821981e-09, + "logits/chosen": -1.9697679281234741, + "logits/rejected": -1.9687258005142212, + "logps/chosen": -1.0258748531341553, + "logps/rejected": -1.143920660018921, + "loss": 1.209, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0517497062683105, + "rewards/margins": 0.23609168827533722, + "rewards/rejected": -2.287841320037842, + "step": 3250 + }, + { + "epoch": 2.3487031700288186, + "grad_norm": 20.30991552682094, + "learning_rate": 6.8436522329140186e-09, + "logits/chosen": -1.9788017272949219, + "logits/rejected": -1.985569715499878, + "logps/chosen": -1.0339092016220093, + "logps/rejected": -1.1592271327972412, + "loss": 1.2106, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.0678184032440186, + "rewards/margins": 0.2506362795829773, + "rewards/rejected": -2.3184542655944824, + "step": 3260 + }, + { + "epoch": 2.3559077809798272, + "grad_norm": 21.894995604840652, + "learning_rate": 6.700202528282603e-09, + "logits/chosen": -1.978734016418457, + "logits/rejected": -1.969061255455017, + "logps/chosen": -1.0286333560943604, + "logps/rejected": -1.1439108848571777, + "loss": 1.2158, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0572667121887207, + "rewards/margins": 0.23055517673492432, + "rewards/rejected": -2.2878217697143555, + "step": 3270 + }, + { + "epoch": 2.363112391930836, + "grad_norm": 21.434677454334327, + "learning_rate": 6.558039223849668e-09, + "logits/chosen": -2.0271782875061035, + "logits/rejected": -2.0174622535705566, + "logps/chosen": -1.0365641117095947, + "logps/rejected": -1.2443287372589111, + "loss": 1.1155, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0731282234191895, + "rewards/margins": 0.41552942991256714, + "rewards/rejected": -2.4886574745178223, + "step": 3280 + }, + { + "epoch": 2.3703170028818445, + "grad_norm": 22.134638764373964, + "learning_rate": 6.417172313108471e-09, + "logits/chosen": -1.95876944065094, + "logits/rejected": -1.9533073902130127, + "logps/chosen": -0.9859912991523743, + "logps/rejected": -1.1151400804519653, + "loss": 1.1979, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9719825983047485, + "rewards/margins": 0.2582974135875702, + "rewards/rejected": -2.2302801609039307, + "step": 3290 + }, + { + "epoch": 2.377521613832853, + "grad_norm": 21.658570611710445, + "learning_rate": 6.277611698421179e-09, + "logits/chosen": -2.017606735229492, + "logits/rejected": -2.0095458030700684, + "logps/chosen": -0.9050453305244446, + "logps/rejected": -1.098288893699646, + "loss": 1.1252, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.8100906610488892, + "rewards/margins": 0.3864876627922058, + "rewards/rejected": -2.196577787399292, + "step": 3300 + }, + { + "epoch": 2.3847262247838614, + "grad_norm": 22.8559892529762, + "learning_rate": 6.139367190322714e-09, + "logits/chosen": -2.0034892559051514, + "logits/rejected": -2.0032081604003906, + "logps/chosen": -1.0592529773712158, + "logps/rejected": -1.2185790538787842, + "loss": 1.1609, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.1185059547424316, + "rewards/margins": 0.3186524510383606, + "rewards/rejected": -2.4371581077575684, + "step": 3310 + }, + { + "epoch": 2.39193083573487, + "grad_norm": 17.198608533100995, + "learning_rate": 6.002448506831171e-09, + "logits/chosen": -2.0061838626861572, + "logits/rejected": -2.0014090538024902, + "logps/chosen": -0.9808699488639832, + "logps/rejected": -1.1246802806854248, + "loss": 1.1731, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9617398977279663, + "rewards/margins": 0.2876203954219818, + "rewards/rejected": -2.2493605613708496, + "step": 3320 + }, + { + "epoch": 2.3991354466858787, + "grad_norm": 18.199025209277288, + "learning_rate": 5.866865272764607e-09, + "logits/chosen": -2.023648262023926, + "logits/rejected": -2.0234923362731934, + "logps/chosen": -1.0167878866195679, + "logps/rejected": -1.1612762212753296, + "loss": 1.1772, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0335757732391357, + "rewards/margins": 0.28897663950920105, + "rewards/rejected": -2.322552442550659, + "step": 3330 + }, + { + "epoch": 2.4063400576368874, + "grad_norm": 23.302252487813124, + "learning_rate": 5.7326270190645595e-09, + "logits/chosen": -1.896691918373108, + "logits/rejected": -1.8979320526123047, + "logps/chosen": -1.0594362020492554, + "logps/rejected": -1.1698405742645264, + "loss": 1.2168, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.1188724040985107, + "rewards/margins": 0.2208089381456375, + "rewards/rejected": -2.3396811485290527, + "step": 3340 + }, + { + "epoch": 2.413544668587896, + "grad_norm": 18.446092862588884, + "learning_rate": 5.599743182125938e-09, + "logits/chosen": -2.043023109436035, + "logits/rejected": -2.043013095855713, + "logps/chosen": -1.0480068922042847, + "logps/rejected": -1.1850215196609497, + "loss": 1.179, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.0960137844085693, + "rewards/margins": 0.2740294933319092, + "rewards/rejected": -2.3700430393218994, + "step": 3350 + }, + { + "epoch": 2.4207492795389047, + "grad_norm": 20.220307143059344, + "learning_rate": 5.46822310313379e-09, + "logits/chosen": -2.0473732948303223, + "logits/rejected": -2.0569522380828857, + "logps/chosen": -1.09086012840271, + "logps/rejected": -1.1955832242965698, + "loss": 1.2353, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.18172025680542, + "rewards/margins": 0.2094462662935257, + "rewards/rejected": -2.3911664485931396, + "step": 3360 + }, + { + "epoch": 2.4279538904899134, + "grad_norm": 20.780990431383444, + "learning_rate": 5.33807602740658e-09, + "logits/chosen": -2.022789478302002, + "logits/rejected": -2.0159573554992676, + "logps/chosen": -0.9560559988021851, + "logps/rejected": -1.160628318786621, + "loss": 1.111, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.9121119976043701, + "rewards/margins": 0.40914446115493774, + "rewards/rejected": -2.321256637573242, + "step": 3370 + }, + { + "epoch": 2.435158501440922, + "grad_norm": 21.245348975655457, + "learning_rate": 5.209311103746334e-09, + "logits/chosen": -2.0008084774017334, + "logits/rejected": -2.0011303424835205, + "logps/chosen": -1.0523884296417236, + "logps/rejected": -1.224974274635315, + "loss": 1.1587, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.1047768592834473, + "rewards/margins": 0.3451715409755707, + "rewards/rejected": -2.44994854927063, + "step": 3380 + }, + { + "epoch": 2.4423631123919307, + "grad_norm": 24.352598699910715, + "learning_rate": 5.081937383795484e-09, + "logits/chosen": -1.972608208656311, + "logits/rejected": -1.9717302322387695, + "logps/chosen": -0.9721845388412476, + "logps/rejected": -1.137112021446228, + "loss": 1.1484, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9443690776824951, + "rewards/margins": 0.32985490560531616, + "rewards/rejected": -2.274224042892456, + "step": 3390 + }, + { + "epoch": 2.4495677233429394, + "grad_norm": 18.574622449743107, + "learning_rate": 4.955963821400599e-09, + "logits/chosen": -2.0249781608581543, + "logits/rejected": -2.019134759902954, + "logps/chosen": -1.029394507408142, + "logps/rejected": -1.167999505996704, + "loss": 1.1922, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.058789014816284, + "rewards/margins": 0.2772100567817688, + "rewards/rejected": -2.335999011993408, + "step": 3400 + }, + { + "epoch": 2.456772334293948, + "grad_norm": 15.429482416255146, + "learning_rate": 4.831399271982928e-09, + "logits/chosen": -1.9512850046157837, + "logits/rejected": -1.9432300329208374, + "logps/chosen": -1.0431114435195923, + "logps/rejected": -1.1738238334655762, + "loss": 1.2095, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.0862228870391846, + "rewards/margins": 0.26142507791519165, + "rewards/rejected": -2.3476476669311523, + "step": 3410 + }, + { + "epoch": 2.4639769452449567, + "grad_norm": 25.103110732614255, + "learning_rate": 4.708252491915951e-09, + "logits/chosen": -2.030299663543701, + "logits/rejected": -2.024203300476074, + "logps/chosen": -1.0459200143814087, + "logps/rejected": -1.1930789947509766, + "loss": 1.197, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0918400287628174, + "rewards/margins": 0.2943178713321686, + "rewards/rejected": -2.386157989501953, + "step": 3420 + }, + { + "epoch": 2.4711815561959654, + "grad_norm": 25.280433628761124, + "learning_rate": 4.58653213790981e-09, + "logits/chosen": -2.006598472595215, + "logits/rejected": -1.9983857870101929, + "logps/chosen": -1.025721549987793, + "logps/rejected": -1.1746145486831665, + "loss": 1.1792, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.051443099975586, + "rewards/margins": 0.29778599739074707, + "rewards/rejected": -2.349229097366333, + "step": 3430 + }, + { + "epoch": 2.478386167146974, + "grad_norm": 18.242083284353217, + "learning_rate": 4.466246766402773e-09, + "logits/chosen": -1.9907543659210205, + "logits/rejected": -1.9845359325408936, + "logps/chosen": -1.0393613576889038, + "logps/rejected": -1.1935051679611206, + "loss": 1.1827, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0787227153778076, + "rewards/margins": 0.30828770995140076, + "rewards/rejected": -2.387010335922241, + "step": 3440 + }, + { + "epoch": 2.4855907780979827, + "grad_norm": 22.018503196573274, + "learning_rate": 4.347404832959775e-09, + "logits/chosen": -2.0336432456970215, + "logits/rejected": -2.0338807106018066, + "logps/chosen": -1.032832384109497, + "logps/rejected": -1.1933454275131226, + "loss": 1.1612, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.065664768218994, + "rewards/margins": 0.32102587819099426, + "rewards/rejected": -2.386690855026245, + "step": 3450 + }, + { + "epoch": 2.4927953890489913, + "grad_norm": 33.037428693429234, + "learning_rate": 4.230014691678016e-09, + "logits/chosen": -1.9883922338485718, + "logits/rejected": -1.9890626668930054, + "logps/chosen": -1.0595102310180664, + "logps/rejected": -1.126479148864746, + "loss": 1.272, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.119020462036133, + "rewards/margins": 0.1339379847049713, + "rewards/rejected": -2.252958297729492, + "step": 3460 + }, + { + "epoch": 2.5, + "grad_norm": 17.82843912451702, + "learning_rate": 4.114084594599707e-09, + "logits/chosen": -1.9903564453125, + "logits/rejected": -1.9900470972061157, + "logps/chosen": -1.0114375352859497, + "logps/rejected": -1.229552984237671, + "loss": 1.1011, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0228750705718994, + "rewards/margins": 0.4362305998802185, + "rewards/rejected": -2.459105968475342, + "step": 3470 + }, + { + "epoch": 2.5072046109510087, + "grad_norm": 22.102059612075095, + "learning_rate": 3.9996226911319546e-09, + "logits/chosen": -1.993326187133789, + "logits/rejected": -1.981066107749939, + "logps/chosen": -1.0159366130828857, + "logps/rejected": -1.1457350254058838, + "loss": 1.1898, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0318732261657715, + "rewards/margins": 0.25959664583206177, + "rewards/rejected": -2.2914700508117676, + "step": 3480 + }, + { + "epoch": 2.5144092219020173, + "grad_norm": 17.973603590541654, + "learning_rate": 3.886637027473949e-09, + "logits/chosen": -2.0013790130615234, + "logits/rejected": -2.0035085678100586, + "logps/chosen": -1.076293706893921, + "logps/rejected": -1.2393258810043335, + "loss": 1.1562, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.152587413787842, + "rewards/margins": 0.3260645270347595, + "rewards/rejected": -2.478651762008667, + "step": 3490 + }, + { + "epoch": 2.521613832853026, + "grad_norm": 19.37957776631117, + "learning_rate": 3.775135546051295e-09, + "logits/chosen": -1.9389715194702148, + "logits/rejected": -1.9399712085723877, + "logps/chosen": -1.0256609916687012, + "logps/rejected": -1.1517935991287231, + "loss": 1.1987, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0513219833374023, + "rewards/margins": 0.252265065908432, + "rewards/rejected": -2.3035871982574463, + "step": 3500 + }, + { + "epoch": 2.5288184438040346, + "grad_norm": 23.33567215234884, + "learning_rate": 3.665126084957723e-09, + "logits/chosen": -1.984487533569336, + "logits/rejected": -1.9886258840560913, + "logps/chosen": -1.1339917182922363, + "logps/rejected": -1.233039140701294, + "loss": 1.2585, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.2679834365844727, + "rewards/margins": 0.19809459149837494, + "rewards/rejected": -2.466078281402588, + "step": 3510 + }, + { + "epoch": 2.5360230547550433, + "grad_norm": 19.609640038869685, + "learning_rate": 3.556616377404101e-09, + "logits/chosen": -2.00850248336792, + "logits/rejected": -2.006412982940674, + "logps/chosen": -1.07861328125, + "logps/rejected": -1.236485242843628, + "loss": 1.154, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1572265625, + "rewards/margins": 0.31574416160583496, + "rewards/rejected": -2.472970485687256, + "step": 3520 + }, + { + "epoch": 2.543227665706052, + "grad_norm": 19.767394228725337, + "learning_rate": 3.4496140511748125e-09, + "logits/chosen": -1.9994624853134155, + "logits/rejected": -1.9942439794540405, + "logps/chosen": -1.0551049709320068, + "logps/rejected": -1.1994330883026123, + "loss": 1.1752, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1102099418640137, + "rewards/margins": 0.28865596652030945, + "rewards/rejected": -2.3988661766052246, + "step": 3530 + }, + { + "epoch": 2.5504322766570606, + "grad_norm": 31.22852578343729, + "learning_rate": 3.3441266280915427e-09, + "logits/chosen": -1.9827390909194946, + "logits/rejected": -1.983473539352417, + "logps/chosen": -1.093752145767212, + "logps/rejected": -1.20872163772583, + "loss": 1.2131, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.187504291534424, + "rewards/margins": 0.229939267039299, + "rewards/rejected": -2.41744327545166, + "step": 3540 + }, + { + "epoch": 2.5576368876080693, + "grad_norm": 23.706412362537016, + "learning_rate": 3.2401615234845693e-09, + "logits/chosen": -2.0068211555480957, + "logits/rejected": -2.0008292198181152, + "logps/chosen": -1.0923867225646973, + "logps/rejected": -1.2357128858566284, + "loss": 1.1896, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1847734451293945, + "rewards/margins": 0.28665226697921753, + "rewards/rejected": -2.471425771713257, + "step": 3550 + }, + { + "epoch": 2.564841498559078, + "grad_norm": 16.13728529223842, + "learning_rate": 3.1377260456714375e-09, + "logits/chosen": -1.901414155960083, + "logits/rejected": -1.8929615020751953, + "logps/chosen": -1.0596764087677002, + "logps/rejected": -1.2023025751113892, + "loss": 1.1686, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1193528175354004, + "rewards/margins": 0.28525251150131226, + "rewards/rejected": -2.4046051502227783, + "step": 3560 + }, + { + "epoch": 2.5720461095100866, + "grad_norm": 18.028717215705484, + "learning_rate": 3.0368273954432698e-09, + "logits/chosen": -2.0307531356811523, + "logits/rejected": -2.022324800491333, + "logps/chosen": -1.0493528842926025, + "logps/rejected": -1.1534329652786255, + "loss": 1.2252, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.098705768585205, + "rewards/margins": 0.20816004276275635, + "rewards/rejected": -2.306865930557251, + "step": 3570 + }, + { + "epoch": 2.5792507204610953, + "grad_norm": 17.367490930434325, + "learning_rate": 2.937472665558541e-09, + "logits/chosen": -2.019484281539917, + "logits/rejected": -2.020643711090088, + "logps/chosen": -1.036195993423462, + "logps/rejected": -1.147991418838501, + "loss": 1.2267, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.072391986846924, + "rewards/margins": 0.22359101474285126, + "rewards/rejected": -2.295982837677002, + "step": 3580 + }, + { + "epoch": 2.586455331412104, + "grad_norm": 21.805325598847563, + "learning_rate": 2.8396688402445053e-09, + "logits/chosen": -2.0637335777282715, + "logits/rejected": -2.0563552379608154, + "logps/chosen": -1.0100147724151611, + "logps/rejected": -1.2180942296981812, + "loss": 1.1063, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0200295448303223, + "rewards/margins": 0.4161592125892639, + "rewards/rejected": -2.4361884593963623, + "step": 3590 + }, + { + "epoch": 2.5936599423631126, + "grad_norm": 24.439180591540023, + "learning_rate": 2.7434227947062324e-09, + "logits/chosen": -2.0062127113342285, + "logits/rejected": -1.9998852014541626, + "logps/chosen": -1.1317923069000244, + "logps/rejected": -1.2398041486740112, + "loss": 1.2326, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.263584613800049, + "rewards/margins": 0.21602365374565125, + "rewards/rejected": -2.4796082973480225, + "step": 3600 + }, + { + "epoch": 2.6008645533141213, + "grad_norm": 18.265831934479376, + "learning_rate": 2.6487412946432976e-09, + "logits/chosen": -1.9716873168945312, + "logits/rejected": -1.966560959815979, + "logps/chosen": -1.0693469047546387, + "logps/rejected": -1.2056225538253784, + "loss": 1.1912, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1386938095092773, + "rewards/margins": 0.27255168557167053, + "rewards/rejected": -2.411245107650757, + "step": 3610 + }, + { + "epoch": 2.60806916426513, + "grad_norm": 22.910160520824302, + "learning_rate": 2.5556309957742024e-09, + "logits/chosen": -1.97675359249115, + "logits/rejected": -1.9716304540634155, + "logps/chosen": -1.0250674486160278, + "logps/rejected": -1.2212371826171875, + "loss": 1.1161, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.0501348972320557, + "rewards/margins": 0.3923397660255432, + "rewards/rejected": -2.442474365234375, + "step": 3620 + }, + { + "epoch": 2.6152737752161386, + "grad_norm": 22.671601957903725, + "learning_rate": 2.4640984433684758e-09, + "logits/chosen": -2.0380005836486816, + "logits/rejected": -2.0387332439422607, + "logps/chosen": -1.1190853118896484, + "logps/rejected": -1.235012173652649, + "loss": 1.2353, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.238170623779297, + "rewards/margins": 0.231853649020195, + "rewards/rejected": -2.470024347305298, + "step": 3630 + }, + { + "epoch": 2.6224783861671472, + "grad_norm": 17.024567886757257, + "learning_rate": 2.3741500717865987e-09, + "logits/chosen": -1.9916216135025024, + "logits/rejected": -2.0025291442871094, + "logps/chosen": -1.0068811178207397, + "logps/rejected": -1.1515626907348633, + "loss": 1.1789, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.0137622356414795, + "rewards/margins": 0.2893627882003784, + "rewards/rejected": -2.3031253814697266, + "step": 3640 + }, + { + "epoch": 2.629682997118156, + "grad_norm": 17.494575910236158, + "learning_rate": 2.285792204027678e-09, + "logits/chosen": -1.9781382083892822, + "logits/rejected": -1.9753141403198242, + "logps/chosen": -1.013346791267395, + "logps/rejected": -1.211428165435791, + "loss": 1.1021, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.02669358253479, + "rewards/margins": 0.39616289734840393, + "rewards/rejected": -2.422856330871582, + "step": 3650 + }, + { + "epoch": 2.636887608069164, + "grad_norm": 20.794166929263792, + "learning_rate": 2.199031051284972e-09, + "logits/chosen": -2.007935047149658, + "logits/rejected": -2.0033650398254395, + "logps/chosen": -1.069888710975647, + "logps/rejected": -1.1960642337799072, + "loss": 1.219, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.139777421951294, + "rewards/margins": 0.25235068798065186, + "rewards/rejected": -2.3921284675598145, + "step": 3660 + }, + { + "epoch": 2.6440922190201728, + "grad_norm": 16.98386285768041, + "learning_rate": 2.113872712509254e-09, + "logits/chosen": -1.9919393062591553, + "logits/rejected": -1.9845707416534424, + "logps/chosen": -1.1297125816345215, + "logps/rejected": -1.241287112236023, + "loss": 1.2279, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.259425163269043, + "rewards/margins": 0.22314925491809845, + "rewards/rejected": -2.482574224472046, + "step": 3670 + }, + { + "epoch": 2.6512968299711814, + "grad_norm": 14.064145090241722, + "learning_rate": 2.0303231739801143e-09, + "logits/chosen": -1.9741106033325195, + "logits/rejected": -1.9633283615112305, + "logps/chosen": -1.0185304880142212, + "logps/rejected": -1.1587377786636353, + "loss": 1.1825, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0370609760284424, + "rewards/margins": 0.28041452169418335, + "rewards/rejected": -2.3174755573272705, + "step": 3680 + }, + { + "epoch": 2.65850144092219, + "grad_norm": 23.56396327392751, + "learning_rate": 1.948388308885102e-09, + "logits/chosen": -2.0366296768188477, + "logits/rejected": -2.02805757522583, + "logps/chosen": -1.0637743473052979, + "logps/rejected": -1.1742548942565918, + "loss": 1.2171, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1275486946105957, + "rewards/margins": 0.22096149623394012, + "rewards/rejected": -2.3485097885131836, + "step": 3690 + }, + { + "epoch": 2.6657060518731988, + "grad_norm": 25.21273485809688, + "learning_rate": 1.86807387690692e-09, + "logits/chosen": -2.0631988048553467, + "logits/rejected": -2.0600669384002686, + "logps/chosen": -1.0889419317245483, + "logps/rejected": -1.2770618200302124, + "loss": 1.1162, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1778838634490967, + "rewards/margins": 0.3762398660182953, + "rewards/rejected": -2.554123640060425, + "step": 3700 + }, + { + "epoch": 2.6729106628242074, + "grad_norm": 19.210332180481718, + "learning_rate": 1.789385523818493e-09, + "logits/chosen": -2.027967929840088, + "logits/rejected": -2.0294251441955566, + "logps/chosen": -1.0404349565505981, + "logps/rejected": -1.209099531173706, + "loss": 1.149, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0808699131011963, + "rewards/margins": 0.33732882142066956, + "rewards/rejected": -2.418199062347412, + "step": 3710 + }, + { + "epoch": 2.680115273775216, + "grad_norm": 25.919412237452388, + "learning_rate": 1.712328781086131e-09, + "logits/chosen": -2.0483665466308594, + "logits/rejected": -2.0430164337158203, + "logps/chosen": -1.1230162382125854, + "logps/rejected": -1.2185190916061401, + "loss": 1.2401, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.246032476425171, + "rewards/margins": 0.191005676984787, + "rewards/rejected": -2.4370381832122803, + "step": 3720 + }, + { + "epoch": 2.6873198847262247, + "grad_norm": 21.180605350865044, + "learning_rate": 1.6369090654806543e-09, + "logits/chosen": -2.0540661811828613, + "logits/rejected": -2.0474164485931396, + "logps/chosen": -1.0206701755523682, + "logps/rejected": -1.1645678281784058, + "loss": 1.1684, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0413403511047363, + "rewards/margins": 0.2877953350543976, + "rewards/rejected": -2.3291356563568115, + "step": 3730 + }, + { + "epoch": 2.6945244956772334, + "grad_norm": 19.151584962250723, + "learning_rate": 1.5631316786966498e-09, + "logits/chosen": -1.9853427410125732, + "logits/rejected": -1.978816270828247, + "logps/chosen": -1.0220520496368408, + "logps/rejected": -1.1623871326446533, + "loss": 1.1969, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0441040992736816, + "rewards/margins": 0.2806701064109802, + "rewards/rejected": -2.3247742652893066, + "step": 3740 + }, + { + "epoch": 2.701729106628242, + "grad_norm": 18.499060326329523, + "learning_rate": 1.491001806979772e-09, + "logits/chosen": -2.035274028778076, + "logits/rejected": -2.028480052947998, + "logps/chosen": -1.077332854270935, + "logps/rejected": -1.2263195514678955, + "loss": 1.1742, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.15466570854187, + "rewards/margins": 0.29797306656837463, + "rewards/rejected": -2.452639102935791, + "step": 3750 + }, + { + "epoch": 2.7089337175792507, + "grad_norm": 29.16839407251503, + "learning_rate": 1.4205245207621508e-09, + "logits/chosen": -1.9820353984832764, + "logits/rejected": -1.9796836376190186, + "logps/chosen": -1.1182725429534912, + "logps/rejected": -1.2864872217178345, + "loss": 1.1548, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.2365450859069824, + "rewards/margins": 0.33642950654029846, + "rewards/rejected": -2.572974443435669, + "step": 3760 + }, + { + "epoch": 2.7161383285302594, + "grad_norm": 17.613059928527296, + "learning_rate": 1.3517047743059978e-09, + "logits/chosen": -2.0181725025177, + "logits/rejected": -2.0215516090393066, + "logps/chosen": -1.0734504461288452, + "logps/rejected": -1.2342610359191895, + "loss": 1.1565, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1469008922576904, + "rewards/margins": 0.3216209411621094, + "rewards/rejected": -2.468522071838379, + "step": 3770 + }, + { + "epoch": 2.723342939481268, + "grad_norm": 17.0753116834011, + "learning_rate": 1.2845474053553156e-09, + "logits/chosen": -2.0134406089782715, + "logits/rejected": -2.0093090534210205, + "logps/chosen": -1.0317740440368652, + "logps/rejected": -1.168919324874878, + "loss": 1.2025, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0635480880737305, + "rewards/margins": 0.2742905914783478, + "rewards/rejected": -2.337838649749756, + "step": 3780 + }, + { + "epoch": 2.7305475504322767, + "grad_norm": 22.583791154808193, + "learning_rate": 1.2190571347958422e-09, + "logits/chosen": -2.044787883758545, + "logits/rejected": -2.046135187149048, + "logps/chosen": -0.9667074084281921, + "logps/rejected": -1.1686309576034546, + "loss": 1.1099, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9334148168563843, + "rewards/margins": 0.4038470387458801, + "rewards/rejected": -2.337261915206909, + "step": 3790 + }, + { + "epoch": 2.7377521613832854, + "grad_norm": 18.287187828533536, + "learning_rate": 1.1552385663231634e-09, + "logits/chosen": -1.9937756061553955, + "logits/rejected": -1.9841327667236328, + "logps/chosen": -1.0935721397399902, + "logps/rejected": -1.190500020980835, + "loss": 1.2381, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.1871442794799805, + "rewards/margins": 0.19385603070259094, + "rewards/rejected": -2.38100004196167, + "step": 3800 + }, + { + "epoch": 2.744956772334294, + "grad_norm": 18.99456309056716, + "learning_rate": 1.0930961861191302e-09, + "logits/chosen": -1.9580612182617188, + "logits/rejected": -1.9627430438995361, + "logps/chosen": -1.0382286310195923, + "logps/rejected": -1.1804331541061401, + "loss": 1.2008, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0764572620391846, + "rewards/margins": 0.2844088673591614, + "rewards/rejected": -2.3608663082122803, + "step": 3810 + }, + { + "epoch": 2.7521613832853027, + "grad_norm": 16.764883597440075, + "learning_rate": 1.0326343625364608e-09, + "logits/chosen": -1.9691221714019775, + "logits/rejected": -1.9639511108398438, + "logps/chosen": -1.0410795211791992, + "logps/rejected": -1.2136642932891846, + "loss": 1.138, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0821590423583984, + "rewards/margins": 0.3451697826385498, + "rewards/rejected": -2.427328586578369, + "step": 3820 + }, + { + "epoch": 2.7593659942363113, + "grad_norm": 18.44911571731718, + "learning_rate": 9.738573457917066e-10, + "logits/chosen": -2.043980836868286, + "logits/rejected": -2.042267084121704, + "logps/chosen": -1.0499022006988525, + "logps/rejected": -1.2412595748901367, + "loss": 1.11, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.099804401397705, + "rewards/margins": 0.3827148973941803, + "rewards/rejected": -2.4825191497802734, + "step": 3830 + }, + { + "epoch": 2.76657060518732, + "grad_norm": 18.764417824451066, + "learning_rate": 9.16769267666434e-10, + "logits/chosen": -2.0091567039489746, + "logits/rejected": -2.0069632530212402, + "logps/chosen": -1.074094295501709, + "logps/rejected": -1.1498383283615112, + "loss": 1.2637, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.148188591003418, + "rewards/margins": 0.15148821473121643, + "rewards/rejected": -2.2996766567230225, + "step": 3840 + }, + { + "epoch": 2.7737752161383287, + "grad_norm": 20.046312375742783, + "learning_rate": 8.613741412168113e-10, + "logits/chosen": -2.027498245239258, + "logits/rejected": -2.026846408843994, + "logps/chosen": -1.0808565616607666, + "logps/rejected": -1.2099745273590088, + "loss": 1.1798, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.161713123321533, + "rewards/margins": 0.2582358717918396, + "rewards/rejected": -2.4199490547180176, + "step": 3850 + }, + { + "epoch": 2.7809798270893373, + "grad_norm": 19.84763607582755, + "learning_rate": 8.076758604914802e-10, + "logits/chosen": -1.957332968711853, + "logits/rejected": -1.9527628421783447, + "logps/chosen": -0.9819733500480652, + "logps/rejected": -1.114538550376892, + "loss": 1.1997, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9639467000961304, + "rewards/margins": 0.2651303708553314, + "rewards/rejected": -2.229077100753784, + "step": 3860 + }, + { + "epoch": 2.7881844380403455, + "grad_norm": 22.904658084781477, + "learning_rate": 7.55678200257856e-10, + "logits/chosen": -1.9844300746917725, + "logits/rejected": -1.9776723384857178, + "logps/chosen": -1.0327974557876587, + "logps/rejected": -1.1763405799865723, + "loss": 1.1751, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0655949115753174, + "rewards/margins": 0.2870861887931824, + "rewards/rejected": -2.3526811599731445, + "step": 3870 + }, + { + "epoch": 2.795389048991354, + "grad_norm": 17.15291998943784, + "learning_rate": 7.053848157367315e-10, + "logits/chosen": -1.9995343685150146, + "logits/rejected": -1.9940645694732666, + "logps/chosen": -1.0412391424179077, + "logps/rejected": -1.1907306909561157, + "loss": 1.1831, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0824782848358154, + "rewards/margins": 0.29898306727409363, + "rewards/rejected": -2.3814613819122314, + "step": 3880 + }, + { + "epoch": 2.802593659942363, + "grad_norm": 15.812884819551362, + "learning_rate": 6.567992423453794e-10, + "logits/chosen": -2.0206310749053955, + "logits/rejected": -2.019430637359619, + "logps/chosen": -0.9630235433578491, + "logps/rejected": -1.0794202089309692, + "loss": 1.2021, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9260470867156982, + "rewards/margins": 0.23279304802417755, + "rewards/rejected": -2.1588404178619385, + "step": 3890 + }, + { + "epoch": 2.8097982708933715, + "grad_norm": 19.54993986750196, + "learning_rate": 6.099248954489794e-10, + "logits/chosen": -1.953507423400879, + "logits/rejected": -1.9511181116104126, + "logps/chosen": -1.0681465864181519, + "logps/rejected": -1.229273796081543, + "loss": 1.1568, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1362931728363037, + "rewards/margins": 0.32225483655929565, + "rewards/rejected": -2.458547592163086, + "step": 3900 + }, + { + "epoch": 2.81700288184438, + "grad_norm": 22.672929732957467, + "learning_rate": 5.647650701205653e-10, + "logits/chosen": -2.026876449584961, + "logits/rejected": -2.018667697906494, + "logps/chosen": -1.1109135150909424, + "logps/rejected": -1.2674014568328857, + "loss": 1.1766, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.2218270301818848, + "rewards/margins": 0.3129761219024658, + "rewards/rejected": -2.5348029136657715, + "step": 3910 + }, + { + "epoch": 2.824207492795389, + "grad_norm": 16.28695288206369, + "learning_rate": 5.213229409093856e-10, + "logits/chosen": -2.0310721397399902, + "logits/rejected": -2.0254709720611572, + "logps/chosen": -1.05387282371521, + "logps/rejected": -1.1856187582015991, + "loss": 1.2009, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.10774564743042, + "rewards/margins": 0.263491690158844, + "rewards/rejected": -2.3712375164031982, + "step": 3920 + }, + { + "epoch": 2.8314121037463975, + "grad_norm": 20.975683447759703, + "learning_rate": 4.796015616177401e-10, + "logits/chosen": -2.0005106925964355, + "logits/rejected": -1.994783639907837, + "logps/chosen": -1.0665435791015625, + "logps/rejected": -1.1785615682601929, + "loss": 1.2143, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.133087158203125, + "rewards/margins": 0.22403590381145477, + "rewards/rejected": -2.3571231365203857, + "step": 3930 + }, + { + "epoch": 2.838616714697406, + "grad_norm": 16.71900486734478, + "learning_rate": 4.3960386508631595e-10, + "logits/chosen": -1.937182068824768, + "logits/rejected": -1.9297128915786743, + "logps/chosen": -0.9666848182678223, + "logps/rejected": -1.0865039825439453, + "loss": 1.2256, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9333696365356445, + "rewards/margins": 0.23963849246501923, + "rewards/rejected": -2.1730079650878906, + "step": 3940 + }, + { + "epoch": 2.845821325648415, + "grad_norm": 35.92105468101964, + "learning_rate": 4.013326629880243e-10, + "logits/chosen": -1.9777085781097412, + "logits/rejected": -1.968076467514038, + "logps/chosen": -1.1062713861465454, + "logps/rejected": -1.2339928150177002, + "loss": 1.2044, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.212542772293091, + "rewards/margins": 0.25544288754463196, + "rewards/rejected": -2.4679856300354004, + "step": 3950 + }, + { + "epoch": 2.8530259365994235, + "grad_norm": 19.697159928360417, + "learning_rate": 3.64790645630339e-10, + "logits/chosen": -1.942647933959961, + "logits/rejected": -1.9421268701553345, + "logps/chosen": -1.0547170639038086, + "logps/rejected": -1.1250708103179932, + "loss": 1.2627, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.109434127807617, + "rewards/margins": 0.1407076120376587, + "rewards/rejected": -2.2501416206359863, + "step": 3960 + }, + { + "epoch": 2.860230547550432, + "grad_norm": 21.359517678769173, + "learning_rate": 3.2998038176619e-10, + "logits/chosen": -1.9776138067245483, + "logits/rejected": -1.9692010879516602, + "logps/chosen": -1.0567617416381836, + "logps/rejected": -1.1803498268127441, + "loss": 1.2064, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.113523483276367, + "rewards/margins": 0.24717645347118378, + "rewards/rejected": -2.3606996536254883, + "step": 3970 + }, + { + "epoch": 2.867435158501441, + "grad_norm": 20.96219918565088, + "learning_rate": 2.969043184133907e-10, + "logits/chosen": -2.046151638031006, + "logits/rejected": -2.044818639755249, + "logps/chosen": -0.9711786508560181, + "logps/rejected": -1.1876708269119263, + "loss": 1.0771, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9423573017120361, + "rewards/margins": 0.43298429250717163, + "rewards/rejected": -2.3753416538238525, + "step": 3980 + }, + { + "epoch": 2.8746397694524495, + "grad_norm": 17.910920824523004, + "learning_rate": 2.6556478068261447e-10, + "logits/chosen": -1.9732444286346436, + "logits/rejected": -1.9708236455917358, + "logps/chosen": -0.9731259346008301, + "logps/rejected": -1.1022650003433228, + "loss": 1.2084, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9462518692016602, + "rewards/margins": 0.2582783102989197, + "rewards/rejected": -2.2045300006866455, + "step": 3990 + }, + { + "epoch": 2.881844380403458, + "grad_norm": 20.734203195977276, + "learning_rate": 2.3596397161395607e-10, + "logits/chosen": -2.044921875, + "logits/rejected": -2.0331034660339355, + "logps/chosen": -1.0672800540924072, + "logps/rejected": -1.2321101427078247, + "loss": 1.1588, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1345601081848145, + "rewards/margins": 0.32965999841690063, + "rewards/rejected": -2.4642202854156494, + "step": 4000 + }, + { + "epoch": 2.889048991354467, + "grad_norm": 25.7016655841959, + "learning_rate": 2.0810397202206399e-10, + "logits/chosen": -1.9520553350448608, + "logits/rejected": -1.9573888778686523, + "logps/chosen": -1.063836693763733, + "logps/rejected": -1.193362832069397, + "loss": 1.1905, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.127673387527466, + "rewards/margins": 0.2590521574020386, + "rewards/rejected": -2.386725664138794, + "step": 4010 + }, + { + "epoch": 2.8962536023054755, + "grad_norm": 22.599478343097772, + "learning_rate": 1.819867403498737e-10, + "logits/chosen": -2.0386157035827637, + "logits/rejected": -2.036118984222412, + "logps/chosen": -1.0686347484588623, + "logps/rejected": -1.2007242441177368, + "loss": 1.2018, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1372694969177246, + "rewards/margins": 0.26417914032936096, + "rewards/rejected": -2.4014484882354736, + "step": 4020 + }, + { + "epoch": 2.903458213256484, + "grad_norm": 21.70269511981427, + "learning_rate": 1.5761411253092382e-10, + "logits/chosen": -1.964998483657837, + "logits/rejected": -1.9548912048339844, + "logps/chosen": -0.9872833490371704, + "logps/rejected": -1.1099205017089844, + "loss": 1.1994, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9745666980743408, + "rewards/margins": 0.24527449905872345, + "rewards/rejected": -2.2198410034179688, + "step": 4030 + }, + { + "epoch": 2.910662824207493, + "grad_norm": 20.259335859045336, + "learning_rate": 1.3498780186031455e-10, + "logits/chosen": -2.010437488555908, + "logits/rejected": -2.0069775581359863, + "logps/chosen": -1.162232756614685, + "logps/rejected": -1.281508207321167, + "loss": 1.2266, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.32446551322937, + "rewards/margins": 0.2385510504245758, + "rewards/rejected": -2.563016414642334, + "step": 4040 + }, + { + "epoch": 2.9178674351585014, + "grad_norm": 15.586122569686582, + "learning_rate": 1.1410939887425141e-10, + "logits/chosen": -1.9975839853286743, + "logits/rejected": -1.9993999004364014, + "logps/chosen": -1.045862078666687, + "logps/rejected": -1.1738336086273193, + "loss": 1.212, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.091724157333374, + "rewards/margins": 0.25594305992126465, + "rewards/rejected": -2.3476672172546387, + "step": 4050 + }, + { + "epoch": 2.92507204610951, + "grad_norm": 18.548109162992386, + "learning_rate": 9.498037123825686e-11, + "logits/chosen": -2.0100975036621094, + "logits/rejected": -2.0066418647766113, + "logps/chosen": -1.0210684537887573, + "logps/rejected": -1.1468260288238525, + "loss": 1.1987, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0421369075775146, + "rewards/margins": 0.2515150308609009, + "rewards/rejected": -2.293652057647705, + "step": 4060 + }, + { + "epoch": 2.9322766570605188, + "grad_norm": 21.54852206068809, + "learning_rate": 7.760206364398614e-11, + "logits/chosen": -2.0660743713378906, + "logits/rejected": -2.063163995742798, + "logps/chosen": -1.0767936706542969, + "logps/rejected": -1.2189406156539917, + "loss": 1.1849, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.1535873413085938, + "rewards/margins": 0.2842939794063568, + "rewards/rejected": -2.4378812313079834, + "step": 4070 + }, + { + "epoch": 2.9394812680115274, + "grad_norm": 21.178294648611878, + "learning_rate": 6.19756977147029e-11, + "logits/chosen": -1.9951989650726318, + "logits/rejected": -1.9918142557144165, + "logps/chosen": -1.02787446975708, + "logps/rejected": -1.2345163822174072, + "loss": 1.1095, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.05574893951416, + "rewards/margins": 0.41328415274620056, + "rewards/rejected": -2.4690327644348145, + "step": 4080 + }, + { + "epoch": 2.946685878962536, + "grad_norm": 20.81149789203122, + "learning_rate": 4.810237191940625e-11, + "logits/chosen": -1.974111557006836, + "logits/rejected": -1.9727462530136108, + "logps/chosen": -1.0376461744308472, + "logps/rejected": -1.1693501472473145, + "loss": 1.217, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.0752923488616943, + "rewards/margins": 0.2634081244468689, + "rewards/rejected": -2.338700294494629, + "step": 4090 + }, + { + "epoch": 2.9538904899135447, + "grad_norm": 20.108876799029805, + "learning_rate": 3.5983061495617476e-11, + "logits/chosen": -2.032691240310669, + "logits/rejected": -2.0327444076538086, + "logps/chosen": -1.1233651638031006, + "logps/rejected": -1.2714459896087646, + "loss": 1.1825, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.246730327606201, + "rewards/margins": 0.2961619794368744, + "rewards/rejected": -2.5428919792175293, + "step": 4100 + }, + { + "epoch": 2.9610951008645534, + "grad_norm": 21.51546113795096, + "learning_rate": 2.5618618380812694e-11, + "logits/chosen": -2.0210888385772705, + "logits/rejected": -2.0106148719787598, + "logps/chosen": -1.002300500869751, + "logps/rejected": -1.166154146194458, + "loss": 1.1714, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.004601001739502, + "rewards/margins": 0.32770711183547974, + "rewards/rejected": -2.332308292388916, + "step": 4110 + }, + { + "epoch": 2.968299711815562, + "grad_norm": 22.762605833671383, + "learning_rate": 1.700977115254576e-11, + "logits/chosen": -1.9953645467758179, + "logits/rejected": -1.9921376705169678, + "logps/chosen": -0.9968992471694946, + "logps/rejected": -1.1455665826797485, + "loss": 1.1674, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9937984943389893, + "rewards/margins": 0.29733437299728394, + "rewards/rejected": -2.291133165359497, + "step": 4120 + }, + { + "epoch": 2.9755043227665707, + "grad_norm": 20.444100868277733, + "learning_rate": 1.0157124977230868e-11, + "logits/chosen": -1.9724935293197632, + "logits/rejected": -1.9707790613174438, + "logps/chosen": -0.9694275856018066, + "logps/rejected": -1.117763876914978, + "loss": 1.1687, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9388551712036133, + "rewards/margins": 0.2966724932193756, + "rewards/rejected": -2.235527753829956, + "step": 4130 + }, + { + "epoch": 2.9827089337175794, + "grad_norm": 21.99215491997881, + "learning_rate": 5.061161567596061e-12, + "logits/chosen": -1.9936805963516235, + "logits/rejected": -1.98941171169281, + "logps/chosen": -1.0571701526641846, + "logps/rejected": -1.1420987844467163, + "loss": 1.2615, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.114340305328369, + "rewards/margins": 0.16985730826854706, + "rewards/rejected": -2.2841975688934326, + "step": 4140 + }, + { + "epoch": 2.989913544668588, + "grad_norm": 20.878532080632212, + "learning_rate": 1.7222391488297406e-12, + "logits/chosen": -2.013947010040283, + "logits/rejected": -2.010057210922241, + "logps/chosen": -1.1070269346237183, + "logps/rejected": -1.254369854927063, + "loss": 1.1756, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2140538692474365, + "rewards/margins": 0.2946857511997223, + "rewards/rejected": -2.508739709854126, + "step": 4150 + }, + { + "epoch": 2.9971181556195967, + "grad_norm": 19.732213045865922, + "learning_rate": 1.4059243338693238e-13, + "logits/chosen": -1.9882125854492188, + "logits/rejected": -1.9810755252838135, + "logps/chosen": -1.059184193611145, + "logps/rejected": -1.1826164722442627, + "loss": 1.1942, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.11836838722229, + "rewards/margins": 0.2468646764755249, + "rewards/rejected": -2.3652329444885254, + "step": 4160 + }, + { + "epoch": 3.0, + "step": 4164, + "total_flos": 0.0, + "train_loss": 1.2025116606473236, + "train_runtime": 6278.9508, + "train_samples_per_second": 10.608, + "train_steps_per_second": 0.663 + } + ], + "logging_steps": 10, + "max_steps": 4164, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}