{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 4.997292206017657, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.0650075227022171, "logits/rejected": 0.14139436185359955, "logps/chosen": -1.715768814086914, "logps/rejected": -1.8891878128051758, "loss": 0.5914, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.715768814086914, "rewards/margins": 0.17341896891593933, "rewards/rejected": -1.8891878128051758, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 10.091215827063065, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.001026798039674759, "logits/rejected": 0.11991753429174423, "logps/chosen": -1.8027427196502686, "logps/rejected": -1.8455826044082642, "loss": 0.6754, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8027427196502686, "rewards/margins": 0.04283960536122322, "rewards/rejected": -1.8455826044082642, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 11.37900660330619, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.036550603806972504, "logits/rejected": 0.059779636561870575, "logps/chosen": -1.6349197626113892, "logps/rejected": -1.7649036645889282, "loss": 0.6378, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6349197626113892, "rewards/margins": 0.12998399138450623, "rewards/rejected": -1.7649036645889282, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 5.139826320879387, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.051127839833498, "logits/rejected": 0.03362053632736206, "logps/chosen": -1.7249982357025146, "logps/rejected": -1.806664228439331, "loss": 0.6651, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7249982357025146, "rewards/margins": 0.08166613429784775, "rewards/rejected": -1.806664228439331, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 17.01737462233637, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.054554302245378494, "logits/rejected": 0.028359418734908104, "logps/chosen": -1.8694196939468384, "logps/rejected": -1.7773698568344116, "loss": 0.7741, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -1.8694196939468384, "rewards/margins": -0.0920499712228775, "rewards/rejected": -1.7773698568344116, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 8.865882116136179, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.09447486698627472, "logits/rejected": -0.0034280805848538876, "logps/chosen": -1.90848708152771, "logps/rejected": -1.8321399688720703, "loss": 0.7341, "rewards/accuracies": 0.4375, "rewards/chosen": -1.90848708152771, "rewards/margins": -0.07634714990854263, "rewards/rejected": -1.8321399688720703, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 10.616869291378759, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.06270752102136612, "logits/rejected": 0.09645139425992966, "logps/chosen": -1.846631646156311, "logps/rejected": -1.9980738162994385, "loss": 0.7007, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.846631646156311, "rewards/margins": 0.15144218504428864, "rewards/rejected": -1.9980738162994385, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 9.524474173489631, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.04470617324113846, "logits/rejected": 0.2189619541168213, "logps/chosen": -1.8801885843276978, "logps/rejected": -1.7427860498428345, "loss": 0.744, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.8801885843276978, "rewards/margins": -0.1374024599790573, "rewards/rejected": -1.7427860498428345, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 15.734880400638977, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.025678545236587524, "logits/rejected": 0.22277244925498962, "logps/chosen": -1.8355839252471924, "logps/rejected": -1.8700122833251953, "loss": 0.7125, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8355839252471924, "rewards/margins": 0.034428536891937256, "rewards/rejected": -1.8700122833251953, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 12.420776663891322, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.05149418115615845, "logits/rejected": 0.0976494774222374, "logps/chosen": -1.8951835632324219, "logps/rejected": -1.7750908136367798, "loss": 0.7646, "rewards/accuracies": 0.5, "rewards/chosen": -1.8951835632324219, "rewards/margins": -0.12009288370609283, "rewards/rejected": -1.7750908136367798, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 7.939859620000488, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.12006314843893051, "logits/rejected": 0.09383951127529144, "logps/chosen": -1.8273239135742188, "logps/rejected": -1.8619701862335205, "loss": 0.735, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.8273239135742188, "rewards/margins": 0.03464624285697937, "rewards/rejected": -1.8619701862335205, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.543470497849205, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.0898396223783493, "logits/rejected": 0.09876315295696259, "logps/chosen": -1.782485008239746, "logps/rejected": -1.8871591091156006, "loss": 0.6553, "rewards/accuracies": 0.53125, "rewards/chosen": -1.782485008239746, "rewards/margins": 0.10467412322759628, "rewards/rejected": -1.8871591091156006, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 6.459991241643717, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.014550602063536644, "logits/rejected": 0.13192692399024963, "logps/chosen": -1.6315174102783203, "logps/rejected": -1.7604926824569702, "loss": 0.6044, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6315174102783203, "rewards/margins": 0.1289752572774887, "rewards/rejected": -1.7604926824569702, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 12.32012993602907, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.07437841594219208, "logits/rejected": 0.07500018924474716, "logps/chosen": -1.7604013681411743, "logps/rejected": -1.805875539779663, "loss": 0.7147, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7604013681411743, "rewards/margins": 0.045474208891391754, "rewards/rejected": -1.805875539779663, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 12.9808863109987, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.062056612223386765, "logits/rejected": 0.11871856451034546, "logps/chosen": -1.7678260803222656, "logps/rejected": -2.027315616607666, "loss": 0.6123, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7678260803222656, "rewards/margins": 0.25948935747146606, "rewards/rejected": -2.027315616607666, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 8.768288583462276, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.024402568116784096, "logits/rejected": 0.12938539683818817, "logps/chosen": -1.698667287826538, "logps/rejected": -1.7314586639404297, "loss": 0.6669, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.698667287826538, "rewards/margins": 0.03279133886098862, "rewards/rejected": -1.7314586639404297, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 5.533795080195625, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.14096300303936005, "logits/rejected": 0.10672752559185028, "logps/chosen": -1.7641658782958984, "logps/rejected": -1.9355589151382446, "loss": 0.6348, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7641658782958984, "rewards/margins": 0.17139312624931335, "rewards/rejected": -1.9355589151382446, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 15.323649915405372, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.08658097684383392, "logits/rejected": 0.0475747287273407, "logps/chosen": -1.715468168258667, "logps/rejected": -1.7499656677246094, "loss": 0.6927, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.715468168258667, "rewards/margins": 0.034497782588005066, "rewards/rejected": -1.7499656677246094, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 6.373423419885825, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.06424698233604431, "logits/rejected": 0.08378218114376068, "logps/chosen": -1.758479356765747, "logps/rejected": -1.8787391185760498, "loss": 0.6609, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.758479356765747, "rewards/margins": 0.12025998532772064, "rewards/rejected": -1.8787391185760498, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 4.628128705810752, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.054062556475400925, "logits/rejected": 0.006810496095567942, "logps/chosen": -1.6636714935302734, "logps/rejected": -1.7686607837677002, "loss": 0.6278, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6636714935302734, "rewards/margins": 0.10498923063278198, "rewards/rejected": -1.7686607837677002, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 10.76571656590462, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.029249349609017372, "logits/rejected": 0.05458509176969528, "logps/chosen": -1.6104685068130493, "logps/rejected": -1.7785825729370117, "loss": 0.6008, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6104685068130493, "rewards/margins": 0.16811402142047882, "rewards/rejected": -1.7785825729370117, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 6.2104305681148695, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.00672105560079217, "logits/rejected": 0.08606382459402084, "logps/chosen": -1.6111024618148804, "logps/rejected": -1.6694419384002686, "loss": 0.6442, "rewards/accuracies": 0.5, "rewards/chosen": -1.6111024618148804, "rewards/margins": 0.058339525014162064, "rewards/rejected": -1.6694419384002686, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 10.887697988594628, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.006419995333999395, "logits/rejected": 0.21102726459503174, "logps/chosen": -1.5864602327346802, "logps/rejected": -1.8474352359771729, "loss": 0.567, "rewards/accuracies": 0.625, "rewards/chosen": -1.5864602327346802, "rewards/margins": 0.26097503304481506, "rewards/rejected": -1.8474352359771729, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 6.253295735653898, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.09053299576044083, "logits/rejected": 0.07829716056585312, "logps/chosen": -1.6399381160736084, "logps/rejected": -1.7526248693466187, "loss": 0.6147, "rewards/accuracies": 0.5, "rewards/chosen": -1.6399381160736084, "rewards/margins": 0.11268671602010727, "rewards/rejected": -1.7526248693466187, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 4.83956164224399, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.07525237649679184, "logits/rejected": 0.05489847809076309, "logps/chosen": -1.5773794651031494, "logps/rejected": -1.5425481796264648, "loss": 0.6576, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.5773794651031494, "rewards/margins": -0.03483119606971741, "rewards/rejected": -1.5425481796264648, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 9.057747459706842, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.009263654239475727, "logits/rejected": 0.14098505675792694, "logps/chosen": -1.6142938137054443, "logps/rejected": -1.733374834060669, "loss": 0.5787, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6142938137054443, "rewards/margins": 0.11908096075057983, "rewards/rejected": -1.733374834060669, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 16.179075190028655, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.057406358420848846, "logits/rejected": 0.05737648159265518, "logps/chosen": -1.6625877618789673, "logps/rejected": -1.6941664218902588, "loss": 0.6564, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.6625877618789673, "rewards/margins": 0.0315786749124527, "rewards/rejected": -1.6941664218902588, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 7.771597679709212, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.04857847839593887, "logits/rejected": 0.11689283698797226, "logps/chosen": -1.6241111755371094, "logps/rejected": -1.7432191371917725, "loss": 0.6057, "rewards/accuracies": 0.5, "rewards/chosen": -1.6241111755371094, "rewards/margins": 0.11910827457904816, "rewards/rejected": -1.7432191371917725, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 9.955536383876971, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.0398310124874115, "logits/rejected": 0.11099239438772202, "logps/chosen": -1.5300779342651367, "logps/rejected": -1.637990951538086, "loss": 0.5925, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.5300779342651367, "rewards/margins": 0.10791321098804474, "rewards/rejected": -1.637990951538086, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 12.71097851691078, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.05772867053747177, "logits/rejected": 0.10352079570293427, "logps/chosen": -1.4744627475738525, "logps/rejected": -1.4740225076675415, "loss": 0.6277, "rewards/accuracies": 0.5, "rewards/chosen": -1.4744627475738525, "rewards/margins": -0.00044009386328980327, "rewards/rejected": -1.4740225076675415, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 8.49105304418165, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.1042834147810936, "logits/rejected": -0.05865399166941643, "logps/chosen": -1.4623711109161377, "logps/rejected": -1.565828561782837, "loss": 0.5773, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4623711109161377, "rewards/margins": 0.10345745086669922, "rewards/rejected": -1.565828561782837, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 8.948745877830188, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.1614638864994049, "logits/rejected": -0.02310488559305668, "logps/chosen": -1.5294479131698608, "logps/rejected": -1.5084502696990967, "loss": 0.6479, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.5294479131698608, "rewards/margins": -0.020997820422053337, "rewards/rejected": -1.5084502696990967, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 7.513050225098715, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.09502000361680984, "logits/rejected": 0.06740803271532059, "logps/chosen": -1.3736889362335205, "logps/rejected": -1.4614773988723755, "loss": 0.582, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3736889362335205, "rewards/margins": 0.08778859674930573, "rewards/rejected": -1.4614773988723755, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 14.744283364253823, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.12818047404289246, "logits/rejected": -0.08105675876140594, "logps/chosen": -1.471206545829773, "logps/rejected": -1.5415713787078857, "loss": 0.6002, "rewards/accuracies": 0.5, "rewards/chosen": -1.471206545829773, "rewards/margins": 0.07036472111940384, "rewards/rejected": -1.5415713787078857, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 6.49805330539499, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.012623664923012257, "logits/rejected": 0.00812039989978075, "logps/chosen": -1.3477891683578491, "logps/rejected": -1.4434601068496704, "loss": 0.5617, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3477891683578491, "rewards/margins": 0.09567093849182129, "rewards/rejected": -1.4434601068496704, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 6.049899887898944, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.07556477934122086, "logits/rejected": -0.08180755376815796, "logps/chosen": -1.3605897426605225, "logps/rejected": -1.5618489980697632, "loss": 0.5631, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3605897426605225, "rewards/margins": 0.20125916600227356, "rewards/rejected": -1.5618489980697632, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 6.791246778200485, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.20081105828285217, "logits/rejected": -0.1206185594201088, "logps/chosen": -1.3488131761550903, "logps/rejected": -1.3990856409072876, "loss": 0.587, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3488131761550903, "rewards/margins": 0.05027235299348831, "rewards/rejected": -1.3990856409072876, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 6.200598909581532, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.12239708006381989, "logits/rejected": -0.014081318862736225, "logps/chosen": -1.2821367979049683, "logps/rejected": -1.4122705459594727, "loss": 0.5364, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2821367979049683, "rewards/margins": 0.13013386726379395, "rewards/rejected": -1.4122705459594727, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 4.529892748275824, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.038916219025850296, "logits/rejected": 0.10655897855758667, "logps/chosen": -1.250950813293457, "logps/rejected": -1.4165217876434326, "loss": 0.5153, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.250950813293457, "rewards/margins": 0.16557088494300842, "rewards/rejected": -1.4165217876434326, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 15.405680226869018, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.13693484663963318, "logits/rejected": -0.008512438274919987, "logps/chosen": -1.373589277267456, "logps/rejected": -1.4141457080841064, "loss": 0.5795, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.373589277267456, "rewards/margins": 0.040556520223617554, "rewards/rejected": -1.4141457080841064, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 10.754004507355, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.10695330053567886, "logits/rejected": 0.02462707832455635, "logps/chosen": -1.2877451181411743, "logps/rejected": -1.3560765981674194, "loss": 0.5621, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2877451181411743, "rewards/margins": 0.0683315247297287, "rewards/rejected": -1.3560765981674194, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 7.07016358761246, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.17576855421066284, "logits/rejected": 0.0044981567189097404, "logps/chosen": -1.3614462614059448, "logps/rejected": -1.4754022359848022, "loss": 0.5707, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3614462614059448, "rewards/margins": 0.1139560341835022, "rewards/rejected": -1.4754022359848022, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 5.111042263756739, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.23045091331005096, "logits/rejected": 0.00015527979121543467, "logps/chosen": -1.3824539184570312, "logps/rejected": -1.4372602701187134, "loss": 0.5673, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3824539184570312, "rewards/margins": 0.054806292057037354, "rewards/rejected": -1.4372602701187134, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 13.021513115122358, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.007768121547996998, "logits/rejected": 0.09841375052928925, "logps/chosen": -1.307355284690857, "logps/rejected": -1.4525563716888428, "loss": 0.5472, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.307355284690857, "rewards/margins": 0.14520104229450226, "rewards/rejected": -1.4525563716888428, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 4.58593198948831, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.14940662682056427, "logits/rejected": 0.004696237854659557, "logps/chosen": -1.310438871383667, "logps/rejected": -1.4395650625228882, "loss": 0.5343, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.310438871383667, "rewards/margins": 0.1291261613368988, "rewards/rejected": -1.4395650625228882, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 4.766481284957794, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.06688622385263443, "logits/rejected": 0.003094717860221863, "logps/chosen": -1.3155782222747803, "logps/rejected": -1.4722493886947632, "loss": 0.5311, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3155782222747803, "rewards/margins": 0.15667104721069336, "rewards/rejected": -1.4722493886947632, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 6.035967069114767, "learning_rate": 4.188948306595365e-07, "logits/chosen": -0.03757713362574577, "logits/rejected": 0.08385910838842392, "logps/chosen": -1.2831577062606812, "logps/rejected": -1.4498207569122314, "loss": 0.52, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2831577062606812, "rewards/margins": 0.1666627675294876, "rewards/rejected": -1.4498207569122314, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 4.366636780094945, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.055832862854003906, "logits/rejected": 0.0617792010307312, "logps/chosen": -1.2991082668304443, "logps/rejected": -1.4824906587600708, "loss": 0.5211, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2991082668304443, "rewards/margins": 0.18338236212730408, "rewards/rejected": -1.4824906587600708, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 6.335140394106502, "learning_rate": 4.3672014260249554e-07, "logits/chosen": -0.014485550113022327, "logits/rejected": 0.09521909058094025, "logps/chosen": -1.417596459388733, "logps/rejected": -1.4422318935394287, "loss": 0.5984, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.417596459388733, "rewards/margins": 0.024635523557662964, "rewards/rejected": -1.4422318935394287, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 8.125982697908114, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.10700766742229462, "logits/rejected": 0.04556097090244293, "logps/chosen": -1.2985507249832153, "logps/rejected": -1.356838583946228, "loss": 0.576, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2985507249832153, "rewards/margins": 0.05828779935836792, "rewards/rejected": -1.356838583946228, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 5.930552617408809, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.07873818278312683, "logits/rejected": 0.04857013747096062, "logps/chosen": -1.2638494968414307, "logps/rejected": -1.3712407350540161, "loss": 0.5329, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2638494968414307, "rewards/margins": 0.10739123821258545, "rewards/rejected": -1.3712407350540161, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 4.985684638195133, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.2606425881385803, "logits/rejected": -0.16278347373008728, "logps/chosen": -1.3510710000991821, "logps/rejected": -1.5093480348587036, "loss": 0.5293, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3510710000991821, "rewards/margins": 0.15827706456184387, "rewards/rejected": -1.5093480348587036, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 5.540166205478578, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.14383761584758759, "logits/rejected": -0.06741148233413696, "logps/chosen": -1.335313081741333, "logps/rejected": -1.5062923431396484, "loss": 0.5446, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.335313081741333, "rewards/margins": 0.1709790676832199, "rewards/rejected": -1.5062923431396484, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 3.8342937333513953, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.10686127096414566, "logits/rejected": 0.01976863667368889, "logps/chosen": -1.3166064023971558, "logps/rejected": -1.4147472381591797, "loss": 0.5451, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3166064023971558, "rewards/margins": 0.09814073890447617, "rewards/rejected": -1.4147472381591797, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 4.92197842948576, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.07850853353738785, "logits/rejected": 0.011839762330055237, "logps/chosen": -1.2637743949890137, "logps/rejected": -1.4215466976165771, "loss": 0.5162, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2637743949890137, "rewards/margins": 0.15777233242988586, "rewards/rejected": -1.4215466976165771, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 5.1499292682560975, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.1415228545665741, "logits/rejected": 0.0021017298568040133, "logps/chosen": -1.316033959388733, "logps/rejected": -1.4149783849716187, "loss": 0.5541, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.316033959388733, "rewards/margins": 0.09894455969333649, "rewards/rejected": -1.4149783849716187, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 5.857510095120647, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.09641702473163605, "logits/rejected": 0.036232173442840576, "logps/chosen": -1.3443809747695923, "logps/rejected": -1.4279015064239502, "loss": 0.5688, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3443809747695923, "rewards/margins": 0.08352051675319672, "rewards/rejected": -1.4279015064239502, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 5.759387156933552, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.13510069251060486, "logits/rejected": 0.15287984907627106, "logps/chosen": -1.3645200729370117, "logps/rejected": -1.484561562538147, "loss": 0.5484, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3645200729370117, "rewards/margins": 0.12004146724939346, "rewards/rejected": -1.484561562538147, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 5.7920127530327505, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.08213040977716446, "logits/rejected": -0.02538430318236351, "logps/chosen": -1.2643084526062012, "logps/rejected": -1.4034979343414307, "loss": 0.5237, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2643084526062012, "rewards/margins": 0.13918954133987427, "rewards/rejected": -1.4034979343414307, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 5.8063292665534645, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.095992311835289, "logits/rejected": 0.06030214577913284, "logps/chosen": -1.3031138181686401, "logps/rejected": -1.379305124282837, "loss": 0.5498, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3031138181686401, "rewards/margins": 0.07619120180606842, "rewards/rejected": -1.379305124282837, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 4.236484782244229, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.07263144105672836, "logits/rejected": -0.005951849278062582, "logps/chosen": -1.407050371170044, "logps/rejected": -1.4133589267730713, "loss": 0.6157, "rewards/accuracies": 0.5, "rewards/chosen": -1.407050371170044, "rewards/margins": 0.006308439187705517, "rewards/rejected": -1.4133589267730713, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 6.178703277848546, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.25677943229675293, "logits/rejected": -0.17639651894569397, "logps/chosen": -1.3696058988571167, "logps/rejected": -1.4542629718780518, "loss": 0.5839, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3696058988571167, "rewards/margins": 0.08465705066919327, "rewards/rejected": -1.4542629718780518, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 6.53912026335561, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.046952664852142334, "logits/rejected": 0.10501468181610107, "logps/chosen": -1.3593002557754517, "logps/rejected": -1.5090728998184204, "loss": 0.5704, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3593002557754517, "rewards/margins": 0.14977267384529114, "rewards/rejected": -1.5090728998184204, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 4.2758260615511166, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.10539977252483368, "logits/rejected": 0.017837602645158768, "logps/chosen": -1.317861795425415, "logps/rejected": -1.3703514337539673, "loss": 0.5588, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.317861795425415, "rewards/margins": 0.05248980596661568, "rewards/rejected": -1.3703514337539673, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 5.082868847961363, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.12865692377090454, "logits/rejected": -0.01492916326969862, "logps/chosen": -1.3153475522994995, "logps/rejected": -1.5646100044250488, "loss": 0.5188, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3153475522994995, "rewards/margins": 0.24926233291625977, "rewards/rejected": -1.5646100044250488, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 8.986749440603885, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.06766274571418762, "logits/rejected": 0.07352413982152939, "logps/chosen": -1.3340603113174438, "logps/rejected": -1.5065033435821533, "loss": 0.537, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3340603113174438, "rewards/margins": 0.17244306206703186, "rewards/rejected": -1.5065033435821533, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 8.28125102869866, "learning_rate": 5.971479500891266e-07, "logits/chosen": -0.013677099719643593, "logits/rejected": 0.08276298642158508, "logps/chosen": -1.3442156314849854, "logps/rejected": -1.379740595817566, "loss": 0.5674, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3442156314849854, "rewards/margins": 0.03552498295903206, "rewards/rejected": -1.379740595817566, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 7.478483752039282, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.08246646076440811, "logits/rejected": 0.051125071942806244, "logps/chosen": -1.4028606414794922, "logps/rejected": -1.48906672000885, "loss": 0.5858, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4028606414794922, "rewards/margins": 0.08620607107877731, "rewards/rejected": -1.48906672000885, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 9.575323995250752, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.011692523956298828, "logits/rejected": 0.039510972797870636, "logps/chosen": -1.3106107711791992, "logps/rejected": -1.455093264579773, "loss": 0.5361, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3106107711791992, "rewards/margins": 0.14448246359825134, "rewards/rejected": -1.455093264579773, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 5.629607174633591, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.0400688461959362, "logits/rejected": 0.0447857566177845, "logps/chosen": -1.2890903949737549, "logps/rejected": -1.4140267372131348, "loss": 0.5445, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2890903949737549, "rewards/margins": 0.12493647634983063, "rewards/rejected": -1.4140267372131348, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 5.6597045326758035, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.12457478046417236, "logits/rejected": 0.08468835055828094, "logps/chosen": -1.392717957496643, "logps/rejected": -1.4253339767456055, "loss": 0.5938, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.392717957496643, "rewards/margins": 0.032616131007671356, "rewards/rejected": -1.4253339767456055, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 5.712289395885944, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.11917763948440552, "logits/rejected": -0.04837574064731598, "logps/chosen": -1.3209375143051147, "logps/rejected": -1.4500617980957031, "loss": 0.537, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3209375143051147, "rewards/margins": 0.12912428379058838, "rewards/rejected": -1.4500617980957031, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 7.4203621152345445, "learning_rate": 6.506238859180035e-07, "logits/chosen": -0.04344567283987999, "logits/rejected": 0.03031911887228489, "logps/chosen": -1.3005120754241943, "logps/rejected": -1.405129075050354, "loss": 0.5544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3005120754241943, "rewards/margins": 0.10461703687906265, "rewards/rejected": -1.405129075050354, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 4.569538195063222, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.04294773191213608, "logits/rejected": 0.04473670572042465, "logps/chosen": -1.2950992584228516, "logps/rejected": -1.3428256511688232, "loss": 0.5661, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2950992584228516, "rewards/margins": 0.04772632196545601, "rewards/rejected": -1.3428256511688232, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 5.976203868026363, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.09484561532735825, "logits/rejected": 0.05638428404927254, "logps/chosen": -1.274760365486145, "logps/rejected": -1.4223711490631104, "loss": 0.5326, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.274760365486145, "rewards/margins": 0.14761099219322205, "rewards/rejected": -1.4223711490631104, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 4.7498159537535924, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.06276343762874603, "logits/rejected": 0.015439057722687721, "logps/chosen": -1.290292739868164, "logps/rejected": -1.456376552581787, "loss": 0.5232, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.290292739868164, "rewards/margins": 0.16608384251594543, "rewards/rejected": -1.456376552581787, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 4.063803980310148, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.01725992001593113, "logits/rejected": 0.056481532752513885, "logps/chosen": -1.3906378746032715, "logps/rejected": -1.3861762285232544, "loss": 0.6176, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3906378746032715, "rewards/margins": -0.0044616335071623325, "rewards/rejected": -1.3861762285232544, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 7.7331092443834955, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.022324861958622932, "logits/rejected": 0.17924316227436066, "logps/chosen": -1.3844705820083618, "logps/rejected": -1.4491915702819824, "loss": 0.5871, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3844705820083618, "rewards/margins": 0.0647209957242012, "rewards/rejected": -1.4491915702819824, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 5.8234894666247, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.05575888231396675, "logits/rejected": 0.1007302775979042, "logps/chosen": -1.3317315578460693, "logps/rejected": -1.349384069442749, "loss": 0.5705, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3317315578460693, "rewards/margins": 0.017652403563261032, "rewards/rejected": -1.349384069442749, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 4.447903960250947, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.046619199216365814, "logits/rejected": 0.1348242461681366, "logps/chosen": -1.3104583024978638, "logps/rejected": -1.4140068292617798, "loss": 0.547, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3104583024978638, "rewards/margins": 0.10354839265346527, "rewards/rejected": -1.4140068292617798, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.2673715054988861, "eval_logits/rejected": 0.3541051149368286, "eval_logps/chosen": -1.3437494039535522, "eval_logps/rejected": -1.4811930656433105, "eval_loss": 0.5455443859100342, "eval_rewards/accuracies": 0.5578634738922119, "eval_rewards/chosen": -1.3437494039535522, "eval_rewards/margins": 0.13744349777698517, "eval_rewards/rejected": -1.4811930656433105, "eval_runtime": 41.3957, "eval_samples_per_second": 32.491, "eval_steps_per_second": 8.141, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 7.329405113300886, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.02488850988447666, "logits/rejected": 0.06474941223859787, "logps/chosen": -1.3190172910690308, "logps/rejected": -1.390777587890625, "loss": 0.5592, "rewards/accuracies": 0.5, "rewards/chosen": -1.3190172910690308, "rewards/margins": 0.07176025211811066, "rewards/rejected": -1.390777587890625, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 4.8599078860714435, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.012596473097801208, "logits/rejected": 0.13915999233722687, "logps/chosen": -1.2880048751831055, "logps/rejected": -1.3827186822891235, "loss": 0.5491, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2880048751831055, "rewards/margins": 0.09471400082111359, "rewards/rejected": -1.3827186822891235, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 4.280161921094163, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.007571890950202942, "logits/rejected": 0.0262184739112854, "logps/chosen": -1.2837762832641602, "logps/rejected": -1.4591214656829834, "loss": 0.5332, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2837762832641602, "rewards/margins": 0.17534509301185608, "rewards/rejected": -1.4591214656829834, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 5.068325891189537, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.03773319721221924, "logits/rejected": 0.14650122821331024, "logps/chosen": -1.2738547325134277, "logps/rejected": -1.3744902610778809, "loss": 0.5489, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2738547325134277, "rewards/margins": 0.10063556581735611, "rewards/rejected": -1.3744902610778809, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 4.961837287805456, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.06659626215696335, "logits/rejected": 0.1286463439464569, "logps/chosen": -1.3133299350738525, "logps/rejected": -1.4799951314926147, "loss": 0.524, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3133299350738525, "rewards/margins": 0.1666652262210846, "rewards/rejected": -1.4799951314926147, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 5.303448214369556, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.10566394031047821, "logits/rejected": 0.08219563215970993, "logps/chosen": -1.3465242385864258, "logps/rejected": -1.48542320728302, "loss": 0.5355, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3465242385864258, "rewards/margins": 0.1388988494873047, "rewards/rejected": -1.48542320728302, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 6.275833372260895, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.02558537758886814, "logits/rejected": 0.0613640621304512, "logps/chosen": -1.221283197402954, "logps/rejected": -1.3607943058013916, "loss": 0.5139, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.221283197402954, "rewards/margins": 0.13951103389263153, "rewards/rejected": -1.3607943058013916, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 4.384721242164579, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.02669503726065159, "logits/rejected": 0.06073420122265816, "logps/chosen": -1.2970921993255615, "logps/rejected": -1.3958443403244019, "loss": 0.5413, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2970921993255615, "rewards/margins": 0.09875227510929108, "rewards/rejected": -1.3958443403244019, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 5.4322389816266705, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.09648775309324265, "logits/rejected": 0.006433696951717138, "logps/chosen": -1.3273687362670898, "logps/rejected": -1.465628743171692, "loss": 0.5558, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3273687362670898, "rewards/margins": 0.1382599174976349, "rewards/rejected": -1.465628743171692, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 8.125005839395463, "learning_rate": 8.02139037433155e-07, "logits/chosen": -0.014400298707187176, "logits/rejected": 0.10641209781169891, "logps/chosen": -1.3204365968704224, "logps/rejected": -1.4638994932174683, "loss": 0.5179, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3204365968704224, "rewards/margins": 0.14346301555633545, "rewards/rejected": -1.4638994932174683, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 6.066865393104314, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.021703166887164116, "logits/rejected": 0.06482100486755371, "logps/chosen": -1.2544450759887695, "logps/rejected": -1.4514362812042236, "loss": 0.5173, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2544450759887695, "rewards/margins": 0.19699141383171082, "rewards/rejected": -1.4514362812042236, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 5.99486424606713, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.14163680374622345, "logits/rejected": -0.022943483665585518, "logps/chosen": -1.3841049671173096, "logps/rejected": -1.445151448249817, "loss": 0.5795, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3841049671173096, "rewards/margins": 0.06104659289121628, "rewards/rejected": -1.445151448249817, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 6.120565542656682, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.099403515458107, "logits/rejected": 0.11395237594842911, "logps/chosen": -1.29450523853302, "logps/rejected": -1.4713361263275146, "loss": 0.5251, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.29450523853302, "rewards/margins": 0.17683091759681702, "rewards/rejected": -1.4713361263275146, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 6.051401721675843, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.12253554165363312, "logits/rejected": 0.07182620465755463, "logps/chosen": -1.248956322669983, "logps/rejected": -1.4495285749435425, "loss": 0.5109, "rewards/accuracies": 0.59375, "rewards/chosen": -1.248956322669983, "rewards/margins": 0.200572207570076, "rewards/rejected": -1.4495285749435425, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 6.135072891224238, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.06563135236501694, "logits/rejected": 0.07042310386896133, "logps/chosen": -1.3125526905059814, "logps/rejected": -1.5333964824676514, "loss": 0.514, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3125526905059814, "rewards/margins": 0.22084379196166992, "rewards/rejected": -1.5333964824676514, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 5.415752081894695, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.03428914397954941, "logits/rejected": 0.16688553988933563, "logps/chosen": -1.2855195999145508, "logps/rejected": -1.3598757982254028, "loss": 0.5372, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2855195999145508, "rewards/margins": 0.07435639202594757, "rewards/rejected": -1.3598757982254028, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 7.446849535942868, "learning_rate": 8.645276292335115e-07, "logits/chosen": -0.009201598353683949, "logits/rejected": 0.031318169087171555, "logps/chosen": -1.378387212753296, "logps/rejected": -1.4738420248031616, "loss": 0.5636, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.378387212753296, "rewards/margins": 0.09545484185218811, "rewards/rejected": -1.4738420248031616, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 5.039160728151553, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.006572410464286804, "logits/rejected": 0.07648377120494843, "logps/chosen": -1.3202940225601196, "logps/rejected": -1.4016218185424805, "loss": 0.5709, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3202940225601196, "rewards/margins": 0.08132799714803696, "rewards/rejected": -1.4016218185424805, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 6.6754668971239335, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.03051861748099327, "logits/rejected": -0.01233578659594059, "logps/chosen": -1.3303617238998413, "logps/rejected": -1.4489643573760986, "loss": 0.5473, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3303617238998413, "rewards/margins": 0.11860283464193344, "rewards/rejected": -1.4489643573760986, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 5.287490837064926, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.04981597140431404, "logits/rejected": 0.047952570021152496, "logps/chosen": -1.2380788326263428, "logps/rejected": -1.3998790979385376, "loss": 0.5289, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2380788326263428, "rewards/margins": 0.16180023550987244, "rewards/rejected": -1.3998790979385376, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 8.725492185758931, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.060777224600315094, "logits/rejected": 0.0757264643907547, "logps/chosen": -1.3563997745513916, "logps/rejected": -1.4081957340240479, "loss": 0.5757, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3563997745513916, "rewards/margins": 0.051796041429042816, "rewards/rejected": -1.4081957340240479, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 6.225287947468693, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.08399702608585358, "logits/rejected": 0.14649052917957306, "logps/chosen": -1.308882474899292, "logps/rejected": -1.4875096082687378, "loss": 0.5232, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.308882474899292, "rewards/margins": 0.17862707376480103, "rewards/rejected": -1.4875096082687378, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 4.114290898285474, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.049279578030109406, "logits/rejected": 0.14412793517112732, "logps/chosen": -1.2506322860717773, "logps/rejected": -1.4143574237823486, "loss": 0.5233, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2506322860717773, "rewards/margins": 0.16372530162334442, "rewards/rejected": -1.4143574237823486, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 4.267106553630775, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.07548630982637405, "logits/rejected": 0.0632234588265419, "logps/chosen": -1.2995474338531494, "logps/rejected": -1.4129770994186401, "loss": 0.5443, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2995474338531494, "rewards/margins": 0.11342976242303848, "rewards/rejected": -1.4129770994186401, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 10.440498420854608, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.11123859882354736, "logits/rejected": 0.18095359206199646, "logps/chosen": -1.2776366472244263, "logps/rejected": -1.4725145101547241, "loss": 0.5287, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2776366472244263, "rewards/margins": 0.1948779672384262, "rewards/rejected": -1.4725145101547241, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 4.349061556733843, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.07784408330917358, "logits/rejected": 0.16160567104816437, "logps/chosen": -1.2566584348678589, "logps/rejected": -1.3696025609970093, "loss": 0.5369, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2566584348678589, "rewards/margins": 0.11294414103031158, "rewards/rejected": -1.3696025609970093, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 4.490857459702983, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.07565474510192871, "logits/rejected": 0.18794603645801544, "logps/chosen": -1.2686631679534912, "logps/rejected": -1.3478660583496094, "loss": 0.5407, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2686631679534912, "rewards/margins": 0.07920284569263458, "rewards/rejected": -1.3478660583496094, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 4.418199849273589, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.015441028401255608, "logits/rejected": 0.083626389503479, "logps/chosen": -1.3971021175384521, "logps/rejected": -1.4681593179702759, "loss": 0.571, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3971021175384521, "rewards/margins": 0.0710570439696312, "rewards/rejected": -1.4681593179702759, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 4.06924242086372, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.08940805494785309, "logits/rejected": 0.10712240636348724, "logps/chosen": -1.301166296005249, "logps/rejected": -1.4282652139663696, "loss": 0.5221, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.301166296005249, "rewards/margins": 0.1270989179611206, "rewards/rejected": -1.4282652139663696, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 4.3132264379406315, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.05499836057424545, "logits/rejected": 0.12131629139184952, "logps/chosen": -1.2971409559249878, "logps/rejected": -1.4440648555755615, "loss": 0.5258, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2971409559249878, "rewards/margins": 0.1469239890575409, "rewards/rejected": -1.4440648555755615, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 6.602284178432305, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.061866700649261475, "logits/rejected": 0.06235383078455925, "logps/chosen": -1.3719993829727173, "logps/rejected": -1.4457218647003174, "loss": 0.5812, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3719993829727173, "rewards/margins": 0.0737224668264389, "rewards/rejected": -1.4457218647003174, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 5.71838028235444, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.04728525131940842, "logits/rejected": 0.06046200543642044, "logps/chosen": -1.2291576862335205, "logps/rejected": -1.378359079360962, "loss": 0.5103, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2291576862335205, "rewards/margins": 0.14920127391815186, "rewards/rejected": -1.378359079360962, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 3.869283112792342, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.04899100586771965, "logits/rejected": 0.1006133183836937, "logps/chosen": -1.3458400964736938, "logps/rejected": -1.4805446863174438, "loss": 0.5451, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3458400964736938, "rewards/margins": 0.13470450043678284, "rewards/rejected": -1.4805446863174438, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 6.516840407676028, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.05578559637069702, "logits/rejected": 0.16043071448802948, "logps/chosen": -1.3167650699615479, "logps/rejected": -1.4205187559127808, "loss": 0.5497, "rewards/accuracies": 0.5, "rewards/chosen": -1.3167650699615479, "rewards/margins": 0.10375366359949112, "rewards/rejected": -1.4205187559127808, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 4.724964060989278, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.003188273636624217, "logits/rejected": 0.05413530394434929, "logps/chosen": -1.2521110773086548, "logps/rejected": -1.3818458318710327, "loss": 0.5189, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2521110773086548, "rewards/margins": 0.1297347992658615, "rewards/rejected": -1.3818458318710327, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 5.331034156049935, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.03645291179418564, "logits/rejected": 0.11539416015148163, "logps/chosen": -1.259407877922058, "logps/rejected": -1.4439879655838013, "loss": 0.5044, "rewards/accuracies": 0.59375, "rewards/chosen": -1.259407877922058, "rewards/margins": 0.18458007276058197, "rewards/rejected": -1.4439879655838013, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 5.8770136833440185, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.06189862638711929, "logits/rejected": 0.03979547694325447, "logps/chosen": -1.3996909856796265, "logps/rejected": -1.4749270677566528, "loss": 0.5803, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3996909856796265, "rewards/margins": 0.07523597031831741, "rewards/rejected": -1.4749270677566528, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 6.736737620710777, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.019210344180464745, "logits/rejected": 0.10791017860174179, "logps/chosen": -1.3719546794891357, "logps/rejected": -1.419940710067749, "loss": 0.5783, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3719546794891357, "rewards/margins": 0.047986168414354324, "rewards/rejected": -1.419940710067749, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 5.295772339505157, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.030385833233594894, "logits/rejected": 0.1614394634962082, "logps/chosen": -1.3022531270980835, "logps/rejected": -1.3948819637298584, "loss": 0.5458, "rewards/accuracies": 0.5, "rewards/chosen": -1.3022531270980835, "rewards/margins": 0.09262903034687042, "rewards/rejected": -1.3948819637298584, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 4.749289114840434, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.018210317939519882, "logits/rejected": 0.12962272763252258, "logps/chosen": -1.299984097480774, "logps/rejected": -1.3446775674819946, "loss": 0.5634, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.299984097480774, "rewards/margins": 0.044693369418382645, "rewards/rejected": -1.3446775674819946, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 5.065752863341894, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.05975578352808952, "logits/rejected": 0.025919277220964432, "logps/chosen": -1.3635542392730713, "logps/rejected": -1.5370919704437256, "loss": 0.5409, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3635542392730713, "rewards/margins": 0.17353768646717072, "rewards/rejected": -1.5370919704437256, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 13.545785018683377, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.06286518275737762, "logits/rejected": 0.21027450263500214, "logps/chosen": -1.3381394147872925, "logps/rejected": -1.448525071144104, "loss": 0.552, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3381394147872925, "rewards/margins": 0.11038555949926376, "rewards/rejected": -1.448525071144104, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 5.6761092341719595, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.04209239035844803, "logits/rejected": 0.06514161080121994, "logps/chosen": -1.33099365234375, "logps/rejected": -1.50978684425354, "loss": 0.5379, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.33099365234375, "rewards/margins": 0.17879299819469452, "rewards/rejected": -1.50978684425354, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 4.823516488433901, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.058346133679151535, "logits/rejected": 0.12314023077487946, "logps/chosen": -1.402769923210144, "logps/rejected": -1.5322433710098267, "loss": 0.5658, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.402769923210144, "rewards/margins": 0.12947334349155426, "rewards/rejected": -1.5322433710098267, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 6.129078945850509, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.051918160170316696, "logits/rejected": 0.16169193387031555, "logps/chosen": -1.308782935142517, "logps/rejected": -1.4762122631072998, "loss": 0.5103, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.308782935142517, "rewards/margins": 0.16742947697639465, "rewards/rejected": -1.4762122631072998, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 6.459088468695512, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.01012207381427288, "logits/rejected": 0.09323824197053909, "logps/chosen": -1.3465960025787354, "logps/rejected": -1.523953914642334, "loss": 0.5186, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3465960025787354, "rewards/margins": 0.17735782265663147, "rewards/rejected": -1.523953914642334, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 4.315226944086073, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.0067606656812131405, "logits/rejected": 0.2296857386827469, "logps/chosen": -1.421900749206543, "logps/rejected": -1.506696343421936, "loss": 0.58, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.421900749206543, "rewards/margins": 0.08479563891887665, "rewards/rejected": -1.506696343421936, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 7.43584986238729, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.08050470799207687, "logits/rejected": 0.11659035831689835, "logps/chosen": -1.3447065353393555, "logps/rejected": -1.4826867580413818, "loss": 0.5357, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3447065353393555, "rewards/margins": 0.13798017799854279, "rewards/rejected": -1.4826867580413818, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 5.6322489353116385, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.09534216672182083, "logits/rejected": -0.013799980282783508, "logps/chosen": -1.2738784551620483, "logps/rejected": -1.4801595211029053, "loss": 0.4936, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2738784551620483, "rewards/margins": 0.20628109574317932, "rewards/rejected": -1.4801595211029053, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 7.242103902740337, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.023808851838111877, "logits/rejected": 0.11441246420145035, "logps/chosen": -1.416351079940796, "logps/rejected": -1.6008179187774658, "loss": 0.5422, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.416351079940796, "rewards/margins": 0.18446668982505798, "rewards/rejected": -1.6008179187774658, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 7.167683282052005, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.04834640771150589, "logits/rejected": 0.05940045788884163, "logps/chosen": -1.3551456928253174, "logps/rejected": -1.5722651481628418, "loss": 0.5446, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3551456928253174, "rewards/margins": 0.21711936593055725, "rewards/rejected": -1.5722651481628418, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 5.255359721595041, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.025629941374063492, "logits/rejected": 0.1553245633840561, "logps/chosen": -1.3227972984313965, "logps/rejected": -1.4089815616607666, "loss": 0.5456, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3227972984313965, "rewards/margins": 0.08618433773517609, "rewards/rejected": -1.4089815616607666, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 7.72294801740959, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.008045777678489685, "logits/rejected": 0.12675219774246216, "logps/chosen": -1.2815535068511963, "logps/rejected": -1.4503428936004639, "loss": 0.5206, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2815535068511963, "rewards/margins": 0.16878940165042877, "rewards/rejected": -1.4503428936004639, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 5.212012803754731, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.052673738449811935, "logits/rejected": 0.15249976515769958, "logps/chosen": -1.3192873001098633, "logps/rejected": -1.4144233465194702, "loss": 0.5376, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3192873001098633, "rewards/margins": 0.09513608366250992, "rewards/rejected": -1.4144233465194702, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 5.511680359218717, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.05590473487973213, "logits/rejected": 0.021316641941666603, "logps/chosen": -1.3147975206375122, "logps/rejected": -1.5639809370040894, "loss": 0.5006, "rewards/accuracies": 0.625, "rewards/chosen": -1.3147975206375122, "rewards/margins": 0.2491835355758667, "rewards/rejected": -1.5639809370040894, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 5.066286058056469, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.06047447398304939, "logits/rejected": 0.22424538433551788, "logps/chosen": -1.3669906854629517, "logps/rejected": -1.4786553382873535, "loss": 0.5889, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3669906854629517, "rewards/margins": 0.11166461557149887, "rewards/rejected": -1.4786553382873535, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 7.946926284411594, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.02485055662691593, "logits/rejected": 0.18203561007976532, "logps/chosen": -1.3349308967590332, "logps/rejected": -1.4446461200714111, "loss": 0.5492, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3349308967590332, "rewards/margins": 0.1097152978181839, "rewards/rejected": -1.4446461200714111, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 6.1525373445507805, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.03793618455529213, "logits/rejected": 0.07665838301181793, "logps/chosen": -1.3384425640106201, "logps/rejected": -1.4931614398956299, "loss": 0.5546, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3384425640106201, "rewards/margins": 0.15471890568733215, "rewards/rejected": -1.4931614398956299, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 4.955180859368702, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.0001690015196800232, "logits/rejected": 0.086110919713974, "logps/chosen": -1.2402942180633545, "logps/rejected": -1.4379851818084717, "loss": 0.5033, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2402942180633545, "rewards/margins": 0.19769081473350525, "rewards/rejected": -1.4379851818084717, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 6.945649569477848, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.11522463709115982, "logits/rejected": 0.022990206256508827, "logps/chosen": -1.4119195938110352, "logps/rejected": -1.5175782442092896, "loss": 0.5624, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4119195938110352, "rewards/margins": 0.10565869510173798, "rewards/rejected": -1.5175782442092896, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 5.022258973590583, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.07284384965896606, "logits/rejected": 0.09999080002307892, "logps/chosen": -1.3468621969223022, "logps/rejected": -1.5806243419647217, "loss": 0.5065, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3468621969223022, "rewards/margins": 0.23376211524009705, "rewards/rejected": -1.5806243419647217, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 4.877927257840908, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.053192656487226486, "logits/rejected": 0.14972971379756927, "logps/chosen": -1.3105436563491821, "logps/rejected": -1.4844410419464111, "loss": 0.5105, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3105436563491821, "rewards/margins": 0.17389757931232452, "rewards/rejected": -1.4844410419464111, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 7.732431430334909, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.027588630095124245, "logits/rejected": 0.13879923522472382, "logps/chosen": -1.3882601261138916, "logps/rejected": -1.5453174114227295, "loss": 0.5257, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3882601261138916, "rewards/margins": 0.1570572555065155, "rewards/rejected": -1.5453174114227295, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 5.805995004474029, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.04018913581967354, "logits/rejected": 0.22701886296272278, "logps/chosen": -1.4326589107513428, "logps/rejected": -1.5781805515289307, "loss": 0.566, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4326589107513428, "rewards/margins": 0.1455215960741043, "rewards/rejected": -1.5781805515289307, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 5.582868853835462, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.09483565390110016, "logits/rejected": 0.03965367004275322, "logps/chosen": -1.2555270195007324, "logps/rejected": -1.555360198020935, "loss": 0.4685, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2555270195007324, "rewards/margins": 0.299833208322525, "rewards/rejected": -1.555360198020935, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 5.46747019444681, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.1380842626094818, "logits/rejected": 0.019442027434706688, "logps/chosen": -1.3697642087936401, "logps/rejected": -1.5161153078079224, "loss": 0.5496, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3697642087936401, "rewards/margins": 0.14635103940963745, "rewards/rejected": -1.5161153078079224, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 5.406071730564995, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.08595935255289078, "logits/rejected": 0.01669185794889927, "logps/chosen": -1.388689637184143, "logps/rejected": -1.513307809829712, "loss": 0.533, "rewards/accuracies": 0.5625, "rewards/chosen": -1.388689637184143, "rewards/margins": 0.12461821734905243, "rewards/rejected": -1.513307809829712, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 8.38847467086693, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.06321905553340912, "logits/rejected": 0.10910911858081818, "logps/chosen": -1.4576215744018555, "logps/rejected": -1.5602877140045166, "loss": 0.5785, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.4576215744018555, "rewards/margins": 0.102665975689888, "rewards/rejected": -1.5602877140045166, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 4.862284036967087, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.043889619410037994, "logits/rejected": 0.1210487112402916, "logps/chosen": -1.4275610446929932, "logps/rejected": -1.6838836669921875, "loss": 0.5556, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4275610446929932, "rewards/margins": 0.25632280111312866, "rewards/rejected": -1.6838836669921875, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 6.129119388683074, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.04303457587957382, "logits/rejected": 0.11394067108631134, "logps/chosen": -1.3670246601104736, "logps/rejected": -1.5888017416000366, "loss": 0.5169, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3670246601104736, "rewards/margins": 0.2217770367860794, "rewards/rejected": -1.5888017416000366, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 5.029792942976596, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.004350324161350727, "logits/rejected": 0.1381688266992569, "logps/chosen": -1.3755730390548706, "logps/rejected": -1.529867172241211, "loss": 0.554, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3755730390548706, "rewards/margins": 0.15429410338401794, "rewards/rejected": -1.529867172241211, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 4.3595227609659695, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.07074405252933502, "logits/rejected": 0.001279197633266449, "logps/chosen": -1.3895020484924316, "logps/rejected": -1.6054500341415405, "loss": 0.5221, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3895020484924316, "rewards/margins": 0.21594806015491486, "rewards/rejected": -1.6054500341415405, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 5.717738257404781, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.03715671971440315, "logits/rejected": 0.053750135004520416, "logps/chosen": -1.3766443729400635, "logps/rejected": -1.5211693048477173, "loss": 0.5365, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3766443729400635, "rewards/margins": 0.14452481269836426, "rewards/rejected": -1.5211693048477173, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 6.043026391613739, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.05102987959980965, "logits/rejected": 0.17382799088954926, "logps/chosen": -1.3235329389572144, "logps/rejected": -1.640259027481079, "loss": 0.502, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3235329389572144, "rewards/margins": 0.3167259395122528, "rewards/rejected": -1.640259027481079, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 5.899612276753233, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.11595108360052109, "logits/rejected": 0.04694090038537979, "logps/chosen": -1.389408826828003, "logps/rejected": -1.6059459447860718, "loss": 0.537, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.389408826828003, "rewards/margins": 0.21653728187084198, "rewards/rejected": -1.6059459447860718, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 4.274341095660232, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.1434163749217987, "logits/rejected": 0.0008094683289527893, "logps/chosen": -1.321092128753662, "logps/rejected": -1.5486948490142822, "loss": 0.5051, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.321092128753662, "rewards/margins": 0.2276027500629425, "rewards/rejected": -1.5486948490142822, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 4.807908971738302, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.018433118239045143, "logits/rejected": 0.14530541002750397, "logps/chosen": -1.3062713146209717, "logps/rejected": -1.5849571228027344, "loss": 0.4978, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3062713146209717, "rewards/margins": 0.27868571877479553, "rewards/rejected": -1.5849571228027344, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 5.497726008072939, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.06224647909402847, "logits/rejected": 0.06625714153051376, "logps/chosen": -1.3578323125839233, "logps/rejected": -1.5364539623260498, "loss": 0.5282, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3578323125839233, "rewards/margins": 0.17862167954444885, "rewards/rejected": -1.5364539623260498, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 6.20614801341624, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.07702189683914185, "logits/rejected": 0.11128165572881699, "logps/chosen": -1.3404291868209839, "logps/rejected": -1.6096493005752563, "loss": 0.5189, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3404291868209839, "rewards/margins": 0.2692199647426605, "rewards/rejected": -1.6096493005752563, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 5.168853654669418, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.02232012338936329, "logits/rejected": 0.06391916424036026, "logps/chosen": -1.3363145589828491, "logps/rejected": -1.5072184801101685, "loss": 0.5301, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3363145589828491, "rewards/margins": 0.17090372741222382, "rewards/rejected": -1.5072184801101685, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.3345325291156769, "eval_logits/rejected": 0.4271507263183594, "eval_logps/chosen": -1.3889254331588745, "eval_logps/rejected": -1.641539454460144, "eval_loss": 0.5165265202522278, "eval_rewards/accuracies": 0.5927299857139587, "eval_rewards/chosen": -1.3889254331588745, "eval_rewards/margins": 0.25261396169662476, "eval_rewards/rejected": -1.641539454460144, "eval_runtime": 40.2669, "eval_samples_per_second": 33.402, "eval_steps_per_second": 8.369, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 6.867723153992824, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.08789472281932831, "logits/rejected": 0.07033085823059082, "logps/chosen": -1.3889178037643433, "logps/rejected": -1.6786119937896729, "loss": 0.5134, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3889178037643433, "rewards/margins": 0.2896941602230072, "rewards/rejected": -1.6786119937896729, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 5.723183861220142, "learning_rate": 9.939967071845424e-07, "logits/chosen": -0.009989236481487751, "logits/rejected": 0.06023658066987991, "logps/chosen": -1.3250126838684082, "logps/rejected": -1.5736223459243774, "loss": 0.506, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3250126838684082, "rewards/margins": 0.24860961735248566, "rewards/rejected": -1.5736223459243774, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 9.694762079668816, "learning_rate": 9.937536987168413e-07, "logits/chosen": -0.007894647307693958, "logits/rejected": 0.12403736263513565, "logps/chosen": -1.325869083404541, "logps/rejected": -1.6757539510726929, "loss": 0.4865, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.325869083404541, "rewards/margins": 0.3498847782611847, "rewards/rejected": -1.6757539510726929, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 7.210031214431489, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.014616812579333782, "logits/rejected": 0.0667891651391983, "logps/chosen": -1.4115034341812134, "logps/rejected": -1.6969811916351318, "loss": 0.5194, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4115034341812134, "rewards/margins": 0.28547775745391846, "rewards/rejected": -1.6969811916351318, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 15.425344000250135, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.03439576178789139, "logits/rejected": 0.08923866599798203, "logps/chosen": -1.3346027135849, "logps/rejected": -1.5731122493743896, "loss": 0.5242, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3346027135849, "rewards/margins": 0.23850946128368378, "rewards/rejected": -1.5731122493743896, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 6.673020386908522, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.03903502970933914, "logits/rejected": 0.1999625414609909, "logps/chosen": -1.4093892574310303, "logps/rejected": -1.5844175815582275, "loss": 0.5424, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4093892574310303, "rewards/margins": 0.17502841353416443, "rewards/rejected": -1.5844175815582275, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 6.385082271910167, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.024074595421552658, "logits/rejected": 0.15999475121498108, "logps/chosen": -1.3286621570587158, "logps/rejected": -1.5429725646972656, "loss": 0.5174, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3286621570587158, "rewards/margins": 0.2143104523420334, "rewards/rejected": -1.5429725646972656, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 5.024404260019362, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.02226843126118183, "logits/rejected": 0.17963920533657074, "logps/chosen": -1.3726035356521606, "logps/rejected": -1.6485439538955688, "loss": 0.5219, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3726035356521606, "rewards/margins": 0.2759404480457306, "rewards/rejected": -1.6485439538955688, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 3.030834681375728, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.11405900865793228, "logits/rejected": 0.18516018986701965, "logps/chosen": -1.3711360692977905, "logps/rejected": -1.5120737552642822, "loss": 0.5549, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3711360692977905, "rewards/margins": 0.14093779027462006, "rewards/rejected": -1.5120737552642822, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 6.589592862405669, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.02984592318534851, "logits/rejected": 0.10063725709915161, "logps/chosen": -1.3376272916793823, "logps/rejected": -1.544937014579773, "loss": 0.4954, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3376272916793823, "rewards/margins": 0.2073097974061966, "rewards/rejected": -1.544937014579773, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 9.496694363006233, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.010763334110379219, "logits/rejected": 0.16096653044223785, "logps/chosen": -1.4010120630264282, "logps/rejected": -1.7015081644058228, "loss": 0.5269, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4010120630264282, "rewards/margins": 0.300495982170105, "rewards/rejected": -1.7015081644058228, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 6.3409093771873, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.0608900785446167, "logits/rejected": 0.10356438159942627, "logps/chosen": -1.4408351182937622, "logps/rejected": -1.7620872259140015, "loss": 0.5112, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4408351182937622, "rewards/margins": 0.3212522864341736, "rewards/rejected": -1.7620872259140015, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 8.746126076561591, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.022087926045060158, "logits/rejected": 0.12186546623706818, "logps/chosen": -1.352374792098999, "logps/rejected": -1.7108958959579468, "loss": 0.5077, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.352374792098999, "rewards/margins": 0.358521044254303, "rewards/rejected": -1.7108958959579468, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 5.0354223247568415, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.06913277506828308, "logits/rejected": 0.19995860755443573, "logps/chosen": -1.3958160877227783, "logps/rejected": -1.6552776098251343, "loss": 0.5249, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3958160877227783, "rewards/margins": 0.25946158170700073, "rewards/rejected": -1.6552776098251343, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 7.054774741452929, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.04347441345453262, "logits/rejected": 0.12112858146429062, "logps/chosen": -1.4564518928527832, "logps/rejected": -1.6596553325653076, "loss": 0.5732, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.4564518928527832, "rewards/margins": 0.20320363342761993, "rewards/rejected": -1.6596553325653076, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 5.733991119421861, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.04044445976614952, "logits/rejected": 0.2108727991580963, "logps/chosen": -1.481805443763733, "logps/rejected": -1.7172458171844482, "loss": 0.5431, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.481805443763733, "rewards/margins": 0.2354406863451004, "rewards/rejected": -1.7172458171844482, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 5.530635173157297, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.04263751208782196, "logits/rejected": 0.03728199750185013, "logps/chosen": -1.371861219406128, "logps/rejected": -1.6569620370864868, "loss": 0.483, "rewards/accuracies": 0.65625, "rewards/chosen": -1.371861219406128, "rewards/margins": 0.2851008176803589, "rewards/rejected": -1.6569620370864868, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 4.773134329236131, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.14747408032417297, "logits/rejected": -0.01884273812174797, "logps/chosen": -1.4116828441619873, "logps/rejected": -1.631593942642212, "loss": 0.522, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4116828441619873, "rewards/margins": 0.2199108898639679, "rewards/rejected": -1.631593942642212, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 4.76858704019432, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.05141037702560425, "logits/rejected": 0.1624143421649933, "logps/chosen": -1.2881485223770142, "logps/rejected": -1.4786533117294312, "loss": 0.5115, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2881485223770142, "rewards/margins": 0.1905047595500946, "rewards/rejected": -1.4786533117294312, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 5.257980888327947, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.09822763502597809, "logits/rejected": -0.034055404365062714, "logps/chosen": -1.2929599285125732, "logps/rejected": -1.5975313186645508, "loss": 0.5017, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2929599285125732, "rewards/margins": 0.3045715093612671, "rewards/rejected": -1.5975313186645508, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 6.24161916541279, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.032900020480155945, "logits/rejected": 0.22491376101970673, "logps/chosen": -1.3883392810821533, "logps/rejected": -1.6684930324554443, "loss": 0.5272, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3883392810821533, "rewards/margins": 0.28015369176864624, "rewards/rejected": -1.6684930324554443, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 3.181353790037136, "learning_rate": 9.882293271315481e-07, "logits/chosen": 0.011284579522907734, "logits/rejected": 0.12082312256097794, "logps/chosen": -1.409759283065796, "logps/rejected": -1.5833348035812378, "loss": 0.5506, "rewards/accuracies": 0.5625, "rewards/chosen": -1.409759283065796, "rewards/margins": 0.1735754907131195, "rewards/rejected": -1.5833348035812378, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 5.932737489720043, "learning_rate": 9.878910202749589e-07, "logits/chosen": 0.00841023214161396, "logits/rejected": 0.19599978625774384, "logps/chosen": -1.3585389852523804, "logps/rejected": -1.5755045413970947, "loss": 0.5249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3585389852523804, "rewards/margins": 0.2169654667377472, "rewards/rejected": -1.5755045413970947, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 6.167063413799225, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.12913081049919128, "logits/rejected": 0.27021074295043945, "logps/chosen": -1.2946172952651978, "logps/rejected": -1.5694751739501953, "loss": 0.5156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2946172952651978, "rewards/margins": 0.2748579978942871, "rewards/rejected": -1.5694751739501953, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 6.816851728700204, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.02681029960513115, "logits/rejected": 0.13866981863975525, "logps/chosen": -1.4194364547729492, "logps/rejected": -1.5732085704803467, "loss": 0.5494, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4194364547729492, "rewards/margins": 0.15377211570739746, "rewards/rejected": -1.5732085704803467, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 8.635072802913916, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.038931023329496384, "logits/rejected": 0.0773131251335144, "logps/chosen": -1.354738473892212, "logps/rejected": -1.7007625102996826, "loss": 0.5032, "rewards/accuracies": 0.59375, "rewards/chosen": -1.354738473892212, "rewards/margins": 0.3460239768028259, "rewards/rejected": -1.7007625102996826, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 4.961759758713111, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.04948309063911438, "logits/rejected": 0.09357891231775284, "logps/chosen": -1.3162089586257935, "logps/rejected": -1.5830973386764526, "loss": 0.506, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3162089586257935, "rewards/margins": 0.2668883800506592, "rewards/rejected": -1.5830973386764526, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 7.127071898721723, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.01768878661096096, "logits/rejected": 0.09037254750728607, "logps/chosen": -1.3815598487854004, "logps/rejected": -1.5617825984954834, "loss": 0.5282, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3815598487854004, "rewards/margins": 0.18022283911705017, "rewards/rejected": -1.5617825984954834, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 6.120853864036778, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.03440701216459274, "logits/rejected": 0.09808699786663055, "logps/chosen": -1.3681036233901978, "logps/rejected": -1.6880896091461182, "loss": 0.4861, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3681036233901978, "rewards/margins": 0.31998589634895325, "rewards/rejected": -1.6880896091461182, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 6.272420511771226, "learning_rate": 9.853905232845727e-07, "logits/chosen": 0.01540914736688137, "logits/rejected": 0.19565069675445557, "logps/chosen": -1.477325201034546, "logps/rejected": -1.6573213338851929, "loss": 0.5718, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.477325201034546, "rewards/margins": 0.17999598383903503, "rewards/rejected": -1.6573213338851929, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 5.665616613103236, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.03142506629228592, "logits/rejected": 0.26532477140426636, "logps/chosen": -1.4752228260040283, "logps/rejected": -1.689875602722168, "loss": 0.5511, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4752228260040283, "rewards/margins": 0.21465297043323517, "rewards/rejected": -1.689875602722168, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 6.133716998944082, "learning_rate": 9.846336591393832e-07, "logits/chosen": 0.011161714792251587, "logits/rejected": 0.16765785217285156, "logps/chosen": -1.410712480545044, "logps/rejected": -1.6017589569091797, "loss": 0.5402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.410712480545044, "rewards/margins": 0.19104662537574768, "rewards/rejected": -1.6017589569091797, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 5.908463600538246, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.07233011722564697, "logits/rejected": 0.07298247516155243, "logps/chosen": -1.4521138668060303, "logps/rejected": -1.766834020614624, "loss": 0.5268, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4521138668060303, "rewards/margins": 0.3147200644016266, "rewards/rejected": -1.766834020614624, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 5.2691705715676065, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.07627938687801361, "logits/rejected": 0.08463269472122192, "logps/chosen": -1.323203444480896, "logps/rejected": -1.567244052886963, "loss": 0.511, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.323203444480896, "rewards/margins": 0.24404048919677734, "rewards/rejected": -1.567244052886963, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 5.407138703239494, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.07204820215702057, "logits/rejected": 0.11519613116979599, "logps/chosen": -1.4009150266647339, "logps/rejected": -1.623420000076294, "loss": 0.5084, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4009150266647339, "rewards/margins": 0.22250494360923767, "rewards/rejected": -1.623420000076294, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 8.250164012225381, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.06445921212434769, "logits/rejected": 0.12994752824306488, "logps/chosen": -1.4836039543151855, "logps/rejected": -1.7131847143173218, "loss": 0.5483, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4836039543151855, "rewards/margins": 0.22958076000213623, "rewards/rejected": -1.7131847143173218, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 6.339621960255445, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.03547795116901398, "logits/rejected": 0.2278524935245514, "logps/chosen": -1.4204121828079224, "logps/rejected": -1.6703943014144897, "loss": 0.5169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4204121828079224, "rewards/margins": 0.249982088804245, "rewards/rejected": -1.6703943014144897, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 4.541350215069407, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.10296173393726349, "logits/rejected": 0.13710102438926697, "logps/chosen": -1.2761070728302002, "logps/rejected": -1.6117852926254272, "loss": 0.466, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2761070728302002, "rewards/margins": 0.33567824959754944, "rewards/rejected": -1.6117852926254272, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 5.471242972401267, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.12126004695892334, "logits/rejected": 0.2068725824356079, "logps/chosen": -1.3954887390136719, "logps/rejected": -1.5668925046920776, "loss": 0.5589, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3954887390136719, "rewards/margins": 0.171403706073761, "rewards/rejected": -1.5668925046920776, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 7.609321988417904, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.10276105254888535, "logits/rejected": 0.22932276129722595, "logps/chosen": -1.4259310960769653, "logps/rejected": -1.716629981994629, "loss": 0.5394, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4259310960769653, "rewards/margins": 0.29069894552230835, "rewards/rejected": -1.716629981994629, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 7.859467894470004, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.069122813642025, "logits/rejected": 0.240271657705307, "logps/chosen": -1.4581899642944336, "logps/rejected": -1.7403833866119385, "loss": 0.532, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4581899642944336, "rewards/margins": 0.28219345211982727, "rewards/rejected": -1.7403833866119385, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 5.296857939068007, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.035430826246738434, "logits/rejected": 0.22953574359416962, "logps/chosen": -1.2969932556152344, "logps/rejected": -1.682795524597168, "loss": 0.4767, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2969932556152344, "rewards/margins": 0.38580238819122314, "rewards/rejected": -1.682795524597168, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 6.423894017184109, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.017559360712766647, "logits/rejected": 0.19751058518886566, "logps/chosen": -1.3591115474700928, "logps/rejected": -1.638343095779419, "loss": 0.5025, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3591115474700928, "rewards/margins": 0.27923136949539185, "rewards/rejected": -1.638343095779419, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 7.098522112638251, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.013466214761137962, "logits/rejected": 0.15423551201820374, "logps/chosen": -1.3965883255004883, "logps/rejected": -1.694798469543457, "loss": 0.4955, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3965883255004883, "rewards/margins": 0.29821014404296875, "rewards/rejected": -1.694798469543457, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 5.405304696399745, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.03098270855844021, "logits/rejected": 0.08919452130794525, "logps/chosen": -1.3344242572784424, "logps/rejected": -1.6978425979614258, "loss": 0.4922, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3344242572784424, "rewards/margins": 0.3634182810783386, "rewards/rejected": -1.6978425979614258, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 4.96257571210325, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.0030382529366761446, "logits/rejected": 0.08272795379161835, "logps/chosen": -1.4018604755401611, "logps/rejected": -1.6715812683105469, "loss": 0.5196, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4018604755401611, "rewards/margins": 0.26972097158432007, "rewards/rejected": -1.6715812683105469, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 7.192791272436166, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.02500019036233425, "logits/rejected": 0.09126511961221695, "logps/chosen": -1.4696487188339233, "logps/rejected": -1.6167863607406616, "loss": 0.5514, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4696487188339233, "rewards/margins": 0.14713750779628754, "rewards/rejected": -1.6167863607406616, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 8.516856257108142, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.0037988275289535522, "logits/rejected": 0.15684238076210022, "logps/chosen": -1.3643237352371216, "logps/rejected": -1.6388190984725952, "loss": 0.5103, "rewards/accuracies": 0.625, "rewards/chosen": -1.3643237352371216, "rewards/margins": 0.27449530363082886, "rewards/rejected": -1.6388190984725952, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 3.983244072512603, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.04210857301950455, "logits/rejected": 0.10226224362850189, "logps/chosen": -1.3435189723968506, "logps/rejected": -1.6800181865692139, "loss": 0.4969, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3435189723968506, "rewards/margins": 0.33649933338165283, "rewards/rejected": -1.6800181865692139, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 10.027690433687457, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.08854939788579941, "logits/rejected": 0.01890941523015499, "logps/chosen": -1.4280637502670288, "logps/rejected": -1.7624692916870117, "loss": 0.5088, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4280637502670288, "rewards/margins": 0.33440545201301575, "rewards/rejected": -1.7624692916870117, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 8.174126196811576, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.010158751159906387, "logits/rejected": 0.07046304643154144, "logps/chosen": -1.4567835330963135, "logps/rejected": -1.7600619792938232, "loss": 0.5052, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4567835330963135, "rewards/margins": 0.3032783567905426, "rewards/rejected": -1.7600619792938232, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 7.960548884224933, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.020586049184203148, "logits/rejected": 0.19689294695854187, "logps/chosen": -1.5394012928009033, "logps/rejected": -1.8067468404769897, "loss": 0.5338, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5394012928009033, "rewards/margins": 0.2673453390598297, "rewards/rejected": -1.8067468404769897, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 4.0025871285407595, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.03965837508440018, "logits/rejected": 0.17310525476932526, "logps/chosen": -1.4224742650985718, "logps/rejected": -1.8390239477157593, "loss": 0.4698, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4224742650985718, "rewards/margins": 0.4165496826171875, "rewards/rejected": -1.8390239477157593, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 5.785588745851021, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.10320504009723663, "logits/rejected": 0.13574329018592834, "logps/chosen": -1.475862979888916, "logps/rejected": -1.8260608911514282, "loss": 0.5037, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.475862979888916, "rewards/margins": 0.350197970867157, "rewards/rejected": -1.8260608911514282, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 5.870629160782091, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.0093617532402277, "logits/rejected": 0.12015372514724731, "logps/chosen": -1.4062837362289429, "logps/rejected": -1.784282922744751, "loss": 0.4893, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4062837362289429, "rewards/margins": 0.3779989182949066, "rewards/rejected": -1.784282922744751, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 5.758631972109694, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.0846220999956131, "logits/rejected": 0.02005021460354328, "logps/chosen": -1.5154950618743896, "logps/rejected": -1.8075637817382812, "loss": 0.5203, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5154950618743896, "rewards/margins": 0.29206863045692444, "rewards/rejected": -1.8075637817382812, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 8.559501715418515, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.0036075368989259005, "logits/rejected": 0.1623397022485733, "logps/chosen": -1.4639016389846802, "logps/rejected": -1.588873267173767, "loss": 0.5755, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4639016389846802, "rewards/margins": 0.12497171014547348, "rewards/rejected": -1.588873267173767, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 4.873242977969351, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.003058557165786624, "logits/rejected": 0.10238274186849594, "logps/chosen": -1.4307628870010376, "logps/rejected": -1.571894884109497, "loss": 0.5561, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4307628870010376, "rewards/margins": 0.14113202691078186, "rewards/rejected": -1.571894884109497, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 5.427126471936945, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.14496079087257385, "logits/rejected": -0.012730265036225319, "logps/chosen": -1.3742672204971313, "logps/rejected": -1.6538225412368774, "loss": 0.4943, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3742672204971313, "rewards/margins": 0.27955543994903564, "rewards/rejected": -1.6538225412368774, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 5.875934855775138, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.0059554786421358585, "logits/rejected": 0.10440125316381454, "logps/chosen": -1.2823108434677124, "logps/rejected": -1.760848045349121, "loss": 0.4603, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2823108434677124, "rewards/margins": 0.4785371720790863, "rewards/rejected": -1.760848045349121, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 4.643643448996381, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.06064309924840927, "logits/rejected": 0.020380396395921707, "logps/chosen": -1.4019039869308472, "logps/rejected": -1.6243879795074463, "loss": 0.527, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4019039869308472, "rewards/margins": 0.22248411178588867, "rewards/rejected": -1.6243879795074463, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 6.161161661391256, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.07677425444126129, "logits/rejected": 0.03764699772000313, "logps/chosen": -1.3823782205581665, "logps/rejected": -1.5533250570297241, "loss": 0.5265, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3823782205581665, "rewards/margins": 0.17094692587852478, "rewards/rejected": -1.5533250570297241, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 13.48317471930264, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.028439337387681007, "logits/rejected": 0.14889821410179138, "logps/chosen": -1.3010786771774292, "logps/rejected": -1.636910080909729, "loss": 0.47, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3010786771774292, "rewards/margins": 0.33583131432533264, "rewards/rejected": -1.636910080909729, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 5.756367218599875, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.16760501265525818, "logits/rejected": -0.06681571155786514, "logps/chosen": -1.3700599670410156, "logps/rejected": -1.7020937204360962, "loss": 0.4972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3700599670410156, "rewards/margins": 0.3320337235927582, "rewards/rejected": -1.7020937204360962, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 7.741884937707179, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.1593383401632309, "logits/rejected": 0.02142031118273735, "logps/chosen": -1.4394105672836304, "logps/rejected": -1.8156394958496094, "loss": 0.492, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4394105672836304, "rewards/margins": 0.3762288987636566, "rewards/rejected": -1.8156394958496094, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 6.494844592224876, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.01569398306310177, "logits/rejected": 0.07012617588043213, "logps/chosen": -1.4582710266113281, "logps/rejected": -1.7543818950653076, "loss": 0.5238, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4582710266113281, "rewards/margins": 0.29611092805862427, "rewards/rejected": -1.7543818950653076, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 8.985183809573478, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.06667536497116089, "logits/rejected": 0.08631397783756256, "logps/chosen": -1.5618860721588135, "logps/rejected": -1.9092543125152588, "loss": 0.5394, "rewards/accuracies": 0.625, "rewards/chosen": -1.5618860721588135, "rewards/margins": 0.3473680913448334, "rewards/rejected": -1.9092543125152588, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 6.864197237978456, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.08967851102352142, "logits/rejected": -0.04634971171617508, "logps/chosen": -1.4985218048095703, "logps/rejected": -1.659868836402893, "loss": 0.5632, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4985218048095703, "rewards/margins": 0.16134698688983917, "rewards/rejected": -1.659868836402893, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 5.896529883214944, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.21506735682487488, "logits/rejected": -0.0964222177863121, "logps/chosen": -1.4657273292541504, "logps/rejected": -1.810084342956543, "loss": 0.5274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4657273292541504, "rewards/margins": 0.34435713291168213, "rewards/rejected": -1.810084342956543, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 5.526393175759484, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.18463262915611267, "logits/rejected": -0.05510406941175461, "logps/chosen": -1.354295015335083, "logps/rejected": -1.7084376811981201, "loss": 0.4817, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.354295015335083, "rewards/margins": 0.354142427444458, "rewards/rejected": -1.7084376811981201, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 5.353266277252248, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.08403745293617249, "logits/rejected": 0.006090373732149601, "logps/chosen": -1.4233505725860596, "logps/rejected": -1.6430718898773193, "loss": 0.5283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4233505725860596, "rewards/margins": 0.21972115337848663, "rewards/rejected": -1.6430718898773193, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 6.165896643499533, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.006598341278731823, "logits/rejected": 0.10143008083105087, "logps/chosen": -1.3574113845825195, "logps/rejected": -1.6438875198364258, "loss": 0.4857, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3574113845825195, "rewards/margins": 0.286476194858551, "rewards/rejected": -1.6438875198364258, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 4.426234964620725, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.07528576999902725, "logits/rejected": 7.554479088867083e-05, "logps/chosen": -1.345879316329956, "logps/rejected": -1.585338830947876, "loss": 0.5008, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.345879316329956, "rewards/margins": 0.23945967853069305, "rewards/rejected": -1.585338830947876, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 4.582377320303087, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.08677862584590912, "logits/rejected": 0.07029275596141815, "logps/chosen": -1.5806987285614014, "logps/rejected": -1.7804940938949585, "loss": 0.5711, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5806987285614014, "rewards/margins": 0.19979527592658997, "rewards/rejected": -1.7804940938949585, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 4.043201851904302, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.07626475393772125, "logits/rejected": 0.10053463280200958, "logps/chosen": -1.3774032592773438, "logps/rejected": -1.6782779693603516, "loss": 0.5154, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3774032592773438, "rewards/margins": 0.3008746802806854, "rewards/rejected": -1.6782779693603516, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 5.399330326107109, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.08205243945121765, "logits/rejected": 0.05793525651097298, "logps/chosen": -1.4979830980300903, "logps/rejected": -1.823978066444397, "loss": 0.5019, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4979830980300903, "rewards/margins": 0.32599499821662903, "rewards/rejected": -1.823978066444397, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 6.729120377452131, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.022509945556521416, "logits/rejected": 0.11508312076330185, "logps/chosen": -1.5400029420852661, "logps/rejected": -1.813910722732544, "loss": 0.5406, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5400029420852661, "rewards/margins": 0.273907870054245, "rewards/rejected": -1.813910722732544, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 6.0085050752754965, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.09502875059843063, "logits/rejected": 0.04676276072859764, "logps/chosen": -1.4716498851776123, "logps/rejected": -1.8823015689849854, "loss": 0.472, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4716498851776123, "rewards/margins": 0.4106515944004059, "rewards/rejected": -1.8823015689849854, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 4.483766598490534, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.03190872445702553, "logits/rejected": 0.08330757915973663, "logps/chosen": -1.352972149848938, "logps/rejected": -1.7502959966659546, "loss": 0.4743, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.352972149848938, "rewards/margins": 0.3973238468170166, "rewards/rejected": -1.7502959966659546, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 10.3435091646708, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.10002975165843964, "logits/rejected": -0.0333033949136734, "logps/chosen": -1.4428231716156006, "logps/rejected": -1.735801100730896, "loss": 0.5265, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4428231716156006, "rewards/margins": 0.2929779589176178, "rewards/rejected": -1.735801100730896, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.27598199248313904, "eval_logits/rejected": 0.36994704604148865, "eval_logps/chosen": -1.4579088687896729, "eval_logps/rejected": -1.8204032182693481, "eval_loss": 0.4984978139400482, "eval_rewards/accuracies": 0.6224035620689392, "eval_rewards/chosen": -1.4579088687896729, "eval_rewards/margins": 0.36249470710754395, "eval_rewards/rejected": -1.8204032182693481, "eval_runtime": 40.5018, "eval_samples_per_second": 33.208, "eval_steps_per_second": 8.321, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 7.908665683909004, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.18065521121025085, "logits/rejected": 0.0032704889308661222, "logps/chosen": -1.4316219091415405, "logps/rejected": -1.7270113229751587, "loss": 0.5082, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4316219091415405, "rewards/margins": 0.29538923501968384, "rewards/rejected": -1.7270113229751587, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 6.257472904419201, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.08583327382802963, "logits/rejected": 0.11713147163391113, "logps/chosen": -1.4150996208190918, "logps/rejected": -1.7114330530166626, "loss": 0.4957, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4150996208190918, "rewards/margins": 0.296333372592926, "rewards/rejected": -1.7114330530166626, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 4.803693368960952, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.13445450365543365, "logits/rejected": -0.008361694402992725, "logps/chosen": -1.4496839046478271, "logps/rejected": -1.6363105773925781, "loss": 0.5456, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4496839046478271, "rewards/margins": 0.18662682175636292, "rewards/rejected": -1.6363105773925781, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 6.237904235489264, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.046252332627773285, "logits/rejected": 0.15435494482517242, "logps/chosen": -1.3530142307281494, "logps/rejected": -1.6336700916290283, "loss": 0.507, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3530142307281494, "rewards/margins": 0.28065595030784607, "rewards/rejected": -1.6336700916290283, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 5.573443742989475, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.1921064853668213, "logits/rejected": 0.013531038537621498, "logps/chosen": -1.4798482656478882, "logps/rejected": -1.72818922996521, "loss": 0.5303, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4798482656478882, "rewards/margins": 0.24834099411964417, "rewards/rejected": -1.72818922996521, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 6.785956009509046, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.06566251814365387, "logits/rejected": 0.03361748903989792, "logps/chosen": -1.4336591958999634, "logps/rejected": -1.7152431011199951, "loss": 0.5184, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4336591958999634, "rewards/margins": 0.2815838158130646, "rewards/rejected": -1.7152431011199951, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 4.830230188477797, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.06023001670837402, "logits/rejected": 0.06755174696445465, "logps/chosen": -1.4487342834472656, "logps/rejected": -1.6709034442901611, "loss": 0.5198, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4487342834472656, "rewards/margins": 0.2221692055463791, "rewards/rejected": -1.6709034442901611, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 5.72841613425029, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.16768716275691986, "logits/rejected": -0.03171756863594055, "logps/chosen": -1.424757719039917, "logps/rejected": -1.724419355392456, "loss": 0.4898, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.424757719039917, "rewards/margins": 0.29966163635253906, "rewards/rejected": -1.724419355392456, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 5.206008931706437, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.08406689763069153, "logits/rejected": 0.09542421251535416, "logps/chosen": -1.4782005548477173, "logps/rejected": -1.7310537099838257, "loss": 0.5471, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4782005548477173, "rewards/margins": 0.252853125333786, "rewards/rejected": -1.7310537099838257, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 8.025543035424315, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.07827457040548325, "logits/rejected": 0.11927123367786407, "logps/chosen": -1.5623528957366943, "logps/rejected": -1.7948954105377197, "loss": 0.5421, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5623528957366943, "rewards/margins": 0.23254244029521942, "rewards/rejected": -1.7948954105377197, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 7.962895088953915, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.07577751576900482, "logits/rejected": 0.04966237023472786, "logps/chosen": -1.4360148906707764, "logps/rejected": -1.7415977716445923, "loss": 0.5114, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4360148906707764, "rewards/margins": 0.3055829107761383, "rewards/rejected": -1.7415977716445923, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 4.973069459702906, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.10902514308691025, "logits/rejected": 0.020555516704916954, "logps/chosen": -1.4325511455535889, "logps/rejected": -1.8250415325164795, "loss": 0.4909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4325511455535889, "rewards/margins": 0.3924906253814697, "rewards/rejected": -1.8250415325164795, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 6.894341645618598, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.013774867169559002, "logits/rejected": 0.13044282793998718, "logps/chosen": -1.4474672079086304, "logps/rejected": -1.7993195056915283, "loss": 0.5016, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4474672079086304, "rewards/margins": 0.3518521189689636, "rewards/rejected": -1.7993195056915283, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 6.0144868242422715, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.08132084459066391, "logits/rejected": 0.024117471650242805, "logps/chosen": -1.4442468881607056, "logps/rejected": -1.8393704891204834, "loss": 0.4942, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4442468881607056, "rewards/margins": 0.39512354135513306, "rewards/rejected": -1.8393704891204834, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 7.465424390730532, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.058660756796598434, "logits/rejected": 0.05175963044166565, "logps/chosen": -1.5137755870819092, "logps/rejected": -1.7574217319488525, "loss": 0.5405, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5137755870819092, "rewards/margins": 0.2436460703611374, "rewards/rejected": -1.7574217319488525, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 7.792220800086334, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.08259814232587814, "logits/rejected": 0.07982233911752701, "logps/chosen": -1.5181019306182861, "logps/rejected": -1.7567436695098877, "loss": 0.5472, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5181019306182861, "rewards/margins": 0.23864145576953888, "rewards/rejected": -1.7567436695098877, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 5.999447061049972, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.04458165913820267, "logits/rejected": 0.09139233827590942, "logps/chosen": -1.3784546852111816, "logps/rejected": -1.6395727396011353, "loss": 0.5067, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3784546852111816, "rewards/margins": 0.261118084192276, "rewards/rejected": -1.6395727396011353, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 8.498665122012218, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.1903972625732422, "logits/rejected": -0.037992868572473526, "logps/chosen": -1.4717209339141846, "logps/rejected": -1.6574962139129639, "loss": 0.5526, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.4717209339141846, "rewards/margins": 0.1857752948999405, "rewards/rejected": -1.6574962139129639, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 5.947033068921262, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.1107846274971962, "logits/rejected": -0.09414137899875641, "logps/chosen": -1.4529798030853271, "logps/rejected": -1.8207781314849854, "loss": 0.4902, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4529798030853271, "rewards/margins": 0.3677983582019806, "rewards/rejected": -1.8207781314849854, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 5.359574540793647, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.10759031772613525, "logits/rejected": 0.1485261172056198, "logps/chosen": -1.4005746841430664, "logps/rejected": -1.7886213064193726, "loss": 0.4934, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4005746841430664, "rewards/margins": 0.3880467414855957, "rewards/rejected": -1.7886213064193726, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 5.008426763612491, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.026275474578142166, "logits/rejected": 0.07831399142742157, "logps/chosen": -1.4283883571624756, "logps/rejected": -1.7190501689910889, "loss": 0.5199, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4283883571624756, "rewards/margins": 0.2906617522239685, "rewards/rejected": -1.7190501689910889, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 3.910334610704942, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.009321535006165504, "logits/rejected": 0.07055697590112686, "logps/chosen": -1.4001070261001587, "logps/rejected": -1.6995279788970947, "loss": 0.5082, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4001070261001587, "rewards/margins": 0.2994207739830017, "rewards/rejected": -1.6995279788970947, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 6.485808994794514, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.06786038726568222, "logits/rejected": 0.11454223096370697, "logps/chosen": -1.3821821212768555, "logps/rejected": -1.7559916973114014, "loss": 0.4811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3821821212768555, "rewards/margins": 0.37380948662757874, "rewards/rejected": -1.7559916973114014, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 4.6467203909569905, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.18313808739185333, "logits/rejected": 0.02190874144434929, "logps/chosen": -1.3848965167999268, "logps/rejected": -1.7262376546859741, "loss": 0.4852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3848965167999268, "rewards/margins": 0.34134113788604736, "rewards/rejected": -1.7262376546859741, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 8.532355387897956, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.03529299050569534, "logits/rejected": 0.08005812764167786, "logps/chosen": -1.4549922943115234, "logps/rejected": -1.7455005645751953, "loss": 0.5571, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4549922943115234, "rewards/margins": 0.29050832986831665, "rewards/rejected": -1.7455005645751953, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 6.795535847462301, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.059025686234235764, "logits/rejected": 0.05710027739405632, "logps/chosen": -1.4092142581939697, "logps/rejected": -1.665091872215271, "loss": 0.5208, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4092142581939697, "rewards/margins": 0.2558777630329132, "rewards/rejected": -1.665091872215271, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 8.2194256272664, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.020594831556081772, "logits/rejected": 0.05829160287976265, "logps/chosen": -1.4768677949905396, "logps/rejected": -1.735068917274475, "loss": 0.5215, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4768677949905396, "rewards/margins": 0.2582012116909027, "rewards/rejected": -1.735068917274475, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 7.290421556793591, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.026541341096162796, "logits/rejected": 0.010924230329692364, "logps/chosen": -1.402931809425354, "logps/rejected": -1.6360347270965576, "loss": 0.5222, "rewards/accuracies": 0.625, "rewards/chosen": -1.402931809425354, "rewards/margins": 0.233102947473526, "rewards/rejected": -1.6360347270965576, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 4.527691997384787, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.017451774328947067, "logits/rejected": 0.2115047723054886, "logps/chosen": -1.400193452835083, "logps/rejected": -1.689227819442749, "loss": 0.4954, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.400193452835083, "rewards/margins": 0.2890344262123108, "rewards/rejected": -1.689227819442749, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 7.118405582736356, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.01962551847100258, "logits/rejected": 0.07980718463659286, "logps/chosen": -1.4294536113739014, "logps/rejected": -1.5361769199371338, "loss": 0.5559, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4294536113739014, "rewards/margins": 0.106723353266716, "rewards/rejected": -1.5361769199371338, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 5.347543565648047, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.018346868455410004, "logits/rejected": 0.27618369460105896, "logps/chosen": -1.4729280471801758, "logps/rejected": -1.6998929977416992, "loss": 0.5365, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4729280471801758, "rewards/margins": 0.2269648313522339, "rewards/rejected": -1.6998929977416992, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 5.884561411747062, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.1286102831363678, "logits/rejected": 0.023300817236304283, "logps/chosen": -1.5276930332183838, "logps/rejected": -1.8919912576675415, "loss": 0.506, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5276930332183838, "rewards/margins": 0.3642980456352234, "rewards/rejected": -1.8919912576675415, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 4.7191310043451455, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.00572115508839488, "logits/rejected": 0.13647417724132538, "logps/chosen": -1.459595799446106, "logps/rejected": -1.7756010293960571, "loss": 0.4995, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.459595799446106, "rewards/margins": 0.31600522994995117, "rewards/rejected": -1.7756010293960571, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 33.57473982252121, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.012426319532096386, "logits/rejected": 0.11027495563030243, "logps/chosen": -1.5561888217926025, "logps/rejected": -2.056885242462158, "loss": 0.4739, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5561888217926025, "rewards/margins": 0.5006963610649109, "rewards/rejected": -2.056885242462158, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 3.6504865058099103, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.03958544880151749, "logits/rejected": 0.12107968330383301, "logps/chosen": -1.4807069301605225, "logps/rejected": -1.9494879245758057, "loss": 0.4763, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4807069301605225, "rewards/margins": 0.468780517578125, "rewards/rejected": -1.9494879245758057, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.064035248108785, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.11789293587207794, "logits/rejected": 0.013043287210166454, "logps/chosen": -1.5244688987731934, "logps/rejected": -1.7435407638549805, "loss": 0.5551, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5244688987731934, "rewards/margins": 0.2190718650817871, "rewards/rejected": -1.7435407638549805, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 7.924599963083553, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.08733045309782028, "logits/rejected": 0.04726707935333252, "logps/chosen": -1.3983521461486816, "logps/rejected": -1.7090288400650024, "loss": 0.5024, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3983521461486816, "rewards/margins": 0.3106767535209656, "rewards/rejected": -1.7090288400650024, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 6.779872090810086, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.0072370367124676704, "logits/rejected": 0.10666964203119278, "logps/chosen": -1.475639820098877, "logps/rejected": -1.7652795314788818, "loss": 0.5123, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.475639820098877, "rewards/margins": 0.2896398901939392, "rewards/rejected": -1.7652795314788818, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 4.443007004877035, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.027853738516569138, "logits/rejected": 0.11909045279026031, "logps/chosen": -1.4143445491790771, "logps/rejected": -1.6390422582626343, "loss": 0.501, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4143445491790771, "rewards/margins": 0.22469770908355713, "rewards/rejected": -1.6390422582626343, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 5.739601432615447, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.189456507563591, "logits/rejected": -0.04669322818517685, "logps/chosen": -1.5486042499542236, "logps/rejected": -1.7674516439437866, "loss": 0.5398, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5486042499542236, "rewards/margins": 0.2188473492860794, "rewards/rejected": -1.7674516439437866, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 4.418297511956971, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.056449275463819504, "logits/rejected": 0.13056030869483948, "logps/chosen": -1.3486518859863281, "logps/rejected": -1.7589184045791626, "loss": 0.4804, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3486518859863281, "rewards/margins": 0.41026654839515686, "rewards/rejected": -1.7589184045791626, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 5.926204932887733, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.08927891403436661, "logits/rejected": 0.09832639247179031, "logps/chosen": -1.4799705743789673, "logps/rejected": -1.7990564107894897, "loss": 0.4981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4799705743789673, "rewards/margins": 0.31908589601516724, "rewards/rejected": -1.7990564107894897, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 7.6832810065653385, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.00827641412615776, "logits/rejected": 0.14384965598583221, "logps/chosen": -1.4864349365234375, "logps/rejected": -1.6136372089385986, "loss": 0.5516, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4864349365234375, "rewards/margins": 0.1272020936012268, "rewards/rejected": -1.6136372089385986, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 4.450910431634495, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.014309501275420189, "logits/rejected": 0.12172539532184601, "logps/chosen": -1.4140266180038452, "logps/rejected": -1.8954118490219116, "loss": 0.487, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4140266180038452, "rewards/margins": 0.4813852906227112, "rewards/rejected": -1.8954118490219116, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 6.622160222711323, "learning_rate": 9.293065361002563e-07, "logits/chosen": 0.03671379014849663, "logits/rejected": 0.13247384130954742, "logps/chosen": -1.439561128616333, "logps/rejected": -1.8650518655776978, "loss": 0.4803, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.439561128616333, "rewards/margins": 0.42549094557762146, "rewards/rejected": -1.8650518655776978, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 7.713726945117117, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.10064462572336197, "logits/rejected": 0.035080667585134506, "logps/chosen": -1.5082744359970093, "logps/rejected": -1.8194735050201416, "loss": 0.4973, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5082744359970093, "rewards/margins": 0.31119903922080994, "rewards/rejected": -1.8194735050201416, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 4.3499852297132175, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.048877209424972534, "logits/rejected": 0.10567761957645416, "logps/chosen": -1.4382750988006592, "logps/rejected": -1.8945884704589844, "loss": 0.4747, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4382750988006592, "rewards/margins": 0.4563133716583252, "rewards/rejected": -1.8945884704589844, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 4.264751935928952, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.06024724245071411, "logits/rejected": 0.05379495024681091, "logps/chosen": -1.4873998165130615, "logps/rejected": -1.720198392868042, "loss": 0.5348, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4873998165130615, "rewards/margins": 0.23279878497123718, "rewards/rejected": -1.720198392868042, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 7.661403295405585, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.1148437038064003, "logits/rejected": 0.1047263890504837, "logps/chosen": -1.5335047245025635, "logps/rejected": -1.7926723957061768, "loss": 0.5415, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5335047245025635, "rewards/margins": 0.25916776061058044, "rewards/rejected": -1.7926723957061768, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 10.932138436229337, "learning_rate": 9.252628226650389e-07, "logits/chosen": 0.04269786924123764, "logits/rejected": 0.14088895916938782, "logps/chosen": -1.4841639995574951, "logps/rejected": -1.7393276691436768, "loss": 0.5411, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4841639995574951, "rewards/margins": 0.2551637291908264, "rewards/rejected": -1.7393276691436768, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 6.980769013246553, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.03086697682738304, "logits/rejected": 0.13706521689891815, "logps/chosen": -1.431242823600769, "logps/rejected": -1.7777916193008423, "loss": 0.5021, "rewards/accuracies": 0.625, "rewards/chosen": -1.431242823600769, "rewards/margins": 0.3465487062931061, "rewards/rejected": -1.7777916193008423, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 4.635668891474701, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.000745361321605742, "logits/rejected": 0.17292192578315735, "logps/chosen": -1.403519630432129, "logps/rejected": -1.7861814498901367, "loss": 0.4823, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.403519630432129, "rewards/margins": 0.38266175985336304, "rewards/rejected": -1.7861814498901367, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 4.539540214002763, "learning_rate": 9.227870209296395e-07, "logits/chosen": 0.021961960941553116, "logits/rejected": 0.14273618161678314, "logps/chosen": -1.5254724025726318, "logps/rejected": -1.807578682899475, "loss": 0.5315, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5254724025726318, "rewards/margins": 0.28210610151290894, "rewards/rejected": -1.807578682899475, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 4.8688853185759795, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.07666613906621933, "logits/rejected": -0.044794876128435135, "logps/chosen": -1.469786286354065, "logps/rejected": -1.7875521183013916, "loss": 0.5177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.469786286354065, "rewards/margins": 0.3177659213542938, "rewards/rejected": -1.7875521183013916, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 7.620445994005098, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.09366847574710846, "logits/rejected": 0.08611170947551727, "logps/chosen": -1.4434967041015625, "logps/rejected": -1.7823482751846313, "loss": 0.4976, "rewards/accuracies": 0.625, "rewards/chosen": -1.4434967041015625, "rewards/margins": 0.3388514518737793, "rewards/rejected": -1.7823482751846313, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 5.89271586792414, "learning_rate": 9.202743024992367e-07, "logits/chosen": 0.018907012417912483, "logits/rejected": 0.13424673676490784, "logps/chosen": -1.385901689529419, "logps/rejected": -1.8066761493682861, "loss": 0.4804, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.385901689529419, "rewards/margins": 0.4207743704319, "rewards/rejected": -1.8066761493682861, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 5.526303282779358, "learning_rate": 9.194285638083293e-07, "logits/chosen": 0.01408832985907793, "logits/rejected": 0.18304958939552307, "logps/chosen": -1.5246706008911133, "logps/rejected": -1.929800033569336, "loss": 0.5035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5246706008911133, "rewards/margins": 0.40512943267822266, "rewards/rejected": -1.929800033569336, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 6.0936178595886705, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.03774349018931389, "logits/rejected": 0.08406372368335724, "logps/chosen": -1.4400547742843628, "logps/rejected": -1.7179491519927979, "loss": 0.5313, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4400547742843628, "rewards/margins": 0.2778942286968231, "rewards/rejected": -1.7179491519927979, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 6.119336069229655, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.04709386080503464, "logits/rejected": 0.06358008086681366, "logps/chosen": -1.6116845607757568, "logps/rejected": -1.776435136795044, "loss": 0.5742, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6116845607757568, "rewards/margins": 0.16475048661231995, "rewards/rejected": -1.776435136795044, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 8.178112975128583, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.08561250567436218, "logits/rejected": 0.017532307654619217, "logps/chosen": -1.470578908920288, "logps/rejected": -1.8447043895721436, "loss": 0.4997, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.470578908920288, "rewards/margins": 0.37412557005882263, "rewards/rejected": -1.8447043895721436, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 6.571782609499443, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.0012682542437687516, "logits/rejected": 0.07107420265674591, "logps/chosen": -1.4586286544799805, "logps/rejected": -1.8900283575057983, "loss": 0.4686, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4586286544799805, "rewards/margins": 0.4313996732234955, "rewards/rejected": -1.8900283575057983, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 3.601145874812561, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.10148704051971436, "logits/rejected": 0.18319007754325867, "logps/chosen": -1.5077013969421387, "logps/rejected": -1.91641366481781, "loss": 0.4866, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5077013969421387, "rewards/margins": 0.4087122976779938, "rewards/rejected": -1.91641366481781, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 5.608307004347856, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.004740949720144272, "logits/rejected": 0.0674985721707344, "logps/chosen": -1.4612431526184082, "logps/rejected": -1.7987353801727295, "loss": 0.5042, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4612431526184082, "rewards/margins": 0.33749234676361084, "rewards/rejected": -1.7987353801727295, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 8.579927118368724, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.018311481922864914, "logits/rejected": 0.07289181649684906, "logps/chosen": -1.4907166957855225, "logps/rejected": -1.8232371807098389, "loss": 0.4956, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4907166957855225, "rewards/margins": 0.3325203061103821, "rewards/rejected": -1.8232371807098389, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 4.853094515852543, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.054845988750457764, "logits/rejected": 0.13690079748630524, "logps/chosen": -1.4775035381317139, "logps/rejected": -1.74541437625885, "loss": 0.5351, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4775035381317139, "rewards/margins": 0.2679109573364258, "rewards/rejected": -1.74541437625885, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 7.278612133456522, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.06240853667259216, "logits/rejected": 0.05636941269040108, "logps/chosen": -1.5159069299697876, "logps/rejected": -1.813144326210022, "loss": 0.5279, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5159069299697876, "rewards/margins": 0.2972373366355896, "rewards/rejected": -1.813144326210022, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 9.150078091336669, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.06936490535736084, "logits/rejected": 0.11537766456604004, "logps/chosen": -1.4352937936782837, "logps/rejected": -1.774214506149292, "loss": 0.4759, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4352937936782837, "rewards/margins": 0.3389210104942322, "rewards/rejected": -1.774214506149292, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 8.592046808392759, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.028295382857322693, "logits/rejected": 0.14062240719795227, "logps/chosen": -1.4800852537155151, "logps/rejected": -1.985321044921875, "loss": 0.4529, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4800852537155151, "rewards/margins": 0.5052357912063599, "rewards/rejected": -1.985321044921875, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 6.267182092694673, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.014219577424228191, "logits/rejected": 0.156729057431221, "logps/chosen": -1.4822317361831665, "logps/rejected": -1.8133710622787476, "loss": 0.508, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4822317361831665, "rewards/margins": 0.331139475107193, "rewards/rejected": -1.8133710622787476, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 5.986896965246944, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.14449085295200348, "logits/rejected": 0.1369493305683136, "logps/chosen": -1.529266595840454, "logps/rejected": -1.9521434307098389, "loss": 0.5008, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.529266595840454, "rewards/margins": 0.42287692427635193, "rewards/rejected": -1.9521434307098389, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 5.70299468181621, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.11426404863595963, "logits/rejected": 0.15424251556396484, "logps/chosen": -1.5450958013534546, "logps/rejected": -2.0076801776885986, "loss": 0.5173, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5450958013534546, "rewards/margins": 0.4625841975212097, "rewards/rejected": -2.0076801776885986, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 5.825775960834924, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.014088431373238564, "logits/rejected": 0.05968545004725456, "logps/chosen": -1.6076838970184326, "logps/rejected": -1.920397162437439, "loss": 0.5454, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6076838970184326, "rewards/margins": 0.3127134442329407, "rewards/rejected": -1.920397162437439, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 4.946140113225051, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.02006353996694088, "logits/rejected": 0.07335661351680756, "logps/chosen": -1.5875966548919678, "logps/rejected": -1.9044586420059204, "loss": 0.5245, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5875966548919678, "rewards/margins": 0.3168618977069855, "rewards/rejected": -1.9044586420059204, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 16.262465483292893, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.005182269029319286, "logits/rejected": 0.021438756957650185, "logps/chosen": -1.4802757501602173, "logps/rejected": -1.729795217514038, "loss": 0.5379, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4802757501602173, "rewards/margins": 0.24951967597007751, "rewards/rejected": -1.729795217514038, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 5.412141510807797, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.020915767177939415, "logits/rejected": 0.09182947129011154, "logps/chosen": -1.47804856300354, "logps/rejected": -1.7549498081207275, "loss": 0.5164, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.47804856300354, "rewards/margins": 0.2769010663032532, "rewards/rejected": -1.7549498081207275, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 8.837963633141088, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.018215443938970566, "logits/rejected": 0.2762666344642639, "logps/chosen": -1.522731065750122, "logps/rejected": -1.9062726497650146, "loss": 0.4916, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.522731065750122, "rewards/margins": 0.3835414946079254, "rewards/rejected": -1.9062726497650146, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 5.154834510424645, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.0346483439207077, "logits/rejected": 0.13558995723724365, "logps/chosen": -1.4193522930145264, "logps/rejected": -1.8866379261016846, "loss": 0.4634, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4193522930145264, "rewards/margins": 0.46728554368019104, "rewards/rejected": -1.8866379261016846, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 10.008208018923805, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.08091981709003448, "logits/rejected": 0.011999507434666157, "logps/chosen": -1.4495495557785034, "logps/rejected": -1.7186473608016968, "loss": 0.5119, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4495495557785034, "rewards/margins": 0.2690978944301605, "rewards/rejected": -1.7186473608016968, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 8.36693448648541, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.012645977549254894, "logits/rejected": 0.07616965472698212, "logps/chosen": -1.6090993881225586, "logps/rejected": -1.8103091716766357, "loss": 0.5713, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6090993881225586, "rewards/margins": 0.20120970904827118, "rewards/rejected": -1.8103091716766357, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 3.6132442909474047, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.1480007767677307, "logits/rejected": 0.057016193866729736, "logps/chosen": -1.5228550434112549, "logps/rejected": -1.9889719486236572, "loss": 0.4765, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5228550434112549, "rewards/margins": 0.46611684560775757, "rewards/rejected": -1.9889719486236572, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.22571302950382233, "eval_logits/rejected": 0.3197654187679291, "eval_logps/chosen": -1.4993666410446167, "eval_logps/rejected": -1.8829439878463745, "eval_loss": 0.4934570789337158, "eval_rewards/accuracies": 0.637982189655304, "eval_rewards/chosen": -1.4993666410446167, "eval_rewards/margins": 0.38357746601104736, "eval_rewards/rejected": -1.8829439878463745, "eval_runtime": 40.529, "eval_samples_per_second": 33.186, "eval_steps_per_second": 8.315, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 6.240186464854856, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.13780352473258972, "logits/rejected": 0.09622332453727722, "logps/chosen": -1.481416940689087, "logps/rejected": -1.8968534469604492, "loss": 0.4796, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.481416940689087, "rewards/margins": 0.4154367446899414, "rewards/rejected": -1.8968534469604492, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 4.586748424735531, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.1500522494316101, "logits/rejected": 0.02734406292438507, "logps/chosen": -1.4626126289367676, "logps/rejected": -1.9681785106658936, "loss": 0.4687, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4626126289367676, "rewards/margins": 0.5055657625198364, "rewards/rejected": -1.9681785106658936, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 8.205566693967892, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.10353459417819977, "logits/rejected": 0.00754410307854414, "logps/chosen": -1.5526597499847412, "logps/rejected": -1.995067834854126, "loss": 0.5116, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5526597499847412, "rewards/margins": 0.4424077868461609, "rewards/rejected": -1.995067834854126, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 5.975262229898736, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.07767478376626968, "logits/rejected": 0.06707130372524261, "logps/chosen": -1.433227777481079, "logps/rejected": -1.8736705780029297, "loss": 0.4812, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.433227777481079, "rewards/margins": 0.44044286012649536, "rewards/rejected": -1.8736705780029297, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 6.694091566051099, "learning_rate": 8.941267982915213e-07, "logits/chosen": -0.013508329167962074, "logits/rejected": 0.030974358320236206, "logps/chosen": -1.6147053241729736, "logps/rejected": -1.84378182888031, "loss": 0.5763, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6147053241729736, "rewards/margins": 0.2290763556957245, "rewards/rejected": -1.84378182888031, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 6.531324337059562, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.06521056592464447, "logits/rejected": 0.08396237343549728, "logps/chosen": -1.5116019248962402, "logps/rejected": -1.8753362894058228, "loss": 0.512, "rewards/accuracies": 0.625, "rewards/chosen": -1.5116019248962402, "rewards/margins": 0.3637344241142273, "rewards/rejected": -1.8753362894058228, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 5.366529845516928, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.17120163142681122, "logits/rejected": -0.053713906556367874, "logps/chosen": -1.3891178369522095, "logps/rejected": -1.8246076107025146, "loss": 0.4738, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3891178369522095, "rewards/margins": 0.4354899525642395, "rewards/rejected": -1.8246076107025146, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 5.93490788918495, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.13274401426315308, "logits/rejected": -0.008592364378273487, "logps/chosen": -1.441159725189209, "logps/rejected": -1.9064738750457764, "loss": 0.4765, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.441159725189209, "rewards/margins": 0.4653142988681793, "rewards/rejected": -1.9064738750457764, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 5.396190516866145, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.05011202022433281, "logits/rejected": 0.007425924297422171, "logps/chosen": -1.4389212131500244, "logps/rejected": -1.73294997215271, "loss": 0.5025, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4389212131500244, "rewards/margins": 0.29402872920036316, "rewards/rejected": -1.73294997215271, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 8.18119462865105, "learning_rate": 8.892874524469537e-07, "logits/chosen": -0.015568910166621208, "logits/rejected": 0.04986589774489403, "logps/chosen": -1.448279857635498, "logps/rejected": -1.8130333423614502, "loss": 0.4716, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.448279857635498, "rewards/margins": 0.36475324630737305, "rewards/rejected": -1.8130333423614502, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 6.204518874135094, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.09690272063016891, "logits/rejected": 0.018508389592170715, "logps/chosen": -1.4901759624481201, "logps/rejected": -1.789602279663086, "loss": 0.5126, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4901759624481201, "rewards/margins": 0.29942628741264343, "rewards/rejected": -1.789602279663086, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 5.873114000665186, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.0340433306992054, "logits/rejected": -0.011099529452621937, "logps/chosen": -1.3688690662384033, "logps/rejected": -1.6865575313568115, "loss": 0.4949, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3688690662384033, "rewards/margins": 0.3176884353160858, "rewards/rejected": -1.6865575313568115, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 4.075123034425838, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.04490387439727783, "logits/rejected": 0.024842102080583572, "logps/chosen": -1.5182523727416992, "logps/rejected": -1.7427375316619873, "loss": 0.5479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5182523727416992, "rewards/margins": 0.22448499500751495, "rewards/rejected": -1.7427375316619873, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 5.118099509963928, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.003239439334720373, "logits/rejected": 0.16879113018512726, "logps/chosen": -1.5358818769454956, "logps/rejected": -1.760401725769043, "loss": 0.5705, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5358818769454956, "rewards/margins": 0.2245200127363205, "rewards/rejected": -1.760401725769043, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 5.4976592326976945, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.015959907323122025, "logits/rejected": 0.20223025977611542, "logps/chosen": -1.5157901048660278, "logps/rejected": -1.9324439764022827, "loss": 0.4976, "rewards/accuracies": 0.625, "rewards/chosen": -1.5157901048660278, "rewards/margins": 0.41665396094322205, "rewards/rejected": -1.9324439764022827, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 6.033635648010546, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.11629164218902588, "logits/rejected": -0.0010228529572486877, "logps/chosen": -1.527944803237915, "logps/rejected": -1.7946611642837524, "loss": 0.5247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.527944803237915, "rewards/margins": 0.266716331243515, "rewards/rejected": -1.7946611642837524, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 7.6341954017791585, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.07460661232471466, "logits/rejected": 0.16159269213676453, "logps/chosen": -1.4611696004867554, "logps/rejected": -1.8025248050689697, "loss": 0.5328, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4611696004867554, "rewards/margins": 0.3413551449775696, "rewards/rejected": -1.8025248050689697, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 5.136998915165465, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.07986936718225479, "logits/rejected": 0.11600703001022339, "logps/chosen": -1.3908650875091553, "logps/rejected": -1.7613322734832764, "loss": 0.464, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3908650875091553, "rewards/margins": 0.37046709656715393, "rewards/rejected": -1.7613322734832764, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 7.93364783787613, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.18879325687885284, "logits/rejected": -0.03199527785181999, "logps/chosen": -1.4757874011993408, "logps/rejected": -1.7853939533233643, "loss": 0.5125, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4757874011993408, "rewards/margins": 0.30960649251937866, "rewards/rejected": -1.7853939533233643, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 5.9358795019616535, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.03666179999709129, "logits/rejected": -0.08555471152067184, "logps/chosen": -1.528098225593567, "logps/rejected": -1.680551290512085, "loss": 0.5557, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.528098225593567, "rewards/margins": 0.15245315432548523, "rewards/rejected": -1.680551290512085, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 8.579269680670874, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.055487632751464844, "logits/rejected": 0.09787547588348389, "logps/chosen": -1.4669617414474487, "logps/rejected": -1.7616409063339233, "loss": 0.4996, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4669617414474487, "rewards/margins": 0.2946791648864746, "rewards/rejected": -1.7616409063339233, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 5.463037939699619, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.0742371529340744, "logits/rejected": 0.011323767714202404, "logps/chosen": -1.4721739292144775, "logps/rejected": -1.7645984888076782, "loss": 0.5021, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4721739292144775, "rewards/margins": 0.29242438077926636, "rewards/rejected": -1.7645984888076782, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 6.880759096596561, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.06508682668209076, "logits/rejected": 0.14892472326755524, "logps/chosen": -1.5663446187973022, "logps/rejected": -1.8233978748321533, "loss": 0.5572, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5663446187973022, "rewards/margins": 0.2570531964302063, "rewards/rejected": -1.8233978748321533, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 6.505371290412826, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.04669763892889023, "logits/rejected": 0.030667606741189957, "logps/chosen": -1.3593701124191284, "logps/rejected": -1.8305232524871826, "loss": 0.4563, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3593701124191284, "rewards/margins": 0.4711533188819885, "rewards/rejected": -1.8305232524871826, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 4.333614184584677, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.025619516149163246, "logits/rejected": 0.09301630407571793, "logps/chosen": -1.5569355487823486, "logps/rejected": -2.016434669494629, "loss": 0.4889, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5569355487823486, "rewards/margins": 0.45949918031692505, "rewards/rejected": -2.016434669494629, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 6.0463693214439385, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.060136694461107254, "logits/rejected": 0.11894433200359344, "logps/chosen": -1.4942710399627686, "logps/rejected": -1.7523078918457031, "loss": 0.53, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4942710399627686, "rewards/margins": 0.2580370008945465, "rewards/rejected": -1.7523078918457031, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 6.948697198910093, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.13472218811511993, "logits/rejected": 0.04240645468235016, "logps/chosen": -1.4713555574417114, "logps/rejected": -2.108682155609131, "loss": 0.4491, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4713555574417114, "rewards/margins": 0.6373263597488403, "rewards/rejected": -2.108682155609131, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 10.038714215029996, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.062398456037044525, "logits/rejected": 0.10127653926610947, "logps/chosen": -1.5524652004241943, "logps/rejected": -1.9174381494522095, "loss": 0.4955, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5524652004241943, "rewards/margins": 0.3649727404117584, "rewards/rejected": -1.9174381494522095, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 4.306870887553488, "learning_rate": 8.700471013287424e-07, "logits/chosen": 0.02232358232140541, "logits/rejected": 0.06499762833118439, "logps/chosen": -1.5197092294692993, "logps/rejected": -1.8415558338165283, "loss": 0.5141, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5197092294692993, "rewards/margins": 0.3218465745449066, "rewards/rejected": -1.8415558338165283, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 11.433375611557802, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.027724886313080788, "logits/rejected": 0.04084404557943344, "logps/chosen": -1.5688155889511108, "logps/rejected": -1.9133634567260742, "loss": 0.5265, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5688155889511108, "rewards/margins": 0.34454816579818726, "rewards/rejected": -1.9133634567260742, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 5.308903913788746, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.06558410823345184, "logits/rejected": 0.044756826013326645, "logps/chosen": -1.493861198425293, "logps/rejected": -1.8257402181625366, "loss": 0.4983, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.493861198425293, "rewards/margins": 0.3318789601325989, "rewards/rejected": -1.8257402181625366, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 6.928642399276298, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.04401123896241188, "logits/rejected": 0.08938965946435928, "logps/chosen": -1.4386156797409058, "logps/rejected": -1.7950365543365479, "loss": 0.4864, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4386156797409058, "rewards/margins": 0.35642069578170776, "rewards/rejected": -1.7950365543365479, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 5.961246096550599, "learning_rate": 8.658290552963827e-07, "logits/chosen": 0.0432157889008522, "logits/rejected": 0.06540998071432114, "logps/chosen": -1.4995759725570679, "logps/rejected": -1.822391152381897, "loss": 0.5112, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4995759725570679, "rewards/margins": 0.3228151202201843, "rewards/rejected": -1.822391152381897, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 5.829287536662387, "learning_rate": 8.647656449367966e-07, "logits/chosen": 0.03461442142724991, "logits/rejected": 0.19764934480190277, "logps/chosen": -1.537506341934204, "logps/rejected": -1.7612464427947998, "loss": 0.5214, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.537506341934204, "rewards/margins": 0.22374019026756287, "rewards/rejected": -1.7612464427947998, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 5.668594051643107, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.04334365576505661, "logits/rejected": 0.06287690252065659, "logps/chosen": -1.45168137550354, "logps/rejected": -1.7881801128387451, "loss": 0.4913, "rewards/accuracies": 0.65625, "rewards/chosen": -1.45168137550354, "rewards/margins": 0.33649885654449463, "rewards/rejected": -1.7881801128387451, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 5.100612642504561, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.06139695644378662, "logits/rejected": 0.06763395667076111, "logps/chosen": -1.413381576538086, "logps/rejected": -1.777255654335022, "loss": 0.4785, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.413381576538086, "rewards/margins": 0.36387428641319275, "rewards/rejected": -1.777255654335022, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 5.936028957421941, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.06372909247875214, "logits/rejected": 0.14290915429592133, "logps/chosen": -1.3759276866912842, "logps/rejected": -1.5791761875152588, "loss": 0.5035, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3759276866912842, "rewards/margins": 0.203248530626297, "rewards/rejected": -1.5791761875152588, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 6.744215812642503, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.05716240406036377, "logits/rejected": 0.11519630253314972, "logps/chosen": -1.494706630706787, "logps/rejected": -1.747436761856079, "loss": 0.5174, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.494706630706787, "rewards/margins": 0.2527301013469696, "rewards/rejected": -1.747436761856079, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 4.711807507094175, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.07095207273960114, "logits/rejected": 0.08118069916963577, "logps/chosen": -1.4247040748596191, "logps/rejected": -1.8081430196762085, "loss": 0.4799, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4247040748596191, "rewards/margins": 0.3834388852119446, "rewards/rejected": -1.8081430196762085, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 5.005290181363208, "learning_rate": 8.58311228163888e-07, "logits/chosen": 0.003749051596969366, "logits/rejected": 0.08958594501018524, "logps/chosen": -1.4678596258163452, "logps/rejected": -1.7771354913711548, "loss": 0.4961, "rewards/accuracies": 0.625, "rewards/chosen": -1.4678596258163452, "rewards/margins": 0.3092755079269409, "rewards/rejected": -1.7771354913711548, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 5.692419950018119, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.14677509665489197, "logits/rejected": -0.002401140984147787, "logps/chosen": -1.4272067546844482, "logps/rejected": -1.944446325302124, "loss": 0.4446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4272067546844482, "rewards/margins": 0.5172396898269653, "rewards/rejected": -1.944446325302124, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 4.921507978567713, "learning_rate": 8.561318334069511e-07, "logits/chosen": 0.005840711295604706, "logits/rejected": 0.1641826182603836, "logps/chosen": -1.4567228555679321, "logps/rejected": -1.79598867893219, "loss": 0.4979, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4567228555679321, "rewards/margins": 0.3392660319805145, "rewards/rejected": -1.79598867893219, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 5.549369897229857, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.03739506006240845, "logits/rejected": 0.11871230602264404, "logps/chosen": -1.5096032619476318, "logps/rejected": -1.8402252197265625, "loss": 0.4979, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5096032619476318, "rewards/margins": 0.330622136592865, "rewards/rejected": -1.8402252197265625, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 6.3556884538300285, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.01189727708697319, "logits/rejected": 0.07292185723781586, "logps/chosen": -1.5445812940597534, "logps/rejected": -1.8706490993499756, "loss": 0.5003, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5445812940597534, "rewards/margins": 0.32606783509254456, "rewards/rejected": -1.8706490993499756, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 6.65077539509938, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.09568434208631516, "logits/rejected": 0.07311873137950897, "logps/chosen": -1.459269404411316, "logps/rejected": -1.9391835927963257, "loss": 0.4883, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.459269404411316, "rewards/margins": 0.47991424798965454, "rewards/rejected": -1.9391835927963257, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 4.621103798919587, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.08115889877080917, "logits/rejected": 0.12485666573047638, "logps/chosen": -1.5900990962982178, "logps/rejected": -1.8542354106903076, "loss": 0.5179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5900990962982178, "rewards/margins": 0.2641362249851227, "rewards/rejected": -1.8542354106903076, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 6.554115567702426, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.054566167294979095, "logits/rejected": 0.017829354852437973, "logps/chosen": -1.5193006992340088, "logps/rejected": -1.7150980234146118, "loss": 0.5404, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5193006992340088, "rewards/margins": 0.19579732418060303, "rewards/rejected": -1.7150980234146118, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 6.754777354949413, "learning_rate": 8.495110657042488e-07, "logits/chosen": 0.004966604523360729, "logits/rejected": 0.12692096829414368, "logps/chosen": -1.5372395515441895, "logps/rejected": -1.9817256927490234, "loss": 0.4742, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5372395515441895, "rewards/margins": 0.4444860816001892, "rewards/rejected": -1.9817256927490234, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 7.868815607531026, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.024876123294234276, "logits/rejected": 0.08235087990760803, "logps/chosen": -1.5736982822418213, "logps/rejected": -1.877713918685913, "loss": 0.5479, "rewards/accuracies": 0.625, "rewards/chosen": -1.5736982822418213, "rewards/margins": 0.30401548743247986, "rewards/rejected": -1.877713918685913, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 11.366120819447582, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.19370611011981964, "logits/rejected": -0.05516909435391426, "logps/chosen": -1.6420161724090576, "logps/rejected": -1.8877147436141968, "loss": 0.5638, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6420161724090576, "rewards/margins": 0.24569861590862274, "rewards/rejected": -1.8877147436141968, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 13.719420465380503, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.07690098136663437, "logits/rejected": 0.05073133856058121, "logps/chosen": -1.5330612659454346, "logps/rejected": -1.9170949459075928, "loss": 0.4988, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5330612659454346, "rewards/margins": 0.3840336799621582, "rewards/rejected": -1.9170949459075928, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 7.557758620373599, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.04783378168940544, "logits/rejected": 0.048353202641010284, "logps/chosen": -1.6503095626831055, "logps/rejected": -1.9672205448150635, "loss": 0.5136, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6503095626831055, "rewards/margins": 0.3169107735157013, "rewards/rejected": -1.9672205448150635, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 5.359547484492175, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.05837645381689072, "logits/rejected": 0.02458125166594982, "logps/chosen": -1.4253261089324951, "logps/rejected": -1.788265585899353, "loss": 0.4929, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4253261089324951, "rewards/margins": 0.36293941736221313, "rewards/rejected": -1.788265585899353, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 8.99576964337197, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.0869135931134224, "logits/rejected": 0.07817542552947998, "logps/chosen": -1.572312831878662, "logps/rejected": -1.8892767429351807, "loss": 0.5084, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.572312831878662, "rewards/margins": 0.3169638514518738, "rewards/rejected": -1.8892767429351807, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 12.990460481237285, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.13233283162117004, "logits/rejected": 0.002775478409603238, "logps/chosen": -1.5400633811950684, "logps/rejected": -1.7788797616958618, "loss": 0.5347, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5400633811950684, "rewards/margins": 0.23881638050079346, "rewards/rejected": -1.7788797616958618, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 5.455906774625539, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.030131524428725243, "logits/rejected": 0.1480594426393509, "logps/chosen": -1.4806350469589233, "logps/rejected": -1.9758703708648682, "loss": 0.4642, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4806350469589233, "rewards/margins": 0.49523526430130005, "rewards/rejected": -1.9758703708648682, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 6.183705237299571, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.08053741604089737, "logits/rejected": -0.0017719387542456388, "logps/chosen": -1.4277750253677368, "logps/rejected": -1.7450214624404907, "loss": 0.512, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4277750253677368, "rewards/margins": 0.31724652647972107, "rewards/rejected": -1.7450214624404907, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 5.92106751829389, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.09591381251811981, "logits/rejected": 0.1108134537935257, "logps/chosen": -1.5133988857269287, "logps/rejected": -1.8597700595855713, "loss": 0.4745, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5133988857269287, "rewards/margins": 0.3463711440563202, "rewards/rejected": -1.8597700595855713, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 6.271521690268862, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.04966556280851364, "logits/rejected": 0.0062907664105296135, "logps/chosen": -1.4901273250579834, "logps/rejected": -1.7681535482406616, "loss": 0.4859, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4901273250579834, "rewards/margins": 0.27802610397338867, "rewards/rejected": -1.7681535482406616, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 5.178194950192017, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.17626014351844788, "logits/rejected": 0.0876087099313736, "logps/chosen": -1.5115207433700562, "logps/rejected": -1.8606668710708618, "loss": 0.4731, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5115207433700562, "rewards/margins": 0.3491460680961609, "rewards/rejected": -1.8606668710708618, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 4.100645684270545, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.09048594534397125, "logits/rejected": 0.06907562166452408, "logps/chosen": -1.5318912267684937, "logps/rejected": -1.8323986530303955, "loss": 0.5096, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5318912267684937, "rewards/margins": 0.300507515668869, "rewards/rejected": -1.8323986530303955, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 5.857895232685873, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.19771865010261536, "logits/rejected": -0.06738439202308655, "logps/chosen": -1.5121411085128784, "logps/rejected": -1.7908271551132202, "loss": 0.5012, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5121411085128784, "rewards/margins": 0.2786860167980194, "rewards/rejected": -1.7908271551132202, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 4.898597011190574, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.12142401933670044, "logits/rejected": -0.04009947180747986, "logps/chosen": -1.517547845840454, "logps/rejected": -1.7601168155670166, "loss": 0.5196, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.517547845840454, "rewards/margins": 0.2425689995288849, "rewards/rejected": -1.7601168155670166, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 6.117338166444186, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.15110328793525696, "logits/rejected": -0.07298944145441055, "logps/chosen": -1.3734453916549683, "logps/rejected": -1.735055685043335, "loss": 0.4662, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3734453916549683, "rewards/margins": 0.3616102933883667, "rewards/rejected": -1.735055685043335, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 7.529065008586553, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.2617657482624054, "logits/rejected": -0.047545768320560455, "logps/chosen": -1.5493519306182861, "logps/rejected": -1.8553383350372314, "loss": 0.5054, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5493519306182861, "rewards/margins": 0.3059864640235901, "rewards/rejected": -1.8553383350372314, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 7.119850916010988, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.17046763002872467, "logits/rejected": -0.03215178847312927, "logps/chosen": -1.425768494606018, "logps/rejected": -1.825882911682129, "loss": 0.452, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.425768494606018, "rewards/margins": 0.4001145362854004, "rewards/rejected": -1.825882911682129, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 6.900915292585026, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.219212144613266, "logits/rejected": -0.02634129486978054, "logps/chosen": -1.5492130517959595, "logps/rejected": -1.9858119487762451, "loss": 0.4614, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5492130517959595, "rewards/margins": 0.4365990161895752, "rewards/rejected": -1.9858119487762451, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 5.403636311969691, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.13509558141231537, "logits/rejected": -0.007983637042343616, "logps/chosen": -1.5350006818771362, "logps/rejected": -2.0086822509765625, "loss": 0.4578, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5350006818771362, "rewards/margins": 0.47368139028549194, "rewards/rejected": -2.0086822509765625, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 4.154295052380319, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.23039202392101288, "logits/rejected": -0.058211155235767365, "logps/chosen": -1.568195104598999, "logps/rejected": -2.110253095626831, "loss": 0.4582, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.568195104598999, "rewards/margins": 0.542057991027832, "rewards/rejected": -2.110253095626831, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 7.932019867779892, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.1511477679014206, "logits/rejected": 0.07033874094486237, "logps/chosen": -1.6436907052993774, "logps/rejected": -2.076551914215088, "loss": 0.4907, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6436907052993774, "rewards/margins": 0.4328608512878418, "rewards/rejected": -2.076551914215088, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 5.366643623315821, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.1267443299293518, "logits/rejected": -0.057616185396909714, "logps/chosen": -1.482744574546814, "logps/rejected": -1.8760229349136353, "loss": 0.4666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.482744574546814, "rewards/margins": 0.3932782709598541, "rewards/rejected": -1.8760229349136353, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 5.877101502131333, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.18132105469703674, "logits/rejected": -0.1439904272556305, "logps/chosen": -1.533780813217163, "logps/rejected": -1.8776013851165771, "loss": 0.4962, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.533780813217163, "rewards/margins": 0.34382060170173645, "rewards/rejected": -1.8776013851165771, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 7.367038468639522, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.0965513214468956, "logits/rejected": -0.026915382593870163, "logps/chosen": -1.459919810295105, "logps/rejected": -2.0469250679016113, "loss": 0.4359, "rewards/accuracies": 0.65625, "rewards/chosen": -1.459919810295105, "rewards/margins": 0.5870050191879272, "rewards/rejected": -2.0469250679016113, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 4.675354107373429, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.13914284110069275, "logits/rejected": 0.01275468897074461, "logps/chosen": -1.4393274784088135, "logps/rejected": -1.8212321996688843, "loss": 0.4713, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4393274784088135, "rewards/margins": 0.3819049298763275, "rewards/rejected": -1.8212321996688843, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 6.892223003096059, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.05053325742483139, "logits/rejected": 0.04154837876558304, "logps/chosen": -1.3958252668380737, "logps/rejected": -1.7833083868026733, "loss": 0.4615, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3958252668380737, "rewards/margins": 0.3874832093715668, "rewards/rejected": -1.7833083868026733, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 5.8520648016525465, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.027234962210059166, "logits/rejected": 0.09547891467809677, "logps/chosen": -1.5229977369308472, "logps/rejected": -1.9171310663223267, "loss": 0.4986, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5229977369308472, "rewards/margins": 0.3941333591938019, "rewards/rejected": -1.9171310663223267, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 6.5077629122760365, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.10885421186685562, "logits/rejected": 0.04782998189330101, "logps/chosen": -1.521545171737671, "logps/rejected": -1.7875175476074219, "loss": 0.521, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.521545171737671, "rewards/margins": 0.26597243547439575, "rewards/rejected": -1.7875175476074219, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 7.612941084974896, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.12629219889640808, "logits/rejected": -0.02760654129087925, "logps/chosen": -1.4988399744033813, "logps/rejected": -1.8203222751617432, "loss": 0.5028, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4988399744033813, "rewards/margins": 0.3214821219444275, "rewards/rejected": -1.8203222751617432, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 4.823318082953958, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.0352267324924469, "logits/rejected": 0.12130987644195557, "logps/chosen": -1.4055982828140259, "logps/rejected": -1.9282000064849854, "loss": 0.434, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4055982828140259, "rewards/margins": 0.5226019024848938, "rewards/rejected": -1.9282000064849854, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 8.34013866621319, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.1113995686173439, "logits/rejected": 0.05337010696530342, "logps/chosen": -1.5497267246246338, "logps/rejected": -1.876712441444397, "loss": 0.5542, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5497267246246338, "rewards/margins": 0.32698577642440796, "rewards/rejected": -1.876712441444397, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.2090035080909729, "eval_logits/rejected": 0.3053840100765228, "eval_logps/chosen": -1.4686795473098755, "eval_logps/rejected": -1.8581894636154175, "eval_loss": 0.4872437119483948, "eval_rewards/accuracies": 0.6372403502464294, "eval_rewards/chosen": -1.4686795473098755, "eval_rewards/margins": 0.3895101249217987, "eval_rewards/rejected": -1.8581894636154175, "eval_runtime": 40.5634, "eval_samples_per_second": 33.158, "eval_steps_per_second": 8.308, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 6.236373702205982, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.08853862434625626, "logits/rejected": 0.002719242125749588, "logps/chosen": -1.5109493732452393, "logps/rejected": -1.7682702541351318, "loss": 0.5376, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5109493732452393, "rewards/margins": 0.25732117891311646, "rewards/rejected": -1.7682702541351318, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 6.354163447035549, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.02527523972094059, "logits/rejected": 0.09005782753229141, "logps/chosen": -1.3973536491394043, "logps/rejected": -1.7135419845581055, "loss": 0.477, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3973536491394043, "rewards/margins": 0.31618839502334595, "rewards/rejected": -1.7135419845581055, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 6.72804178976049, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.1025487557053566, "logits/rejected": 0.04611871764063835, "logps/chosen": -1.3783191442489624, "logps/rejected": -1.765669584274292, "loss": 0.4628, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3783191442489624, "rewards/margins": 0.3873503804206848, "rewards/rejected": -1.765669584274292, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 6.339112259399924, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.1630805879831314, "logits/rejected": 0.036900248378515244, "logps/chosen": -1.5076767206192017, "logps/rejected": -1.9129406213760376, "loss": 0.4781, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5076767206192017, "rewards/margins": 0.4052638113498688, "rewards/rejected": -1.9129406213760376, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 5.7952330260114016, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.025143707171082497, "logits/rejected": 0.1194809228181839, "logps/chosen": -1.4730809926986694, "logps/rejected": -1.935746431350708, "loss": 0.469, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4730809926986694, "rewards/margins": 0.46266525983810425, "rewards/rejected": -1.935746431350708, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 5.041981060931971, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.11958135664463043, "logits/rejected": -0.09215328097343445, "logps/chosen": -1.4268603324890137, "logps/rejected": -1.736210823059082, "loss": 0.4938, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4268603324890137, "rewards/margins": 0.3093504309654236, "rewards/rejected": -1.736210823059082, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 8.012147116656964, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.13880446553230286, "logits/rejected": 0.04056922346353531, "logps/chosen": -1.4889777898788452, "logps/rejected": -1.9353151321411133, "loss": 0.4874, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4889777898788452, "rewards/margins": 0.44633737206459045, "rewards/rejected": -1.9353151321411133, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 31.464722109184223, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.19712121784687042, "logits/rejected": -0.11911998689174652, "logps/chosen": -1.5142052173614502, "logps/rejected": -1.8804069757461548, "loss": 0.5089, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5142052173614502, "rewards/margins": 0.3662016987800598, "rewards/rejected": -1.8804069757461548, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 7.423857743103119, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.04131780192255974, "logits/rejected": 0.001157002174295485, "logps/chosen": -1.5440315008163452, "logps/rejected": -1.9551403522491455, "loss": 0.4788, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5440315008163452, "rewards/margins": 0.4111087918281555, "rewards/rejected": -1.9551403522491455, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 7.2700464692904205, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.1901923269033432, "logits/rejected": -0.05527256056666374, "logps/chosen": -1.5041463375091553, "logps/rejected": -1.8498035669326782, "loss": 0.5074, "rewards/accuracies": 0.625, "rewards/chosen": -1.5041463375091553, "rewards/margins": 0.34565702080726624, "rewards/rejected": -1.8498035669326782, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 5.920791242809537, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.08778829127550125, "logits/rejected": 0.0543634295463562, "logps/chosen": -1.491448163986206, "logps/rejected": -1.926989197731018, "loss": 0.4736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.491448163986206, "rewards/margins": 0.4355408251285553, "rewards/rejected": -1.926989197731018, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 6.031379560153673, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.05811872333288193, "logits/rejected": -0.0019346773624420166, "logps/chosen": -1.5684421062469482, "logps/rejected": -2.0479769706726074, "loss": 0.4707, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5684421062469482, "rewards/margins": 0.4795348644256592, "rewards/rejected": -2.0479769706726074, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 6.220839355506025, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.07601083815097809, "logits/rejected": 0.06897013634443283, "logps/chosen": -1.5472190380096436, "logps/rejected": -1.9796726703643799, "loss": 0.4874, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5472190380096436, "rewards/margins": 0.43245354294776917, "rewards/rejected": -1.9796726703643799, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 7.841546356690113, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.0913565382361412, "logits/rejected": 0.06955388933420181, "logps/chosen": -1.4142158031463623, "logps/rejected": -1.9294054508209229, "loss": 0.4368, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4142158031463623, "rewards/margins": 0.5151897668838501, "rewards/rejected": -1.9294054508209229, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 8.298268242527833, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.12061178684234619, "logits/rejected": -0.005510507617145777, "logps/chosen": -1.565507411956787, "logps/rejected": -2.0275635719299316, "loss": 0.4819, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.565507411956787, "rewards/margins": 0.4620559811592102, "rewards/rejected": -2.0275635719299316, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 5.3792000674785685, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.1755526065826416, "logits/rejected": 0.0011181235313415527, "logps/chosen": -1.4745380878448486, "logps/rejected": -1.9291636943817139, "loss": 0.4431, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4745380878448486, "rewards/margins": 0.4546256959438324, "rewards/rejected": -1.9291636943817139, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 10.21606678379247, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.14807187020778656, "logits/rejected": -0.027882719412446022, "logps/chosen": -1.552650809288025, "logps/rejected": -2.0372233390808105, "loss": 0.4687, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.552650809288025, "rewards/margins": 0.4845725893974304, "rewards/rejected": -2.0372233390808105, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 6.003899535182584, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.19446441531181335, "logits/rejected": -0.18677999079227448, "logps/chosen": -1.524290919303894, "logps/rejected": -1.8046903610229492, "loss": 0.5161, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.524290919303894, "rewards/margins": 0.2803994417190552, "rewards/rejected": -1.8046903610229492, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 7.307839594836412, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.15240326523780823, "logits/rejected": 0.002622663974761963, "logps/chosen": -1.7244608402252197, "logps/rejected": -2.0785250663757324, "loss": 0.5284, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7244608402252197, "rewards/margins": 0.35406431555747986, "rewards/rejected": -2.0785250663757324, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 9.089962799529955, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.07988885045051575, "logits/rejected": -0.024965696036815643, "logps/chosen": -1.7074930667877197, "logps/rejected": -2.0484671592712402, "loss": 0.521, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7074930667877197, "rewards/margins": 0.340974360704422, "rewards/rejected": -2.0484671592712402, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 9.273179662484484, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.03168075904250145, "logits/rejected": 0.10370880365371704, "logps/chosen": -1.5638258457183838, "logps/rejected": -2.0377678871154785, "loss": 0.4811, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5638258457183838, "rewards/margins": 0.47394195199012756, "rewards/rejected": -2.0377678871154785, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 10.477463506306018, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.1415366381406784, "logits/rejected": 0.02591909095644951, "logps/chosen": -1.525785207748413, "logps/rejected": -2.136225461959839, "loss": 0.4751, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.525785207748413, "rewards/margins": 0.6104402542114258, "rewards/rejected": -2.136225461959839, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 10.953975752002748, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.01825452782213688, "logits/rejected": 0.04235028475522995, "logps/chosen": -1.4925674200057983, "logps/rejected": -1.9161365032196045, "loss": 0.4852, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4925674200057983, "rewards/margins": 0.4235692620277405, "rewards/rejected": -1.9161365032196045, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 7.179380766016577, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.12350162118673325, "logits/rejected": -0.0759977474808693, "logps/chosen": -1.5033982992172241, "logps/rejected": -1.860610008239746, "loss": 0.4805, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5033982992172241, "rewards/margins": 0.3572116494178772, "rewards/rejected": -1.860610008239746, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 5.9163124287858375, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.09556813538074493, "logits/rejected": -0.03338911384344101, "logps/chosen": -1.4860813617706299, "logps/rejected": -1.9500808715820312, "loss": 0.4476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4860813617706299, "rewards/margins": 0.46399959921836853, "rewards/rejected": -1.9500808715820312, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 12.965854789024, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.0474545881152153, "logits/rejected": 0.21912598609924316, "logps/chosen": -1.4726083278656006, "logps/rejected": -1.9423186779022217, "loss": 0.4658, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4726083278656006, "rewards/margins": 0.4697105288505554, "rewards/rejected": -1.9423186779022217, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 4.895896345129165, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.07785522192716599, "logits/rejected": 0.0888616219162941, "logps/chosen": -1.5202147960662842, "logps/rejected": -2.028317928314209, "loss": 0.4607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5202147960662842, "rewards/margins": 0.5081032514572144, "rewards/rejected": -2.028317928314209, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 9.61600856344291, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.08627970516681671, "logits/rejected": 0.0657169371843338, "logps/chosen": -1.521667242050171, "logps/rejected": -1.9318917989730835, "loss": 0.4945, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.521667242050171, "rewards/margins": 0.4102245271205902, "rewards/rejected": -1.9318917989730835, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 7.157496335920197, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.06417248398065567, "logits/rejected": 0.03503220155835152, "logps/chosen": -1.407430648803711, "logps/rejected": -1.961836814880371, "loss": 0.4319, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.407430648803711, "rewards/margins": 0.5544062852859497, "rewards/rejected": -1.961836814880371, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 5.518443480534409, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.12172581255435944, "logits/rejected": -0.05900086089968681, "logps/chosen": -1.5806515216827393, "logps/rejected": -2.014547109603882, "loss": 0.4753, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5806515216827393, "rewards/margins": 0.4338955283164978, "rewards/rejected": -2.014547109603882, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 8.791853201539597, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.055629897862672806, "logits/rejected": 0.019487783312797546, "logps/chosen": -1.6698133945465088, "logps/rejected": -2.065542459487915, "loss": 0.5015, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6698133945465088, "rewards/margins": 0.395729124546051, "rewards/rejected": -2.065542459487915, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 8.711075083572526, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.12839803099632263, "logits/rejected": -0.024078816175460815, "logps/chosen": -1.5495182275772095, "logps/rejected": -2.0599279403686523, "loss": 0.4697, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5495182275772095, "rewards/margins": 0.5104095339775085, "rewards/rejected": -2.0599279403686523, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 6.161978324416473, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.10593986511230469, "logits/rejected": 0.08534537255764008, "logps/chosen": -1.5857932567596436, "logps/rejected": -2.1109843254089355, "loss": 0.4657, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5857932567596436, "rewards/margins": 0.5251911282539368, "rewards/rejected": -2.1109843254089355, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 5.268687724416399, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.10057272017002106, "logits/rejected": 0.08127640187740326, "logps/chosen": -1.5055793523788452, "logps/rejected": -2.0212976932525635, "loss": 0.4698, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5055793523788452, "rewards/margins": 0.5157182812690735, "rewards/rejected": -2.0212976932525635, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 5.3938340490868155, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.11679784953594208, "logits/rejected": 0.07069636881351471, "logps/chosen": -1.5785627365112305, "logps/rejected": -1.974876046180725, "loss": 0.4838, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5785627365112305, "rewards/margins": 0.3963134288787842, "rewards/rejected": -1.974876046180725, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 4.958934240276058, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.12677395343780518, "logits/rejected": -0.038053855299949646, "logps/chosen": -1.5308269262313843, "logps/rejected": -2.0098366737365723, "loss": 0.462, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5308269262313843, "rewards/margins": 0.4790096879005432, "rewards/rejected": -2.0098366737365723, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 7.404238077330564, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.015866786241531372, "logits/rejected": 0.13566181063652039, "logps/chosen": -1.5643783807754517, "logps/rejected": -1.9376657009124756, "loss": 0.4934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5643783807754517, "rewards/margins": 0.373287171125412, "rewards/rejected": -1.9376657009124756, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 6.552315271173891, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.08182038366794586, "logits/rejected": 0.14377811551094055, "logps/chosen": -1.5467259883880615, "logps/rejected": -1.9314239025115967, "loss": 0.4924, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5467259883880615, "rewards/margins": 0.3846980631351471, "rewards/rejected": -1.9314239025115967, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 8.446467150194819, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.030015503987669945, "logits/rejected": 0.07091988623142242, "logps/chosen": -1.6226377487182617, "logps/rejected": -2.1339879035949707, "loss": 0.4738, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6226377487182617, "rewards/margins": 0.5113499164581299, "rewards/rejected": -2.1339879035949707, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 7.795596581815276, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.09162668883800507, "logits/rejected": -0.09633249789476395, "logps/chosen": -1.5701854228973389, "logps/rejected": -1.9686763286590576, "loss": 0.5012, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5701854228973389, "rewards/margins": 0.3984907269477844, "rewards/rejected": -1.9686763286590576, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 4.788257723309533, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.13180482387542725, "logits/rejected": 0.0633571445941925, "logps/chosen": -1.456046462059021, "logps/rejected": -1.9325265884399414, "loss": 0.4466, "rewards/accuracies": 0.6875, "rewards/chosen": -1.456046462059021, "rewards/margins": 0.476480096578598, "rewards/rejected": -1.9325265884399414, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 7.877534464420654, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.16618433594703674, "logits/rejected": -0.025913292542099953, "logps/chosen": -1.377068281173706, "logps/rejected": -1.9640827178955078, "loss": 0.4167, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.377068281173706, "rewards/margins": 0.5870144367218018, "rewards/rejected": -1.9640827178955078, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 10.41178947790382, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.14068502187728882, "logits/rejected": -0.047222062945365906, "logps/chosen": -1.6247119903564453, "logps/rejected": -2.1102609634399414, "loss": 0.486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6247119903564453, "rewards/margins": 0.48554906249046326, "rewards/rejected": -2.1102609634399414, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 7.7059998532261895, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.19745001196861267, "logits/rejected": -0.06136977672576904, "logps/chosen": -1.4447778463363647, "logps/rejected": -1.8753626346588135, "loss": 0.4891, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4447778463363647, "rewards/margins": 0.4305848181247711, "rewards/rejected": -1.8753626346588135, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 6.856383584782873, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.16775110363960266, "logits/rejected": 0.059480249881744385, "logps/chosen": -1.4274357557296753, "logps/rejected": -1.9839521646499634, "loss": 0.436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4274357557296753, "rewards/margins": 0.556516706943512, "rewards/rejected": -1.9839521646499634, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 8.311795573748897, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.15321232378482819, "logits/rejected": 0.006391219794750214, "logps/chosen": -1.5258538722991943, "logps/rejected": -2.1389529705047607, "loss": 0.4514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5258538722991943, "rewards/margins": 0.6130992770195007, "rewards/rejected": -2.1389529705047607, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 7.239034739038024, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.13408470153808594, "logits/rejected": 0.07148827612400055, "logps/chosen": -1.478649616241455, "logps/rejected": -2.0206961631774902, "loss": 0.4605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.478649616241455, "rewards/margins": 0.5420466065406799, "rewards/rejected": -2.0206961631774902, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 5.679044942962389, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.12887462973594666, "logits/rejected": 0.05280867964029312, "logps/chosen": -1.53232741355896, "logps/rejected": -2.0618908405303955, "loss": 0.4344, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.53232741355896, "rewards/margins": 0.529563307762146, "rewards/rejected": -2.0618908405303955, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 7.346693456795275, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.06396341323852539, "logits/rejected": -0.003896425012499094, "logps/chosen": -1.544514536857605, "logps/rejected": -1.9655109643936157, "loss": 0.4662, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.544514536857605, "rewards/margins": 0.42099618911743164, "rewards/rejected": -1.9655109643936157, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 8.212842075131002, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.006130790803581476, "logits/rejected": 0.02111983858048916, "logps/chosen": -1.5837961435317993, "logps/rejected": -2.1438395977020264, "loss": 0.4364, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5837961435317993, "rewards/margins": 0.5600436329841614, "rewards/rejected": -2.1438395977020264, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 11.899594195647122, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.13546614348888397, "logits/rejected": -0.02214309573173523, "logps/chosen": -1.683683156967163, "logps/rejected": -2.03389048576355, "loss": 0.5436, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.683683156967163, "rewards/margins": 0.35020750761032104, "rewards/rejected": -2.03389048576355, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 6.733707285911048, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.022555997595191002, "logits/rejected": -0.030495155602693558, "logps/chosen": -1.5471341609954834, "logps/rejected": -1.8503811359405518, "loss": 0.5155, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5471341609954834, "rewards/margins": 0.3032470643520355, "rewards/rejected": -1.8503811359405518, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 6.042259996941106, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.13461582362651825, "logits/rejected": -0.03332845866680145, "logps/chosen": -1.6003786325454712, "logps/rejected": -2.042073965072632, "loss": 0.4869, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6003786325454712, "rewards/margins": 0.44169527292251587, "rewards/rejected": -2.042073965072632, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 9.973983650860966, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.20825931429862976, "logits/rejected": -0.01582668349146843, "logps/chosen": -1.6285450458526611, "logps/rejected": -2.174424648284912, "loss": 0.4719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6285450458526611, "rewards/margins": 0.5458796620368958, "rewards/rejected": -2.174424648284912, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 7.7982333854557995, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.08776336163282394, "logits/rejected": 0.06051754951477051, "logps/chosen": -1.5161792039871216, "logps/rejected": -2.1645047664642334, "loss": 0.4251, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5161792039871216, "rewards/margins": 0.6483253240585327, "rewards/rejected": -2.1645047664642334, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 10.26335281979415, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.1657739132642746, "logits/rejected": 0.021909479051828384, "logps/chosen": -1.4380111694335938, "logps/rejected": -1.9401988983154297, "loss": 0.4538, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4380111694335938, "rewards/margins": 0.5021874904632568, "rewards/rejected": -1.9401988983154297, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 5.094221421896467, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.12217359244823456, "logits/rejected": 0.09328801184892654, "logps/chosen": -1.6901514530181885, "logps/rejected": -2.2851309776306152, "loss": 0.456, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6901514530181885, "rewards/margins": 0.5949796438217163, "rewards/rejected": -2.2851309776306152, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 6.857265715462522, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.19276610016822815, "logits/rejected": -0.018395811319351196, "logps/chosen": -1.5114589929580688, "logps/rejected": -1.9960839748382568, "loss": 0.4798, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5114589929580688, "rewards/margins": 0.4846251606941223, "rewards/rejected": -1.9960839748382568, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 8.00198919500667, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.11626338958740234, "logits/rejected": -0.020674359053373337, "logps/chosen": -1.60494863986969, "logps/rejected": -2.084780216217041, "loss": 0.5042, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.60494863986969, "rewards/margins": 0.4798316955566406, "rewards/rejected": -2.084780216217041, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 10.445303024415079, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.07256245613098145, "logits/rejected": 0.06938094645738602, "logps/chosen": -1.5472546815872192, "logps/rejected": -1.9900295734405518, "loss": 0.4955, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5472546815872192, "rewards/margins": 0.44277462363243103, "rewards/rejected": -1.9900295734405518, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 8.480012382064256, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.09636969864368439, "logits/rejected": 0.05780763551592827, "logps/chosen": -1.7210719585418701, "logps/rejected": -2.297027349472046, "loss": 0.469, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7210719585418701, "rewards/margins": 0.5759555101394653, "rewards/rejected": -2.297027349472046, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 7.3005466466982565, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.0425327867269516, "logits/rejected": 0.25758275389671326, "logps/chosen": -1.5626256465911865, "logps/rejected": -2.154857873916626, "loss": 0.4539, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5626256465911865, "rewards/margins": 0.5922321081161499, "rewards/rejected": -2.154857873916626, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 7.074336486475421, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.06643326580524445, "logits/rejected": 0.13686507940292358, "logps/chosen": -1.6021817922592163, "logps/rejected": -2.190265655517578, "loss": 0.46, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6021817922592163, "rewards/margins": 0.5880836844444275, "rewards/rejected": -2.190265655517578, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 5.650619307447635, "learning_rate": 7.286726973755554e-07, "logits/chosen": 0.042232006788253784, "logits/rejected": 0.07846888154745102, "logps/chosen": -1.5645692348480225, "logps/rejected": -2.041264057159424, "loss": 0.4744, "rewards/accuracies": 0.625, "rewards/chosen": -1.5645692348480225, "rewards/margins": 0.4766944944858551, "rewards/rejected": -2.041264057159424, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 6.764563456661936, "learning_rate": 7.272866090342493e-07, "logits/chosen": 0.07977616786956787, "logits/rejected": 0.17584457993507385, "logps/chosen": -1.672327995300293, "logps/rejected": -2.2080042362213135, "loss": 0.4664, "rewards/accuracies": 0.65625, "rewards/chosen": -1.672327995300293, "rewards/margins": 0.5356762409210205, "rewards/rejected": -2.2080042362213135, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 7.345331728604675, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.13734345138072968, "logits/rejected": -0.010082717053592205, "logps/chosen": -1.5450170040130615, "logps/rejected": -1.9848597049713135, "loss": 0.5006, "rewards/accuracies": 0.625, "rewards/chosen": -1.5450170040130615, "rewards/margins": 0.4398427903652191, "rewards/rejected": -1.9848597049713135, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 6.605194335959755, "learning_rate": 7.245078304138335e-07, "logits/chosen": 0.003159067127853632, "logits/rejected": 0.0774504542350769, "logps/chosen": -1.6681016683578491, "logps/rejected": -2.2301762104034424, "loss": 0.4477, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6681016683578491, "rewards/margins": 0.5620743036270142, "rewards/rejected": -2.2301762104034424, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 6.340833401261232, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.13051895797252655, "logits/rejected": 0.07922609150409698, "logps/chosen": -1.6848726272583008, "logps/rejected": -2.1507742404937744, "loss": 0.4895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6848726272583008, "rewards/margins": 0.4659012258052826, "rewards/rejected": -2.1507742404937744, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 6.666537068929513, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.06071236729621887, "logits/rejected": 0.13457247614860535, "logps/chosen": -1.615939736366272, "logps/rejected": -2.1229145526885986, "loss": 0.4757, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.615939736366272, "rewards/margins": 0.5069748163223267, "rewards/rejected": -2.1229145526885986, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 6.294177671928379, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.09363870322704315, "logits/rejected": 0.07508295774459839, "logps/chosen": -1.6184641122817993, "logps/rejected": -2.0924735069274902, "loss": 0.4899, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6184641122817993, "rewards/margins": 0.4740094244480133, "rewards/rejected": -2.0924735069274902, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 8.598240710289698, "learning_rate": 7.189242433016852e-07, "logits/chosen": 0.02433309331536293, "logits/rejected": 0.18412891030311584, "logps/chosen": -1.4755879640579224, "logps/rejected": -2.1051886081695557, "loss": 0.4532, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4755879640579224, "rewards/margins": 0.6296008825302124, "rewards/rejected": -2.1051886081695557, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 8.33848339446104, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.04005918651819229, "logits/rejected": 0.12352597713470459, "logps/chosen": -1.6038684844970703, "logps/rejected": -2.270421028137207, "loss": 0.4416, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6038684844970703, "rewards/margins": 0.6665524840354919, "rewards/rejected": -2.270421028137207, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 4.873138111835629, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.0017689854139462113, "logits/rejected": 0.15761753916740417, "logps/chosen": -1.5955610275268555, "logps/rejected": -2.1455841064453125, "loss": 0.4663, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5955610275268555, "rewards/margins": 0.5500231981277466, "rewards/rejected": -2.1455841064453125, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 7.563062680451812, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.005323231220245361, "logits/rejected": 0.14324846863746643, "logps/chosen": -1.6632435321807861, "logps/rejected": -2.1169967651367188, "loss": 0.4731, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6632435321807861, "rewards/margins": 0.4537532329559326, "rewards/rejected": -2.1169967651367188, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 6.249038024702197, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.14584100246429443, "logits/rejected": 0.057858455926179886, "logps/chosen": -1.4833753108978271, "logps/rejected": -1.9312641620635986, "loss": 0.4656, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4833753108978271, "rewards/margins": 0.44788867235183716, "rewards/rejected": -1.9312641620635986, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 9.880654387732907, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.09947670996189117, "logits/rejected": 0.1252444088459015, "logps/chosen": -1.642422080039978, "logps/rejected": -2.2200229167938232, "loss": 0.4798, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.642422080039978, "rewards/margins": 0.5776008367538452, "rewards/rejected": -2.2200229167938232, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 5.75786922091076, "learning_rate": 7.104854155899711e-07, "logits/chosen": 0.014604300260543823, "logits/rejected": 0.13994798064231873, "logps/chosen": -1.617510199546814, "logps/rejected": -2.1078078746795654, "loss": 0.4808, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.617510199546814, "rewards/margins": 0.49029740691185, "rewards/rejected": -2.1078078746795654, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 6.81911620009211, "learning_rate": 7.090717170722817e-07, "logits/chosen": 0.010394556447863579, "logits/rejected": 0.08864818513393402, "logps/chosen": -1.5391380786895752, "logps/rejected": -2.166426181793213, "loss": 0.4176, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5391380786895752, "rewards/margins": 0.6272881627082825, "rewards/rejected": -2.166426181793213, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 6.137852737352751, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.18247434496879578, "logits/rejected": -0.03218594938516617, "logps/chosen": -1.5087926387786865, "logps/rejected": -1.980444312095642, "loss": 0.452, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5087926387786865, "rewards/margins": 0.471651554107666, "rewards/rejected": -1.980444312095642, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 12.187451550011133, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.0425427183508873, "logits/rejected": 0.09998269379138947, "logps/chosen": -1.5334279537200928, "logps/rejected": -1.9492900371551514, "loss": 0.4732, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5334279537200928, "rewards/margins": 0.4158620238304138, "rewards/rejected": -1.9492900371551514, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.2752378582954407, "eval_logits/rejected": 0.38054320216178894, "eval_logps/chosen": -1.6420358419418335, "eval_logps/rejected": -2.162473678588867, "eval_loss": 0.4775308072566986, "eval_rewards/accuracies": 0.6669139266014099, "eval_rewards/chosen": -1.6420358419418335, "eval_rewards/margins": 0.5204380750656128, "eval_rewards/rejected": -2.162473678588867, "eval_runtime": 40.5665, "eval_samples_per_second": 33.155, "eval_steps_per_second": 8.307, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 4.975085834857313, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.05230217054486275, "logits/rejected": 0.11341820657253265, "logps/chosen": -1.6430044174194336, "logps/rejected": -2.2264299392700195, "loss": 0.4903, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6430044174194336, "rewards/margins": 0.5834256410598755, "rewards/rejected": -2.2264299392700195, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 6.589081509343902, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.18361692130565643, "logits/rejected": 0.04731101915240288, "logps/chosen": -1.5621782541275024, "logps/rejected": -1.8810374736785889, "loss": 0.4926, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5621782541275024, "rewards/margins": 0.3188591003417969, "rewards/rejected": -1.8810374736785889, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 5.910429966547782, "learning_rate": 7.019730732632681e-07, "logits/chosen": 0.0019161917734891176, "logits/rejected": 0.09464772045612335, "logps/chosen": -1.5237740278244019, "logps/rejected": -2.1924636363983154, "loss": 0.4426, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5237740278244019, "rewards/margins": 0.6686898469924927, "rewards/rejected": -2.1924636363983154, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 5.217122398017095, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.13239330053329468, "logits/rejected": 0.01928863488137722, "logps/chosen": -1.622183084487915, "logps/rejected": -2.1465840339660645, "loss": 0.437, "rewards/accuracies": 0.71875, "rewards/chosen": -1.622183084487915, "rewards/margins": 0.5244010090827942, "rewards/rejected": -2.1465840339660645, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 5.7139936804363085, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.00784804206341505, "logits/rejected": 0.07794944941997528, "logps/chosen": -1.4888660907745361, "logps/rejected": -1.9114980697631836, "loss": 0.4438, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4888660907745361, "rewards/margins": 0.42263203859329224, "rewards/rejected": -1.9114980697631836, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 9.990735578412384, "learning_rate": 6.976902622196776e-07, "logits/chosen": 0.002607661532238126, "logits/rejected": 0.0830191820859909, "logps/chosen": -1.6700376272201538, "logps/rejected": -2.143716812133789, "loss": 0.4807, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6700376272201538, "rewards/margins": 0.4736790657043457, "rewards/rejected": -2.143716812133789, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 5.248261634439076, "learning_rate": 6.962588040686064e-07, "logits/chosen": 0.01189921610057354, "logits/rejected": 0.16031195223331451, "logps/chosen": -1.5359896421432495, "logps/rejected": -1.884161353111267, "loss": 0.4998, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5359896421432495, "rewards/margins": 0.3481716811656952, "rewards/rejected": -1.884161353111267, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 7.264069942314549, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.11026249080896378, "logits/rejected": 0.024601425975561142, "logps/chosen": -1.5177053213119507, "logps/rejected": -2.0315258502960205, "loss": 0.4692, "rewards/accuracies": 0.625, "rewards/chosen": -1.5177053213119507, "rewards/margins": 0.5138203501701355, "rewards/rejected": -2.0315258502960205, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 6.902261471394395, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.053836990147829056, "logits/rejected": 0.11333946138620377, "logps/chosen": -1.5675827264785767, "logps/rejected": -1.9716116189956665, "loss": 0.499, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5675827264785767, "rewards/margins": 0.4040289521217346, "rewards/rejected": -1.9716116189956665, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 9.524271792959603, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.04093671962618828, "logits/rejected": 0.13618075847625732, "logps/chosen": -1.4637603759765625, "logps/rejected": -2.016995668411255, "loss": 0.4589, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4637603759765625, "rewards/margins": 0.5532349348068237, "rewards/rejected": -2.016995668411255, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 4.850753458201713, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.09337630122900009, "logits/rejected": 0.17799821496009827, "logps/chosen": -1.5855872631072998, "logps/rejected": -2.07099986076355, "loss": 0.4701, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5855872631072998, "rewards/margins": 0.4854126572608948, "rewards/rejected": -2.07099986076355, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 11.71795911428344, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.007785356137901545, "logits/rejected": 0.12040611356496811, "logps/chosen": -1.558915376663208, "logps/rejected": -1.9051589965820312, "loss": 0.5288, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.558915376663208, "rewards/margins": 0.3462437093257904, "rewards/rejected": -1.9051589965820312, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 5.432212599359135, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.04086199402809143, "logits/rejected": 0.05344525724649429, "logps/chosen": -1.5345993041992188, "logps/rejected": -2.148566246032715, "loss": 0.4469, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5345993041992188, "rewards/margins": 0.6139670610427856, "rewards/rejected": -2.148566246032715, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 6.397956097397064, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.18973979353904724, "logits/rejected": -0.03370284289121628, "logps/chosen": -1.5425035953521729, "logps/rejected": -1.8760061264038086, "loss": 0.4986, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5425035953521729, "rewards/margins": 0.33350247144699097, "rewards/rejected": -1.8760061264038086, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 6.195436327194637, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.03406739979982376, "logits/rejected": 0.045273952186107635, "logps/chosen": -1.5450842380523682, "logps/rejected": -1.975246787071228, "loss": 0.4541, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5450842380523682, "rewards/margins": 0.4301624894142151, "rewards/rejected": -1.975246787071228, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 6.518236897664501, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.18124788999557495, "logits/rejected": 0.027191396802663803, "logps/chosen": -1.6215412616729736, "logps/rejected": -2.2047996520996094, "loss": 0.43, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6215412616729736, "rewards/margins": 0.583258867263794, "rewards/rejected": -2.2047996520996094, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 6.060910376073924, "learning_rate": 6.818417974097246e-07, "logits/chosen": 0.013284141197800636, "logits/rejected": 0.21436361968517303, "logps/chosen": -1.6474018096923828, "logps/rejected": -2.298553943634033, "loss": 0.4816, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6474018096923828, "rewards/margins": 0.6511520147323608, "rewards/rejected": -2.298553943634033, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 8.163309040337657, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.11916539818048477, "logits/rejected": -0.05883374810218811, "logps/chosen": -1.685755729675293, "logps/rejected": -2.1753947734832764, "loss": 0.4676, "rewards/accuracies": 0.71875, "rewards/chosen": -1.685755729675293, "rewards/margins": 0.4896390438079834, "rewards/rejected": -2.1753947734832764, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 7.62973145807213, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.01143647264689207, "logits/rejected": 0.001668338431045413, "logps/chosen": -1.5900094509124756, "logps/rejected": -1.9956557750701904, "loss": 0.504, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5900094509124756, "rewards/margins": 0.40564632415771484, "rewards/rejected": -1.9956557750701904, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 5.781556147922516, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.014825952239334583, "logits/rejected": 0.1718948632478714, "logps/chosen": -1.5834308862686157, "logps/rejected": -2.0373787879943848, "loss": 0.4702, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5834308862686157, "rewards/margins": 0.4539477825164795, "rewards/rejected": -2.0373787879943848, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 6.270325721669857, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.032043930143117905, "logits/rejected": 0.1591833531856537, "logps/chosen": -1.4572300910949707, "logps/rejected": -1.9761745929718018, "loss": 0.4617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4572300910949707, "rewards/margins": 0.5189443230628967, "rewards/rejected": -1.9761745929718018, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 7.343646060501299, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.12552650272846222, "logits/rejected": 0.03440629690885544, "logps/chosen": -1.5132838487625122, "logps/rejected": -2.031515598297119, "loss": 0.4528, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5132838487625122, "rewards/margins": 0.5182317495346069, "rewards/rejected": -2.031515598297119, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 6.9278016164192255, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.031079525128006935, "logits/rejected": 0.1037171334028244, "logps/chosen": -1.6150360107421875, "logps/rejected": -2.115753412246704, "loss": 0.4706, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6150360107421875, "rewards/margins": 0.5007173418998718, "rewards/rejected": -2.115753412246704, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 6.502524886725466, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.14155879616737366, "logits/rejected": -0.043433304876089096, "logps/chosen": -1.6286166906356812, "logps/rejected": -2.039634943008423, "loss": 0.4744, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6286166906356812, "rewards/margins": 0.411018043756485, "rewards/rejected": -2.039634943008423, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 8.89443090560977, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.1029941588640213, "logits/rejected": -0.011468647047877312, "logps/chosen": -1.7110602855682373, "logps/rejected": -2.247680187225342, "loss": 0.4608, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7110602855682373, "rewards/margins": 0.5366200804710388, "rewards/rejected": -2.247680187225342, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 10.088011528575402, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.1326943039894104, "logits/rejected": -0.07833883911371231, "logps/chosen": -1.6849720478057861, "logps/rejected": -2.0100650787353516, "loss": 0.546, "rewards/accuracies": 0.625, "rewards/chosen": -1.6849720478057861, "rewards/margins": 0.3250933289527893, "rewards/rejected": -2.0100650787353516, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 4.775381684141014, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.1960236132144928, "logits/rejected": -0.02682407572865486, "logps/chosen": -1.545352578163147, "logps/rejected": -2.066591501235962, "loss": 0.4484, "rewards/accuracies": 0.6875, "rewards/chosen": -1.545352578163147, "rewards/margins": 0.5212386846542358, "rewards/rejected": -2.066591501235962, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 8.067531945588602, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.10053286701440811, "logits/rejected": 0.10157926380634308, "logps/chosen": -1.616670846939087, "logps/rejected": -2.027583360671997, "loss": 0.4922, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.616670846939087, "rewards/margins": 0.4109126925468445, "rewards/rejected": -2.027583360671997, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 5.171473840638131, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.09394155442714691, "logits/rejected": 0.09637672454118729, "logps/chosen": -1.519028902053833, "logps/rejected": -2.0682191848754883, "loss": 0.4579, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.519028902053833, "rewards/margins": 0.5491902828216553, "rewards/rejected": -2.0682191848754883, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 6.295165113445156, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.08885548263788223, "logits/rejected": 0.09714460372924805, "logps/chosen": -1.5601648092269897, "logps/rejected": -1.9622972011566162, "loss": 0.4855, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5601648092269897, "rewards/margins": 0.40213242173194885, "rewards/rejected": -1.9622972011566162, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 9.512547144432135, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.003894433379173279, "logits/rejected": 0.10497350990772247, "logps/chosen": -1.4253404140472412, "logps/rejected": -1.9720462560653687, "loss": 0.4609, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4253404140472412, "rewards/margins": 0.5467058420181274, "rewards/rejected": -1.9720462560653687, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 9.954441679120983, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.05742545798420906, "logits/rejected": 0.08353970944881439, "logps/chosen": -1.5430809259414673, "logps/rejected": -2.005007743835449, "loss": 0.4683, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5430809259414673, "rewards/margins": 0.4619268476963043, "rewards/rejected": -2.005007743835449, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 7.2934166467385975, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.04572281241416931, "logits/rejected": 0.10049648582935333, "logps/chosen": -1.5360782146453857, "logps/rejected": -2.1679091453552246, "loss": 0.4239, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5360782146453857, "rewards/margins": 0.6318305730819702, "rewards/rejected": -2.1679091453552246, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 6.3227937002203065, "learning_rate": 6.569356025551454e-07, "logits/chosen": 0.020348545163869858, "logits/rejected": 0.10466353595256805, "logps/chosen": -1.5404096841812134, "logps/rejected": -2.072561740875244, "loss": 0.4569, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5404096841812134, "rewards/margins": 0.5321521162986755, "rewards/rejected": -2.072561740875244, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 7.597522609925915, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.13122648000717163, "logits/rejected": 0.03882508724927902, "logps/chosen": -1.4942795038223267, "logps/rejected": -1.9945781230926514, "loss": 0.4536, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4942795038223267, "rewards/margins": 0.5002988576889038, "rewards/rejected": -1.9945781230926514, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 6.255054222854732, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.04119990020990372, "logits/rejected": 0.06675471365451813, "logps/chosen": -1.6728417873382568, "logps/rejected": -2.0740911960601807, "loss": 0.4864, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6728417873382568, "rewards/margins": 0.4012494683265686, "rewards/rejected": -2.0740911960601807, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 7.164327659016272, "learning_rate": 6.524927148842602e-07, "logits/chosen": 0.053516268730163574, "logits/rejected": 0.23053336143493652, "logps/chosen": -1.504475474357605, "logps/rejected": -2.0689492225646973, "loss": 0.4126, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.504475474357605, "rewards/margins": 0.5644736289978027, "rewards/rejected": -2.0689492225646973, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 8.408187857496015, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.06427216529846191, "logits/rejected": 0.0924176424741745, "logps/chosen": -1.5690739154815674, "logps/rejected": -2.0274500846862793, "loss": 0.4734, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5690739154815674, "rewards/margins": 0.45837631821632385, "rewards/rejected": -2.0274500846862793, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 7.7877198587785275, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.012520027346909046, "logits/rejected": 0.04297181963920593, "logps/chosen": -1.5927222967147827, "logps/rejected": -1.9718300104141235, "loss": 0.5043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5927222967147827, "rewards/margins": 0.3791077136993408, "rewards/rejected": -1.9718300104141235, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 6.932182435310737, "learning_rate": 6.480365119346011e-07, "logits/chosen": 0.06229390949010849, "logits/rejected": 0.21428605914115906, "logps/chosen": -1.582781195640564, "logps/rejected": -1.9746601581573486, "loss": 0.4877, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.582781195640564, "rewards/margins": 0.3918788433074951, "rewards/rejected": -1.9746601581573486, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 9.947348356216136, "learning_rate": 6.465482192203129e-07, "logits/chosen": 0.08096398413181305, "logits/rejected": 0.15108910202980042, "logps/chosen": -1.578955888748169, "logps/rejected": -2.0038020610809326, "loss": 0.464, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.578955888748169, "rewards/margins": 0.42484623193740845, "rewards/rejected": -2.0038020610809326, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 14.939376867025778, "learning_rate": 6.45058504694559e-07, "logits/chosen": 0.10026098787784576, "logits/rejected": 0.19216075539588928, "logps/chosen": -1.6392631530761719, "logps/rejected": -2.0632503032684326, "loss": 0.5124, "rewards/accuracies": 0.625, "rewards/chosen": -1.6392631530761719, "rewards/margins": 0.42398738861083984, "rewards/rejected": -2.0632503032684326, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 14.806491467065625, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.0561630018055439, "logits/rejected": 0.12168797105550766, "logps/chosen": -1.4993690252304077, "logps/rejected": -2.036639451980591, "loss": 0.4521, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4993690252304077, "rewards/margins": 0.537270188331604, "rewards/rejected": -2.036639451980591, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 7.5108195360329955, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.01612640544772148, "logits/rejected": -0.018476996570825577, "logps/chosen": -1.6292651891708374, "logps/rejected": -1.9080960750579834, "loss": 0.532, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6292651891708374, "rewards/margins": 0.27883079648017883, "rewards/rejected": -1.9080960750579834, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 12.577874211902989, "learning_rate": 6.405809748488032e-07, "logits/chosen": 0.01715278998017311, "logits/rejected": 0.20437569916248322, "logps/chosen": -1.5592197179794312, "logps/rejected": -2.0398223400115967, "loss": 0.4901, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5592197179794312, "rewards/margins": 0.4806024134159088, "rewards/rejected": -2.0398223400115967, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 7.609182815449498, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.13736779987812042, "logits/rejected": 0.08779671043157578, "logps/chosen": -1.6455129384994507, "logps/rejected": -2.095186471939087, "loss": 0.4847, "rewards/accuracies": 0.625, "rewards/chosen": -1.6455129384994507, "rewards/margins": 0.4496735632419586, "rewards/rejected": -2.095186471939087, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 9.85708049936895, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.06550505012273788, "logits/rejected": 0.03518306463956833, "logps/chosen": -1.5420849323272705, "logps/rejected": -2.0127482414245605, "loss": 0.4578, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5420849323272705, "rewards/margins": 0.4706631600856781, "rewards/rejected": -2.0127482414245605, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 5.568263698272433, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.0046273707412183285, "logits/rejected": 0.09794069826602936, "logps/chosen": -1.758061170578003, "logps/rejected": -2.2347893714904785, "loss": 0.4911, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.758061170578003, "rewards/margins": 0.4767279028892517, "rewards/rejected": -2.2347893714904785, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 8.721285973389191, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.03150638937950134, "logits/rejected": 0.0698084607720375, "logps/chosen": -1.5471601486206055, "logps/rejected": -2.0534818172454834, "loss": 0.4799, "rewards/accuracies": 0.625, "rewards/chosen": -1.5471601486206055, "rewards/margins": 0.5063217282295227, "rewards/rejected": -2.0534818172454834, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 6.162825567909797, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.10343219339847565, "logits/rejected": 0.08253543078899384, "logps/chosen": -1.6182587146759033, "logps/rejected": -2.3279426097869873, "loss": 0.444, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6182587146759033, "rewards/margins": 0.7096840143203735, "rewards/rejected": -2.3279426097869873, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 8.036424416709663, "learning_rate": 6.315894816685838e-07, "logits/chosen": 0.002221143338829279, "logits/rejected": 0.18447551131248474, "logps/chosen": -1.510881781578064, "logps/rejected": -2.005204439163208, "loss": 0.4331, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.510881781578064, "rewards/margins": 0.49432268738746643, "rewards/rejected": -2.005204439163208, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 9.415863289448032, "learning_rate": 6.300863461616657e-07, "logits/chosen": 0.04597727954387665, "logits/rejected": 0.10877394676208496, "logps/chosen": -1.4800009727478027, "logps/rejected": -1.9634151458740234, "loss": 0.4774, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4800009727478027, "rewards/margins": 0.48341432213783264, "rewards/rejected": -1.9634151458740234, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 6.099808548894657, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.13056616485118866, "logits/rejected": 0.026355814188718796, "logps/chosen": -1.5744701623916626, "logps/rejected": -2.089160442352295, "loss": 0.4473, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5744701623916626, "rewards/margins": 0.5146902203559875, "rewards/rejected": -2.089160442352295, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 10.643509872743977, "learning_rate": 6.270763034485986e-07, "logits/chosen": 0.025143736973404884, "logits/rejected": 0.1482909917831421, "logps/chosen": -1.7165138721466064, "logps/rejected": -2.0700948238372803, "loss": 0.5151, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7165138721466064, "rewards/margins": 0.35358113050460815, "rewards/rejected": -2.0700948238372803, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 9.808485435818904, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.0500541552901268, "logits/rejected": 0.14869610965251923, "logps/chosen": -1.5963345766067505, "logps/rejected": -1.996419906616211, "loss": 0.528, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5963345766067505, "rewards/margins": 0.40008530020713806, "rewards/rejected": -1.996419906616211, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 7.120655459638015, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.06641460955142975, "logits/rejected": 0.12910157442092896, "logps/chosen": -1.5677170753479004, "logps/rejected": -1.9796056747436523, "loss": 0.4845, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5677170753479004, "rewards/margins": 0.4118885397911072, "rewards/rejected": -1.9796056747436523, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 7.5232044398879, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.05052545666694641, "logits/rejected": 0.20401112735271454, "logps/chosen": -1.500235676765442, "logps/rejected": -1.9518215656280518, "loss": 0.4518, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.500235676765442, "rewards/margins": 0.4515857696533203, "rewards/rejected": -1.9518215656280518, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 12.404101685704234, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.17619284987449646, "logits/rejected": 0.11565271764993668, "logps/chosen": -1.5411489009857178, "logps/rejected": -2.1512503623962402, "loss": 0.4392, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5411489009857178, "rewards/margins": 0.6101009845733643, "rewards/rejected": -2.1512503623962402, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 6.788842119472379, "learning_rate": 6.195298770577415e-07, "logits/chosen": 0.028119731694459915, "logits/rejected": 0.062077395617961884, "logps/chosen": -1.5112144947052002, "logps/rejected": -1.9915170669555664, "loss": 0.4807, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5112144947052002, "rewards/margins": 0.48030251264572144, "rewards/rejected": -1.9915170669555664, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 7.570354683387723, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.05936000868678093, "logits/rejected": 0.15183496475219727, "logps/chosen": -1.5991464853286743, "logps/rejected": -2.1046407222747803, "loss": 0.4622, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5991464853286743, "rewards/margins": 0.5054944157600403, "rewards/rejected": -2.1046407222747803, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 7.695106896946199, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.11789985001087189, "logits/rejected": 0.1413964331150055, "logps/chosen": -1.4773584604263306, "logps/rejected": -2.1341819763183594, "loss": 0.4138, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4773584604263306, "rewards/margins": 0.6568232774734497, "rewards/rejected": -2.1341819763183594, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 9.275425678434653, "learning_rate": 6.149879879003876e-07, "logits/chosen": 0.017136480659246445, "logits/rejected": 0.04668601229786873, "logps/chosen": -1.5412933826446533, "logps/rejected": -2.053201913833618, "loss": 0.4647, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5412933826446533, "rewards/margins": 0.5119085311889648, "rewards/rejected": -2.053201913833618, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 4.5827831828338, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.15883806347846985, "logits/rejected": -0.030060749500989914, "logps/chosen": -1.439509630203247, "logps/rejected": -1.8782564401626587, "loss": 0.4719, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.439509630203247, "rewards/margins": 0.438746839761734, "rewards/rejected": -1.8782564401626587, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 6.639251921511077, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.07841916382312775, "logits/rejected": 0.05972137302160263, "logps/chosen": -1.5232925415039062, "logps/rejected": -1.918605089187622, "loss": 0.482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5232925415039062, "rewards/margins": 0.39531224966049194, "rewards/rejected": -1.918605089187622, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 6.765204038241718, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.0544058158993721, "logits/rejected": 0.0789538100361824, "logps/chosen": -1.5143784284591675, "logps/rejected": -1.9921993017196655, "loss": 0.4534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5143784284591675, "rewards/margins": 0.47782102227211, "rewards/rejected": -1.9921993017196655, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 7.643858300726252, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.07833041995763779, "logits/rejected": 0.13936454057693481, "logps/chosen": -1.5700321197509766, "logps/rejected": -2.117766857147217, "loss": 0.4599, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5700321197509766, "rewards/margins": 0.5477348566055298, "rewards/rejected": -2.117766857147217, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 6.151101815821969, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.06800331175327301, "logits/rejected": 0.16644461452960968, "logps/chosen": -1.4196412563323975, "logps/rejected": -2.004312038421631, "loss": 0.4113, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4196412563323975, "rewards/margins": 0.584670901298523, "rewards/rejected": -2.004312038421631, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 7.332616235341915, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.08055730909109116, "logits/rejected": 0.06232626363635063, "logps/chosen": -1.546736717224121, "logps/rejected": -2.2165377140045166, "loss": 0.4305, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.546736717224121, "rewards/margins": 0.6698009371757507, "rewards/rejected": -2.2165377140045166, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 6.179271180333944, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.10861754417419434, "logits/rejected": 0.07308439910411835, "logps/chosen": -1.4773671627044678, "logps/rejected": -1.9273483753204346, "loss": 0.4628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4773671627044678, "rewards/margins": 0.44998103380203247, "rewards/rejected": -1.9273483753204346, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 6.460065083515118, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.057541441172361374, "logits/rejected": 0.0713726133108139, "logps/chosen": -1.6474723815917969, "logps/rejected": -2.2658133506774902, "loss": 0.4296, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6474723815917969, "rewards/margins": 0.618340790271759, "rewards/rejected": -2.2658133506774902, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 8.34749567962867, "learning_rate": 6.013036683579798e-07, "logits/chosen": 0.0012588858371600509, "logits/rejected": 0.17275908589363098, "logps/chosen": -1.578290581703186, "logps/rejected": -2.0697579383850098, "loss": 0.4665, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.578290581703186, "rewards/margins": 0.4914672374725342, "rewards/rejected": -2.0697579383850098, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 7.837454588404637, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.12664416432380676, "logits/rejected": 0.08664029836654663, "logps/chosen": -1.717015266418457, "logps/rejected": -2.3803622722625732, "loss": 0.4196, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.717015266418457, "rewards/margins": 0.6633471250534058, "rewards/rejected": -2.3803622722625732, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 9.999985394269169, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.025986066088080406, "logits/rejected": 0.14024318754673004, "logps/chosen": -1.6303606033325195, "logps/rejected": -2.119910717010498, "loss": 0.4438, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6303606033325195, "rewards/margins": 0.4895502030849457, "rewards/rejected": -2.119910717010498, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 6.849671162764282, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.1280602514743805, "logits/rejected": -0.030441606417298317, "logps/chosen": -1.6340034008026123, "logps/rejected": -2.011007308959961, "loss": 0.4864, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6340034008026123, "rewards/margins": 0.37700384855270386, "rewards/rejected": -2.011007308959961, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 13.134335234938202, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.07396284490823746, "logits/rejected": -0.012590000405907631, "logps/chosen": -1.6476377248764038, "logps/rejected": -2.267047882080078, "loss": 0.4555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6476377248764038, "rewards/margins": 0.6194103360176086, "rewards/rejected": -2.267047882080078, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 6.332088336468796, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.05554728955030441, "logits/rejected": 0.07126756012439728, "logps/chosen": -1.784759521484375, "logps/rejected": -2.227672576904297, "loss": 0.5221, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.784759521484375, "rewards/margins": 0.4429130554199219, "rewards/rejected": -2.227672576904297, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 9.938454676448588, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.11562782526016235, "logits/rejected": 0.055083371698856354, "logps/chosen": -1.7588727474212646, "logps/rejected": -2.2724289894104004, "loss": 0.4967, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7588727474212646, "rewards/margins": 0.5135561227798462, "rewards/rejected": -2.2724289894104004, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 8.925422341844289, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.041163552552461624, "logits/rejected": -0.0646100789308548, "logps/chosen": -1.603329062461853, "logps/rejected": -2.0765597820281982, "loss": 0.4808, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.603329062461853, "rewards/margins": 0.47323089838027954, "rewards/rejected": -2.0765597820281982, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 6.411695234801729, "learning_rate": 5.890726635828919e-07, "logits/chosen": 0.11050862073898315, "logits/rejected": 0.129068523645401, "logps/chosen": -1.4515634775161743, "logps/rejected": -1.9260543584823608, "loss": 0.4513, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4515634775161743, "rewards/margins": 0.47449102997779846, "rewards/rejected": -1.9260543584823608, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 13.654573215114976, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.10376518964767456, "logits/rejected": -0.009432412683963776, "logps/chosen": -1.6354379653930664, "logps/rejected": -2.107036828994751, "loss": 0.5055, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6354379653930664, "rewards/margins": 0.47159862518310547, "rewards/rejected": -2.107036828994751, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.2980523407459259, "eval_logits/rejected": 0.40482062101364136, "eval_logps/chosen": -1.615561842918396, "eval_logps/rejected": -2.1128506660461426, "eval_loss": 0.47550180554389954, "eval_rewards/accuracies": 0.6639465689659119, "eval_rewards/chosen": -1.615561842918396, "eval_rewards/margins": 0.4972890615463257, "eval_rewards/rejected": -2.1128506660461426, "eval_runtime": 40.5489, "eval_samples_per_second": 33.17, "eval_steps_per_second": 8.311, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 4.642751793373204, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.20865106582641602, "logits/rejected": -0.03168492391705513, "logps/chosen": -1.4481909275054932, "logps/rejected": -1.9983524084091187, "loss": 0.4133, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4481909275054932, "rewards/margins": 0.5501615405082703, "rewards/rejected": -1.9983524084091187, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 8.823472231511934, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.04885340854525566, "logits/rejected": 0.024360086768865585, "logps/chosen": -1.5447633266448975, "logps/rejected": -2.0316712856292725, "loss": 0.4554, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5447633266448975, "rewards/margins": 0.48690786957740784, "rewards/rejected": -2.0316712856292725, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 5.8173856570582485, "learning_rate": 5.829359458171714e-07, "logits/chosen": 0.002756330417469144, "logits/rejected": 0.13931605219841003, "logps/chosen": -1.5607579946517944, "logps/rejected": -2.089944362640381, "loss": 0.4608, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5607579946517944, "rewards/margins": 0.5291863679885864, "rewards/rejected": -2.089944362640381, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 5.986720703263821, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.050485432147979736, "logits/rejected": 0.1694364845752716, "logps/chosen": -1.6227829456329346, "logps/rejected": -2.083887815475464, "loss": 0.4914, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6227829456329346, "rewards/margins": 0.46110501885414124, "rewards/rejected": -2.083887815475464, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 6.862378920640574, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.11633233726024628, "logits/rejected": 0.09893203526735306, "logps/chosen": -1.6612581014633179, "logps/rejected": -2.25675368309021, "loss": 0.4565, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6612581014633179, "rewards/margins": 0.5954957008361816, "rewards/rejected": -2.25675368309021, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 8.692637997335018, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.02949836291372776, "logits/rejected": 0.15439757704734802, "logps/chosen": -1.6481574773788452, "logps/rejected": -2.138017416000366, "loss": 0.4687, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6481574773788452, "rewards/margins": 0.4898598790168762, "rewards/rejected": -2.138017416000366, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 7.891946951445509, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.07888110727071762, "logits/rejected": 0.10589386522769928, "logps/chosen": -1.6426537036895752, "logps/rejected": -2.189969301223755, "loss": 0.4605, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6426537036895752, "rewards/margins": 0.5473154783248901, "rewards/rejected": -2.189969301223755, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 7.234053140815082, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.0600334107875824, "logits/rejected": 0.04381268098950386, "logps/chosen": -1.6902275085449219, "logps/rejected": -2.1046910285949707, "loss": 0.5, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6902275085449219, "rewards/margins": 0.4144632816314697, "rewards/rejected": -2.1046910285949707, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 8.688589978192127, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.07859118282794952, "logits/rejected": 0.025086581707000732, "logps/chosen": -1.6646705865859985, "logps/rejected": -2.160888195037842, "loss": 0.5036, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6646705865859985, "rewards/margins": 0.49621763825416565, "rewards/rejected": -2.160888195037842, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 6.57053479687912, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.09414790570735931, "logits/rejected": 0.007630676031112671, "logps/chosen": -1.6637828350067139, "logps/rejected": -2.2037274837493896, "loss": 0.4742, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6637828350067139, "rewards/margins": 0.5399444699287415, "rewards/rejected": -2.2037274837493896, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 7.18691453095042, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.08783832937479019, "logits/rejected": 0.10384579747915268, "logps/chosen": -1.9083540439605713, "logps/rejected": -2.483633041381836, "loss": 0.4557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9083540439605713, "rewards/margins": 0.5752791166305542, "rewards/rejected": -2.483633041381836, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 7.227394474639535, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.16970416903495789, "logits/rejected": 0.039454877376556396, "logps/chosen": -1.589909315109253, "logps/rejected": -2.0500943660736084, "loss": 0.4579, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.589909315109253, "rewards/margins": 0.46018490195274353, "rewards/rejected": -2.0500943660736084, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 7.106690185097418, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.1991889774799347, "logits/rejected": -0.047068722546100616, "logps/chosen": -1.6504377126693726, "logps/rejected": -2.128983736038208, "loss": 0.4529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6504377126693726, "rewards/margins": 0.4785459041595459, "rewards/rejected": -2.128983736038208, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 11.797590126581767, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.11344132572412491, "logits/rejected": 0.014205547980964184, "logps/chosen": -1.6746965646743774, "logps/rejected": -2.045448064804077, "loss": 0.4881, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6746965646743774, "rewards/margins": 0.3707515299320221, "rewards/rejected": -2.045448064804077, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 7.995492393376911, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.0871611088514328, "logits/rejected": 0.03860270977020264, "logps/chosen": -1.6874269247055054, "logps/rejected": -2.1893951892852783, "loss": 0.44, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6874269247055054, "rewards/margins": 0.501968264579773, "rewards/rejected": -2.1893951892852783, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 7.000858678019708, "learning_rate": 5.629076484188952e-07, "logits/chosen": 0.06356900185346603, "logits/rejected": 0.19630762934684753, "logps/chosen": -1.5413215160369873, "logps/rejected": -2.0865683555603027, "loss": 0.46, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5413215160369873, "rewards/margins": 0.5452467203140259, "rewards/rejected": -2.0865683555603027, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 12.06054606845961, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.07794005423784256, "logits/rejected": 0.05914082005620003, "logps/chosen": -1.7649116516113281, "logps/rejected": -2.320505142211914, "loss": 0.4683, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7649116516113281, "rewards/margins": 0.5555934906005859, "rewards/rejected": -2.320505142211914, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 9.825349603170435, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.15499825775623322, "logits/rejected": 0.0615253672003746, "logps/chosen": -1.7381982803344727, "logps/rejected": -2.36466908454895, "loss": 0.4454, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7381982803344727, "rewards/margins": 0.6264706254005432, "rewards/rejected": -2.36466908454895, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 11.484340736642993, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.10740765184164047, "logits/rejected": 0.06730961799621582, "logps/chosen": -1.5317363739013672, "logps/rejected": -2.077270030975342, "loss": 0.4446, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5317363739013672, "rewards/margins": 0.5455336570739746, "rewards/rejected": -2.077270030975342, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 6.1314440187126475, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.042904384434223175, "logits/rejected": 0.07427171617746353, "logps/chosen": -1.6372734308242798, "logps/rejected": -2.1972196102142334, "loss": 0.4466, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6372734308242798, "rewards/margins": 0.5599461197853088, "rewards/rejected": -2.1972196102142334, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 6.941584069165275, "learning_rate": 5.551751964760838e-07, "logits/chosen": 0.0365503765642643, "logits/rejected": 0.06783448159694672, "logps/chosen": -1.5995454788208008, "logps/rejected": -2.125095844268799, "loss": 0.4771, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5995454788208008, "rewards/margins": 0.525550365447998, "rewards/rejected": -2.125095844268799, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 7.469122731481386, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.08264781534671783, "logits/rejected": 0.03534680977463722, "logps/chosen": -1.6846702098846436, "logps/rejected": -2.132639169692993, "loss": 0.4697, "rewards/accuracies": 0.625, "rewards/chosen": -1.6846702098846436, "rewards/margins": 0.4479687809944153, "rewards/rejected": -2.132639169692993, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 14.654183886796433, "learning_rate": 5.520783634613667e-07, "logits/chosen": 0.0002588361385278404, "logits/rejected": 0.19813410937786102, "logps/chosen": -1.7038036584854126, "logps/rejected": -2.2847676277160645, "loss": 0.4921, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7038036584854126, "rewards/margins": 0.5809639096260071, "rewards/rejected": -2.2847676277160645, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 9.65046284367642, "learning_rate": 5.505291815446082e-07, "logits/chosen": 0.005177915096282959, "logits/rejected": 0.14560845494270325, "logps/chosen": -1.7170436382293701, "logps/rejected": -2.2894134521484375, "loss": 0.4749, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7170436382293701, "rewards/margins": 0.5723696351051331, "rewards/rejected": -2.2894134521484375, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 9.180150407440614, "learning_rate": 5.489795093935089e-07, "logits/chosen": 0.011673715896904469, "logits/rejected": 0.09902342408895493, "logps/chosen": -1.5294486284255981, "logps/rejected": -2.1089706420898438, "loss": 0.4677, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5294486284255981, "rewards/margins": 0.5795220136642456, "rewards/rejected": -2.1089706420898438, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 7.280585060601515, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.16292323172092438, "logits/rejected": 0.04053761437535286, "logps/chosen": -1.6154592037200928, "logps/rejected": -2.433621883392334, "loss": 0.4064, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6154592037200928, "rewards/margins": 0.818162739276886, "rewards/rejected": -2.433621883392334, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 8.018654439363317, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.09710148721933365, "logits/rejected": 0.06715744733810425, "logps/chosen": -1.7047722339630127, "logps/rejected": -2.2279908657073975, "loss": 0.4726, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7047722339630127, "rewards/margins": 0.5232186913490295, "rewards/rejected": -2.2279908657073975, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 8.429693877025763, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.10238828510046005, "logits/rejected": 0.09743017703294754, "logps/chosen": -1.6793454885482788, "logps/rejected": -2.292565107345581, "loss": 0.4626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6793454885482788, "rewards/margins": 0.6132197380065918, "rewards/rejected": -2.292565107345581, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 12.317262555692611, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.10576635599136353, "logits/rejected": 0.05966886132955551, "logps/chosen": -1.5713249444961548, "logps/rejected": -2.0290122032165527, "loss": 0.4736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5713249444961548, "rewards/margins": 0.4576875567436218, "rewards/rejected": -2.0290122032165527, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 7.186816824676625, "learning_rate": 5.41224321503607e-07, "logits/chosen": 0.023516353219747543, "logits/rejected": 0.32052093744277954, "logps/chosen": -1.5313398838043213, "logps/rejected": -2.2353968620300293, "loss": 0.4052, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5313398838043213, "rewards/margins": 0.7040570378303528, "rewards/rejected": -2.2353968620300293, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 10.59361777230133, "learning_rate": 5.396720238361637e-07, "logits/chosen": 0.01968434639275074, "logits/rejected": 0.14163419604301453, "logps/chosen": -1.6637918949127197, "logps/rejected": -2.238799571990967, "loss": 0.4663, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6637918949127197, "rewards/margins": 0.5750076174736023, "rewards/rejected": -2.238799571990967, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 8.789728935637534, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.10512866079807281, "logits/rejected": 0.04714461788535118, "logps/chosen": -1.6435121297836304, "logps/rejected": -2.136634588241577, "loss": 0.471, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6435121297836304, "rewards/margins": 0.4931226670742035, "rewards/rejected": -2.136634588241577, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 6.284192893881936, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.03583573177456856, "logits/rejected": 0.07221022993326187, "logps/chosen": -1.6223284006118774, "logps/rejected": -2.176422119140625, "loss": 0.4714, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6223284006118774, "rewards/margins": 0.5540937185287476, "rewards/rejected": -2.176422119140625, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 19.128797076795287, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.07857382297515869, "logits/rejected": 0.14929823577404022, "logps/chosen": -1.7484052181243896, "logps/rejected": -2.356426954269409, "loss": 0.4963, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7484052181243896, "rewards/margins": 0.6080219745635986, "rewards/rejected": -2.356426954269409, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 9.307904895090152, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.03377872705459595, "logits/rejected": 0.14480581879615784, "logps/chosen": -1.6507179737091064, "logps/rejected": -2.292327404022217, "loss": 0.4495, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6507179737091064, "rewards/margins": 0.6416096687316895, "rewards/rejected": -2.292327404022217, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 5.68683393383205, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.13927766680717468, "logits/rejected": 0.060767192393541336, "logps/chosen": -1.7975196838378906, "logps/rejected": -2.3496346473693848, "loss": 0.4561, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7975196838378906, "rewards/margins": 0.552115261554718, "rewards/rejected": -2.3496346473693848, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 6.827422631894793, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.07638033479452133, "logits/rejected": 0.16232767701148987, "logps/chosen": -1.8714338541030884, "logps/rejected": -2.5276970863342285, "loss": 0.4977, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8714338541030884, "rewards/margins": 0.6562631726264954, "rewards/rejected": -2.5276970863342285, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 13.614202132351284, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.07288020849227905, "logits/rejected": 0.1488359421491623, "logps/chosen": -1.6442277431488037, "logps/rejected": -2.368490695953369, "loss": 0.4358, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6442277431488037, "rewards/margins": 0.7242627739906311, "rewards/rejected": -2.368490695953369, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 15.426323107917295, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.04883218929171562, "logits/rejected": 0.09796526283025742, "logps/chosen": -1.8562647104263306, "logps/rejected": -2.632030963897705, "loss": 0.4482, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8562647104263306, "rewards/margins": 0.7757660150527954, "rewards/rejected": -2.632030963897705, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 4.614911710596609, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.09070388972759247, "logits/rejected": 0.08681733906269073, "logps/chosen": -1.6451680660247803, "logps/rejected": -2.1571407318115234, "loss": 0.4775, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6451680660247803, "rewards/margins": 0.5119727849960327, "rewards/rejected": -2.1571407318115234, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 14.137058785546943, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.1489393413066864, "logits/rejected": 0.025513362139463425, "logps/chosen": -1.5995906591415405, "logps/rejected": -1.9803142547607422, "loss": 0.4791, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5995906591415405, "rewards/margins": 0.3807234764099121, "rewards/rejected": -1.9803142547607422, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 8.589534814107965, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.1262711137533188, "logits/rejected": 0.026388023048639297, "logps/chosen": -1.6019537448883057, "logps/rejected": -2.0012054443359375, "loss": 0.4917, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6019537448883057, "rewards/margins": 0.3992519974708557, "rewards/rejected": -2.0012054443359375, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 11.382052354474013, "learning_rate": 5.210187404905735e-07, "logits/chosen": 0.10268217325210571, "logits/rejected": 0.19708140194416046, "logps/chosen": -1.5778775215148926, "logps/rejected": -1.9784314632415771, "loss": 0.4784, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5778775215148926, "rewards/margins": 0.4005538821220398, "rewards/rejected": -1.9784314632415771, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 8.359397121890353, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.09849467873573303, "logits/rejected": 0.07482314109802246, "logps/chosen": -1.6075853109359741, "logps/rejected": -2.1020407676696777, "loss": 0.4401, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6075853109359741, "rewards/margins": 0.49445539712905884, "rewards/rejected": -2.1020407676696777, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 6.615111517052666, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.06342469155788422, "logits/rejected": 0.12223508208990097, "logps/chosen": -1.552067518234253, "logps/rejected": -2.015472650527954, "loss": 0.4512, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.552067518234253, "rewards/margins": 0.463405042886734, "rewards/rejected": -2.015472650527954, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 8.545125463760215, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.040998633950948715, "logits/rejected": 0.1188499704003334, "logps/chosen": -1.7313768863677979, "logps/rejected": -2.1948814392089844, "loss": 0.489, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7313768863677979, "rewards/margins": 0.4635045528411865, "rewards/rejected": -2.1948814392089844, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 8.71911814578877, "learning_rate": 5.147931662540144e-07, "logits/chosen": 0.048558127135038376, "logits/rejected": 0.1957053691148758, "logps/chosen": -1.7075660228729248, "logps/rejected": -2.1026601791381836, "loss": 0.4878, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7075660228729248, "rewards/margins": 0.39509424567222595, "rewards/rejected": -2.1026601791381836, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 6.418393591319962, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.06575219333171844, "logits/rejected": 0.013626632280647755, "logps/chosen": -1.6094707250595093, "logps/rejected": -2.2943801879882812, "loss": 0.4231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6094707250595093, "rewards/margins": 0.6849097013473511, "rewards/rejected": -2.2943801879882812, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 17.565209390411297, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.047845568507909775, "logits/rejected": 0.07051759213209152, "logps/chosen": -1.4953639507293701, "logps/rejected": -1.9154865741729736, "loss": 0.4748, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4953639507293701, "rewards/margins": 0.4201226234436035, "rewards/rejected": -1.9154865741729736, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 6.642269413731905, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.07968827337026596, "logits/rejected": 0.10386794805526733, "logps/chosen": -1.6966203451156616, "logps/rejected": -2.3019192218780518, "loss": 0.4397, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6966203451156616, "rewards/margins": 0.6052988767623901, "rewards/rejected": -2.3019192218780518, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 8.515151885059254, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.09994269907474518, "logits/rejected": 0.06980907917022705, "logps/chosen": -1.614031434059143, "logps/rejected": -2.128415584564209, "loss": 0.4539, "rewards/accuracies": 0.6875, "rewards/chosen": -1.614031434059143, "rewards/margins": 0.5143840312957764, "rewards/rejected": -2.128415584564209, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 5.8224978800680915, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.07845492660999298, "logits/rejected": 0.13995292782783508, "logps/chosen": -1.6931785345077515, "logps/rejected": -2.201347827911377, "loss": 0.499, "rewards/accuracies": 0.625, "rewards/chosen": -1.6931785345077515, "rewards/margins": 0.5081695318222046, "rewards/rejected": -2.201347827911377, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 9.903527424955339, "learning_rate": 5.0545080135113e-07, "logits/chosen": 0.032994594424963, "logits/rejected": 0.0923147052526474, "logps/chosen": -1.682677984237671, "logps/rejected": -2.2914559841156006, "loss": 0.455, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.682677984237671, "rewards/margins": 0.6087781190872192, "rewards/rejected": -2.2914559841156006, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 11.013408596375006, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.14262424409389496, "logits/rejected": -0.0029101967811584473, "logps/chosen": -1.8412444591522217, "logps/rejected": -2.3819901943206787, "loss": 0.4871, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.8412444591522217, "rewards/margins": 0.5407457947731018, "rewards/rejected": -2.3819901943206787, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 4.633057612165813, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.15010672807693481, "logits/rejected": -0.0696638897061348, "logps/chosen": -1.521460771560669, "logps/rejected": -1.990719199180603, "loss": 0.4479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.521460771560669, "rewards/margins": 0.4692586064338684, "rewards/rejected": -1.990719199180603, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 6.628031082563841, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.20792193710803986, "logits/rejected": 0.012540074996650219, "logps/chosen": -1.4911404848098755, "logps/rejected": -2.033839225769043, "loss": 0.428, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4911404848098755, "rewards/margins": 0.542698860168457, "rewards/rejected": -2.033839225769043, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 5.777925415794297, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.007816017605364323, "logits/rejected": 0.014690647833049297, "logps/chosen": -1.493788480758667, "logps/rejected": -1.9720876216888428, "loss": 0.4532, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.493788480758667, "rewards/margins": 0.4782991409301758, "rewards/rejected": -1.9720876216888428, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 6.719632530995194, "learning_rate": 4.976639045035036e-07, "logits/chosen": 0.016942087560892105, "logits/rejected": 0.10999103635549545, "logps/chosen": -1.5796377658843994, "logps/rejected": -1.9444456100463867, "loss": 0.5238, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5796377658843994, "rewards/margins": 0.36480778455734253, "rewards/rejected": -1.9444456100463867, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 7.740081229460768, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.04360239952802658, "logits/rejected": 0.13280083239078522, "logps/chosen": -1.6954505443572998, "logps/rejected": -2.1920969486236572, "loss": 0.4686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6954505443572998, "rewards/margins": 0.49664634466171265, "rewards/rejected": -2.1920969486236572, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 8.210948234681975, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.17998211085796356, "logits/rejected": -0.01732531748712063, "logps/chosen": -1.6088836193084717, "logps/rejected": -2.197388172149658, "loss": 0.4483, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6088836193084717, "rewards/margins": 0.5885046720504761, "rewards/rejected": -2.197388172149658, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 8.421390842983074, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.14124853909015656, "logits/rejected": 0.07779726386070251, "logps/chosen": -1.6131668090820312, "logps/rejected": -2.02565336227417, "loss": 0.4972, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6131668090820312, "rewards/margins": 0.4124864935874939, "rewards/rejected": -2.02565336227417, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 11.269559019243111, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.06289513409137726, "logits/rejected": 0.06999722123146057, "logps/chosen": -1.57101571559906, "logps/rejected": -2.020738124847412, "loss": 0.493, "rewards/accuracies": 0.6875, "rewards/chosen": -1.57101571559906, "rewards/margins": 0.4497222900390625, "rewards/rejected": -2.020738124847412, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 8.111847543872907, "learning_rate": 4.898775742651013e-07, "logits/chosen": 0.047879137098789215, "logits/rejected": 0.16327807307243347, "logps/chosen": -1.6629081964492798, "logps/rejected": -2.213575839996338, "loss": 0.4521, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6629081964492798, "rewards/margins": 0.5506675243377686, "rewards/rejected": -2.213575839996338, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 9.613242969834843, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.1264229267835617, "logits/rejected": 0.05352715402841568, "logps/chosen": -1.6965415477752686, "logps/rejected": -2.2385268211364746, "loss": 0.5023, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6965415477752686, "rewards/margins": 0.5419851541519165, "rewards/rejected": -2.2385268211364746, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 6.506945951371255, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.01653352752327919, "logits/rejected": 0.12483112514019012, "logps/chosen": -1.6449161767959595, "logps/rejected": -2.1039481163024902, "loss": 0.4903, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6449161767959595, "rewards/margins": 0.459031879901886, "rewards/rejected": -2.1039481163024902, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 7.282655656127116, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.020227957516908646, "logits/rejected": 0.16941022872924805, "logps/chosen": -1.7297897338867188, "logps/rejected": -2.235679864883423, "loss": 0.4574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7297897338867188, "rewards/margins": 0.5058901309967041, "rewards/rejected": -2.235679864883423, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 7.984920523863347, "learning_rate": 4.8365018714728e-07, "logits/chosen": 0.01937662437558174, "logits/rejected": 0.09848417341709137, "logps/chosen": -1.6891262531280518, "logps/rejected": -2.0851426124572754, "loss": 0.5033, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6891262531280518, "rewards/margins": 0.3960162401199341, "rewards/rejected": -2.0851426124572754, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 6.518444829878884, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.2609904110431671, "logits/rejected": -0.0926179513335228, "logps/chosen": -1.4865214824676514, "logps/rejected": -1.9025242328643799, "loss": 0.4668, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4865214824676514, "rewards/margins": 0.41600289940834045, "rewards/rejected": -1.9025242328643799, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 6.233897293440857, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.055072713643312454, "logits/rejected": 0.1119011864066124, "logps/chosen": -1.4963717460632324, "logps/rejected": -2.041358470916748, "loss": 0.4528, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4963717460632324, "rewards/margins": 0.5449866056442261, "rewards/rejected": -2.041358470916748, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 9.13277925473898, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.21542203426361084, "logits/rejected": -0.07814104110002518, "logps/chosen": -1.609500527381897, "logps/rejected": -2.0910818576812744, "loss": 0.4653, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.609500527381897, "rewards/margins": 0.4815812110900879, "rewards/rejected": -2.0910818576812744, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 6.679992812173395, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.21357032656669617, "logits/rejected": -0.07746794074773788, "logps/chosen": -1.4359432458877563, "logps/rejected": -1.9584068059921265, "loss": 0.4491, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4359432458877563, "rewards/margins": 0.5224637985229492, "rewards/rejected": -1.9584068059921265, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 9.908660828039347, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.09149477630853653, "logits/rejected": -0.08310405910015106, "logps/chosen": -1.559985876083374, "logps/rejected": -2.091235876083374, "loss": 0.4569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.559985876083374, "rewards/margins": 0.53125, "rewards/rejected": -2.091235876083374, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 5.727145813914709, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.20452912151813507, "logits/rejected": -0.08732346445322037, "logps/chosen": -1.6636015176773071, "logps/rejected": -2.0986714363098145, "loss": 0.487, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6636015176773071, "rewards/margins": 0.4350700378417969, "rewards/rejected": -2.0986714363098145, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 7.290909368762703, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.1633075326681137, "logits/rejected": -0.0569547601044178, "logps/chosen": -1.6291965246200562, "logps/rejected": -2.1428394317626953, "loss": 0.4671, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6291965246200562, "rewards/margins": 0.5136427283287048, "rewards/rejected": -2.1428394317626953, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 13.575254524999655, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.20115220546722412, "logits/rejected": -0.07489201426506042, "logps/chosen": -1.684495210647583, "logps/rejected": -2.050915479660034, "loss": 0.5299, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.684495210647583, "rewards/margins": 0.3664206862449646, "rewards/rejected": -2.050915479660034, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 6.519799757604999, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.048026520758867264, "logits/rejected": 0.01066395454108715, "logps/chosen": -1.6414997577667236, "logps/rejected": -2.224869966506958, "loss": 0.4454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6414997577667236, "rewards/margins": 0.5833699107170105, "rewards/rejected": -2.224869966506958, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 9.676781213799002, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.10038290172815323, "logits/rejected": -0.06395160406827927, "logps/chosen": -1.5947024822235107, "logps/rejected": -1.9163787364959717, "loss": 0.5089, "rewards/accuracies": 0.625, "rewards/chosen": -1.5947024822235107, "rewards/margins": 0.321676105260849, "rewards/rejected": -1.9163787364959717, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 6.672816414979572, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.1953904926776886, "logits/rejected": -0.0046152682043612, "logps/chosen": -1.5963404178619385, "logps/rejected": -2.1976380348205566, "loss": 0.4261, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5963404178619385, "rewards/margins": 0.6012973189353943, "rewards/rejected": -2.1976380348205566, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 8.408815719840659, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.1891988217830658, "logits/rejected": -0.06386252492666245, "logps/chosen": -1.5877994298934937, "logps/rejected": -2.1241488456726074, "loss": 0.4612, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5877994298934937, "rewards/margins": 0.5363491773605347, "rewards/rejected": -2.1241488456726074, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 8.4522269146182, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.05092586949467659, "logits/rejected": 0.11532881110906601, "logps/chosen": -1.6418864727020264, "logps/rejected": -2.1992897987365723, "loss": 0.4945, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6418864727020264, "rewards/margins": 0.5574029088020325, "rewards/rejected": -2.1992897987365723, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.28342580795288086, "eval_logits/rejected": 0.39087986946105957, "eval_logps/chosen": -1.5939713716506958, "eval_logps/rejected": -2.0955896377563477, "eval_loss": 0.4737696647644043, "eval_rewards/accuracies": 0.6676557660102844, "eval_rewards/chosen": -1.5939713716506958, "eval_rewards/margins": 0.5016182661056519, "eval_rewards/rejected": -2.0955896377563477, "eval_runtime": 40.3731, "eval_samples_per_second": 33.314, "eval_steps_per_second": 8.347, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 7.294747554464907, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.21884839236736298, "logits/rejected": -0.08587951958179474, "logps/chosen": -1.54911470413208, "logps/rejected": -2.0887200832366943, "loss": 0.4627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.54911470413208, "rewards/margins": 0.539605438709259, "rewards/rejected": -2.0887200832366943, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 8.054075440680405, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.1680445522069931, "logits/rejected": -0.04861944913864136, "logps/chosen": -1.598713755607605, "logps/rejected": -2.1032581329345703, "loss": 0.4622, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.598713755607605, "rewards/margins": 0.5045443773269653, "rewards/rejected": -2.1032581329345703, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 10.55260819934486, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.16315996646881104, "logits/rejected": -0.020486649125814438, "logps/chosen": -1.553241491317749, "logps/rejected": -2.0550389289855957, "loss": 0.4807, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.553241491317749, "rewards/margins": 0.5017975568771362, "rewards/rejected": -2.0550389289855957, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 8.208102393418207, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.17600563168525696, "logits/rejected": 0.09056280553340912, "logps/chosen": -1.7380603551864624, "logps/rejected": -2.2843708992004395, "loss": 0.5227, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7380603551864624, "rewards/margins": 0.5463104248046875, "rewards/rejected": -2.2843708992004395, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 7.777743271463182, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.1516670286655426, "logits/rejected": -0.007256799843162298, "logps/chosen": -1.4837692975997925, "logps/rejected": -2.108306407928467, "loss": 0.425, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4837692975997925, "rewards/margins": 0.6245372891426086, "rewards/rejected": -2.108306407928467, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 9.81631391221655, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.18306465446949005, "logits/rejected": 0.0014761544298380613, "logps/chosen": -1.6160571575164795, "logps/rejected": -2.394860029220581, "loss": 0.4075, "rewards/accuracies": 0.75, "rewards/chosen": -1.6160571575164795, "rewards/margins": 0.7788026928901672, "rewards/rejected": -2.394860029220581, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 6.452889048944105, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.17065966129302979, "logits/rejected": -0.07790778577327728, "logps/chosen": -1.5877543687820435, "logps/rejected": -2.1768155097961426, "loss": 0.4458, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5877543687820435, "rewards/margins": 0.5890610814094543, "rewards/rejected": -2.1768155097961426, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 9.321988859582989, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.06224856898188591, "logits/rejected": 0.07909824699163437, "logps/chosen": -1.5559574365615845, "logps/rejected": -2.1987593173980713, "loss": 0.4432, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5559574365615845, "rewards/margins": 0.6428017616271973, "rewards/rejected": -2.1987593173980713, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 10.582160918187041, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.25316867232322693, "logits/rejected": -0.08437500149011612, "logps/chosen": -1.5972328186035156, "logps/rejected": -2.167891025543213, "loss": 0.4669, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5972328186035156, "rewards/margins": 0.5706582069396973, "rewards/rejected": -2.167891025543213, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 6.386920358721332, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.034543924033641815, "logits/rejected": 0.14559173583984375, "logps/chosen": -1.6439317464828491, "logps/rejected": -2.2543246746063232, "loss": 0.4517, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6439317464828491, "rewards/margins": 0.6103931665420532, "rewards/rejected": -2.2543246746063232, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 5.94696743237734, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.05548275262117386, "logits/rejected": 0.03375374525785446, "logps/chosen": -1.703317642211914, "logps/rejected": -2.1584787368774414, "loss": 0.498, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.703317642211914, "rewards/margins": 0.45516103506088257, "rewards/rejected": -2.1584787368774414, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 8.161909370724064, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.19456695020198822, "logits/rejected": -0.04071703553199768, "logps/chosen": -1.6626838445663452, "logps/rejected": -2.2138752937316895, "loss": 0.4566, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6626838445663452, "rewards/margins": 0.5511918067932129, "rewards/rejected": -2.2138752937316895, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 10.941090171543163, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.048779528588056564, "logits/rejected": 0.07540126889944077, "logps/chosen": -1.5606415271759033, "logps/rejected": -2.1610100269317627, "loss": 0.4526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5606415271759033, "rewards/margins": 0.6003684997558594, "rewards/rejected": -2.1610100269317627, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 8.06676476976768, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.06870637834072113, "logits/rejected": 0.08948095142841339, "logps/chosen": -1.589386224746704, "logps/rejected": -2.1881346702575684, "loss": 0.4434, "rewards/accuracies": 0.71875, "rewards/chosen": -1.589386224746704, "rewards/margins": 0.598748505115509, "rewards/rejected": -2.1881346702575684, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 6.56813678065148, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.15020890533924103, "logits/rejected": 0.21405506134033203, "logps/chosen": -1.659999132156372, "logps/rejected": -2.311579942703247, "loss": 0.44, "rewards/accuracies": 0.6875, "rewards/chosen": -1.659999132156372, "rewards/margins": 0.651580810546875, "rewards/rejected": -2.311579942703247, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 8.792619503104397, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.09379534423351288, "logits/rejected": 0.018929172307252884, "logps/chosen": -1.4578596353530884, "logps/rejected": -2.0011513233184814, "loss": 0.436, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4578596353530884, "rewards/margins": 0.5432916879653931, "rewards/rejected": -2.0011513233184814, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 6.3272870492416144, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.13952267169952393, "logits/rejected": 0.1145879253745079, "logps/chosen": -1.5964546203613281, "logps/rejected": -2.1960668563842773, "loss": 0.4395, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5964546203613281, "rewards/margins": 0.5996121168136597, "rewards/rejected": -2.1960668563842773, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 8.622017180278993, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.0669303685426712, "logits/rejected": 0.07052404433488846, "logps/chosen": -1.5545414686203003, "logps/rejected": -2.067183017730713, "loss": 0.4609, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5545414686203003, "rewards/margins": 0.512641429901123, "rewards/rejected": -2.067183017730713, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 7.322388177339634, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.10841120779514313, "logits/rejected": -0.02166413888335228, "logps/chosen": -1.7624595165252686, "logps/rejected": -2.2892837524414062, "loss": 0.503, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7624595165252686, "rewards/margins": 0.5268241167068481, "rewards/rejected": -2.2892837524414062, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 7.897692266543979, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.218820720911026, "logits/rejected": -0.04769423231482506, "logps/chosen": -1.7774940729141235, "logps/rejected": -2.423774242401123, "loss": 0.48, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.7774940729141235, "rewards/margins": 0.6462799906730652, "rewards/rejected": -2.423774242401123, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 9.519332104300528, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.1914810836315155, "logits/rejected": -0.006599991116672754, "logps/chosen": -1.7079054117202759, "logps/rejected": -2.3030059337615967, "loss": 0.4648, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7079054117202759, "rewards/margins": 0.5951007008552551, "rewards/rejected": -2.3030059337615967, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 6.601548809437462, "learning_rate": 4.293751575992455e-07, "logits/chosen": 0.016861936077475548, "logits/rejected": 0.0756213515996933, "logps/chosen": -1.7027183771133423, "logps/rejected": -2.2110085487365723, "loss": 0.4608, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7027183771133423, "rewards/margins": 0.5082901120185852, "rewards/rejected": -2.2110085487365723, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 15.056240566266336, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.21691083908081055, "logits/rejected": -0.03100084699690342, "logps/chosen": -1.6918230056762695, "logps/rejected": -2.2261364459991455, "loss": 0.4796, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6918230056762695, "rewards/margins": 0.5343132019042969, "rewards/rejected": -2.2261364459991455, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 7.411336469645185, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.11473717540502548, "logits/rejected": 0.12198439985513687, "logps/chosen": -1.6622596979141235, "logps/rejected": -2.4046096801757812, "loss": 0.4276, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6622596979141235, "rewards/margins": 0.7423499822616577, "rewards/rejected": -2.4046096801757812, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 7.205859013460629, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.18044662475585938, "logits/rejected": 0.019443240016698837, "logps/chosen": -1.7143522500991821, "logps/rejected": -2.292325735092163, "loss": 0.448, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7143522500991821, "rewards/margins": 0.5779733061790466, "rewards/rejected": -2.292325735092163, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 7.803845463524999, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.06799592822790146, "logits/rejected": 0.06416481733322144, "logps/chosen": -1.5456761121749878, "logps/rejected": -2.1574549674987793, "loss": 0.4363, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5456761121749878, "rewards/margins": 0.6117784976959229, "rewards/rejected": -2.1574549674987793, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 19.342497369517552, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.11853525787591934, "logits/rejected": 0.05644531920552254, "logps/chosen": -1.7124582529067993, "logps/rejected": -2.213517189025879, "loss": 0.4919, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7124582529067993, "rewards/margins": 0.5010589361190796, "rewards/rejected": -2.213517189025879, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 7.567209748331828, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.0797075629234314, "logits/rejected": 0.10979306697845459, "logps/chosen": -1.5406991243362427, "logps/rejected": -2.105896234512329, "loss": 0.455, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5406991243362427, "rewards/margins": 0.5651971101760864, "rewards/rejected": -2.105896234512329, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 7.19770998495049, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.10256737470626831, "logits/rejected": 0.10403762012720108, "logps/chosen": -1.4860495328903198, "logps/rejected": -2.0060622692108154, "loss": 0.451, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4860495328903198, "rewards/margins": 0.5200127959251404, "rewards/rejected": -2.0060622692108154, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 8.47392121776634, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.24208185076713562, "logits/rejected": -0.07218058407306671, "logps/chosen": -1.686939001083374, "logps/rejected": -2.1191635131835938, "loss": 0.4775, "rewards/accuracies": 0.65625, "rewards/chosen": -1.686939001083374, "rewards/margins": 0.4322245121002197, "rewards/rejected": -2.1191635131835938, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 6.5141770485478245, "learning_rate": 4.1552863054229116e-07, "logits/chosen": 0.03692032769322395, "logits/rejected": 0.092523492872715, "logps/chosen": -1.720044732093811, "logps/rejected": -2.1281657218933105, "loss": 0.5405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.720044732093811, "rewards/margins": 0.408121258020401, "rewards/rejected": -2.1281657218933105, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 7.942683936468173, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.1558951735496521, "logits/rejected": 0.10409434139728546, "logps/chosen": -1.5908839702606201, "logps/rejected": -2.121629238128662, "loss": 0.4622, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5908839702606201, "rewards/margins": 0.5307454466819763, "rewards/rejected": -2.121629238128662, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 7.6816105746107946, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.1612395942211151, "logits/rejected": 0.02495410479605198, "logps/chosen": -1.7327139377593994, "logps/rejected": -2.251650810241699, "loss": 0.4619, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7327139377593994, "rewards/margins": 0.5189366340637207, "rewards/rejected": -2.251650810241699, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 9.512459842728083, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.1760474145412445, "logits/rejected": 0.010972839780151844, "logps/chosen": -1.7028369903564453, "logps/rejected": -2.0960617065429688, "loss": 0.5361, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7028369903564453, "rewards/margins": 0.3932246267795563, "rewards/rejected": -2.0960617065429688, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 10.922935346538326, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.004866510629653931, "logits/rejected": 0.0685608759522438, "logps/chosen": -1.7979533672332764, "logps/rejected": -2.2760138511657715, "loss": 0.5352, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7979533672332764, "rewards/margins": 0.4780604839324951, "rewards/rejected": -2.2760138511657715, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 5.69581796428529, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.21498322486877441, "logits/rejected": -0.03498520702123642, "logps/chosen": -1.6010398864746094, "logps/rejected": -2.136812686920166, "loss": 0.452, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6010398864746094, "rewards/margins": 0.5357726812362671, "rewards/rejected": -2.136812686920166, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 6.006515018235091, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.1718832403421402, "logits/rejected": -0.13623422384262085, "logps/chosen": -1.730212926864624, "logps/rejected": -2.1370067596435547, "loss": 0.5038, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.730212926864624, "rewards/margins": 0.40679341554641724, "rewards/rejected": -2.1370067596435547, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 11.07218107658687, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.09494385868310928, "logits/rejected": 0.061937130987644196, "logps/chosen": -1.6549794673919678, "logps/rejected": -2.041999101638794, "loss": 0.5149, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6549794673919678, "rewards/margins": 0.3870196044445038, "rewards/rejected": -2.041999101638794, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 7.940606467078925, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.1590867042541504, "logits/rejected": -0.021192366257309914, "logps/chosen": -1.52074134349823, "logps/rejected": -2.099182605743408, "loss": 0.4424, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.52074134349823, "rewards/margins": 0.5784412026405334, "rewards/rejected": -2.099182605743408, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 7.610440644814737, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.09842582046985626, "logits/rejected": 0.040536586195230484, "logps/chosen": -1.5329501628875732, "logps/rejected": -1.9897925853729248, "loss": 0.4846, "rewards/accuracies": 0.625, "rewards/chosen": -1.5329501628875732, "rewards/margins": 0.4568423330783844, "rewards/rejected": -1.9897925853729248, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 6.908427037752891, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.1355098932981491, "logits/rejected": 0.04013829678297043, "logps/chosen": -1.5662481784820557, "logps/rejected": -2.051380157470703, "loss": 0.4604, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5662481784820557, "rewards/margins": 0.4851318895816803, "rewards/rejected": -2.051380157470703, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 13.622776370546523, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.15083037316799164, "logits/rejected": 0.12041524797677994, "logps/chosen": -1.7199735641479492, "logps/rejected": -2.1616084575653076, "loss": 0.491, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7199735641479492, "rewards/margins": 0.44163504242897034, "rewards/rejected": -2.1616084575653076, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 10.595501937743313, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.1448957473039627, "logits/rejected": -0.011075102724134922, "logps/chosen": -1.6675622463226318, "logps/rejected": -2.2203192710876465, "loss": 0.4799, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6675622463226318, "rewards/margins": 0.5527569055557251, "rewards/rejected": -2.2203192710876465, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 13.930594176074814, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.08221431076526642, "logits/rejected": 0.051155705004930496, "logps/chosen": -1.7290515899658203, "logps/rejected": -2.320524215698242, "loss": 0.4911, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7290515899658203, "rewards/margins": 0.5914725065231323, "rewards/rejected": -2.320524215698242, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 5.515939199180583, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.1595170646905899, "logits/rejected": -0.1026882752776146, "logps/chosen": -1.563403606414795, "logps/rejected": -2.0864171981811523, "loss": 0.4836, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.563403606414795, "rewards/margins": 0.523013710975647, "rewards/rejected": -2.0864171981811523, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 7.14811328865067, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.16500917077064514, "logits/rejected": -0.01280413568019867, "logps/chosen": -1.6966482400894165, "logps/rejected": -2.3979029655456543, "loss": 0.4502, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6966482400894165, "rewards/margins": 0.7012547254562378, "rewards/rejected": -2.3979029655456543, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 8.963510680250758, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.16919752955436707, "logits/rejected": -0.05342903733253479, "logps/chosen": -1.5686460733413696, "logps/rejected": -2.179141044616699, "loss": 0.4199, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5686460733413696, "rewards/margins": 0.6104949712753296, "rewards/rejected": -2.179141044616699, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 9.148830395819887, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.19685237109661102, "logits/rejected": -0.11372096836566925, "logps/chosen": -1.491294264793396, "logps/rejected": -2.1185173988342285, "loss": 0.4591, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.491294264793396, "rewards/margins": 0.6272231340408325, "rewards/rejected": -2.1185173988342285, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 5.650871199401895, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.1758863776922226, "logits/rejected": 0.08714493364095688, "logps/chosen": -1.6788581609725952, "logps/rejected": -2.230807065963745, "loss": 0.4689, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6788581609725952, "rewards/margins": 0.5519488453865051, "rewards/rejected": -2.230807065963745, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 10.11217423318757, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.299620121717453, "logits/rejected": -0.09718199819326401, "logps/chosen": -1.6020921468734741, "logps/rejected": -2.194240093231201, "loss": 0.4497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6020921468734741, "rewards/margins": 0.5921476483345032, "rewards/rejected": -2.194240093231201, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 6.897071905524454, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.13259075582027435, "logits/rejected": 0.07207486778497696, "logps/chosen": -1.8694835901260376, "logps/rejected": -2.438405990600586, "loss": 0.4942, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8694835901260376, "rewards/margins": 0.568922221660614, "rewards/rejected": -2.438405990600586, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 13.067331222993886, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.017802337184548378, "logits/rejected": 0.12826618552207947, "logps/chosen": -1.5886718034744263, "logps/rejected": -2.158238172531128, "loss": 0.4689, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5886718034744263, "rewards/margins": 0.5695662498474121, "rewards/rejected": -2.158238172531128, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 6.723886484925612, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.19222888350486755, "logits/rejected": -0.04990043863654137, "logps/chosen": -1.5873486995697021, "logps/rejected": -2.1005141735076904, "loss": 0.4407, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5873486995697021, "rewards/margins": 0.5131653547286987, "rewards/rejected": -2.1005141735076904, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 8.090915901720049, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.16646277904510498, "logits/rejected": -0.056137360632419586, "logps/chosen": -1.7569866180419922, "logps/rejected": -2.33811092376709, "loss": 0.4669, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7569866180419922, "rewards/margins": 0.5811241269111633, "rewards/rejected": -2.33811092376709, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 11.127920509847229, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.1740160435438156, "logits/rejected": -0.09970130771398544, "logps/chosen": -1.6219441890716553, "logps/rejected": -2.072610378265381, "loss": 0.4727, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6219441890716553, "rewards/margins": 0.4506661891937256, "rewards/rejected": -2.072610378265381, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 6.103896012538862, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.020819351077079773, "logits/rejected": 0.023052120581269264, "logps/chosen": -1.7564241886138916, "logps/rejected": -2.2695279121398926, "loss": 0.4844, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7564241886138916, "rewards/margins": 0.5131038427352905, "rewards/rejected": -2.2695279121398926, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 5.681022834546637, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.06041911244392395, "logits/rejected": 0.07516376674175262, "logps/chosen": -1.6208909749984741, "logps/rejected": -2.129011392593384, "loss": 0.4638, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6208909749984741, "rewards/margins": 0.5081202983856201, "rewards/rejected": -2.129011392593384, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 7.781775749564612, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.08120614290237427, "logits/rejected": 0.07273395359516144, "logps/chosen": -1.6656090021133423, "logps/rejected": -2.098789691925049, "loss": 0.4748, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6656090021133423, "rewards/margins": 0.43318071961402893, "rewards/rejected": -2.098789691925049, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 9.48353706058087, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.1604049652814865, "logits/rejected": 0.04220633581280708, "logps/chosen": -1.7715049982070923, "logps/rejected": -2.206753730773926, "loss": 0.4725, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7715049982070923, "rewards/margins": 0.4352489113807678, "rewards/rejected": -2.206753730773926, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 6.346724690472188, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.07206880301237106, "logits/rejected": 0.107447549700737, "logps/chosen": -1.6419788599014282, "logps/rejected": -2.2720589637756348, "loss": 0.4538, "rewards/accuracies": 0.75, "rewards/chosen": -1.6419788599014282, "rewards/margins": 0.6300797462463379, "rewards/rejected": -2.2720589637756348, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 9.686977450074954, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.10207635164260864, "logits/rejected": 0.06881222128868103, "logps/chosen": -1.6468238830566406, "logps/rejected": -2.218038558959961, "loss": 0.4496, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6468238830566406, "rewards/margins": 0.5712145566940308, "rewards/rejected": -2.218038558959961, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 11.0937691254103, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.10623686015605927, "logits/rejected": 0.0007482111686840653, "logps/chosen": -1.5784227848052979, "logps/rejected": -2.1373276710510254, "loss": 0.4398, "rewards/accuracies": 0.75, "rewards/chosen": -1.5784227848052979, "rewards/margins": 0.5589048266410828, "rewards/rejected": -2.1373276710510254, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 8.003948898972345, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.13302062451839447, "logits/rejected": 0.0944342166185379, "logps/chosen": -1.6523277759552002, "logps/rejected": -2.26509952545166, "loss": 0.4606, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6523277759552002, "rewards/margins": 0.6127719283103943, "rewards/rejected": -2.26509952545166, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 6.954887182892608, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.17810526490211487, "logits/rejected": -0.001887299120426178, "logps/chosen": -1.6607215404510498, "logps/rejected": -2.2467894554138184, "loss": 0.4583, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6607215404510498, "rewards/margins": 0.5860679745674133, "rewards/rejected": -2.2467894554138184, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 7.156576446881386, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.08237972855567932, "logits/rejected": 0.14814645051956177, "logps/chosen": -1.633213996887207, "logps/rejected": -2.1915502548217773, "loss": 0.4519, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.633213996887207, "rewards/margins": 0.5583363175392151, "rewards/rejected": -2.1915502548217773, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 8.905250369111537, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.07599402964115143, "logits/rejected": 0.010531652718782425, "logps/chosen": -1.6511253118515015, "logps/rejected": -2.209929943084717, "loss": 0.4472, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6511253118515015, "rewards/margins": 0.5588047504425049, "rewards/rejected": -2.209929943084717, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 7.931935498250657, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.19470389187335968, "logits/rejected": -0.037597399204969406, "logps/chosen": -1.6013845205307007, "logps/rejected": -2.1648612022399902, "loss": 0.4558, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6013845205307007, "rewards/margins": 0.5634766221046448, "rewards/rejected": -2.1648612022399902, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 10.148896110408023, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.1410907506942749, "logits/rejected": 0.12587939202785492, "logps/chosen": -1.6886818408966064, "logps/rejected": -2.1082732677459717, "loss": 0.4977, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6886818408966064, "rewards/margins": 0.4195917248725891, "rewards/rejected": -2.1082732677459717, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 6.563079399026398, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.29996955394744873, "logits/rejected": 0.03556128218770027, "logps/chosen": -1.6634353399276733, "logps/rejected": -2.2374510765075684, "loss": 0.4377, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6634353399276733, "rewards/margins": 0.574015736579895, "rewards/rejected": -2.2374510765075684, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 6.881726948531249, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.05344680696725845, "logits/rejected": 0.058273058384656906, "logps/chosen": -1.6721327304840088, "logps/rejected": -2.1411499977111816, "loss": 0.4855, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6721327304840088, "rewards/margins": 0.46901735663414, "rewards/rejected": -2.1411499977111816, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 8.552675104901553, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.21700994670391083, "logits/rejected": -0.0454895906150341, "logps/chosen": -1.5820682048797607, "logps/rejected": -2.132432460784912, "loss": 0.4773, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5820682048797607, "rewards/margins": 0.5503643751144409, "rewards/rejected": -2.132432460784912, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 8.661362826118578, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.09347380697727203, "logits/rejected": 0.019575998187065125, "logps/chosen": -1.6075941324234009, "logps/rejected": -2.1575798988342285, "loss": 0.4474, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6075941324234009, "rewards/margins": 0.5499861836433411, "rewards/rejected": -2.1575798988342285, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 6.270281457883015, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.08457478135824203, "logits/rejected": 0.009884688071906567, "logps/chosen": -1.5520362854003906, "logps/rejected": -2.2914605140686035, "loss": 0.4118, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5520362854003906, "rewards/margins": 0.7394243478775024, "rewards/rejected": -2.2914605140686035, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 9.192362510439247, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.1555635631084442, "logits/rejected": 0.02932533621788025, "logps/chosen": -1.5959014892578125, "logps/rejected": -2.139308452606201, "loss": 0.4402, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5959014892578125, "rewards/margins": 0.5434070825576782, "rewards/rejected": -2.139308452606201, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 5.323861590051058, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.03380977362394333, "logits/rejected": 0.1167793720960617, "logps/chosen": -1.6290137767791748, "logps/rejected": -2.2042770385742188, "loss": 0.4687, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6290137767791748, "rewards/margins": 0.5752629041671753, "rewards/rejected": -2.2042770385742188, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 7.951001927682395, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.11662764847278595, "logits/rejected": -0.04703262448310852, "logps/chosen": -1.6219335794448853, "logps/rejected": -2.2743890285491943, "loss": 0.4099, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6219335794448853, "rewards/margins": 0.6524555087089539, "rewards/rejected": -2.2743890285491943, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 16.212913531796147, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.022108901292085648, "logits/rejected": 0.05072753503918648, "logps/chosen": -1.79641854763031, "logps/rejected": -2.3403162956237793, "loss": 0.4603, "rewards/accuracies": 0.71875, "rewards/chosen": -1.79641854763031, "rewards/margins": 0.5438976287841797, "rewards/rejected": -2.3403162956237793, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 7.614280127339638, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.07231578230857849, "logits/rejected": 0.01361311785876751, "logps/chosen": -1.737091064453125, "logps/rejected": -2.203265905380249, "loss": 0.5005, "rewards/accuracies": 0.65625, "rewards/chosen": -1.737091064453125, "rewards/margins": 0.46617498993873596, "rewards/rejected": -2.203265905380249, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 7.178477449956493, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.23460988700389862, "logits/rejected": 0.002154585672542453, "logps/chosen": -1.6241573095321655, "logps/rejected": -2.1807315349578857, "loss": 0.4599, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6241573095321655, "rewards/margins": 0.5565743446350098, "rewards/rejected": -2.1807315349578857, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 8.297357012287913, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.07165677845478058, "logits/rejected": 0.1221420019865036, "logps/chosen": -1.732013463973999, "logps/rejected": -2.454972982406616, "loss": 0.4619, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.732013463973999, "rewards/margins": 0.722959578037262, "rewards/rejected": -2.454972982406616, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.2472785860300064, "eval_logits/rejected": 0.3536233901977539, "eval_logps/chosen": -1.6913567781448364, "eval_logps/rejected": -2.253035545349121, "eval_loss": 0.46998971700668335, "eval_rewards/accuracies": 0.672848641872406, "eval_rewards/chosen": -1.6913567781448364, "eval_rewards/margins": 0.5616786479949951, "eval_rewards/rejected": -2.253035545349121, "eval_runtime": 40.4643, "eval_samples_per_second": 33.239, "eval_steps_per_second": 8.328, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 8.245177345796815, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.17350052297115326, "logits/rejected": -0.12200416624546051, "logps/chosen": -1.6428953409194946, "logps/rejected": -2.078484296798706, "loss": 0.4875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6428953409194946, "rewards/margins": 0.43558892607688904, "rewards/rejected": -2.078484296798706, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 7.038874077568448, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.0658680647611618, "logits/rejected": 0.08376292139291763, "logps/chosen": -1.6610826253890991, "logps/rejected": -2.2041733264923096, "loss": 0.4565, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6610826253890991, "rewards/margins": 0.5430906414985657, "rewards/rejected": -2.2041733264923096, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 8.929872434919973, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.0324077233672142, "logits/rejected": 0.10529638826847076, "logps/chosen": -1.7204697132110596, "logps/rejected": -2.2614822387695312, "loss": 0.4721, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7204697132110596, "rewards/margins": 0.5410124063491821, "rewards/rejected": -2.2614822387695312, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 11.282153144878993, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.08990032970905304, "logits/rejected": -0.0012932062381878495, "logps/chosen": -1.7916405200958252, "logps/rejected": -2.211171865463257, "loss": 0.5058, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.7916405200958252, "rewards/margins": 0.4195317327976227, "rewards/rejected": -2.211171865463257, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 7.053475496602356, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.026878291741013527, "logits/rejected": -0.00260886549949646, "logps/chosen": -1.6487003564834595, "logps/rejected": -2.1165473461151123, "loss": 0.4781, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6487003564834595, "rewards/margins": 0.46784719824790955, "rewards/rejected": -2.1165473461151123, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 13.407752462405028, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.16567979753017426, "logits/rejected": -0.10723850876092911, "logps/chosen": -1.662584662437439, "logps/rejected": -2.146472454071045, "loss": 0.5173, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.662584662437439, "rewards/margins": 0.4838876724243164, "rewards/rejected": -2.146472454071045, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 6.95183946634287, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.17405828833580017, "logits/rejected": -0.06884922087192535, "logps/chosen": -1.5877101421356201, "logps/rejected": -2.123340368270874, "loss": 0.4493, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5877101421356201, "rewards/margins": 0.5356300473213196, "rewards/rejected": -2.123340368270874, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 6.448471367999517, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.11658506095409393, "logits/rejected": 0.00267019122838974, "logps/chosen": -1.7437679767608643, "logps/rejected": -2.1515276432037354, "loss": 0.4888, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.7437679767608643, "rewards/margins": 0.40775972604751587, "rewards/rejected": -2.1515276432037354, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 6.549249814293363, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.15045621991157532, "logits/rejected": -0.0032274401746690273, "logps/chosen": -1.7561931610107422, "logps/rejected": -2.101285219192505, "loss": 0.5195, "rewards/accuracies": 0.625, "rewards/chosen": -1.7561931610107422, "rewards/margins": 0.34509220719337463, "rewards/rejected": -2.101285219192505, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 14.54735220349112, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.18200799822807312, "logits/rejected": -0.06308607757091522, "logps/chosen": -1.6302919387817383, "logps/rejected": -2.0607686042785645, "loss": 0.4697, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6302919387817383, "rewards/margins": 0.4304766058921814, "rewards/rejected": -2.0607686042785645, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 8.208711093423526, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.16025730967521667, "logits/rejected": -0.017460089176893234, "logps/chosen": -1.7145850658416748, "logps/rejected": -2.2167675495147705, "loss": 0.4741, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7145850658416748, "rewards/margins": 0.5021826028823853, "rewards/rejected": -2.2167675495147705, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 11.158235620662094, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.2111811637878418, "logits/rejected": -0.06510473787784576, "logps/chosen": -1.5707592964172363, "logps/rejected": -2.097867250442505, "loss": 0.4439, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5707592964172363, "rewards/margins": 0.5271077156066895, "rewards/rejected": -2.097867250442505, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 7.496502778596213, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.12833727896213531, "logits/rejected": -0.0004452556313481182, "logps/chosen": -1.4616782665252686, "logps/rejected": -1.99832022190094, "loss": 0.473, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4616782665252686, "rewards/margins": 0.5366418957710266, "rewards/rejected": -1.99832022190094, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 6.788426904036366, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.15903808176517487, "logits/rejected": 0.011359068565070629, "logps/chosen": -1.576718807220459, "logps/rejected": -2.178205728530884, "loss": 0.4339, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.576718807220459, "rewards/margins": 0.60148686170578, "rewards/rejected": -2.178205728530884, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 7.69199344550379, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.1651841551065445, "logits/rejected": 0.0014219998847693205, "logps/chosen": -1.7308480739593506, "logps/rejected": -2.467891216278076, "loss": 0.4581, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7308480739593506, "rewards/margins": 0.7370426654815674, "rewards/rejected": -2.467891216278076, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 10.123508759251004, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.17970092594623566, "logits/rejected": -0.04212270304560661, "logps/chosen": -1.4600975513458252, "logps/rejected": -1.9849493503570557, "loss": 0.4139, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4600975513458252, "rewards/margins": 0.5248516798019409, "rewards/rejected": -1.9849493503570557, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 8.376087176496464, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.19717036187648773, "logits/rejected": -0.09891363233327866, "logps/chosen": -1.6186168193817139, "logps/rejected": -2.2157680988311768, "loss": 0.4593, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6186168193817139, "rewards/margins": 0.5971512794494629, "rewards/rejected": -2.2157680988311768, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 6.7151213257053115, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.15891586244106293, "logits/rejected": 0.02692449651658535, "logps/chosen": -1.6849273443222046, "logps/rejected": -2.221229076385498, "loss": 0.4698, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6849273443222046, "rewards/margins": 0.5363017916679382, "rewards/rejected": -2.221229076385498, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 7.3607045473611725, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.08959382027387619, "logits/rejected": -0.021063124760985374, "logps/chosen": -1.6931654214859009, "logps/rejected": -2.2204737663269043, "loss": 0.4932, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6931654214859009, "rewards/margins": 0.5273082256317139, "rewards/rejected": -2.2204737663269043, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 6.440989845256881, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.09416007995605469, "logits/rejected": 0.036277417093515396, "logps/chosen": -1.5540186166763306, "logps/rejected": -2.0784759521484375, "loss": 0.4553, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5540186166763306, "rewards/margins": 0.5244571566581726, "rewards/rejected": -2.0784759521484375, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 6.480476217777915, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.1344822645187378, "logits/rejected": -0.048076145350933075, "logps/chosen": -1.5913596153259277, "logps/rejected": -2.105093479156494, "loss": 0.4663, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5913596153259277, "rewards/margins": 0.5137335062026978, "rewards/rejected": -2.105093479156494, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 9.18694392555604, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.006093072704970837, "logits/rejected": 0.06943705677986145, "logps/chosen": -1.617544174194336, "logps/rejected": -2.0742499828338623, "loss": 0.4996, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.617544174194336, "rewards/margins": 0.4567059874534607, "rewards/rejected": -2.0742499828338623, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 6.001853808141746, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.06737169623374939, "logits/rejected": 0.012567895464599133, "logps/chosen": -1.683852195739746, "logps/rejected": -2.248185396194458, "loss": 0.467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.683852195739746, "rewards/margins": 0.5643332600593567, "rewards/rejected": -2.248185396194458, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 6.660121910757817, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.12260043621063232, "logits/rejected": -0.02947406843304634, "logps/chosen": -1.5896680355072021, "logps/rejected": -2.036689281463623, "loss": 0.4639, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5896680355072021, "rewards/margins": 0.4470215439796448, "rewards/rejected": -2.036689281463623, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 5.646075825330403, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.18432314693927765, "logits/rejected": -0.06993330270051956, "logps/chosen": -1.6310367584228516, "logps/rejected": -2.1323130130767822, "loss": 0.4929, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6310367584228516, "rewards/margins": 0.5012762546539307, "rewards/rejected": -2.1323130130767822, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 7.168959288446188, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.0913909524679184, "logits/rejected": 0.06132540851831436, "logps/chosen": -1.530108094215393, "logps/rejected": -2.053109645843506, "loss": 0.434, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.530108094215393, "rewards/margins": 0.5230015516281128, "rewards/rejected": -2.053109645843506, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 12.07565112609945, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.04048456996679306, "logits/rejected": 0.054282475262880325, "logps/chosen": -1.7184568643569946, "logps/rejected": -2.0940983295440674, "loss": 0.5673, "rewards/accuracies": 0.625, "rewards/chosen": -1.7184568643569946, "rewards/margins": 0.3756418228149414, "rewards/rejected": -2.0940983295440674, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 8.001519260259931, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.04994543641805649, "logits/rejected": 0.023034943267703056, "logps/chosen": -1.747633934020996, "logps/rejected": -2.1576180458068848, "loss": 0.5144, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.747633934020996, "rewards/margins": 0.4099845886230469, "rewards/rejected": -2.1576180458068848, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 8.026584855248954, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.12715840339660645, "logits/rejected": -0.024956170469522476, "logps/chosen": -1.7176978588104248, "logps/rejected": -2.180222272872925, "loss": 0.4783, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7176978588104248, "rewards/margins": 0.46252402663230896, "rewards/rejected": -2.180222272872925, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 6.793674601749615, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.1619778871536255, "logits/rejected": -0.05693144351243973, "logps/chosen": -1.5077826976776123, "logps/rejected": -2.157355308532715, "loss": 0.4222, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5077826976776123, "rewards/margins": 0.6495726704597473, "rewards/rejected": -2.157355308532715, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 7.570880006272601, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.12011580169200897, "logits/rejected": 0.006220706272870302, "logps/chosen": -1.6163476705551147, "logps/rejected": -2.311784267425537, "loss": 0.4152, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6163476705551147, "rewards/margins": 0.6954367160797119, "rewards/rejected": -2.311784267425537, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 6.113877979888895, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.2035842388868332, "logits/rejected": -0.15713955461978912, "logps/chosen": -1.640262246131897, "logps/rejected": -2.278890609741211, "loss": 0.4493, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.640262246131897, "rewards/margins": 0.638628363609314, "rewards/rejected": -2.278890609741211, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 11.01817799247406, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.10648135095834732, "logits/rejected": 0.10368646681308746, "logps/chosen": -1.670911192893982, "logps/rejected": -2.3182713985443115, "loss": 0.4542, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.670911192893982, "rewards/margins": 0.6473601460456848, "rewards/rejected": -2.3182713985443115, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 14.597479729423563, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.2192239761352539, "logits/rejected": -0.02926032245159149, "logps/chosen": -1.6242482662200928, "logps/rejected": -2.4483726024627686, "loss": 0.3982, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6242482662200928, "rewards/margins": 0.8241241574287415, "rewards/rejected": -2.4483726024627686, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 8.059448510816614, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.17119763791561127, "logits/rejected": -0.10168623924255371, "logps/chosen": -1.660200834274292, "logps/rejected": -2.2011947631835938, "loss": 0.4613, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.660200834274292, "rewards/margins": 0.5409940481185913, "rewards/rejected": -2.2011947631835938, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 7.2963954591757885, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.10525877773761749, "logits/rejected": -0.05921076610684395, "logps/chosen": -1.691656470298767, "logps/rejected": -2.309962272644043, "loss": 0.4504, "rewards/accuracies": 0.65625, "rewards/chosen": -1.691656470298767, "rewards/margins": 0.6183057427406311, "rewards/rejected": -2.309962272644043, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 4.979435573132258, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.14460299909114838, "logits/rejected": 0.013989953324198723, "logps/chosen": -1.7722980976104736, "logps/rejected": -2.4382071495056152, "loss": 0.444, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7722980976104736, "rewards/margins": 0.6659094095230103, "rewards/rejected": -2.4382071495056152, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 4.966574762485079, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.19783806800842285, "logits/rejected": -0.005297267343848944, "logps/chosen": -1.651984453201294, "logps/rejected": -2.33089280128479, "loss": 0.4105, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.651984453201294, "rewards/margins": 0.6789082884788513, "rewards/rejected": -2.33089280128479, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 7.993045273710177, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.09061770141124725, "logits/rejected": -0.06273224204778671, "logps/chosen": -1.5261223316192627, "logps/rejected": -2.009979724884033, "loss": 0.4645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5261223316192627, "rewards/margins": 0.4838576912879944, "rewards/rejected": -2.009979724884033, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 8.554508222103172, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.10264910757541656, "logits/rejected": 0.047110434621572495, "logps/chosen": -1.7362077236175537, "logps/rejected": -2.3533945083618164, "loss": 0.4245, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7362077236175537, "rewards/margins": 0.6171868443489075, "rewards/rejected": -2.3533945083618164, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 7.654019775346947, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.09349913895130157, "logits/rejected": 0.049702614545822144, "logps/chosen": -1.501654863357544, "logps/rejected": -2.0668740272521973, "loss": 0.4316, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.501654863357544, "rewards/margins": 0.5652189254760742, "rewards/rejected": -2.0668740272521973, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 8.465695598505357, "learning_rate": 2.81075756698315e-07, "logits/chosen": 0.014327213168144226, "logits/rejected": 0.11599723994731903, "logps/chosen": -1.5694503784179688, "logps/rejected": -2.2847278118133545, "loss": 0.3992, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5694503784179688, "rewards/margins": 0.7152775526046753, "rewards/rejected": -2.2847278118133545, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 8.18038642719341, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.23515768349170685, "logits/rejected": -0.09461261332035065, "logps/chosen": -1.6329929828643799, "logps/rejected": -2.2135508060455322, "loss": 0.4419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6329929828643799, "rewards/margins": 0.5805578231811523, "rewards/rejected": -2.2135508060455322, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 9.462354167131917, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.2195102721452713, "logits/rejected": 0.024111080914735794, "logps/chosen": -1.7403284311294556, "logps/rejected": -2.3458123207092285, "loss": 0.4674, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7403284311294556, "rewards/margins": 0.6054843068122864, "rewards/rejected": -2.3458123207092285, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 7.997152046580276, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.16170576214790344, "logits/rejected": -0.0369705855846405, "logps/chosen": -1.5689483880996704, "logps/rejected": -2.193948268890381, "loss": 0.4057, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5689483880996704, "rewards/margins": 0.6249998807907104, "rewards/rejected": -2.193948268890381, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 7.171140555351103, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.2582949995994568, "logits/rejected": -0.08853115141391754, "logps/chosen": -1.7362453937530518, "logps/rejected": -2.5175719261169434, "loss": 0.3961, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7362453937530518, "rewards/margins": 0.7813268303871155, "rewards/rejected": -2.5175719261169434, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 8.302882438854025, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.1695060431957245, "logits/rejected": 0.02151711843907833, "logps/chosen": -1.7512890100479126, "logps/rejected": -2.4019858837127686, "loss": 0.4376, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7512890100479126, "rewards/margins": 0.6506971120834351, "rewards/rejected": -2.4019858837127686, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 5.657041610732123, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.11862162500619888, "logits/rejected": 0.017155099660158157, "logps/chosen": -1.5637755393981934, "logps/rejected": -2.246851921081543, "loss": 0.4166, "rewards/accuracies": 0.75, "rewards/chosen": -1.5637755393981934, "rewards/margins": 0.6830763220787048, "rewards/rejected": -2.246851921081543, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 4.86072731357907, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.3108099102973938, "logits/rejected": -0.04393802955746651, "logps/chosen": -1.7282326221466064, "logps/rejected": -2.3783435821533203, "loss": 0.4276, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7282326221466064, "rewards/margins": 0.6501110196113586, "rewards/rejected": -2.3783435821533203, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 7.720520496471414, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.20143470168113708, "logits/rejected": -0.004190978594124317, "logps/chosen": -1.7718290090560913, "logps/rejected": -2.40324068069458, "loss": 0.4519, "rewards/accuracies": 0.75, "rewards/chosen": -1.7718290090560913, "rewards/margins": 0.6314113736152649, "rewards/rejected": -2.40324068069458, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 9.601851307310778, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.1245606392621994, "logits/rejected": 0.07988785952329636, "logps/chosen": -1.7244341373443604, "logps/rejected": -2.625386953353882, "loss": 0.3784, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7244341373443604, "rewards/margins": 0.9009529948234558, "rewards/rejected": -2.625386953353882, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 9.767239472769736, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.16443488001823425, "logits/rejected": 0.0017104626167565584, "logps/chosen": -1.639521598815918, "logps/rejected": -2.4078800678253174, "loss": 0.4084, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.639521598815918, "rewards/margins": 0.7683584690093994, "rewards/rejected": -2.4078800678253174, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 9.94581893399318, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.2910632789134979, "logits/rejected": -0.08045108616352081, "logps/chosen": -1.730804681777954, "logps/rejected": -2.239010810852051, "loss": 0.4793, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.730804681777954, "rewards/margins": 0.5082062482833862, "rewards/rejected": -2.239010810852051, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 8.515911312537913, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.16657409071922302, "logits/rejected": -0.0868040919303894, "logps/chosen": -1.544026494026184, "logps/rejected": -2.1228535175323486, "loss": 0.4263, "rewards/accuracies": 0.75, "rewards/chosen": -1.544026494026184, "rewards/margins": 0.578826904296875, "rewards/rejected": -2.1228535175323486, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 7.817080001660532, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.15717177093029022, "logits/rejected": -0.039968498051166534, "logps/chosen": -1.7651211023330688, "logps/rejected": -2.4039371013641357, "loss": 0.4084, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7651211023330688, "rewards/margins": 0.6388161778450012, "rewards/rejected": -2.4039371013641357, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 13.234424509954827, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.1902887374162674, "logits/rejected": 0.0007076561450958252, "logps/chosen": -1.4341686964035034, "logps/rejected": -2.1403591632843018, "loss": 0.3835, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4341686964035034, "rewards/margins": 0.7061904668807983, "rewards/rejected": -2.1403591632843018, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 7.665456569964181, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.21201202273368835, "logits/rejected": -0.04142715781927109, "logps/chosen": -1.5459574460983276, "logps/rejected": -2.2568836212158203, "loss": 0.4171, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5459574460983276, "rewards/margins": 0.7109262943267822, "rewards/rejected": -2.2568836212158203, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 7.272943920510569, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.07563682645559311, "logits/rejected": 0.026577282696962357, "logps/chosen": -1.688232421875, "logps/rejected": -2.3195266723632812, "loss": 0.4775, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.688232421875, "rewards/margins": 0.6312945485115051, "rewards/rejected": -2.3195266723632812, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 7.498715250549529, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.19716952741146088, "logits/rejected": 0.06388433277606964, "logps/chosen": -1.6599597930908203, "logps/rejected": -2.2328543663024902, "loss": 0.4604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6599597930908203, "rewards/margins": 0.5728943347930908, "rewards/rejected": -2.2328543663024902, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 9.998024026160307, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.02830003760755062, "logits/rejected": -0.0009842514991760254, "logps/chosen": -1.6166940927505493, "logps/rejected": -2.1754493713378906, "loss": 0.439, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6166940927505493, "rewards/margins": 0.5587553977966309, "rewards/rejected": -2.1754493713378906, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 8.850187266653625, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.1222141832113266, "logits/rejected": -0.015364821068942547, "logps/chosen": -1.6841554641723633, "logps/rejected": -2.229933977127075, "loss": 0.4786, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6841554641723633, "rewards/margins": 0.5457783937454224, "rewards/rejected": -2.229933977127075, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 7.375732225201661, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.09060560166835785, "logits/rejected": 0.0630372017621994, "logps/chosen": -1.4682657718658447, "logps/rejected": -2.068972587585449, "loss": 0.4085, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4682657718658447, "rewards/margins": 0.6007068753242493, "rewards/rejected": -2.068972587585449, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 9.87385188756126, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.18940049409866333, "logits/rejected": 0.005272398702800274, "logps/chosen": -1.6955063343048096, "logps/rejected": -2.3606626987457275, "loss": 0.4472, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6955063343048096, "rewards/margins": 0.6651566624641418, "rewards/rejected": -2.3606626987457275, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 9.19159877208008, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.1610020101070404, "logits/rejected": 0.021506082266569138, "logps/chosen": -1.6752796173095703, "logps/rejected": -2.4602067470550537, "loss": 0.4118, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6752796173095703, "rewards/margins": 0.7849270701408386, "rewards/rejected": -2.4602067470550537, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 7.05176851724037, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.1782703846693039, "logits/rejected": -0.05589856952428818, "logps/chosen": -1.6056280136108398, "logps/rejected": -2.3269519805908203, "loss": 0.4104, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6056280136108398, "rewards/margins": 0.7213238477706909, "rewards/rejected": -2.3269519805908203, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 7.157025331610629, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.26441726088523865, "logits/rejected": -0.09193973988294601, "logps/chosen": -1.6612592935562134, "logps/rejected": -2.3515655994415283, "loss": 0.426, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6612592935562134, "rewards/margins": 0.6903061866760254, "rewards/rejected": -2.3515655994415283, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 6.761547588385043, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.1315656453371048, "logits/rejected": -0.059941601008176804, "logps/chosen": -1.7033334970474243, "logps/rejected": -2.383653163909912, "loss": 0.4538, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7033334970474243, "rewards/margins": 0.6803197860717773, "rewards/rejected": -2.383653163909912, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 8.794493495259983, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.1559073030948639, "logits/rejected": -0.07542288303375244, "logps/chosen": -1.601605772972107, "logps/rejected": -2.398118019104004, "loss": 0.4033, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.601605772972107, "rewards/margins": 0.796512246131897, "rewards/rejected": -2.398118019104004, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 8.024824504969668, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.15106254816055298, "logits/rejected": 0.044697605073451996, "logps/chosen": -1.8633956909179688, "logps/rejected": -2.419623851776123, "loss": 0.4859, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8633956909179688, "rewards/margins": 0.5562279224395752, "rewards/rejected": -2.419623851776123, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 10.607116331852657, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.08442585170269012, "logits/rejected": 0.09216947853565216, "logps/chosen": -1.6179497241973877, "logps/rejected": -2.274871826171875, "loss": 0.4374, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6179497241973877, "rewards/margins": 0.6569223403930664, "rewards/rejected": -2.274871826171875, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 6.946013683766209, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.2144765853881836, "logits/rejected": -0.11225824058055878, "logps/chosen": -1.612540602684021, "logps/rejected": -2.390310764312744, "loss": 0.4267, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.612540602684021, "rewards/margins": 0.7777701616287231, "rewards/rejected": -2.390310764312744, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 9.165909096627239, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.11133971065282822, "logits/rejected": 0.020153874531388283, "logps/chosen": -1.6569511890411377, "logps/rejected": -2.468090534210205, "loss": 0.4179, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6569511890411377, "rewards/margins": 0.8111389875411987, "rewards/rejected": -2.468090534210205, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 7.553351107768989, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.1038537248969078, "logits/rejected": 0.024773940443992615, "logps/chosen": -1.6882301568984985, "logps/rejected": -2.4658374786376953, "loss": 0.4055, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6882301568984985, "rewards/margins": 0.777607262134552, "rewards/rejected": -2.4658374786376953, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 6.56923087844321, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.13287505507469177, "logits/rejected": -0.019622355699539185, "logps/chosen": -1.5616213083267212, "logps/rejected": -2.2908406257629395, "loss": 0.4252, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5616213083267212, "rewards/margins": 0.7292193174362183, "rewards/rejected": -2.2908406257629395, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 9.853996902727078, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.09419572353363037, "logits/rejected": 0.03081243857741356, "logps/chosen": -1.4388527870178223, "logps/rejected": -2.0299038887023926, "loss": 0.3964, "rewards/accuracies": 0.75, "rewards/chosen": -1.4388527870178223, "rewards/margins": 0.591050922870636, "rewards/rejected": -2.0299038887023926, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 6.98774293006464, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.21626611053943634, "logits/rejected": -0.12151708453893661, "logps/chosen": -1.5049701929092407, "logps/rejected": -2.2318670749664307, "loss": 0.4011, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5049701929092407, "rewards/margins": 0.7268967628479004, "rewards/rejected": -2.2318670749664307, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 9.852602997322588, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.22341063618659973, "logits/rejected": -0.09986375272274017, "logps/chosen": -1.6260045766830444, "logps/rejected": -2.306483745574951, "loss": 0.4178, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6260045766830444, "rewards/margins": 0.6804793477058411, "rewards/rejected": -2.306483745574951, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 7.6026603009429286, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.2644169330596924, "logits/rejected": -0.11081578582525253, "logps/chosen": -1.5193434953689575, "logps/rejected": -2.251497983932495, "loss": 0.3891, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5193434953689575, "rewards/margins": 0.7321546673774719, "rewards/rejected": -2.251497983932495, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 16.099922407988785, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.30713725090026855, "logits/rejected": -0.00859877746552229, "logps/chosen": -1.6493667364120483, "logps/rejected": -2.3461811542510986, "loss": 0.4233, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6493667364120483, "rewards/margins": 0.696814239025116, "rewards/rejected": -2.3461811542510986, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 9.238621106928637, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.1565760374069214, "logits/rejected": -0.0390843041241169, "logps/chosen": -1.5411382913589478, "logps/rejected": -2.3162026405334473, "loss": 0.4109, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5411382913589478, "rewards/margins": 0.7750640511512756, "rewards/rejected": -2.3162026405334473, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.25557586550712585, "eval_logits/rejected": 0.36768534779548645, "eval_logps/chosen": -1.7061785459518433, "eval_logps/rejected": -2.288332462310791, "eval_loss": 0.4698818624019623, "eval_rewards/accuracies": 0.6780415177345276, "eval_rewards/chosen": -1.7061785459518433, "eval_rewards/margins": 0.5821537375450134, "eval_rewards/rejected": -2.288332462310791, "eval_runtime": 40.4504, "eval_samples_per_second": 33.251, "eval_steps_per_second": 8.331, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 7.738656307031986, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.18038979172706604, "logits/rejected": 0.0640304833650589, "logps/chosen": -1.874886155128479, "logps/rejected": -2.5268025398254395, "loss": 0.4708, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.874886155128479, "rewards/margins": 0.6519161462783813, "rewards/rejected": -2.5268025398254395, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 14.295234423489921, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.19292163848876953, "logits/rejected": -0.07282300293445587, "logps/chosen": -1.6418721675872803, "logps/rejected": -2.139007568359375, "loss": 0.4741, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6418721675872803, "rewards/margins": 0.49713531136512756, "rewards/rejected": -2.139007568359375, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 7.87403006228256, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.18528911471366882, "logits/rejected": 0.10643212497234344, "logps/chosen": -1.6693623065948486, "logps/rejected": -2.2935757637023926, "loss": 0.4593, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6693623065948486, "rewards/margins": 0.6242133975028992, "rewards/rejected": -2.2935757637023926, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 11.378782397452044, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.3253355622291565, "logits/rejected": -0.15068058669567108, "logps/chosen": -1.793304204940796, "logps/rejected": -2.427961826324463, "loss": 0.4565, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.793304204940796, "rewards/margins": 0.634657621383667, "rewards/rejected": -2.427961826324463, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 7.997683167262913, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.16820070147514343, "logits/rejected": -0.045115403831005096, "logps/chosen": -1.5857900381088257, "logps/rejected": -2.2270374298095703, "loss": 0.4313, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5857900381088257, "rewards/margins": 0.6412474513053894, "rewards/rejected": -2.2270374298095703, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 13.60720211731394, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.20668920874595642, "logits/rejected": -0.10016520321369171, "logps/chosen": -1.5637476444244385, "logps/rejected": -2.244633913040161, "loss": 0.4101, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5637476444244385, "rewards/margins": 0.6808861494064331, "rewards/rejected": -2.244633913040161, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 9.53829030286975, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.19638994336128235, "logits/rejected": -0.1516174077987671, "logps/chosen": -1.5569967031478882, "logps/rejected": -2.03334903717041, "loss": 0.44, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5569967031478882, "rewards/margins": 0.4763523042201996, "rewards/rejected": -2.03334903717041, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 8.950672891738877, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.21002106368541718, "logits/rejected": -0.16512136161327362, "logps/chosen": -1.510585069656372, "logps/rejected": -2.10384202003479, "loss": 0.4427, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.510585069656372, "rewards/margins": 0.5932568907737732, "rewards/rejected": -2.10384202003479, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 7.033551595688128, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.16127708554267883, "logits/rejected": -0.015156283974647522, "logps/chosen": -1.600839614868164, "logps/rejected": -2.1919257640838623, "loss": 0.4526, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.600839614868164, "rewards/margins": 0.591086208820343, "rewards/rejected": -2.1919257640838623, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 7.243081132054538, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.2518821358680725, "logits/rejected": -0.13364256918430328, "logps/chosen": -1.6746279001235962, "logps/rejected": -2.160327911376953, "loss": 0.481, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6746279001235962, "rewards/margins": 0.4857003092765808, "rewards/rejected": -2.160327911376953, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 8.511129972572018, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.10275685787200928, "logits/rejected": -0.044627558439970016, "logps/chosen": -1.6533695459365845, "logps/rejected": -2.1032421588897705, "loss": 0.4613, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6533695459365845, "rewards/margins": 0.44987234473228455, "rewards/rejected": -2.1032421588897705, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 8.889612523865395, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.030911307781934738, "logits/rejected": 0.10164520889520645, "logps/chosen": -1.5643908977508545, "logps/rejected": -2.155663251876831, "loss": 0.4308, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5643908977508545, "rewards/margins": 0.5912724137306213, "rewards/rejected": -2.155663251876831, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 8.820734345944151, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.26366811990737915, "logits/rejected": -0.15506787598133087, "logps/chosen": -1.6945518255233765, "logps/rejected": -2.4360978603363037, "loss": 0.4214, "rewards/accuracies": 0.75, "rewards/chosen": -1.6945518255233765, "rewards/margins": 0.7415462136268616, "rewards/rejected": -2.4360978603363037, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 10.484858877252565, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.13762135803699493, "logits/rejected": 0.09991301596164703, "logps/chosen": -1.7282555103302002, "logps/rejected": -2.553313732147217, "loss": 0.4305, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7282555103302002, "rewards/margins": 0.8250584602355957, "rewards/rejected": -2.553313732147217, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 9.825527977211555, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.2015105038881302, "logits/rejected": 0.02460029162466526, "logps/chosen": -1.7387912273406982, "logps/rejected": -2.3589119911193848, "loss": 0.4431, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7387912273406982, "rewards/margins": 0.6201208233833313, "rewards/rejected": -2.3589119911193848, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 6.31698751938309, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.2081146538257599, "logits/rejected": -0.05770736187696457, "logps/chosen": -1.6647611856460571, "logps/rejected": -2.418663263320923, "loss": 0.4236, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6647611856460571, "rewards/margins": 0.7539018988609314, "rewards/rejected": -2.418663263320923, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 8.879467112472868, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.14782652258872986, "logits/rejected": -0.039372630417346954, "logps/chosen": -1.547098159790039, "logps/rejected": -2.1038870811462402, "loss": 0.4279, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.547098159790039, "rewards/margins": 0.5567886233329773, "rewards/rejected": -2.1038870811462402, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 7.153245070369247, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.2555685043334961, "logits/rejected": -0.028570573776960373, "logps/chosen": -1.6288509368896484, "logps/rejected": -2.311668634414673, "loss": 0.4208, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6288509368896484, "rewards/margins": 0.6828176379203796, "rewards/rejected": -2.311668634414673, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 6.438351966198373, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.23030726611614227, "logits/rejected": -0.12892338633537292, "logps/chosen": -1.7237437963485718, "logps/rejected": -2.274494171142578, "loss": 0.4601, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7237437963485718, "rewards/margins": 0.5507504343986511, "rewards/rejected": -2.274494171142578, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 7.174018179165796, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.14087054133415222, "logits/rejected": 0.025855109095573425, "logps/chosen": -1.5315310955047607, "logps/rejected": -2.288343906402588, "loss": 0.39, "rewards/accuracies": 0.75, "rewards/chosen": -1.5315310955047607, "rewards/margins": 0.7568124532699585, "rewards/rejected": -2.288343906402588, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 10.39496658566991, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.1696988344192505, "logits/rejected": 0.05542017146945, "logps/chosen": -1.8359979391098022, "logps/rejected": -2.4664409160614014, "loss": 0.4665, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8359979391098022, "rewards/margins": 0.6304431557655334, "rewards/rejected": -2.4664409160614014, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 14.382956594618541, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.12038995325565338, "logits/rejected": 0.09711913764476776, "logps/chosen": -1.6766479015350342, "logps/rejected": -2.39607310295105, "loss": 0.4273, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6766479015350342, "rewards/margins": 0.7194253206253052, "rewards/rejected": -2.39607310295105, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 11.65386051792557, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.11076556146144867, "logits/rejected": -0.036900877952575684, "logps/chosen": -1.756870985031128, "logps/rejected": -2.4727931022644043, "loss": 0.4129, "rewards/accuracies": 0.75, "rewards/chosen": -1.756870985031128, "rewards/margins": 0.7159223556518555, "rewards/rejected": -2.4727931022644043, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 10.808232216429293, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.17534366250038147, "logits/rejected": -0.06891094148159027, "logps/chosen": -1.5853203535079956, "logps/rejected": -2.308851957321167, "loss": 0.4139, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5853203535079956, "rewards/margins": 0.7235313653945923, "rewards/rejected": -2.308851957321167, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 7.419944983603745, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.2437528818845749, "logits/rejected": 0.011416694149374962, "logps/chosen": -1.72686767578125, "logps/rejected": -2.577498197555542, "loss": 0.3903, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.72686767578125, "rewards/margins": 0.8506305813789368, "rewards/rejected": -2.577498197555542, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 11.756603013604204, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.13519218564033508, "logits/rejected": -0.03631724789738655, "logps/chosen": -1.734135627746582, "logps/rejected": -2.4316511154174805, "loss": 0.4612, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.734135627746582, "rewards/margins": 0.6975155472755432, "rewards/rejected": -2.4316511154174805, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 8.907419906974667, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.12922216951847076, "logits/rejected": -0.13037076592445374, "logps/chosen": -1.8192694187164307, "logps/rejected": -2.6789464950561523, "loss": 0.4434, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8192694187164307, "rewards/margins": 0.8596771359443665, "rewards/rejected": -2.6789464950561523, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 10.60048566572192, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.30562835931777954, "logits/rejected": -0.21553292870521545, "logps/chosen": -1.6569108963012695, "logps/rejected": -2.319675922393799, "loss": 0.4222, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6569108963012695, "rewards/margins": 0.662765383720398, "rewards/rejected": -2.319675922393799, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 9.047295245745207, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.06593813747167587, "logits/rejected": 0.09143920242786407, "logps/chosen": -1.8557246923446655, "logps/rejected": -2.5162012577056885, "loss": 0.4529, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8557246923446655, "rewards/margins": 0.660476565361023, "rewards/rejected": -2.5162012577056885, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 7.207820931020698, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.15539555251598358, "logits/rejected": -0.040627919137477875, "logps/chosen": -1.5200191736221313, "logps/rejected": -2.178874969482422, "loss": 0.4202, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5200191736221313, "rewards/margins": 0.6588557362556458, "rewards/rejected": -2.178874969482422, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 12.420535872658817, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.1266462206840515, "logits/rejected": -0.135248064994812, "logps/chosen": -1.6348772048950195, "logps/rejected": -2.185096025466919, "loss": 0.4488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6348772048950195, "rewards/margins": 0.5502188801765442, "rewards/rejected": -2.185096025466919, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 11.840234347039022, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.17417390644550323, "logits/rejected": -0.006153729744255543, "logps/chosen": -1.7832558155059814, "logps/rejected": -2.3553450107574463, "loss": 0.4456, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7832558155059814, "rewards/margins": 0.5720891952514648, "rewards/rejected": -2.3553450107574463, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 9.911593389079012, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.16786035895347595, "logits/rejected": -0.021083027124404907, "logps/chosen": -1.6481859683990479, "logps/rejected": -2.342196464538574, "loss": 0.4415, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6481859683990479, "rewards/margins": 0.6940103769302368, "rewards/rejected": -2.342196464538574, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 9.452476825623888, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.05816710740327835, "logits/rejected": 0.02203897014260292, "logps/chosen": -1.5998636484146118, "logps/rejected": -2.272752523422241, "loss": 0.4182, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5998636484146118, "rewards/margins": 0.6728887557983398, "rewards/rejected": -2.272752523422241, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 11.120085563760442, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.16505321860313416, "logits/rejected": -0.036025237292051315, "logps/chosen": -1.6223564147949219, "logps/rejected": -2.1133782863616943, "loss": 0.4662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6223564147949219, "rewards/margins": 0.49102187156677246, "rewards/rejected": -2.1133782863616943, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 6.636165022988251, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.22749702632427216, "logits/rejected": 0.03257812559604645, "logps/chosen": -1.5895249843597412, "logps/rejected": -2.362736225128174, "loss": 0.3917, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5895249843597412, "rewards/margins": 0.7732115983963013, "rewards/rejected": -2.362736225128174, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 13.690886789836892, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.22622433304786682, "logits/rejected": -0.19854703545570374, "logps/chosen": -1.672340750694275, "logps/rejected": -2.3366639614105225, "loss": 0.4424, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.672340750694275, "rewards/margins": 0.6643232107162476, "rewards/rejected": -2.3366639614105225, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 6.961815746027594, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.07682903856039047, "logits/rejected": 0.043538253754377365, "logps/chosen": -1.5186244249343872, "logps/rejected": -2.1954665184020996, "loss": 0.4131, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5186244249343872, "rewards/margins": 0.676842212677002, "rewards/rejected": -2.1954665184020996, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 6.645919676646359, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.3062294125556946, "logits/rejected": -0.14783944189548492, "logps/chosen": -1.6814686059951782, "logps/rejected": -2.210733652114868, "loss": 0.4731, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6814686059951782, "rewards/margins": 0.5292651057243347, "rewards/rejected": -2.210733652114868, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 8.834660884720558, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.0425289161503315, "logits/rejected": -0.05461583659052849, "logps/chosen": -1.7545950412750244, "logps/rejected": -2.3220114707946777, "loss": 0.4763, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7545950412750244, "rewards/margins": 0.5674163103103638, "rewards/rejected": -2.3220114707946777, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 6.516833985551031, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.24489760398864746, "logits/rejected": -0.16647258400917053, "logps/chosen": -1.5728332996368408, "logps/rejected": -2.230311393737793, "loss": 0.4072, "rewards/accuracies": 0.75, "rewards/chosen": -1.5728332996368408, "rewards/margins": 0.6574779748916626, "rewards/rejected": -2.230311393737793, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 6.672611440306558, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.18464837968349457, "logits/rejected": -0.04435797408223152, "logps/chosen": -1.71502685546875, "logps/rejected": -2.6265625953674316, "loss": 0.4022, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.71502685546875, "rewards/margins": 0.9115356206893921, "rewards/rejected": -2.6265625953674316, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 9.941269672071952, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.18763265013694763, "logits/rejected": -0.019227270036935806, "logps/chosen": -1.620009183883667, "logps/rejected": -2.337023973464966, "loss": 0.4358, "rewards/accuracies": 0.71875, "rewards/chosen": -1.620009183883667, "rewards/margins": 0.7170146703720093, "rewards/rejected": -2.337023973464966, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 8.695803020408617, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.22091105580329895, "logits/rejected": 0.024110516533255577, "logps/chosen": -1.6519966125488281, "logps/rejected": -2.322220802307129, "loss": 0.4164, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6519966125488281, "rewards/margins": 0.6702240705490112, "rewards/rejected": -2.322220802307129, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 7.674075401444611, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.2209376096725464, "logits/rejected": 0.006926730275154114, "logps/chosen": -1.7736784219741821, "logps/rejected": -2.4956259727478027, "loss": 0.4177, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7736784219741821, "rewards/margins": 0.7219477891921997, "rewards/rejected": -2.4956259727478027, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 11.138692204961966, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.24214644730091095, "logits/rejected": -0.13526801764965057, "logps/chosen": -1.6981589794158936, "logps/rejected": -2.360462188720703, "loss": 0.4652, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6981589794158936, "rewards/margins": 0.6623033285140991, "rewards/rejected": -2.360462188720703, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 7.175673723499193, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.16565775871276855, "logits/rejected": -0.04023613780736923, "logps/chosen": -1.709749460220337, "logps/rejected": -2.215182065963745, "loss": 0.4806, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.709749460220337, "rewards/margins": 0.5054327249526978, "rewards/rejected": -2.215182065963745, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 6.962077693988592, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.18331244587898254, "logits/rejected": -0.03809729963541031, "logps/chosen": -1.693153738975525, "logps/rejected": -2.308493137359619, "loss": 0.4247, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.693153738975525, "rewards/margins": 0.6153393387794495, "rewards/rejected": -2.308493137359619, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 9.97281855086906, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.24693885445594788, "logits/rejected": -0.09823521226644516, "logps/chosen": -1.7547004222869873, "logps/rejected": -2.385183572769165, "loss": 0.4526, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7547004222869873, "rewards/margins": 0.630483090877533, "rewards/rejected": -2.385183572769165, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 7.17443115060811, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.16163256764411926, "logits/rejected": 0.059814561158418655, "logps/chosen": -1.7835197448730469, "logps/rejected": -2.472378969192505, "loss": 0.4476, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7835197448730469, "rewards/margins": 0.6888591051101685, "rewards/rejected": -2.472378969192505, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 14.990916100680312, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.2199275940656662, "logits/rejected": 0.05258049815893173, "logps/chosen": -1.7845547199249268, "logps/rejected": -2.5384488105773926, "loss": 0.429, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7845547199249268, "rewards/margins": 0.7538946270942688, "rewards/rejected": -2.5384488105773926, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 9.459319072901412, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.15469150245189667, "logits/rejected": -0.025954579934477806, "logps/chosen": -1.5756384134292603, "logps/rejected": -2.242607593536377, "loss": 0.4142, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5756384134292603, "rewards/margins": 0.6669691205024719, "rewards/rejected": -2.242607593536377, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 7.138117539891217, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.28479233384132385, "logits/rejected": -0.08460494875907898, "logps/chosen": -1.683616280555725, "logps/rejected": -2.506190538406372, "loss": 0.3928, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.683616280555725, "rewards/margins": 0.8225743174552917, "rewards/rejected": -2.506190538406372, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 8.214313412345131, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.09482437372207642, "logits/rejected": 0.18521924316883087, "logps/chosen": -1.7577953338623047, "logps/rejected": -2.603116989135742, "loss": 0.4212, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7577953338623047, "rewards/margins": 0.8453216552734375, "rewards/rejected": -2.603116989135742, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 8.145449759864695, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.14908023178577423, "logits/rejected": -0.035757966339588165, "logps/chosen": -1.7827965021133423, "logps/rejected": -2.42545223236084, "loss": 0.4633, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7827965021133423, "rewards/margins": 0.6426557302474976, "rewards/rejected": -2.42545223236084, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 10.841388079074484, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.22195784747600555, "logits/rejected": -0.1026119738817215, "logps/chosen": -1.6750249862670898, "logps/rejected": -2.3458476066589355, "loss": 0.4198, "rewards/accuracies": 0.75, "rewards/chosen": -1.6750249862670898, "rewards/margins": 0.6708227396011353, "rewards/rejected": -2.3458476066589355, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 10.128891481875401, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.10807888209819794, "logits/rejected": -0.11603794991970062, "logps/chosen": -1.6098880767822266, "logps/rejected": -2.256838083267212, "loss": 0.4296, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6098880767822266, "rewards/margins": 0.6469499468803406, "rewards/rejected": -2.256838083267212, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 10.458622624473222, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.24528522789478302, "logits/rejected": -0.05442124605178833, "logps/chosen": -1.743072509765625, "logps/rejected": -2.402745485305786, "loss": 0.4652, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.743072509765625, "rewards/margins": 0.6596727967262268, "rewards/rejected": -2.402745485305786, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 11.583089629227127, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.20361199975013733, "logits/rejected": -0.13156521320343018, "logps/chosen": -1.6919209957122803, "logps/rejected": -2.283585786819458, "loss": 0.4519, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6919209957122803, "rewards/margins": 0.591664731502533, "rewards/rejected": -2.283585786819458, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 7.704345807112456, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.17123118042945862, "logits/rejected": -0.01826654188334942, "logps/chosen": -1.7015421390533447, "logps/rejected": -2.2401034832000732, "loss": 0.4578, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7015421390533447, "rewards/margins": 0.5385614633560181, "rewards/rejected": -2.2401034832000732, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 12.756966477537189, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.2599652409553528, "logits/rejected": -0.12193713337182999, "logps/chosen": -1.5964255332946777, "logps/rejected": -2.4201161861419678, "loss": 0.4011, "rewards/accuracies": 0.75, "rewards/chosen": -1.5964255332946777, "rewards/margins": 0.8236907720565796, "rewards/rejected": -2.4201161861419678, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 7.965299132866289, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.2101641148328781, "logits/rejected": 0.017744893208146095, "logps/chosen": -1.6455914974212646, "logps/rejected": -2.475620746612549, "loss": 0.3776, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6455914974212646, "rewards/margins": 0.8300293684005737, "rewards/rejected": -2.475620746612549, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 11.799540295115591, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.18859384953975677, "logits/rejected": -0.01610657200217247, "logps/chosen": -1.7314517498016357, "logps/rejected": -2.280651092529297, "loss": 0.4569, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7314517498016357, "rewards/margins": 0.5491991639137268, "rewards/rejected": -2.280651092529297, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 13.262604762258471, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.20682469010353088, "logits/rejected": -0.04748683422803879, "logps/chosen": -1.7773901224136353, "logps/rejected": -2.461301326751709, "loss": 0.4362, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7773901224136353, "rewards/margins": 0.6839113235473633, "rewards/rejected": -2.461301326751709, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 9.462648525525188, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.2025531530380249, "logits/rejected": -0.022411899641156197, "logps/chosen": -1.786272644996643, "logps/rejected": -2.495540142059326, "loss": 0.4694, "rewards/accuracies": 0.6875, "rewards/chosen": -1.786272644996643, "rewards/margins": 0.7092679738998413, "rewards/rejected": -2.495540142059326, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 10.040894698841436, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.18476524949073792, "logits/rejected": -0.046088140457868576, "logps/chosen": -1.8620725870132446, "logps/rejected": -2.556309223175049, "loss": 0.4949, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8620725870132446, "rewards/margins": 0.6942366361618042, "rewards/rejected": -2.556309223175049, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 9.811322938866246, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.15047045052051544, "logits/rejected": -0.06739865243434906, "logps/chosen": -1.699896216392517, "logps/rejected": -2.3415441513061523, "loss": 0.4384, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.699896216392517, "rewards/margins": 0.64164799451828, "rewards/rejected": -2.3415441513061523, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 9.057760884965301, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.20074363052845, "logits/rejected": 0.003097963286563754, "logps/chosen": -1.630014419555664, "logps/rejected": -2.452707529067993, "loss": 0.4013, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.630014419555664, "rewards/margins": 0.8226932287216187, "rewards/rejected": -2.452707529067993, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 22.962078340491008, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.2092522382736206, "logits/rejected": -0.12871553003787994, "logps/chosen": -1.9025121927261353, "logps/rejected": -2.4437849521636963, "loss": 0.5014, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9025121927261353, "rewards/margins": 0.541272759437561, "rewards/rejected": -2.4437849521636963, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 11.030191652434304, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.16363202035427094, "logits/rejected": -0.09014679491519928, "logps/chosen": -1.7650678157806396, "logps/rejected": -2.5314793586730957, "loss": 0.4051, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7650678157806396, "rewards/margins": 0.766411304473877, "rewards/rejected": -2.5314793586730957, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 6.840904996366364, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.22691413760185242, "logits/rejected": -0.09678506851196289, "logps/chosen": -1.7239291667938232, "logps/rejected": -2.358384847640991, "loss": 0.4262, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7239291667938232, "rewards/margins": 0.6344557404518127, "rewards/rejected": -2.358384847640991, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 8.971896065675612, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.2769126296043396, "logits/rejected": -0.030696701258420944, "logps/chosen": -1.7794189453125, "logps/rejected": -2.4832077026367188, "loss": 0.4112, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7794189453125, "rewards/margins": 0.7037889361381531, "rewards/rejected": -2.4832077026367188, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 6.78048924915311, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.07546310126781464, "logits/rejected": -0.07309579849243164, "logps/chosen": -1.6516942977905273, "logps/rejected": -2.230509042739868, "loss": 0.4577, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6516942977905273, "rewards/margins": 0.5788145661354065, "rewards/rejected": -2.230509042739868, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 9.301440053822217, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.14802294969558716, "logits/rejected": 0.006210329942405224, "logps/chosen": -1.5076686143875122, "logps/rejected": -2.2285118103027344, "loss": 0.3922, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5076686143875122, "rewards/margins": 0.7208432555198669, "rewards/rejected": -2.2285118103027344, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 9.201570617329214, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.24860334396362305, "logits/rejected": -0.011888621374964714, "logps/chosen": -1.7496311664581299, "logps/rejected": -2.441511392593384, "loss": 0.4584, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7496311664581299, "rewards/margins": 0.6918801665306091, "rewards/rejected": -2.441511392593384, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 7.388763480562631, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.2144652158021927, "logits/rejected": 0.016422836109995842, "logps/chosen": -1.8240630626678467, "logps/rejected": -2.46561598777771, "loss": 0.4693, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8240630626678467, "rewards/margins": 0.6415529847145081, "rewards/rejected": -2.46561598777771, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 12.918436913835842, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.2397753894329071, "logits/rejected": -0.07036063820123672, "logps/chosen": -1.7827560901641846, "logps/rejected": -2.415404796600342, "loss": 0.449, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7827560901641846, "rewards/margins": 0.6326485872268677, "rewards/rejected": -2.415404796600342, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 6.50963252014801, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.1995238959789276, "logits/rejected": -0.046418797224760056, "logps/chosen": -1.7021472454071045, "logps/rejected": -2.333613872528076, "loss": 0.4339, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7021472454071045, "rewards/margins": 0.6314667463302612, "rewards/rejected": -2.333613872528076, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 9.151513964816928, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.14800867438316345, "logits/rejected": 0.037006258964538574, "logps/chosen": -1.6470239162445068, "logps/rejected": -2.3669276237487793, "loss": 0.4308, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6470239162445068, "rewards/margins": 0.7199036478996277, "rewards/rejected": -2.3669276237487793, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 7.152547477330738, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.24051189422607422, "logits/rejected": -0.06354856491088867, "logps/chosen": -1.7367076873779297, "logps/rejected": -2.4032413959503174, "loss": 0.4282, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7367076873779297, "rewards/margins": 0.6665335893630981, "rewards/rejected": -2.4032413959503174, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.12394216656684875, "eval_logits/rejected": 0.2280324548482895, "eval_logps/chosen": -1.7749197483062744, "eval_logps/rejected": -2.395164966583252, "eval_loss": 0.47065040469169617, "eval_rewards/accuracies": 0.687685489654541, "eval_rewards/chosen": -1.7749197483062744, "eval_rewards/margins": 0.620245099067688, "eval_rewards/rejected": -2.395164966583252, "eval_runtime": 40.5606, "eval_samples_per_second": 33.16, "eval_steps_per_second": 8.309, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 10.284365439675229, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.2604585289955139, "logits/rejected": -0.22023072838783264, "logps/chosen": -1.6455824375152588, "logps/rejected": -2.2338058948516846, "loss": 0.4401, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6455824375152588, "rewards/margins": 0.5882236361503601, "rewards/rejected": -2.2338058948516846, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 10.523433686042667, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.1999916285276413, "logits/rejected": -0.13909205794334412, "logps/chosen": -1.8652708530426025, "logps/rejected": -2.6659092903137207, "loss": 0.4276, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8652708530426025, "rewards/margins": 0.8006383180618286, "rewards/rejected": -2.6659092903137207, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 6.865941875746164, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.17582401633262634, "logits/rejected": 0.028542879968881607, "logps/chosen": -1.7354873418807983, "logps/rejected": -2.398193836212158, "loss": 0.4403, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7354873418807983, "rewards/margins": 0.6627063751220703, "rewards/rejected": -2.398193836212158, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 13.948926966784335, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.2104373425245285, "logits/rejected": -0.1155184730887413, "logps/chosen": -1.7976830005645752, "logps/rejected": -2.317903757095337, "loss": 0.461, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7976830005645752, "rewards/margins": 0.5202207565307617, "rewards/rejected": -2.317903757095337, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 5.7776317825584425, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.207981139421463, "logits/rejected": -0.08955500274896622, "logps/chosen": -1.5379942655563354, "logps/rejected": -2.153710126876831, "loss": 0.3981, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5379942655563354, "rewards/margins": 0.6157158613204956, "rewards/rejected": -2.153710126876831, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 8.752015632036876, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.21747057139873505, "logits/rejected": -0.01809767261147499, "logps/chosen": -1.752576470375061, "logps/rejected": -2.4480502605438232, "loss": 0.4291, "rewards/accuracies": 0.75, "rewards/chosen": -1.752576470375061, "rewards/margins": 0.6954737901687622, "rewards/rejected": -2.4480502605438232, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 5.992253493928554, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.2970042824745178, "logits/rejected": -0.05152718350291252, "logps/chosen": -1.6525604724884033, "logps/rejected": -2.365082263946533, "loss": 0.3937, "rewards/accuracies": 0.75, "rewards/chosen": -1.6525604724884033, "rewards/margins": 0.7125218510627747, "rewards/rejected": -2.365082263946533, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 9.67915284840019, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.3080977201461792, "logits/rejected": -0.0445028617978096, "logps/chosen": -1.7420886754989624, "logps/rejected": -2.402892589569092, "loss": 0.4421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7420886754989624, "rewards/margins": 0.6608040928840637, "rewards/rejected": -2.402892589569092, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 7.346893440151981, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.24900075793266296, "logits/rejected": -0.03005148097872734, "logps/chosen": -1.7489814758300781, "logps/rejected": -2.3010313510894775, "loss": 0.4686, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7489814758300781, "rewards/margins": 0.5520498156547546, "rewards/rejected": -2.3010313510894775, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 14.650238146810429, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.1695578396320343, "logits/rejected": -0.01812020316720009, "logps/chosen": -1.7514022588729858, "logps/rejected": -2.488818883895874, "loss": 0.4664, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7514022588729858, "rewards/margins": 0.7374169230461121, "rewards/rejected": -2.488818883895874, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 12.100617867711481, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.2541901171207428, "logits/rejected": -0.1304183453321457, "logps/chosen": -1.7370179891586304, "logps/rejected": -2.4484238624572754, "loss": 0.4326, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7370179891586304, "rewards/margins": 0.711405873298645, "rewards/rejected": -2.4484238624572754, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 9.03627700611328, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.28452691435813904, "logits/rejected": -0.02641073428094387, "logps/chosen": -1.6944023370742798, "logps/rejected": -2.5199828147888184, "loss": 0.4042, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6944023370742798, "rewards/margins": 0.825580894947052, "rewards/rejected": -2.5199828147888184, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 12.101282091132202, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.2650972902774811, "logits/rejected": -0.037634264677762985, "logps/chosen": -1.9104210138320923, "logps/rejected": -2.584073543548584, "loss": 0.494, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9104210138320923, "rewards/margins": 0.6736525297164917, "rewards/rejected": -2.584073543548584, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 6.970232535907575, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.1036420688033104, "logits/rejected": -0.010314238257706165, "logps/chosen": -1.5654821395874023, "logps/rejected": -2.174424648284912, "loss": 0.4501, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5654821395874023, "rewards/margins": 0.6089423894882202, "rewards/rejected": -2.174424648284912, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 14.073641738839578, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.21148638427257538, "logits/rejected": 0.04061644524335861, "logps/chosen": -1.7384744882583618, "logps/rejected": -2.384366750717163, "loss": 0.4238, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7384744882583618, "rewards/margins": 0.6458922624588013, "rewards/rejected": -2.384366750717163, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 17.01423519963169, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.22068946063518524, "logits/rejected": -0.11195673793554306, "logps/chosen": -1.5533367395401, "logps/rejected": -2.087136745452881, "loss": 0.4433, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5533367395401, "rewards/margins": 0.5338001847267151, "rewards/rejected": -2.087136745452881, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 7.892004093278041, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.1743631809949875, "logits/rejected": -0.07022198289632797, "logps/chosen": -1.7122751474380493, "logps/rejected": -2.408198833465576, "loss": 0.4211, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7122751474380493, "rewards/margins": 0.6959234476089478, "rewards/rejected": -2.408198833465576, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 7.980222415593173, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.25519946217536926, "logits/rejected": -0.007830450311303139, "logps/chosen": -1.6455392837524414, "logps/rejected": -2.4417858123779297, "loss": 0.3954, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6455392837524414, "rewards/margins": 0.7962468266487122, "rewards/rejected": -2.4417858123779297, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 9.466952610754097, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.31417161226272583, "logits/rejected": -0.07406683266162872, "logps/chosen": -1.7574630975723267, "logps/rejected": -2.338456153869629, "loss": 0.4384, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7574630975723267, "rewards/margins": 0.5809930562973022, "rewards/rejected": -2.338456153869629, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 8.548663262526087, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.09195277839899063, "logits/rejected": -0.07713989913463593, "logps/chosen": -1.6739174127578735, "logps/rejected": -2.2990975379943848, "loss": 0.4258, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6739174127578735, "rewards/margins": 0.6251803636550903, "rewards/rejected": -2.2990975379943848, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 7.148511822957127, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.2519226670265198, "logits/rejected": -0.06618736684322357, "logps/chosen": -1.6303861141204834, "logps/rejected": -2.266941785812378, "loss": 0.4232, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6303861141204834, "rewards/margins": 0.6365553736686707, "rewards/rejected": -2.266941785812378, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 7.07588066956926, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.2776569426059723, "logits/rejected": -0.11190319061279297, "logps/chosen": -1.727441430091858, "logps/rejected": -2.3481054306030273, "loss": 0.4471, "rewards/accuracies": 0.6875, "rewards/chosen": -1.727441430091858, "rewards/margins": 0.6206642985343933, "rewards/rejected": -2.3481054306030273, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 7.911638511364907, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.2649892568588257, "logits/rejected": -0.1642163097858429, "logps/chosen": -1.7428703308105469, "logps/rejected": -2.331357479095459, "loss": 0.4437, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7428703308105469, "rewards/margins": 0.5884872674942017, "rewards/rejected": -2.331357479095459, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 7.421240797642687, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.19513782858848572, "logits/rejected": -0.07698577642440796, "logps/chosen": -1.662101149559021, "logps/rejected": -2.501035213470459, "loss": 0.3843, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.662101149559021, "rewards/margins": 0.8389341235160828, "rewards/rejected": -2.501035213470459, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 9.305052953302242, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.2945138216018677, "logits/rejected": -0.08423537015914917, "logps/chosen": -1.799242377281189, "logps/rejected": -2.5951900482177734, "loss": 0.4273, "rewards/accuracies": 0.78125, "rewards/chosen": -1.799242377281189, "rewards/margins": 0.7959474921226501, "rewards/rejected": -2.5951900482177734, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 8.948546350683202, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.28393468260765076, "logits/rejected": -0.18059223890304565, "logps/chosen": -1.7348911762237549, "logps/rejected": -2.226630687713623, "loss": 0.4756, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7348911762237549, "rewards/margins": 0.49173974990844727, "rewards/rejected": -2.226630687713623, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 8.949781294579598, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.10721731185913086, "logits/rejected": 0.020197704434394836, "logps/chosen": -1.6267902851104736, "logps/rejected": -2.240532398223877, "loss": 0.4435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6267902851104736, "rewards/margins": 0.6137421727180481, "rewards/rejected": -2.240532398223877, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 5.985503894783728, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.11945569515228271, "logits/rejected": -0.01734943874180317, "logps/chosen": -1.5271570682525635, "logps/rejected": -2.1526284217834473, "loss": 0.4385, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5271570682525635, "rewards/margins": 0.6254713535308838, "rewards/rejected": -2.1526284217834473, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 8.30747204362492, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.14915531873703003, "logits/rejected": -0.0026716054417192936, "logps/chosen": -1.6203283071517944, "logps/rejected": -2.2397820949554443, "loss": 0.4431, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6203283071517944, "rewards/margins": 0.6194537878036499, "rewards/rejected": -2.2397820949554443, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 9.78927922143477, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.239475280046463, "logits/rejected": -0.11548696458339691, "logps/chosen": -1.6000664234161377, "logps/rejected": -2.232635021209717, "loss": 0.4206, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6000664234161377, "rewards/margins": 0.6325686573982239, "rewards/rejected": -2.232635021209717, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 8.133902395270944, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.16970163583755493, "logits/rejected": -0.08997634053230286, "logps/chosen": -1.8212791681289673, "logps/rejected": -2.3491625785827637, "loss": 0.4573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8212791681289673, "rewards/margins": 0.5278835296630859, "rewards/rejected": -2.3491625785827637, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 6.788162503459246, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.2507859766483307, "logits/rejected": -0.03913618624210358, "logps/chosen": -1.6289186477661133, "logps/rejected": -2.267913579940796, "loss": 0.4719, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6289186477661133, "rewards/margins": 0.6389948725700378, "rewards/rejected": -2.267913579940796, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 9.510003606127237, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.15535804629325867, "logits/rejected": -0.03553268685936928, "logps/chosen": -1.8434889316558838, "logps/rejected": -2.4730923175811768, "loss": 0.4469, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8434889316558838, "rewards/margins": 0.6296036243438721, "rewards/rejected": -2.4730923175811768, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 13.748212597307392, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.09278123080730438, "logits/rejected": 0.08924518525600433, "logps/chosen": -1.6528123617172241, "logps/rejected": -2.389423131942749, "loss": 0.4161, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6528123617172241, "rewards/margins": 0.7366108298301697, "rewards/rejected": -2.389423131942749, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 10.481666490890749, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.12688806653022766, "logits/rejected": -0.0652596652507782, "logps/chosen": -1.6355838775634766, "logps/rejected": -2.2795283794403076, "loss": 0.4436, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6355838775634766, "rewards/margins": 0.6439446210861206, "rewards/rejected": -2.2795283794403076, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 7.93699870937114, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.17919878661632538, "logits/rejected": -0.12317834049463272, "logps/chosen": -1.6119797229766846, "logps/rejected": -2.248685359954834, "loss": 0.4238, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6119797229766846, "rewards/margins": 0.6367056965827942, "rewards/rejected": -2.248685359954834, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 7.448861572070777, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.16641297936439514, "logits/rejected": 0.004331605043262243, "logps/chosen": -1.7562932968139648, "logps/rejected": -2.430765151977539, "loss": 0.4757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7562932968139648, "rewards/margins": 0.6744720339775085, "rewards/rejected": -2.430765151977539, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 8.23736156805862, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.308736115694046, "logits/rejected": -0.11548459529876709, "logps/chosen": -1.5339339971542358, "logps/rejected": -2.124629259109497, "loss": 0.4373, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5339339971542358, "rewards/margins": 0.5906953811645508, "rewards/rejected": -2.124629259109497, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 11.259044724904777, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.24482667446136475, "logits/rejected": -0.10848468542098999, "logps/chosen": -1.6783969402313232, "logps/rejected": -2.3828930854797363, "loss": 0.4405, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6783969402313232, "rewards/margins": 0.7044960856437683, "rewards/rejected": -2.3828930854797363, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 8.95824352539041, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.2777855098247528, "logits/rejected": 0.01349138654768467, "logps/chosen": -1.6804864406585693, "logps/rejected": -2.4321556091308594, "loss": 0.406, "rewards/accuracies": 0.75, "rewards/chosen": -1.6804864406585693, "rewards/margins": 0.7516691088676453, "rewards/rejected": -2.4321556091308594, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 7.017887624295489, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.20901751518249512, "logits/rejected": -0.03126145154237747, "logps/chosen": -1.7034187316894531, "logps/rejected": -2.499802589416504, "loss": 0.4363, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7034187316894531, "rewards/margins": 0.7963839173316956, "rewards/rejected": -2.499802589416504, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 10.418716496482906, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.23555977642536163, "logits/rejected": -0.07622835785150528, "logps/chosen": -1.779353141784668, "logps/rejected": -2.507845401763916, "loss": 0.4508, "rewards/accuracies": 0.71875, "rewards/chosen": -1.779353141784668, "rewards/margins": 0.7284921407699585, "rewards/rejected": -2.507845401763916, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 11.088525052190777, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.16794966161251068, "logits/rejected": 0.0010345608461648226, "logps/chosen": -1.83742356300354, "logps/rejected": -2.6607418060302734, "loss": 0.4278, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.83742356300354, "rewards/margins": 0.8233181834220886, "rewards/rejected": -2.6607418060302734, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 6.915328781914365, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.17060139775276184, "logits/rejected": 0.07329483330249786, "logps/chosen": -1.6219370365142822, "logps/rejected": -2.344425678253174, "loss": 0.405, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6219370365142822, "rewards/margins": 0.7224887013435364, "rewards/rejected": -2.344425678253174, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 7.824471109780126, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.29697659611701965, "logits/rejected": -0.24408411979675293, "logps/chosen": -1.674429178237915, "logps/rejected": -2.281888484954834, "loss": 0.4526, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.674429178237915, "rewards/margins": 0.6074593663215637, "rewards/rejected": -2.281888484954834, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 10.598706076196734, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.14728602766990662, "logits/rejected": -0.04636381193995476, "logps/chosen": -1.7258694171905518, "logps/rejected": -2.4840986728668213, "loss": 0.4206, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7258694171905518, "rewards/margins": 0.7582294344902039, "rewards/rejected": -2.4840986728668213, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 7.500401432870157, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.12214155495166779, "logits/rejected": 0.060303401201963425, "logps/chosen": -1.7117080688476562, "logps/rejected": -2.500868558883667, "loss": 0.4211, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7117080688476562, "rewards/margins": 0.7891607284545898, "rewards/rejected": -2.500868558883667, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 7.568741725300677, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.14792805910110474, "logits/rejected": -0.019823264330625534, "logps/chosen": -1.7667713165283203, "logps/rejected": -2.491652011871338, "loss": 0.3936, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7667713165283203, "rewards/margins": 0.7248806953430176, "rewards/rejected": -2.491652011871338, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 9.072309209937144, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.196926087141037, "logits/rejected": -0.03682239353656769, "logps/chosen": -1.6885616779327393, "logps/rejected": -2.336080312728882, "loss": 0.4289, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6885616779327393, "rewards/margins": 0.6475186944007874, "rewards/rejected": -2.336080312728882, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 7.927140113130535, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.25401538610458374, "logits/rejected": 0.02853848971426487, "logps/chosen": -1.6547572612762451, "logps/rejected": -2.4280033111572266, "loss": 0.4104, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6547572612762451, "rewards/margins": 0.7732460498809814, "rewards/rejected": -2.4280033111572266, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 10.79243501790061, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.2280188798904419, "logits/rejected": -0.050871867686510086, "logps/chosen": -1.8363583087921143, "logps/rejected": -2.577662229537964, "loss": 0.4114, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8363583087921143, "rewards/margins": 0.7413042783737183, "rewards/rejected": -2.577662229537964, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 10.50771856107557, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.19452443718910217, "logits/rejected": -0.05185568332672119, "logps/chosen": -1.7944815158843994, "logps/rejected": -2.4846863746643066, "loss": 0.4233, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7944815158843994, "rewards/margins": 0.6902049779891968, "rewards/rejected": -2.4846863746643066, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 11.539221012296, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.20931319892406464, "logits/rejected": -0.12919676303863525, "logps/chosen": -1.778306245803833, "logps/rejected": -2.331148862838745, "loss": 0.501, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.778306245803833, "rewards/margins": 0.5528425574302673, "rewards/rejected": -2.331148862838745, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 14.609066723457747, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.21181365847587585, "logits/rejected": -0.053109604865312576, "logps/chosen": -1.702825903892517, "logps/rejected": -2.216198205947876, "loss": 0.4558, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.702825903892517, "rewards/margins": 0.5133721828460693, "rewards/rejected": -2.216198205947876, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 12.318140433960503, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.1431012898683548, "logits/rejected": 0.041647423058748245, "logps/chosen": -1.6854658126831055, "logps/rejected": -2.384006977081299, "loss": 0.4257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6854658126831055, "rewards/margins": 0.6985413432121277, "rewards/rejected": -2.384006977081299, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 14.727672873914443, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.13063819706439972, "logits/rejected": -0.010431376285851002, "logps/chosen": -1.6089423894882202, "logps/rejected": -2.3095650672912598, "loss": 0.4368, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6089423894882202, "rewards/margins": 0.7006223797798157, "rewards/rejected": -2.3095650672912598, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 8.541809094446103, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.2590874433517456, "logits/rejected": -0.02011718973517418, "logps/chosen": -1.6655070781707764, "logps/rejected": -2.373628616333008, "loss": 0.4049, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6655070781707764, "rewards/margins": 0.7081214189529419, "rewards/rejected": -2.373628616333008, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 9.992376261886683, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.15074700117111206, "logits/rejected": -0.08499506860971451, "logps/chosen": -1.6664329767227173, "logps/rejected": -2.292686939239502, "loss": 0.4567, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6664329767227173, "rewards/margins": 0.6262542009353638, "rewards/rejected": -2.292686939239502, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 6.980476477656598, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.17244479060173035, "logits/rejected": 0.04675675556063652, "logps/chosen": -1.7327744960784912, "logps/rejected": -2.433210849761963, "loss": 0.4612, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7327744960784912, "rewards/margins": 0.7004357576370239, "rewards/rejected": -2.433210849761963, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 8.386626677873718, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.26770567893981934, "logits/rejected": -0.12101028859615326, "logps/chosen": -1.6683902740478516, "logps/rejected": -2.281280279159546, "loss": 0.4448, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6683902740478516, "rewards/margins": 0.6128900647163391, "rewards/rejected": -2.281280279159546, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 10.788104011185519, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.11966216564178467, "logits/rejected": -0.061551857739686966, "logps/chosen": -1.7001079320907593, "logps/rejected": -2.2997336387634277, "loss": 0.4634, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7001079320907593, "rewards/margins": 0.5996257066726685, "rewards/rejected": -2.2997336387634277, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 10.630485852001726, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.2386687994003296, "logits/rejected": -0.04381350800395012, "logps/chosen": -1.7811568975448608, "logps/rejected": -2.5043911933898926, "loss": 0.4209, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7811568975448608, "rewards/margins": 0.7232342958450317, "rewards/rejected": -2.5043911933898926, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 8.061326714851612, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.2069791853427887, "logits/rejected": -0.1537046730518341, "logps/chosen": -1.8301494121551514, "logps/rejected": -2.5560824871063232, "loss": 0.4227, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8301494121551514, "rewards/margins": 0.7259331345558167, "rewards/rejected": -2.5560824871063232, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 8.814076627942512, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.1765478104352951, "logits/rejected": -0.04154636710882187, "logps/chosen": -1.521730661392212, "logps/rejected": -2.227720022201538, "loss": 0.39, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.521730661392212, "rewards/margins": 0.7059892416000366, "rewards/rejected": -2.227720022201538, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 7.86993087489975, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.22753913700580597, "logits/rejected": -0.1609765738248825, "logps/chosen": -1.720136284828186, "logps/rejected": -2.3282859325408936, "loss": 0.4708, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.720136284828186, "rewards/margins": 0.608149528503418, "rewards/rejected": -2.3282859325408936, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 6.892116752977285, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.16040951013565063, "logits/rejected": -0.01277973037213087, "logps/chosen": -1.6558548212051392, "logps/rejected": -2.333360433578491, "loss": 0.4183, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6558548212051392, "rewards/margins": 0.677505612373352, "rewards/rejected": -2.333360433578491, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 11.947543401064104, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.1341305524110794, "logits/rejected": -0.1876380741596222, "logps/chosen": -1.8088960647583008, "logps/rejected": -2.461027145385742, "loss": 0.4207, "rewards/accuracies": 0.75, "rewards/chosen": -1.8088960647583008, "rewards/margins": 0.6521310806274414, "rewards/rejected": -2.461027145385742, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 6.84659389310529, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.22579798102378845, "logits/rejected": -0.05506812408566475, "logps/chosen": -1.8374704122543335, "logps/rejected": -2.533339738845825, "loss": 0.4302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8374704122543335, "rewards/margins": 0.6958690881729126, "rewards/rejected": -2.533339738845825, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 12.298446261326825, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.17154420912265778, "logits/rejected": -0.11341144889593124, "logps/chosen": -1.8943641185760498, "logps/rejected": -2.5038318634033203, "loss": 0.474, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8943641185760498, "rewards/margins": 0.6094677448272705, "rewards/rejected": -2.5038318634033203, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 6.057847082221732, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.1403556913137436, "logits/rejected": 0.11298413574695587, "logps/chosen": -1.7152535915374756, "logps/rejected": -2.6783206462860107, "loss": 0.3511, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7152535915374756, "rewards/margins": 0.9630670547485352, "rewards/rejected": -2.6783206462860107, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 9.708814618957655, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.2321714609861374, "logits/rejected": -0.03705715388059616, "logps/chosen": -1.6292619705200195, "logps/rejected": -2.4399428367614746, "loss": 0.4215, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6292619705200195, "rewards/margins": 0.8106810450553894, "rewards/rejected": -2.4399428367614746, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 7.518357434541207, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.28486210107803345, "logits/rejected": -0.11866022646427155, "logps/chosen": -1.613390326499939, "logps/rejected": -2.297753095626831, "loss": 0.4139, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.613390326499939, "rewards/margins": 0.6843625903129578, "rewards/rejected": -2.297753095626831, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 7.479029782093494, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.2281685173511505, "logits/rejected": -0.026276081800460815, "logps/chosen": -1.6376514434814453, "logps/rejected": -2.1324105262756348, "loss": 0.5011, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6376514434814453, "rewards/margins": 0.49475932121276855, "rewards/rejected": -2.1324105262756348, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 10.326564164642852, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.17516720294952393, "logits/rejected": -0.03884189575910568, "logps/chosen": -1.818216323852539, "logps/rejected": -2.536815643310547, "loss": 0.4384, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.818216323852539, "rewards/margins": 0.7185991406440735, "rewards/rejected": -2.536815643310547, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 9.90609741900535, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.058928973972797394, "logits/rejected": 0.07119754701852798, "logps/chosen": -1.6986284255981445, "logps/rejected": -2.422569751739502, "loss": 0.4477, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6986284255981445, "rewards/margins": 0.7239412069320679, "rewards/rejected": -2.422569751739502, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 8.575244328435723, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.17524407804012299, "logits/rejected": -0.06111008673906326, "logps/chosen": -1.7385562658309937, "logps/rejected": -2.342148780822754, "loss": 0.447, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7385562658309937, "rewards/margins": 0.603592574596405, "rewards/rejected": -2.342148780822754, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 8.351569188141193, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.1853046715259552, "logits/rejected": -0.11531984806060791, "logps/chosen": -1.6457151174545288, "logps/rejected": -2.1539978981018066, "loss": 0.4611, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6457151174545288, "rewards/margins": 0.5082827210426331, "rewards/rejected": -2.1539978981018066, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 8.303253189789196, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.2767137885093689, "logits/rejected": -0.11489056050777435, "logps/chosen": -1.712721824645996, "logps/rejected": -2.4300382137298584, "loss": 0.4447, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.712721824645996, "rewards/margins": 0.7173165678977966, "rewards/rejected": -2.4300382137298584, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 14.000112288276156, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.16802194714546204, "logits/rejected": -0.07253174483776093, "logps/chosen": -1.7289464473724365, "logps/rejected": -2.3443641662597656, "loss": 0.4584, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7289464473724365, "rewards/margins": 0.6154177784919739, "rewards/rejected": -2.3443641662597656, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 6.589086153630348, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.20465822517871857, "logits/rejected": -0.07182928174734116, "logps/chosen": -1.5968660116195679, "logps/rejected": -2.157181978225708, "loss": 0.4299, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5968660116195679, "rewards/margins": 0.5603160858154297, "rewards/rejected": -2.157181978225708, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.1929057091474533, "eval_logits/rejected": 0.30267134308815, "eval_logps/chosen": -1.7424734830856323, "eval_logps/rejected": -2.3507001399993896, "eval_loss": 0.4704340398311615, "eval_rewards/accuracies": 0.6802670359611511, "eval_rewards/chosen": -1.7424734830856323, "eval_rewards/margins": 0.608226478099823, "eval_rewards/rejected": -2.3507001399993896, "eval_runtime": 40.6903, "eval_samples_per_second": 33.055, "eval_steps_per_second": 8.282, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 9.506965237387389, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.28589242696762085, "logits/rejected": -0.13822674751281738, "logps/chosen": -1.7172784805297852, "logps/rejected": -2.4600110054016113, "loss": 0.4269, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7172784805297852, "rewards/margins": 0.7427327036857605, "rewards/rejected": -2.4600110054016113, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 10.160205598125092, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.16260287165641785, "logits/rejected": -0.013149010017514229, "logps/chosen": -1.799870252609253, "logps/rejected": -2.464735507965088, "loss": 0.4554, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.799870252609253, "rewards/margins": 0.6648651361465454, "rewards/rejected": -2.464735507965088, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 7.891487539581977, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.1684851348400116, "logits/rejected": -0.01601838693022728, "logps/chosen": -1.4977695941925049, "logps/rejected": -2.2498373985290527, "loss": 0.416, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4977695941925049, "rewards/margins": 0.7520676851272583, "rewards/rejected": -2.2498373985290527, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 8.083911786143442, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.15176579356193542, "logits/rejected": -0.04817506670951843, "logps/chosen": -1.7161592245101929, "logps/rejected": -2.260246992111206, "loss": 0.4681, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7161592245101929, "rewards/margins": 0.5440878868103027, "rewards/rejected": -2.260246992111206, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 12.083526808470628, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.16366152465343475, "logits/rejected": -0.014265497215092182, "logps/chosen": -1.7682815790176392, "logps/rejected": -2.3734469413757324, "loss": 0.4362, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7682815790176392, "rewards/margins": 0.6051654815673828, "rewards/rejected": -2.3734469413757324, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 11.866875417613196, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.1984398514032364, "logits/rejected": -0.016267079859972, "logps/chosen": -1.7273082733154297, "logps/rejected": -2.2748215198516846, "loss": 0.47, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7273082733154297, "rewards/margins": 0.5475131869316101, "rewards/rejected": -2.2748215198516846, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 9.104337124230083, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.3283892869949341, "logits/rejected": -0.10221920907497406, "logps/chosen": -1.677991509437561, "logps/rejected": -2.443533420562744, "loss": 0.4066, "rewards/accuracies": 0.6875, "rewards/chosen": -1.677991509437561, "rewards/margins": 0.7655418515205383, "rewards/rejected": -2.443533420562744, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 9.009215677805726, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.27540016174316406, "logits/rejected": -0.04253140836954117, "logps/chosen": -1.6775808334350586, "logps/rejected": -2.4309802055358887, "loss": 0.4058, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6775808334350586, "rewards/margins": 0.7533996105194092, "rewards/rejected": -2.4309802055358887, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 6.025176248247243, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.2122703492641449, "logits/rejected": -0.11150509119033813, "logps/chosen": -1.6645904779434204, "logps/rejected": -2.302550792694092, "loss": 0.4449, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6645904779434204, "rewards/margins": 0.6379603147506714, "rewards/rejected": -2.302550792694092, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 10.46628814860702, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.35457688570022583, "logits/rejected": -0.18448397517204285, "logps/chosen": -1.7287814617156982, "logps/rejected": -2.5128531455993652, "loss": 0.4424, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7287814617156982, "rewards/margins": 0.7840721011161804, "rewards/rejected": -2.5128531455993652, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 8.05806603537157, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.1514482945203781, "logits/rejected": 0.029550720006227493, "logps/chosen": -1.588945746421814, "logps/rejected": -2.3114118576049805, "loss": 0.4204, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.588945746421814, "rewards/margins": 0.7224661111831665, "rewards/rejected": -2.3114118576049805, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 14.38346111592754, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.18528418242931366, "logits/rejected": -0.09897531569004059, "logps/chosen": -1.5963497161865234, "logps/rejected": -2.344438076019287, "loss": 0.3917, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5963497161865234, "rewards/margins": 0.7480884194374084, "rewards/rejected": -2.344438076019287, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 6.924434973056153, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.3078891336917877, "logits/rejected": -0.18980643153190613, "logps/chosen": -1.7192268371582031, "logps/rejected": -2.5096912384033203, "loss": 0.4057, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7192268371582031, "rewards/margins": 0.7904642820358276, "rewards/rejected": -2.5096912384033203, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 20.651470908255863, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.16999760270118713, "logits/rejected": -0.016139687970280647, "logps/chosen": -1.6135499477386475, "logps/rejected": -2.383836269378662, "loss": 0.4085, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6135499477386475, "rewards/margins": 0.7702863812446594, "rewards/rejected": -2.383836269378662, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 8.368642274710242, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.049949321895837784, "logits/rejected": 0.03747277334332466, "logps/chosen": -1.5437110662460327, "logps/rejected": -2.3944454193115234, "loss": 0.3894, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5437110662460327, "rewards/margins": 0.8507342338562012, "rewards/rejected": -2.3944454193115234, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 6.902304102577124, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.12314367294311523, "logits/rejected": 0.01745045743882656, "logps/chosen": -1.5631787776947021, "logps/rejected": -2.292850971221924, "loss": 0.4328, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5631787776947021, "rewards/margins": 0.7296720743179321, "rewards/rejected": -2.292850971221924, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 11.140680692661038, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.13733288645744324, "logits/rejected": -0.050634630024433136, "logps/chosen": -1.7967157363891602, "logps/rejected": -2.544011354446411, "loss": 0.4333, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7967157363891602, "rewards/margins": 0.7472954988479614, "rewards/rejected": -2.544011354446411, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 13.793017001121362, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.2792007029056549, "logits/rejected": -0.08783477544784546, "logps/chosen": -1.7916719913482666, "logps/rejected": -2.5308451652526855, "loss": 0.4293, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7916719913482666, "rewards/margins": 0.7391732335090637, "rewards/rejected": -2.5308451652526855, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 7.446982853454831, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.19906005263328552, "logits/rejected": -0.09800487756729126, "logps/chosen": -1.7950475215911865, "logps/rejected": -2.250649929046631, "loss": 0.5113, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7950475215911865, "rewards/margins": 0.4556023180484772, "rewards/rejected": -2.250649929046631, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 7.3471882051990764, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.10433042049407959, "logits/rejected": 0.05531962588429451, "logps/chosen": -1.650079369544983, "logps/rejected": -2.400200366973877, "loss": 0.4142, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.650079369544983, "rewards/margins": 0.7501211762428284, "rewards/rejected": -2.400200366973877, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 7.128233411938747, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.1827472597360611, "logits/rejected": 0.013630146160721779, "logps/chosen": -1.7648212909698486, "logps/rejected": -2.6109981536865234, "loss": 0.3968, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7648212909698486, "rewards/margins": 0.8461767435073853, "rewards/rejected": -2.6109981536865234, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 7.73628802159265, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.20916962623596191, "logits/rejected": -0.22946925461292267, "logps/chosen": -1.6158320903778076, "logps/rejected": -2.547794818878174, "loss": 0.4059, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6158320903778076, "rewards/margins": 0.9319628477096558, "rewards/rejected": -2.547794818878174, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 10.337967755886673, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.1738714873790741, "logits/rejected": -0.030047958716750145, "logps/chosen": -1.526439905166626, "logps/rejected": -2.260037660598755, "loss": 0.3907, "rewards/accuracies": 0.75, "rewards/chosen": -1.526439905166626, "rewards/margins": 0.733597457408905, "rewards/rejected": -2.260037660598755, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 8.825902849913506, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.16639330983161926, "logits/rejected": -0.05910569429397583, "logps/chosen": -1.6952708959579468, "logps/rejected": -2.3662118911743164, "loss": 0.4426, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6952708959579468, "rewards/margins": 0.6709409356117249, "rewards/rejected": -2.3662118911743164, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 9.322124628007394, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.3515883982181549, "logits/rejected": -0.08304257690906525, "logps/chosen": -1.7526121139526367, "logps/rejected": -2.545325756072998, "loss": 0.3958, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7526121139526367, "rewards/margins": 0.7927138805389404, "rewards/rejected": -2.545325756072998, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 10.162763784071855, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.17064332962036133, "logits/rejected": 0.018415559083223343, "logps/chosen": -1.714175820350647, "logps/rejected": -2.5459108352661133, "loss": 0.4149, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.714175820350647, "rewards/margins": 0.8317351341247559, "rewards/rejected": -2.5459108352661133, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 9.86845197454439, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.24460916221141815, "logits/rejected": -0.057219069451093674, "logps/chosen": -1.6485151052474976, "logps/rejected": -2.260695695877075, "loss": 0.4641, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6485151052474976, "rewards/margins": 0.6121805310249329, "rewards/rejected": -2.260695695877075, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 6.776921481072155, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.17473876476287842, "logits/rejected": -0.06879258900880814, "logps/chosen": -1.8020213842391968, "logps/rejected": -2.2619924545288086, "loss": 0.4674, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8020213842391968, "rewards/margins": 0.45997095108032227, "rewards/rejected": -2.2619924545288086, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 10.519990890910163, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.2124883234500885, "logits/rejected": 0.02032359316945076, "logps/chosen": -1.635719656944275, "logps/rejected": -2.482880115509033, "loss": 0.4076, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.635719656944275, "rewards/margins": 0.8471605181694031, "rewards/rejected": -2.482880115509033, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 6.584133047273501, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.22448861598968506, "logits/rejected": -0.06686008721590042, "logps/chosen": -1.627128005027771, "logps/rejected": -2.2758777141571045, "loss": 0.4329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.627128005027771, "rewards/margins": 0.6487494707107544, "rewards/rejected": -2.2758777141571045, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 8.151691385842993, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.19478999078273773, "logits/rejected": -0.18964624404907227, "logps/chosen": -1.583848476409912, "logps/rejected": -2.2458956241607666, "loss": 0.4238, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.583848476409912, "rewards/margins": 0.662047266960144, "rewards/rejected": -2.2458956241607666, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 12.79326758534214, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.13934996724128723, "logits/rejected": -0.023800883442163467, "logps/chosen": -1.6382545232772827, "logps/rejected": -2.367661952972412, "loss": 0.4171, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6382545232772827, "rewards/margins": 0.7294072508811951, "rewards/rejected": -2.367661952972412, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 8.459323089609414, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.2333543300628662, "logits/rejected": -0.08987903594970703, "logps/chosen": -1.5974574089050293, "logps/rejected": -2.3490512371063232, "loss": 0.4009, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5974574089050293, "rewards/margins": 0.7515941262245178, "rewards/rejected": -2.3490512371063232, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 9.640124003987756, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.13901160657405853, "logits/rejected": -0.0565708763897419, "logps/chosen": -1.5453542470932007, "logps/rejected": -2.279877185821533, "loss": 0.392, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5453542470932007, "rewards/margins": 0.734522819519043, "rewards/rejected": -2.279877185821533, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 8.633826255801882, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.1694817841053009, "logits/rejected": -0.10513631254434586, "logps/chosen": -1.735923409461975, "logps/rejected": -2.360243558883667, "loss": 0.4438, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.735923409461975, "rewards/margins": 0.6243202090263367, "rewards/rejected": -2.360243558883667, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 8.00192839263538, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.10024277865886688, "logits/rejected": -0.020224109292030334, "logps/chosen": -1.8886654376983643, "logps/rejected": -2.5166752338409424, "loss": 0.507, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8886654376983643, "rewards/margins": 0.6280097961425781, "rewards/rejected": -2.5166752338409424, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 11.392266057869156, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.14519762992858887, "logits/rejected": 0.06055606156587601, "logps/chosen": -1.5355899333953857, "logps/rejected": -2.377645492553711, "loss": 0.4192, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5355899333953857, "rewards/margins": 0.8420552015304565, "rewards/rejected": -2.377645492553711, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 10.80194665834703, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.28407737612724304, "logits/rejected": -0.0380951464176178, "logps/chosen": -1.6787121295928955, "logps/rejected": -2.428769588470459, "loss": 0.4072, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6787121295928955, "rewards/margins": 0.7500573992729187, "rewards/rejected": -2.428769588470459, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 9.216976407727914, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.23785607516765594, "logits/rejected": -0.21692650020122528, "logps/chosen": -1.7045984268188477, "logps/rejected": -2.285457134246826, "loss": 0.4431, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7045984268188477, "rewards/margins": 0.5808587074279785, "rewards/rejected": -2.285457134246826, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 5.706235092023475, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.21365030109882355, "logits/rejected": -0.19409283995628357, "logps/chosen": -1.7280330657958984, "logps/rejected": -2.3226683139801025, "loss": 0.4736, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7280330657958984, "rewards/margins": 0.5946353673934937, "rewards/rejected": -2.3226683139801025, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 7.419134655146061, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.12133806943893433, "logits/rejected": -0.06504418700933456, "logps/chosen": -1.6846742630004883, "logps/rejected": -2.334351062774658, "loss": 0.4097, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6846742630004883, "rewards/margins": 0.6496765613555908, "rewards/rejected": -2.334351062774658, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 9.893492971038414, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.09776897728443146, "logits/rejected": 0.016471516340970993, "logps/chosen": -1.8245117664337158, "logps/rejected": -2.5752675533294678, "loss": 0.4542, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8245117664337158, "rewards/margins": 0.7507559657096863, "rewards/rejected": -2.5752675533294678, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 10.386149743791748, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.2415042370557785, "logits/rejected": -0.16942764818668365, "logps/chosen": -1.7485237121582031, "logps/rejected": -2.367062568664551, "loss": 0.4324, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7485237121582031, "rewards/margins": 0.6185387969017029, "rewards/rejected": -2.367062568664551, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 8.321458293669217, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.24038052558898926, "logits/rejected": -0.01406899094581604, "logps/chosen": -1.5961346626281738, "logps/rejected": -2.2222695350646973, "loss": 0.4322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5961346626281738, "rewards/margins": 0.6261348724365234, "rewards/rejected": -2.2222695350646973, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 8.750463784045863, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.2236858606338501, "logits/rejected": -0.062092531472444534, "logps/chosen": -1.7187353372573853, "logps/rejected": -2.4023866653442383, "loss": 0.4534, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7187353372573853, "rewards/margins": 0.6836512684822083, "rewards/rejected": -2.4023866653442383, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 9.77628637318939, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.19271305203437805, "logits/rejected": -0.1166713684797287, "logps/chosen": -1.6275596618652344, "logps/rejected": -2.3112902641296387, "loss": 0.455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6275596618652344, "rewards/margins": 0.6837307810783386, "rewards/rejected": -2.3112902641296387, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 19.423349894804293, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.24002961814403534, "logits/rejected": -0.10354097187519073, "logps/chosen": -1.7776683568954468, "logps/rejected": -2.401078224182129, "loss": 0.4445, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7776683568954468, "rewards/margins": 0.6234096884727478, "rewards/rejected": -2.401078224182129, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 6.830158894811909, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.036166924983263016, "logits/rejected": -0.072201669216156, "logps/chosen": -1.640226125717163, "logps/rejected": -2.3059463500976562, "loss": 0.4346, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.640226125717163, "rewards/margins": 0.6657201051712036, "rewards/rejected": -2.3059463500976562, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 15.692444813905562, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.1264549195766449, "logits/rejected": -0.0931362509727478, "logps/chosen": -1.6065356731414795, "logps/rejected": -2.18522310256958, "loss": 0.4389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6065356731414795, "rewards/margins": 0.5786874294281006, "rewards/rejected": -2.18522310256958, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 7.249025235334473, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.20184388756752014, "logits/rejected": -0.1268937885761261, "logps/chosen": -1.5003236532211304, "logps/rejected": -2.149989366531372, "loss": 0.3947, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5003236532211304, "rewards/margins": 0.6496654748916626, "rewards/rejected": -2.149989366531372, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 9.346189717494891, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.30416733026504517, "logits/rejected": -0.2145983874797821, "logps/chosen": -1.5059964656829834, "logps/rejected": -2.2037439346313477, "loss": 0.4044, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5059964656829834, "rewards/margins": 0.697747528553009, "rewards/rejected": -2.2037439346313477, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 9.305377466983204, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.15547212958335876, "logits/rejected": -0.08535311371088028, "logps/chosen": -1.6773805618286133, "logps/rejected": -2.294898748397827, "loss": 0.4412, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6773805618286133, "rewards/margins": 0.6175183057785034, "rewards/rejected": -2.294898748397827, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 11.316988803673745, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.24089860916137695, "logits/rejected": 0.022669857367873192, "logps/chosen": -1.6773402690887451, "logps/rejected": -2.3373703956604004, "loss": 0.4163, "rewards/accuracies": 0.75, "rewards/chosen": -1.6773402690887451, "rewards/margins": 0.6600298881530762, "rewards/rejected": -2.3373703956604004, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 12.088257848436255, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.10625454038381577, "logits/rejected": 0.06879651546478271, "logps/chosen": -1.5640321969985962, "logps/rejected": -2.135523557662964, "loss": 0.4256, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5640321969985962, "rewards/margins": 0.571491539478302, "rewards/rejected": -2.135523557662964, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 10.474772921805421, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.18225374817848206, "logits/rejected": -0.09811027348041534, "logps/chosen": -1.6582107543945312, "logps/rejected": -2.1244869232177734, "loss": 0.4837, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6582107543945312, "rewards/margins": 0.4662759304046631, "rewards/rejected": -2.1244869232177734, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 10.585534398682826, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.22543105483055115, "logits/rejected": -0.0672234445810318, "logps/chosen": -1.6462138891220093, "logps/rejected": -2.273482322692871, "loss": 0.4369, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6462138891220093, "rewards/margins": 0.6272685527801514, "rewards/rejected": -2.273482322692871, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 15.786943580098606, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.2511370778083801, "logits/rejected": -0.11307625472545624, "logps/chosen": -1.5162370204925537, "logps/rejected": -2.280825138092041, "loss": 0.404, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5162370204925537, "rewards/margins": 0.7645877003669739, "rewards/rejected": -2.280825138092041, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 11.506706121153146, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.3281300663948059, "logits/rejected": -0.13915562629699707, "logps/chosen": -1.723780870437622, "logps/rejected": -2.4697680473327637, "loss": 0.4207, "rewards/accuracies": 0.75, "rewards/chosen": -1.723780870437622, "rewards/margins": 0.7459869384765625, "rewards/rejected": -2.4697680473327637, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 11.42216537833148, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.33761900663375854, "logits/rejected": -0.10695630311965942, "logps/chosen": -1.6199461221694946, "logps/rejected": -2.4618332386016846, "loss": 0.394, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6199461221694946, "rewards/margins": 0.8418868780136108, "rewards/rejected": -2.4618332386016846, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 7.965258482017517, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.1286105364561081, "logits/rejected": 0.047980327159166336, "logps/chosen": -1.6010692119598389, "logps/rejected": -2.5288405418395996, "loss": 0.3875, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6010692119598389, "rewards/margins": 0.9277715682983398, "rewards/rejected": -2.5288405418395996, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 11.697879193273616, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.1937987208366394, "logits/rejected": -0.005719378590583801, "logps/chosen": -1.7386348247528076, "logps/rejected": -2.330533266067505, "loss": 0.4425, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7386348247528076, "rewards/margins": 0.5918980836868286, "rewards/rejected": -2.330533266067505, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 9.664623271005864, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.27357718348503113, "logits/rejected": -0.12317635864019394, "logps/chosen": -1.6265552043914795, "logps/rejected": -2.257211208343506, "loss": 0.4422, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6265552043914795, "rewards/margins": 0.6306560635566711, "rewards/rejected": -2.257211208343506, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 6.453712673321282, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.2784246802330017, "logits/rejected": -0.02629704214632511, "logps/chosen": -1.662407636642456, "logps/rejected": -2.52083158493042, "loss": 0.3924, "rewards/accuracies": 0.75, "rewards/chosen": -1.662407636642456, "rewards/margins": 0.8584240078926086, "rewards/rejected": -2.52083158493042, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 8.823969285479254, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.22877275943756104, "logits/rejected": -0.15858663618564606, "logps/chosen": -1.750125527381897, "logps/rejected": -2.4829444885253906, "loss": 0.4097, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.750125527381897, "rewards/margins": 0.7328189015388489, "rewards/rejected": -2.4829444885253906, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 22.47916709882074, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.3465072512626648, "logits/rejected": -0.09823520481586456, "logps/chosen": -1.662949800491333, "logps/rejected": -2.3185579776763916, "loss": 0.4228, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.662949800491333, "rewards/margins": 0.6556081175804138, "rewards/rejected": -2.3185579776763916, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 6.755703304245198, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.3737272024154663, "logits/rejected": -0.1568094789981842, "logps/chosen": -1.6700235605239868, "logps/rejected": -2.388484477996826, "loss": 0.4275, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6700235605239868, "rewards/margins": 0.7184609174728394, "rewards/rejected": -2.388484477996826, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 11.283586233605163, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.14451536536216736, "logits/rejected": -0.06124686077237129, "logps/chosen": -1.641452431678772, "logps/rejected": -2.2682013511657715, "loss": 0.4276, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.641452431678772, "rewards/margins": 0.6267486214637756, "rewards/rejected": -2.2682013511657715, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 11.309967624126914, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.1662003993988037, "logits/rejected": -0.009802314452826977, "logps/chosen": -1.7570396661758423, "logps/rejected": -2.3348097801208496, "loss": 0.4601, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7570396661758423, "rewards/margins": 0.577769935131073, "rewards/rejected": -2.3348097801208496, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 7.935694122760897, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.20717056095600128, "logits/rejected": -0.015534961596131325, "logps/chosen": -1.631024956703186, "logps/rejected": -2.2873129844665527, "loss": 0.4449, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.631024956703186, "rewards/margins": 0.6562881469726562, "rewards/rejected": -2.2873129844665527, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 6.789896406396291, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.2975825071334839, "logits/rejected": -0.1573215276002884, "logps/chosen": -1.6466338634490967, "logps/rejected": -2.333155393600464, "loss": 0.4072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6466338634490967, "rewards/margins": 0.6865212321281433, "rewards/rejected": -2.333155393600464, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 7.632908611897957, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.2936624586582184, "logits/rejected": -0.16784432530403137, "logps/chosen": -1.6612141132354736, "logps/rejected": -2.2570176124572754, "loss": 0.4461, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6612141132354736, "rewards/margins": 0.5958033800125122, "rewards/rejected": -2.2570176124572754, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 5.37629029257076, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.2544962763786316, "logits/rejected": -0.17708469927310944, "logps/chosen": -1.7097675800323486, "logps/rejected": -2.532360792160034, "loss": 0.382, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7097675800323486, "rewards/margins": 0.8225935697555542, "rewards/rejected": -2.532360792160034, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 10.21128714989124, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.21555109322071075, "logits/rejected": -0.007090986706316471, "logps/chosen": -1.5289596319198608, "logps/rejected": -2.2108635902404785, "loss": 0.4088, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5289596319198608, "rewards/margins": 0.6819039583206177, "rewards/rejected": -2.2108635902404785, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 6.316176873208817, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.2205251008272171, "logits/rejected": -0.0737658143043518, "logps/chosen": -1.6812431812286377, "logps/rejected": -2.338650941848755, "loss": 0.4493, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6812431812286377, "rewards/margins": 0.6574073433876038, "rewards/rejected": -2.338650941848755, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 8.081729266019122, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.2327534407377243, "logits/rejected": -0.10571527481079102, "logps/chosen": -1.7716243267059326, "logps/rejected": -2.514632225036621, "loss": 0.4104, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7716243267059326, "rewards/margins": 0.7430077195167542, "rewards/rejected": -2.514632225036621, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 9.885762164712364, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.14688795804977417, "logits/rejected": -0.07873876392841339, "logps/chosen": -1.7520370483398438, "logps/rejected": -2.4511260986328125, "loss": 0.4572, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7520370483398438, "rewards/margins": 0.6990889310836792, "rewards/rejected": -2.4511260986328125, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 6.679206194650915, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.16469566524028778, "logits/rejected": -0.03365384787321091, "logps/chosen": -1.5759086608886719, "logps/rejected": -2.3513970375061035, "loss": 0.3977, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5759086608886719, "rewards/margins": 0.775488555431366, "rewards/rejected": -2.3513970375061035, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 12.019463477767852, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.3064512610435486, "logits/rejected": -0.10870430618524551, "logps/chosen": -1.5779739618301392, "logps/rejected": -2.324672222137451, "loss": 0.4113, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5779739618301392, "rewards/margins": 0.7466981410980225, "rewards/rejected": -2.324672222137451, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 15.603000824683756, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.18821445107460022, "logits/rejected": -0.04060869663953781, "logps/chosen": -1.5963757038116455, "logps/rejected": -2.2651546001434326, "loss": 0.4011, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5963757038116455, "rewards/margins": 0.6687790155410767, "rewards/rejected": -2.2651546001434326, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 13.90363872130239, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.28439170122146606, "logits/rejected": -0.0969526395201683, "logps/chosen": -1.693139672279358, "logps/rejected": -2.336012840270996, "loss": 0.4414, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.693139672279358, "rewards/margins": 0.6428731679916382, "rewards/rejected": -2.336012840270996, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.1301593780517578, "eval_logits/rejected": 0.23435340821743011, "eval_logps/chosen": -1.7505549192428589, "eval_logps/rejected": -2.3686470985412598, "eval_loss": 0.46975237131118774, "eval_rewards/accuracies": 0.6847180724143982, "eval_rewards/chosen": -1.7505549192428589, "eval_rewards/margins": 0.6180920004844666, "eval_rewards/rejected": -2.3686470985412598, "eval_runtime": 40.7157, "eval_samples_per_second": 33.034, "eval_steps_per_second": 8.277, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 5.535498604747621, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.13449445366859436, "logits/rejected": -0.08821174502372742, "logps/chosen": -1.637037992477417, "logps/rejected": -2.2171640396118164, "loss": 0.4658, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.637037992477417, "rewards/margins": 0.5801259279251099, "rewards/rejected": -2.2171640396118164, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 12.414259394364596, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.21603801846504211, "logits/rejected": -0.10904371738433838, "logps/chosen": -1.6274759769439697, "logps/rejected": -2.364156723022461, "loss": 0.4048, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6274759769439697, "rewards/margins": 0.7366809844970703, "rewards/rejected": -2.364156723022461, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 5.631453605969291, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.21718236804008484, "logits/rejected": -0.10242237895727158, "logps/chosen": -1.5314159393310547, "logps/rejected": -2.2760169506073, "loss": 0.4095, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5314159393310547, "rewards/margins": 0.7446011900901794, "rewards/rejected": -2.2760169506073, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 8.859875144111584, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.2402385026216507, "logits/rejected": -0.14846864342689514, "logps/chosen": -1.5787250995635986, "logps/rejected": -2.312079668045044, "loss": 0.4207, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5787250995635986, "rewards/margins": 0.7333544492721558, "rewards/rejected": -2.312079668045044, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 11.203861288270799, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.14103104174137115, "logits/rejected": 0.017539968714118004, "logps/chosen": -1.7455532550811768, "logps/rejected": -2.4552829265594482, "loss": 0.446, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7455532550811768, "rewards/margins": 0.7097296714782715, "rewards/rejected": -2.4552829265594482, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 8.819046066284965, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.3244856894016266, "logits/rejected": -0.16100825369358063, "logps/chosen": -1.6469604969024658, "logps/rejected": -2.3386900424957275, "loss": 0.4201, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6469604969024658, "rewards/margins": 0.6917296051979065, "rewards/rejected": -2.3386900424957275, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 9.074723996781755, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.2911549508571625, "logits/rejected": -0.1500391960144043, "logps/chosen": -1.6339752674102783, "logps/rejected": -2.2734057903289795, "loss": 0.4379, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6339752674102783, "rewards/margins": 0.6394303441047668, "rewards/rejected": -2.2734057903289795, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 8.90690786245453, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.2231179028749466, "logits/rejected": 0.012175345793366432, "logps/chosen": -1.7092864513397217, "logps/rejected": -2.4165596961975098, "loss": 0.4404, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7092864513397217, "rewards/margins": 0.7072734832763672, "rewards/rejected": -2.4165596961975098, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 10.681837464847229, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.13506057858467102, "logits/rejected": -0.0782659575343132, "logps/chosen": -1.596719741821289, "logps/rejected": -2.2051239013671875, "loss": 0.4538, "rewards/accuracies": 0.6875, "rewards/chosen": -1.596719741821289, "rewards/margins": 0.6084040403366089, "rewards/rejected": -2.2051239013671875, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 16.61425914624376, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.26859182119369507, "logits/rejected": -0.09962020069360733, "logps/chosen": -1.8071186542510986, "logps/rejected": -2.5454981327056885, "loss": 0.4484, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8071186542510986, "rewards/margins": 0.7383795976638794, "rewards/rejected": -2.5454981327056885, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 10.181927976625705, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.24699559807777405, "logits/rejected": 0.003957188222557306, "logps/chosen": -1.7558863162994385, "logps/rejected": -2.43202543258667, "loss": 0.436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7558863162994385, "rewards/margins": 0.6761394143104553, "rewards/rejected": -2.43202543258667, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 8.88761976851756, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.19093777239322662, "logits/rejected": -0.11458507925271988, "logps/chosen": -1.7045962810516357, "logps/rejected": -2.393123149871826, "loss": 0.4459, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7045962810516357, "rewards/margins": 0.68852698802948, "rewards/rejected": -2.393123149871826, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 12.343163375408466, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.34429964423179626, "logits/rejected": -0.08577932417392731, "logps/chosen": -1.9859247207641602, "logps/rejected": -2.6130828857421875, "loss": 0.4804, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9859247207641602, "rewards/margins": 0.6271580457687378, "rewards/rejected": -2.6130828857421875, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 7.388191354103933, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.1942037045955658, "logits/rejected": -0.029857853427529335, "logps/chosen": -1.708739995956421, "logps/rejected": -2.4387435913085938, "loss": 0.4015, "rewards/accuracies": 0.78125, "rewards/chosen": -1.708739995956421, "rewards/margins": 0.730003297328949, "rewards/rejected": -2.4387435913085938, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 6.983761564577683, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.19474998116493225, "logits/rejected": -0.026011278852820396, "logps/chosen": -1.8304287195205688, "logps/rejected": -2.605928897857666, "loss": 0.392, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8304287195205688, "rewards/margins": 0.7755002975463867, "rewards/rejected": -2.605928897857666, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 9.083889384327287, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.21530011296272278, "logits/rejected": -0.010361668653786182, "logps/chosen": -1.8833993673324585, "logps/rejected": -2.7177321910858154, "loss": 0.4248, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8833993673324585, "rewards/margins": 0.8343328237533569, "rewards/rejected": -2.7177321910858154, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 13.691426338554312, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.2699522078037262, "logits/rejected": -0.14956672489643097, "logps/chosen": -1.5875293016433716, "logps/rejected": -2.379575252532959, "loss": 0.4045, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5875293016433716, "rewards/margins": 0.7920459508895874, "rewards/rejected": -2.379575252532959, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 7.351647183833241, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.2009161412715912, "logits/rejected": -0.030434230342507362, "logps/chosen": -1.6317245960235596, "logps/rejected": -2.280377149581909, "loss": 0.431, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6317245960235596, "rewards/margins": 0.6486524939537048, "rewards/rejected": -2.280377149581909, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 6.750846126532824, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.3043988049030304, "logits/rejected": -0.09166266024112701, "logps/chosen": -1.7909457683563232, "logps/rejected": -2.564138412475586, "loss": 0.4246, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7909457683563232, "rewards/margins": 0.7731926441192627, "rewards/rejected": -2.564138412475586, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 10.055515567490408, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.22735965251922607, "logits/rejected": -0.16883966326713562, "logps/chosen": -1.6752593517303467, "logps/rejected": -2.448578357696533, "loss": 0.4144, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6752593517303467, "rewards/margins": 0.7733188271522522, "rewards/rejected": -2.448578357696533, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 10.964922117203121, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.19676534831523895, "logits/rejected": -0.039084918797016144, "logps/chosen": -1.7762854099273682, "logps/rejected": -2.4887564182281494, "loss": 0.4174, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7762854099273682, "rewards/margins": 0.7124709486961365, "rewards/rejected": -2.4887564182281494, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 8.65504346168159, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.366741418838501, "logits/rejected": -0.0903666764497757, "logps/chosen": -1.633435606956482, "logps/rejected": -2.5082240104675293, "loss": 0.4124, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.633435606956482, "rewards/margins": 0.8747886419296265, "rewards/rejected": -2.5082240104675293, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 10.01101299272854, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.12120343744754791, "logits/rejected": 0.023883363232016563, "logps/chosen": -1.6603565216064453, "logps/rejected": -2.263627529144287, "loss": 0.4216, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6603565216064453, "rewards/margins": 0.6032707095146179, "rewards/rejected": -2.263627529144287, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 7.307934663398553, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.10164584219455719, "logits/rejected": -0.00446722935885191, "logps/chosen": -1.6131635904312134, "logps/rejected": -2.239896535873413, "loss": 0.4288, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6131635904312134, "rewards/margins": 0.6267330646514893, "rewards/rejected": -2.239896535873413, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 6.591973255431241, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.19755670428276062, "logits/rejected": -0.03216441720724106, "logps/chosen": -1.7728755474090576, "logps/rejected": -2.577751398086548, "loss": 0.396, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7728755474090576, "rewards/margins": 0.8048759698867798, "rewards/rejected": -2.577751398086548, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 10.878014297108429, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.15936610102653503, "logits/rejected": -0.06801502406597137, "logps/chosen": -1.5923312902450562, "logps/rejected": -2.389348030090332, "loss": 0.4266, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5923312902450562, "rewards/margins": 0.7970169186592102, "rewards/rejected": -2.389348030090332, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 14.156731048002257, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.07937529683113098, "logits/rejected": -0.10965453088283539, "logps/chosen": -1.6146513223648071, "logps/rejected": -2.0958268642425537, "loss": 0.4975, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6146513223648071, "rewards/margins": 0.48117581009864807, "rewards/rejected": -2.0958268642425537, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 7.052775529865043, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.16289743781089783, "logits/rejected": -0.11550410091876984, "logps/chosen": -1.6701046228408813, "logps/rejected": -2.3369832038879395, "loss": 0.4322, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6701046228408813, "rewards/margins": 0.6668787002563477, "rewards/rejected": -2.3369832038879395, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 5.863683085745865, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.3269725739955902, "logits/rejected": -0.11682265996932983, "logps/chosen": -1.690829873085022, "logps/rejected": -2.434781789779663, "loss": 0.378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.690829873085022, "rewards/margins": 0.7439519762992859, "rewards/rejected": -2.434781789779663, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 8.310591894160734, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.13289813697338104, "logits/rejected": -0.15137526392936707, "logps/chosen": -1.7445909976959229, "logps/rejected": -2.303849697113037, "loss": 0.4651, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7445909976959229, "rewards/margins": 0.5592588186264038, "rewards/rejected": -2.303849697113037, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 11.939265657431324, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.2540941834449768, "logits/rejected": -0.16252782940864563, "logps/chosen": -1.6619617938995361, "logps/rejected": -2.2716026306152344, "loss": 0.4259, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6619617938995361, "rewards/margins": 0.6096407175064087, "rewards/rejected": -2.2716026306152344, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 12.080556629643546, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.2131994515657425, "logits/rejected": 0.0045366049744188786, "logps/chosen": -1.7368589639663696, "logps/rejected": -2.4362616539001465, "loss": 0.4367, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7368589639663696, "rewards/margins": 0.699402928352356, "rewards/rejected": -2.4362616539001465, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 9.784690319832995, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.30134397745132446, "logits/rejected": -0.11042986810207367, "logps/chosen": -1.700526475906372, "logps/rejected": -2.4608824253082275, "loss": 0.4138, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.700526475906372, "rewards/margins": 0.7603558897972107, "rewards/rejected": -2.4608824253082275, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 7.09378219513203, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.140936017036438, "logits/rejected": -0.09773506224155426, "logps/chosen": -1.6609716415405273, "logps/rejected": -2.1493782997131348, "loss": 0.4664, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6609716415405273, "rewards/margins": 0.4884066581726074, "rewards/rejected": -2.1493782997131348, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 8.304954945476187, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.20915377140045166, "logits/rejected": -0.10395795106887817, "logps/chosen": -1.618930459022522, "logps/rejected": -2.469147205352783, "loss": 0.4272, "rewards/accuracies": 0.6875, "rewards/chosen": -1.618930459022522, "rewards/margins": 0.8502169847488403, "rewards/rejected": -2.469147205352783, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 8.890135105525104, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.2684245705604553, "logits/rejected": -0.012992632575333118, "logps/chosen": -1.844985008239746, "logps/rejected": -2.615591287612915, "loss": 0.4062, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.844985008239746, "rewards/margins": 0.7706061005592346, "rewards/rejected": -2.615591287612915, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 6.80458923893997, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.14007353782653809, "logits/rejected": -0.033022623509168625, "logps/chosen": -1.656536340713501, "logps/rejected": -2.2832539081573486, "loss": 0.4148, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.656536340713501, "rewards/margins": 0.6267178654670715, "rewards/rejected": -2.2832539081573486, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 5.693611908269652, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.1865476667881012, "logits/rejected": -0.029537910595536232, "logps/chosen": -1.6774896383285522, "logps/rejected": -2.2488365173339844, "loss": 0.4518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6774896383285522, "rewards/margins": 0.5713469386100769, "rewards/rejected": -2.2488365173339844, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 6.925814130297165, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.33329999446868896, "logits/rejected": -0.1097496747970581, "logps/chosen": -1.6807295083999634, "logps/rejected": -2.353853225708008, "loss": 0.3981, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6807295083999634, "rewards/margins": 0.6731237173080444, "rewards/rejected": -2.353853225708008, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 10.658702699880036, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.2547788619995117, "logits/rejected": 0.02544974349439144, "logps/chosen": -1.6398718357086182, "logps/rejected": -2.614966630935669, "loss": 0.3835, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6398718357086182, "rewards/margins": 0.9750946164131165, "rewards/rejected": -2.614966630935669, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 9.813156361268765, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.13717059791088104, "logits/rejected": -0.07634995877742767, "logps/chosen": -1.8867213726043701, "logps/rejected": -2.499987840652466, "loss": 0.4784, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8867213726043701, "rewards/margins": 0.6132663488388062, "rewards/rejected": -2.499987840652466, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 7.885102336269766, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.15556256473064423, "logits/rejected": -0.005589795298874378, "logps/chosen": -1.7171329259872437, "logps/rejected": -2.6526904106140137, "loss": 0.414, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7171329259872437, "rewards/margins": 0.9355573654174805, "rewards/rejected": -2.6526904106140137, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 6.7913934394682896, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.2413446605205536, "logits/rejected": -0.03337030112743378, "logps/chosen": -1.752310037612915, "logps/rejected": -2.350551128387451, "loss": 0.443, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.752310037612915, "rewards/margins": 0.5982412695884705, "rewards/rejected": -2.350551128387451, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 8.96766547668909, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.14620235562324524, "logits/rejected": -0.06466380506753922, "logps/chosen": -1.7657420635223389, "logps/rejected": -2.3725085258483887, "loss": 0.4553, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7657420635223389, "rewards/margins": 0.6067663431167603, "rewards/rejected": -2.3725085258483887, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 9.024466081858309, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.1780090630054474, "logits/rejected": -0.02283216081559658, "logps/chosen": -1.6792405843734741, "logps/rejected": -2.420431613922119, "loss": 0.4217, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6792405843734741, "rewards/margins": 0.7411910891532898, "rewards/rejected": -2.420431613922119, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 9.340279121614364, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.24418766796588898, "logits/rejected": -0.04588242620229721, "logps/chosen": -1.598848581314087, "logps/rejected": -2.355781078338623, "loss": 0.393, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.598848581314087, "rewards/margins": 0.7569323778152466, "rewards/rejected": -2.355781078338623, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 27.911396922986235, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.338864266872406, "logits/rejected": -0.22151124477386475, "logps/chosen": -1.6586824655532837, "logps/rejected": -2.420889139175415, "loss": 0.4154, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6586824655532837, "rewards/margins": 0.7622066736221313, "rewards/rejected": -2.420889139175415, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 9.047538076261072, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.25779178738594055, "logits/rejected": -0.11255134642124176, "logps/chosen": -1.5996456146240234, "logps/rejected": -2.211871862411499, "loss": 0.4232, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5996456146240234, "rewards/margins": 0.6122261881828308, "rewards/rejected": -2.211871862411499, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 7.7034498846412784, "learning_rate": 2.450761014337888e-09, "logits/chosen": 0.007573113776743412, "logits/rejected": 0.028845876455307007, "logps/chosen": -1.7006845474243164, "logps/rejected": -2.610978126525879, "loss": 0.4199, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7006845474243164, "rewards/margins": 0.9102934002876282, "rewards/rejected": -2.610978126525879, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 9.480187411554859, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.1185058131814003, "logits/rejected": -0.021084267646074295, "logps/chosen": -1.7408926486968994, "logps/rejected": -2.4972469806671143, "loss": 0.4402, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7408926486968994, "rewards/margins": 0.7563542127609253, "rewards/rejected": -2.4972469806671143, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 7.908171683146034, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.23456911742687225, "logits/rejected": -0.04003529995679855, "logps/chosen": -1.7042639255523682, "logps/rejected": -2.2638742923736572, "loss": 0.4737, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7042639255523682, "rewards/margins": 0.5596104264259338, "rewards/rejected": -2.2638742923736572, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 8.220524645447801, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.21456141769886017, "logits/rejected": -0.08591201156377792, "logps/chosen": -1.6611522436141968, "logps/rejected": -2.412694215774536, "loss": 0.4067, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6611522436141968, "rewards/margins": 0.7515419125556946, "rewards/rejected": -2.412694215774536, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 7.695119917552502, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.2998107671737671, "logits/rejected": -0.08768518269062042, "logps/chosen": -1.824004888534546, "logps/rejected": -2.4239258766174316, "loss": 0.446, "rewards/accuracies": 0.6875, "rewards/chosen": -1.824004888534546, "rewards/margins": 0.5999209880828857, "rewards/rejected": -2.4239258766174316, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 9.785775814775247, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.14006975293159485, "logits/rejected": -0.04280511662364006, "logps/chosen": -1.6531875133514404, "logps/rejected": -2.260084629058838, "loss": 0.4524, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6531875133514404, "rewards/margins": 0.6068969964981079, "rewards/rejected": -2.260084629058838, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 9.623335021033906, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.14072854816913605, "logits/rejected": -0.17417272925376892, "logps/chosen": -1.580338478088379, "logps/rejected": -2.2227678298950195, "loss": 0.4096, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.580338478088379, "rewards/margins": 0.6424292325973511, "rewards/rejected": -2.2227678298950195, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 7.62812894800998, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.19472870230674744, "logits/rejected": -0.06156862527132034, "logps/chosen": -1.828049659729004, "logps/rejected": -2.4619317054748535, "loss": 0.4502, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.828049659729004, "rewards/margins": 0.6338822245597839, "rewards/rejected": -2.4619317054748535, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 7.024901816008821, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.14743950963020325, "logits/rejected": 0.03870768845081329, "logps/chosen": -1.574162244796753, "logps/rejected": -2.2261760234832764, "loss": 0.416, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.574162244796753, "rewards/margins": 0.6520137786865234, "rewards/rejected": -2.2261760234832764, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 12.507249720080985, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.22349539399147034, "logits/rejected": -0.16669543087482452, "logps/chosen": -1.6187880039215088, "logps/rejected": -2.254061222076416, "loss": 0.4315, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6187880039215088, "rewards/margins": 0.6352733373641968, "rewards/rejected": -2.254061222076416, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 7.814720084281585, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.26529473066329956, "logits/rejected": -0.04124947637319565, "logps/chosen": -1.6391613483428955, "logps/rejected": -2.416468858718872, "loss": 0.4165, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6391613483428955, "rewards/margins": 0.7773075103759766, "rewards/rejected": -2.416468858718872, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 6.823512057942252, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.17339283227920532, "logits/rejected": -0.0464506521821022, "logps/chosen": -1.7062044143676758, "logps/rejected": -2.2960314750671387, "loss": 0.4635, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7062044143676758, "rewards/margins": 0.5898270606994629, "rewards/rejected": -2.2960314750671387, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 9.36167099673395, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.14240801334381104, "logits/rejected": -0.12498018890619278, "logps/chosen": -1.6624677181243896, "logps/rejected": -2.3642704486846924, "loss": 0.4114, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6624677181243896, "rewards/margins": 0.7018024921417236, "rewards/rejected": -2.3642704486846924, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 11.037570033752342, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.22189101576805115, "logits/rejected": -0.08284217119216919, "logps/chosen": -1.6653811931610107, "logps/rejected": -2.6358067989349365, "loss": 0.3924, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6653811931610107, "rewards/margins": 0.9704257249832153, "rewards/rejected": -2.6358067989349365, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 13.83259578568047, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.315311461687088, "logits/rejected": -0.1589202731847763, "logps/chosen": -1.6414378881454468, "logps/rejected": -2.4022586345672607, "loss": 0.4322, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6414378881454468, "rewards/margins": 0.7608209848403931, "rewards/rejected": -2.4022586345672607, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 11.652165777406163, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.1831158697605133, "logits/rejected": -0.10827866941690445, "logps/chosen": -1.6619945764541626, "logps/rejected": -2.280052900314331, "loss": 0.4311, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6619945764541626, "rewards/margins": 0.6180580854415894, "rewards/rejected": -2.280052900314331, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 13.0770303249055, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.19605092704296112, "logits/rejected": -0.06697191298007965, "logps/chosen": -1.8263505697250366, "logps/rejected": -2.5495944023132324, "loss": 0.4461, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8263505697250366, "rewards/margins": 0.7232438921928406, "rewards/rejected": -2.5495944023132324, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 11.133944749021417, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.25125670433044434, "logits/rejected": 0.01260462123900652, "logps/chosen": -1.7712438106536865, "logps/rejected": -2.35971736907959, "loss": 0.4629, "rewards/accuracies": 0.75, "rewards/chosen": -1.7712438106536865, "rewards/margins": 0.5884734392166138, "rewards/rejected": -2.35971736907959, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 8.78302337537522, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.20127682387828827, "logits/rejected": -0.13745814561843872, "logps/chosen": -1.645218849182129, "logps/rejected": -2.1983642578125, "loss": 0.4747, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.645218849182129, "rewards/margins": 0.5531454086303711, "rewards/rejected": -2.1983642578125, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 7.969651479998755, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.34325718879699707, "logits/rejected": -0.17306803166866302, "logps/chosen": -1.7453396320343018, "logps/rejected": -2.425354242324829, "loss": 0.4521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7453396320343018, "rewards/margins": 0.6800147294998169, "rewards/rejected": -2.425354242324829, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 13.690133091165638, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.22184042632579803, "logits/rejected": -0.08896343410015106, "logps/chosen": -1.560638189315796, "logps/rejected": -2.1766767501831055, "loss": 0.4194, "rewards/accuracies": 0.71875, "rewards/chosen": -1.560638189315796, "rewards/margins": 0.61603844165802, "rewards/rejected": -2.1766767501831055, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 8.457532760321087, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.12201350927352905, "logits/rejected": 0.0005096882814541459, "logps/chosen": -1.7388665676116943, "logps/rejected": -2.4146103858947754, "loss": 0.4273, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7388665676116943, "rewards/margins": 0.675743579864502, "rewards/rejected": -2.4146103858947754, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 10.610160904588634, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.23547551035881042, "logits/rejected": -0.007309481501579285, "logps/chosen": -1.7833993434906006, "logps/rejected": -2.553593873977661, "loss": 0.4275, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7833993434906006, "rewards/margins": 0.770194411277771, "rewards/rejected": -2.553593873977661, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 9.38604074131912, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.15099649131298065, "logits/rejected": -0.1303461492061615, "logps/chosen": -1.6690593957901, "logps/rejected": -2.161623239517212, "loss": 0.4971, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6690593957901, "rewards/margins": 0.4925641417503357, "rewards/rejected": -2.161623239517212, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 8.004811637733011, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.27880245447158813, "logits/rejected": -0.20941570401191711, "logps/chosen": -1.6845579147338867, "logps/rejected": -2.367297410964966, "loss": 0.4343, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6845579147338867, "rewards/margins": 0.6827393770217896, "rewards/rejected": -2.367297410964966, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 12.18722249393703, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.2158760279417038, "logits/rejected": -0.07435666769742966, "logps/chosen": -1.6414493322372437, "logps/rejected": -2.613215684890747, "loss": 0.3736, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6414493322372437, "rewards/margins": 0.9717662930488586, "rewards/rejected": -2.613215684890747, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 8.077645292935205, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.22686132788658142, "logits/rejected": -0.0697961300611496, "logps/chosen": -1.5508238077163696, "logps/rejected": -2.241894006729126, "loss": 0.4139, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5508238077163696, "rewards/margins": 0.6910701990127563, "rewards/rejected": -2.241894006729126, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 7.050127946968985, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.35961657762527466, "logits/rejected": -0.07859481871128082, "logps/chosen": -1.6036380529403687, "logps/rejected": -2.2783265113830566, "loss": 0.4277, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6036380529403687, "rewards/margins": 0.6746885776519775, "rewards/rejected": -2.2783265113830566, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 7.859817957318789, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.29858672618865967, "logits/rejected": -0.11690598726272583, "logps/chosen": -1.7370589971542358, "logps/rejected": -2.5789856910705566, "loss": 0.3938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7370589971542358, "rewards/margins": 0.8419266939163208, "rewards/rejected": -2.5789856910705566, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 6.411316503728484, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.21430249512195587, "logits/rejected": -0.1492438018321991, "logps/chosen": -1.7832714319229126, "logps/rejected": -2.4604218006134033, "loss": 0.4658, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7832714319229126, "rewards/margins": 0.6771504282951355, "rewards/rejected": -2.4604218006134033, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 8.825848871259147, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.2386697232723236, "logits/rejected": -0.0770302414894104, "logps/chosen": -1.5555530786514282, "logps/rejected": -2.227916955947876, "loss": 0.4239, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5555530786514282, "rewards/margins": 0.6723641157150269, "rewards/rejected": -2.227916955947876, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 8.446497724009374, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.23375597596168518, "logits/rejected": -0.11699078232049942, "logps/chosen": -1.7586185932159424, "logps/rejected": -2.651644229888916, "loss": 0.404, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7586185932159424, "rewards/margins": 0.8930255174636841, "rewards/rejected": -2.651644229888916, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.11797049641609192, "eval_logits/rejected": 0.22131425142288208, "eval_logps/chosen": -1.754404902458191, "eval_logps/rejected": -2.3724052906036377, "eval_loss": 0.4698374271392822, "eval_rewards/accuracies": 0.6839762330055237, "eval_rewards/chosen": -1.754404902458191, "eval_rewards/margins": 0.6180002093315125, "eval_rewards/rejected": -2.3724052906036377, "eval_runtime": 40.6193, "eval_samples_per_second": 33.112, "eval_steps_per_second": 8.297, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.4793110236919071, "train_runtime": 30370.8284, "train_samples_per_second": 5.906, "train_steps_per_second": 0.185 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }