{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010468463752944255, "grad_norm": 1.1945625860018705, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.5192830562591553, "logits/rejected": -2.3547825813293457, "logps/chosen": -297.60443115234375, "logps/rejected": -252.4619903564453, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00020415784092620015, "rewards/margins": -0.0002505290030967444, "rewards/rejected": 4.637122037820518e-05, "step": 1 }, { "epoch": 0.010468463752944255, "grad_norm": 1.1009278086693854, "learning_rate": 5.208333333333334e-07, "logits/chosen": -2.2455766201019287, "logits/rejected": -2.215245008468628, "logps/chosen": -275.6755065917969, "logps/rejected": -254.76722717285156, "loss": 0.6927, "rewards/accuracies": 0.6111111044883728, "rewards/chosen": 0.004448441788554192, "rewards/margins": 0.0008290203404612839, "rewards/rejected": 0.0036194208078086376, "step": 10 }, { "epoch": 0.02093692750588851, "grad_norm": 1.165704750760885, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.2313215732574463, "logits/rejected": -2.114736795425415, "logps/chosen": -277.5883483886719, "logps/rejected": -255.2056427001953, "loss": 0.6907, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.026263630017638206, "rewards/margins": 0.005699009168893099, "rewards/rejected": 0.020564619451761246, "step": 20 }, { "epoch": 0.031405391258832765, "grad_norm": 1.1872789709669673, "learning_rate": 1.5625e-06, "logits/chosen": -2.3138914108276367, "logits/rejected": -2.2109274864196777, "logps/chosen": -281.3846740722656, "logps/rejected": -262.41693115234375, "loss": 0.6858, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04216086491942406, "rewards/margins": 0.014992751181125641, "rewards/rejected": 0.027168119326233864, "step": 30 }, { "epoch": 0.04187385501177702, "grad_norm": 1.1753743538686479, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.307976245880127, "logits/rejected": -2.2140889167785645, "logps/chosen": -268.4030456542969, "logps/rejected": -255.44882202148438, "loss": 0.6814, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.05265005677938461, "rewards/margins": 0.026613134890794754, "rewards/rejected": 0.026036927476525307, "step": 40 }, { "epoch": 0.05234231876472128, "grad_norm": 1.1656515626034387, "learning_rate": 2.604166666666667e-06, "logits/chosen": -2.2787346839904785, "logits/rejected": -2.175128936767578, "logps/chosen": -227.7914581298828, "logps/rejected": -206.5706024169922, "loss": 0.6764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05774398520588875, "rewards/margins": 0.042526550590991974, "rewards/rejected": 0.015217426232993603, "step": 50 }, { "epoch": 0.06281078251766553, "grad_norm": 1.324103463890143, "learning_rate": 3.125e-06, "logits/chosen": -2.2864975929260254, "logits/rejected": -2.185832977294922, "logps/chosen": -264.6636657714844, "logps/rejected": -228.9823455810547, "loss": 0.6687, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.04733141511678696, "rewards/margins": 0.05985499545931816, "rewards/rejected": -0.012523581273853779, "step": 60 }, { "epoch": 0.07327924627060979, "grad_norm": 1.4908224965531012, "learning_rate": 3.6458333333333333e-06, "logits/chosen": -2.128446102142334, "logits/rejected": -2.080828905105591, "logps/chosen": -256.41363525390625, "logps/rejected": -262.66949462890625, "loss": 0.6557, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0010238643735647202, "rewards/margins": 0.10270833969116211, "rewards/rejected": -0.10373219102621078, "step": 70 }, { "epoch": 0.08374771002355404, "grad_norm": 2.5274509004519445, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.2609763145446777, "logits/rejected": -2.1039209365844727, "logps/chosen": -263.5312194824219, "logps/rejected": -256.4105529785156, "loss": 0.6402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12527017295360565, "rewards/margins": 0.11860889196395874, "rewards/rejected": -0.2438790500164032, "step": 80 }, { "epoch": 0.0942161737764983, "grad_norm": 2.5638356680875156, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -2.1489036083221436, "logits/rejected": -2.081789493560791, "logps/chosen": -270.69091796875, "logps/rejected": -291.17962646484375, "loss": 0.6304, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1973474770784378, "rewards/margins": 0.19165988266468048, "rewards/rejected": -0.3890073299407959, "step": 90 }, { "epoch": 0.10468463752944256, "grad_norm": 2.7331412184768507, "learning_rate": 4.9997324926814375e-06, "logits/chosen": -2.193147659301758, "logits/rejected": -2.0836679935455322, "logps/chosen": -303.36590576171875, "logps/rejected": -320.6068420410156, "loss": 0.6212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.31098657846450806, "rewards/margins": 0.2229224145412445, "rewards/rejected": -0.5339089632034302, "step": 100 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -2.0932729244232178, "eval_logits/rejected": -2.008643627166748, "eval_logps/chosen": -298.0505676269531, "eval_logps/rejected": -299.1472473144531, "eval_loss": 0.6321468353271484, "eval_rewards/accuracies": 0.6944444179534912, "eval_rewards/chosen": -0.3312907814979553, "eval_rewards/margins": 0.213734969496727, "eval_rewards/rejected": -0.5450257658958435, "eval_runtime": 321.7711, "eval_samples_per_second": 6.216, "eval_steps_per_second": 0.196, "step": 100 }, { "epoch": 0.11515310128238682, "grad_norm": 2.3956891782186904, "learning_rate": 4.996723692767927e-06, "logits/chosen": -2.1808857917785645, "logits/rejected": -2.05126690864563, "logps/chosen": -281.98193359375, "logps/rejected": -265.5942687988281, "loss": 0.6236, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3388732373714447, "rewards/margins": 0.2357875108718872, "rewards/rejected": -0.5746607184410095, "step": 110 }, { "epoch": 0.12562156503533106, "grad_norm": 4.49315717191337, "learning_rate": 4.9903757462135984e-06, "logits/chosen": -2.1513657569885254, "logits/rejected": -2.0812501907348633, "logps/chosen": -287.4888610839844, "logps/rejected": -328.67156982421875, "loss": 0.5888, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.41489553451538086, "rewards/margins": 0.31543681025505066, "rewards/rejected": -0.7303323149681091, "step": 120 }, { "epoch": 0.1360900287882753, "grad_norm": 3.426776723546068, "learning_rate": 4.980697142834315e-06, "logits/chosen": -2.1264045238494873, "logits/rejected": -2.0155797004699707, "logps/chosen": -381.717041015625, "logps/rejected": -350.6758728027344, "loss": 0.5946, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8191909790039062, "rewards/margins": 0.3090600371360779, "rewards/rejected": -1.1282509565353394, "step": 130 }, { "epoch": 0.14655849254121958, "grad_norm": 3.004217560772131, "learning_rate": 4.967700826904229e-06, "logits/chosen": -2.0693180561065674, "logits/rejected": -2.010124683380127, "logps/chosen": -363.1732482910156, "logps/rejected": -400.6826171875, "loss": 0.5746, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9419809579849243, "rewards/margins": 0.41463392972946167, "rewards/rejected": -1.3566150665283203, "step": 140 }, { "epoch": 0.15702695629416383, "grad_norm": 4.5725215394059004, "learning_rate": 4.951404179843963e-06, "logits/chosen": -2.154370069503784, "logits/rejected": -2.016098976135254, "logps/chosen": -362.11627197265625, "logps/rejected": -358.12823486328125, "loss": 0.5888, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7362005710601807, "rewards/margins": 0.44253450632095337, "rewards/rejected": -1.1787351369857788, "step": 150 }, { "epoch": 0.16749542004710807, "grad_norm": 7.788576778528557, "learning_rate": 4.931828996974498e-06, "logits/chosen": -2.099804162979126, "logits/rejected": -1.9289453029632568, "logps/chosen": -348.2688903808594, "logps/rejected": -349.8648986816406, "loss": 0.5524, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6180304288864136, "rewards/margins": 0.5966934561729431, "rewards/rejected": -1.2147239446640015, "step": 160 }, { "epoch": 0.17796388380005235, "grad_norm": 3.73369227909531, "learning_rate": 4.909001458367867e-06, "logits/chosen": -2.005589008331299, "logits/rejected": -1.8591816425323486, "logps/chosen": -357.9122619628906, "logps/rejected": -401.5499267578125, "loss": 0.5578, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9827691912651062, "rewards/margins": 0.5925935506820679, "rewards/rejected": -1.5753626823425293, "step": 170 }, { "epoch": 0.1884323475529966, "grad_norm": 5.142786695761387, "learning_rate": 4.882952093833628e-06, "logits/chosen": -1.906904935836792, "logits/rejected": -1.8502197265625, "logps/chosen": -384.1532287597656, "logps/rejected": -441.2289123535156, "loss": 0.5357, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2132524251937866, "rewards/margins": 0.6486458778381348, "rewards/rejected": -1.861898422241211, "step": 180 }, { "epoch": 0.19890081130594087, "grad_norm": 4.970082596077688, "learning_rate": 4.853715742087947e-06, "logits/chosen": -1.7953016757965088, "logits/rejected": -1.749053716659546, "logps/chosen": -331.3048400878906, "logps/rejected": -420.5511779785156, "loss": 0.5344, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8520873188972473, "rewards/margins": 0.6949248313903809, "rewards/rejected": -1.5470120906829834, "step": 190 }, { "epoch": 0.2093692750588851, "grad_norm": 4.380709251136357, "learning_rate": 4.821331504159906e-06, "logits/chosen": -1.931880235671997, "logits/rejected": -1.7673060894012451, "logps/chosen": -395.6458435058594, "logps/rejected": -393.07818603515625, "loss": 0.5618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8311824798583984, "rewards/margins": 0.598331868648529, "rewards/rejected": -1.4295144081115723, "step": 200 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -1.7550567388534546, "eval_logits/rejected": -1.669370174407959, "eval_logps/chosen": -346.90643310546875, "eval_logps/rejected": -381.2445983886719, "eval_loss": 0.5600804686546326, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": -0.8198498487472534, "eval_rewards/margins": 0.5461496114730835, "eval_rewards/rejected": -1.365999460220337, "eval_runtime": 319.6036, "eval_samples_per_second": 6.258, "eval_steps_per_second": 0.197, "step": 200 }, { "epoch": 0.21983773881182936, "grad_norm": 4.390548860079351, "learning_rate": 4.7858426910973435e-06, "logits/chosen": -1.82965886592865, "logits/rejected": -1.7543909549713135, "logps/chosen": -382.2540588378906, "logps/rejected": -415.6708984375, "loss": 0.5746, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0466556549072266, "rewards/margins": 0.42583417892456055, "rewards/rejected": -1.4724897146224976, "step": 210 }, { "epoch": 0.23030620256477363, "grad_norm": 5.154433364870237, "learning_rate": 4.747296766042161e-06, "logits/chosen": -1.791577696800232, "logits/rejected": -1.676790475845337, "logps/chosen": -430.439697265625, "logps/rejected": -446.4627380371094, "loss": 0.5594, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3720312118530273, "rewards/margins": 0.6089810132980347, "rewards/rejected": -1.9810121059417725, "step": 220 }, { "epoch": 0.24077466631771788, "grad_norm": 5.067287557540918, "learning_rate": 4.705745280752586e-06, "logits/chosen": -1.6005672216415405, "logits/rejected": -1.5293024778366089, "logps/chosen": -364.32769775390625, "logps/rejected": -393.8982849121094, "loss": 0.5585, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1689074039459229, "rewards/margins": 0.4595082402229309, "rewards/rejected": -1.6284157037734985, "step": 230 }, { "epoch": 0.2512431300706621, "grad_norm": 4.187057753178847, "learning_rate": 4.661243806657256e-06, "logits/chosen": -1.6649366617202759, "logits/rejected": -1.6055676937103271, "logps/chosen": -377.30517578125, "logps/rejected": -407.87017822265625, "loss": 0.5306, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2085318565368652, "rewards/margins": 0.6119499802589417, "rewards/rejected": -1.8204820156097412, "step": 240 }, { "epoch": 0.26171159382360637, "grad_norm": 4.31385003289797, "learning_rate": 4.613851860533367e-06, "logits/chosen": -1.6592628955841064, "logits/rejected": -1.5831575393676758, "logps/chosen": -392.666259765625, "logps/rejected": -415.00933837890625, "loss": 0.5341, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2613078355789185, "rewards/margins": 0.6352987289428711, "rewards/rejected": -1.896606683731079, "step": 250 }, { "epoch": 0.2721800575765506, "grad_norm": 6.08274430115293, "learning_rate": 4.563632824908252e-06, "logits/chosen": -1.5671354532241821, "logits/rejected": -1.4325566291809082, "logps/chosen": -446.5708923339844, "logps/rejected": -492.8990173339844, "loss": 0.5228, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7604526281356812, "rewards/margins": 0.6336302161216736, "rewards/rejected": -2.394083261489868, "step": 260 }, { "epoch": 0.2826485213294949, "grad_norm": 4.56724778147284, "learning_rate": 4.510653863290871e-06, "logits/chosen": -1.456993579864502, "logits/rejected": -1.3645284175872803, "logps/chosen": -401.00244140625, "logps/rejected": -461.67816162109375, "loss": 0.5277, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2706773281097412, "rewards/margins": 0.8170326352119446, "rewards/rejected": -2.087709903717041, "step": 270 }, { "epoch": 0.29311698508243916, "grad_norm": 3.947817632488332, "learning_rate": 4.454985830346574e-06, "logits/chosen": -1.5088273286819458, "logits/rejected": -1.3900407552719116, "logps/chosen": -385.47772216796875, "logps/rejected": -426.82049560546875, "loss": 0.5485, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9744614362716675, "rewards/margins": 0.75079345703125, "rewards/rejected": -1.725255012512207, "step": 280 }, { "epoch": 0.3035854488353834, "grad_norm": 4.730568661724723, "learning_rate": 4.396703177135262e-06, "logits/chosen": -1.517740249633789, "logits/rejected": -1.351360559463501, "logps/chosen": -388.90789794921875, "logps/rejected": -416.22052001953125, "loss": 0.526, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0588579177856445, "rewards/margins": 0.6205722093582153, "rewards/rejected": -1.6794300079345703, "step": 290 }, { "epoch": 0.31405391258832765, "grad_norm": 5.829501633867066, "learning_rate": 4.335883851539693e-06, "logits/chosen": -1.2372510433197021, "logits/rejected": -1.2240632772445679, "logps/chosen": -386.8778381347656, "logps/rejected": -448.6904296875, "loss": 0.54, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.441900610923767, "rewards/margins": 0.6276523470878601, "rewards/rejected": -2.0695528984069824, "step": 300 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -1.1714633703231812, "eval_logits/rejected": -1.0704221725463867, "eval_logps/chosen": -417.12750244140625, "eval_logps/rejected": -478.07476806640625, "eval_loss": 0.5264545679092407, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": -1.5220601558685303, "eval_rewards/margins": 0.8122406601905823, "eval_rewards/rejected": -2.334300994873047, "eval_runtime": 318.2186, "eval_samples_per_second": 6.285, "eval_steps_per_second": 0.198, "step": 300 }, { "epoch": 0.3245223763412719, "grad_norm": 4.0412766328088585, "learning_rate": 4.2726091940171055e-06, "logits/chosen": -1.3419673442840576, "logits/rejected": -1.1779625415802002, "logps/chosen": -421.1036682128906, "logps/rejected": -466.26275634765625, "loss": 0.5369, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4869836568832397, "rewards/margins": 0.8415184020996094, "rewards/rejected": -2.3285021781921387, "step": 310 }, { "epoch": 0.33499084009421615, "grad_norm": 7.228642390991299, "learning_rate": 4.206963828813555e-06, "logits/chosen": -1.3606576919555664, "logits/rejected": -1.266966700553894, "logps/chosen": -374.7814025878906, "logps/rejected": -450.57489013671875, "loss": 0.5399, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3771132230758667, "rewards/margins": 0.7092502117156982, "rewards/rejected": -2.0863633155822754, "step": 320 }, { "epoch": 0.34545930384716045, "grad_norm": 5.998674043872773, "learning_rate": 4.139035550786495e-06, "logits/chosen": -1.305854082107544, "logits/rejected": -1.2263256311416626, "logps/chosen": -429.99188232421875, "logps/rejected": -497.3521423339844, "loss": 0.5206, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8139232397079468, "rewards/margins": 0.7226920127868652, "rewards/rejected": -2.5366153717041016, "step": 330 }, { "epoch": 0.3559277676001047, "grad_norm": 6.244527338235992, "learning_rate": 4.068915207986931e-06, "logits/chosen": -1.0698177814483643, "logits/rejected": -0.9774864315986633, "logps/chosen": -438.493408203125, "logps/rejected": -487.9413146972656, "loss": 0.5553, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8784738779067993, "rewards/margins": 0.7664145231246948, "rewards/rejected": -2.644888401031494, "step": 340 }, { "epoch": 0.36639623135304894, "grad_norm": 6.05372732646654, "learning_rate": 3.996696580158211e-06, "logits/chosen": -1.419528603553772, "logits/rejected": -1.3026950359344482, "logps/chosen": -399.3412170410156, "logps/rejected": -459.59063720703125, "loss": 0.5144, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4473296403884888, "rewards/margins": 0.7528419494628906, "rewards/rejected": -2.2001712322235107, "step": 350 }, { "epoch": 0.3768646951059932, "grad_norm": 7.477190042107064, "learning_rate": 3.922476253313921e-06, "logits/chosen": -1.3192278146743774, "logits/rejected": -1.2596288919448853, "logps/chosen": -388.9505920410156, "logps/rejected": -450.23748779296875, "loss": 0.476, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3493094444274902, "rewards/margins": 0.7934570908546448, "rewards/rejected": -2.1427664756774902, "step": 360 }, { "epoch": 0.38733315885893743, "grad_norm": 5.411077344670459, "learning_rate": 3.846353490562664e-06, "logits/chosen": -1.3181861639022827, "logits/rejected": -1.2644492387771606, "logps/chosen": -406.28851318359375, "logps/rejected": -484.81646728515625, "loss": 0.5189, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.618009328842163, "rewards/margins": 0.912137508392334, "rewards/rejected": -2.530146837234497, "step": 370 }, { "epoch": 0.39780162261188173, "grad_norm": 6.255769900471538, "learning_rate": 3.768430099352445e-06, "logits/chosen": -1.2576202154159546, "logits/rejected": -1.1985712051391602, "logps/chosen": -426.02978515625, "logps/rejected": -499.4483337402344, "loss": 0.5291, "rewards/accuracies": 0.71875, "rewards/chosen": -1.724285364151001, "rewards/margins": 0.7168464660644531, "rewards/rejected": -2.441131830215454, "step": 380 }, { "epoch": 0.408270086364826, "grad_norm": 5.374401251386735, "learning_rate": 3.6888102953122307e-06, "logits/chosen": -1.3139002323150635, "logits/rejected": -1.2482521533966064, "logps/chosen": -366.7402648925781, "logps/rejected": -402.67144775390625, "loss": 0.5445, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0828481912612915, "rewards/margins": 0.5530051589012146, "rewards/rejected": -1.6358531713485718, "step": 390 }, { "epoch": 0.4187385501177702, "grad_norm": 5.700868192112112, "learning_rate": 3.607600562872785e-06, "logits/chosen": -1.2806731462478638, "logits/rejected": -1.2076570987701416, "logps/chosen": -397.6805725097656, "logps/rejected": -457.99578857421875, "loss": 0.5261, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.534794807434082, "rewards/margins": 0.6010990738868713, "rewards/rejected": -2.1358938217163086, "step": 400 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -1.201329231262207, "eval_logits/rejected": -1.1013988256454468, "eval_logps/chosen": -430.45263671875, "eval_logps/rejected": -497.2759094238281, "eval_loss": 0.5082111954689026, "eval_rewards/accuracies": 0.7539682388305664, "eval_rewards/chosen": -1.6553115844726562, "eval_rewards/margins": 0.8710008859634399, "eval_rewards/rejected": -2.5263123512268066, "eval_runtime": 314.7811, "eval_samples_per_second": 6.354, "eval_steps_per_second": 0.2, "step": 400 }, { "epoch": 0.42920701387071447, "grad_norm": 4.78368697455832, "learning_rate": 3.5249095128531863e-06, "logits/chosen": -1.1789991855621338, "logits/rejected": -1.0596911907196045, "logps/chosen": -449.33709716796875, "logps/rejected": -515.829833984375, "loss": 0.5275, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8276984691619873, "rewards/margins": 0.9437017440795898, "rewards/rejected": -2.7714004516601562, "step": 410 }, { "epoch": 0.4396754776236587, "grad_norm": 5.718886468906387, "learning_rate": 3.4408477372034743e-06, "logits/chosen": -1.3089560270309448, "logits/rejected": -1.111132025718689, "logps/chosen": -451.94354248046875, "logps/rejected": -472.0818786621094, "loss": 0.5212, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6895692348480225, "rewards/margins": 0.77433842420578, "rewards/rejected": -2.4639077186584473, "step": 420 }, { "epoch": 0.45014394137660296, "grad_norm": 5.4739752222845, "learning_rate": 3.355527661097728e-06, "logits/chosen": -1.2457327842712402, "logits/rejected": -1.1701027154922485, "logps/chosen": -408.38629150390625, "logps/rejected": -495.34906005859375, "loss": 0.5228, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8043445348739624, "rewards/margins": 0.8327828645706177, "rewards/rejected": -2.63712739944458, "step": 430 }, { "epoch": 0.46061240512954726, "grad_norm": 6.85581768096026, "learning_rate": 3.269063392575352e-06, "logits/chosen": -1.2453696727752686, "logits/rejected": -1.202413558959961, "logps/chosen": -399.9681091308594, "logps/rejected": -447.87744140625, "loss": 0.5256, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5209221839904785, "rewards/margins": 0.6652868986129761, "rewards/rejected": -2.186208963394165, "step": 440 }, { "epoch": 0.4710808688824915, "grad_norm": 5.776991001419176, "learning_rate": 3.181570569931697e-06, "logits/chosen": -1.3612130880355835, "logits/rejected": -1.2961633205413818, "logps/chosen": -399.5633239746094, "logps/rejected": -510.95513916015625, "loss": 0.5105, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5496407747268677, "rewards/margins": 0.9577849507331848, "rewards/rejected": -2.507425308227539, "step": 450 }, { "epoch": 0.48154933263543576, "grad_norm": 5.051312773178446, "learning_rate": 3.09316620706208e-06, "logits/chosen": -1.2932283878326416, "logits/rejected": -1.2328197956085205, "logps/chosen": -472.6172790527344, "logps/rejected": -538.5615234375, "loss": 0.5286, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.122169256210327, "rewards/margins": 0.7884066104888916, "rewards/rejected": -2.9105758666992188, "step": 460 }, { "epoch": 0.49201779638838, "grad_norm": 4.905312405134056, "learning_rate": 3.0039685369660785e-06, "logits/chosen": -1.299459457397461, "logits/rejected": -1.164813756942749, "logps/chosen": -433.37554931640625, "logps/rejected": -470.384521484375, "loss": 0.4942, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7430375814437866, "rewards/margins": 0.6865721940994263, "rewards/rejected": -2.429609775543213, "step": 470 }, { "epoch": 0.5024862601413242, "grad_norm": 6.146340380005326, "learning_rate": 2.91409685362137e-06, "logits/chosen": -1.2403475046157837, "logits/rejected": -1.0839545726776123, "logps/chosen": -513.665771484375, "logps/rejected": -583.8973388671875, "loss": 0.5046, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4166836738586426, "rewards/margins": 1.000232219696045, "rewards/rejected": -3.4169158935546875, "step": 480 }, { "epoch": 0.5129547238942685, "grad_norm": 5.112445065707855, "learning_rate": 2.8236713524386085e-06, "logits/chosen": -1.2714743614196777, "logits/rejected": -1.0941110849380493, "logps/chosen": -532.9962158203125, "logps/rejected": -588.236328125, "loss": 0.5025, "rewards/accuracies": 0.75, "rewards/chosen": -2.491058826446533, "rewards/margins": 0.900057315826416, "rewards/rejected": -3.39111590385437, "step": 490 }, { "epoch": 0.5234231876472127, "grad_norm": 5.578978194196055, "learning_rate": 2.7328129695107205e-06, "logits/chosen": -1.2111141681671143, "logits/rejected": -1.1049137115478516, "logps/chosen": -476.11932373046875, "logps/rejected": -544.0899658203125, "loss": 0.5107, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1048712730407715, "rewards/margins": 0.9492176175117493, "rewards/rejected": -3.054089069366455, "step": 500 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -1.0955979824066162, "eval_logits/rejected": -0.9851866364479065, "eval_logps/chosen": -509.9847717285156, "eval_logps/rejected": -587.1475830078125, "eval_loss": 0.5058528184890747, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -2.4506328105926514, "eval_rewards/margins": 0.9743964076042175, "eval_rewards/rejected": -3.4250295162200928, "eval_runtime": 306.1826, "eval_samples_per_second": 6.532, "eval_steps_per_second": 0.206, "step": 500 }, { "epoch": 0.533891651400157, "grad_norm": 5.171650655838442, "learning_rate": 2.641643219871597e-06, "logits/chosen": -1.1813862323760986, "logits/rejected": -1.0965768098831177, "logps/chosen": -508.13092041015625, "logps/rejected": -570.53759765625, "loss": 0.4794, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4878056049346924, "rewards/margins": 0.8628519773483276, "rewards/rejected": -3.3506579399108887, "step": 510 }, { "epoch": 0.5443601151531012, "grad_norm": 6.785478355965952, "learning_rate": 2.5502840349805074e-06, "logits/chosen": -1.1492969989776611, "logits/rejected": -1.0285675525665283, "logps/chosen": -577.2451171875, "logps/rejected": -618.67236328125, "loss": 0.536, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.018739938735962, "rewards/margins": 0.8701656460762024, "rewards/rejected": -3.8889052867889404, "step": 520 }, { "epoch": 0.5548285789060455, "grad_norm": 5.9061132192029975, "learning_rate": 2.4588575996495797e-06, "logits/chosen": -1.1598259210586548, "logits/rejected": -1.0490505695343018, "logps/chosen": -515.8734130859375, "logps/rejected": -585.2592163085938, "loss": 0.5089, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.4302353858947754, "rewards/margins": 0.9659484624862671, "rewards/rejected": -3.396183729171753, "step": 530 }, { "epoch": 0.5652970426589898, "grad_norm": 4.6033180245939604, "learning_rate": 2.367486188632446e-06, "logits/chosen": -1.1840062141418457, "logits/rejected": -1.05405592918396, "logps/chosen": -469.9341735839844, "logps/rejected": -503.66156005859375, "loss": 0.5311, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1539134979248047, "rewards/margins": 0.7668038606643677, "rewards/rejected": -2.920717239379883, "step": 540 }, { "epoch": 0.575765506411934, "grad_norm": 4.876993529338895, "learning_rate": 2.276292003092593e-06, "logits/chosen": -1.1908996105194092, "logits/rejected": -1.03009033203125, "logps/chosen": -489.2470703125, "logps/rejected": -519.9906616210938, "loss": 0.5265, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.272523880004883, "rewards/margins": 0.6718829870223999, "rewards/rejected": -2.9444069862365723, "step": 550 }, { "epoch": 0.5862339701648783, "grad_norm": 6.4140589895092734, "learning_rate": 2.1853970071701415e-06, "logits/chosen": -1.0713722705841064, "logits/rejected": -0.9895181655883789, "logps/chosen": -517.431396484375, "logps/rejected": -573.0247802734375, "loss": 0.527, "rewards/accuracies": 0.75, "rewards/chosen": -2.533311605453491, "rewards/margins": 0.8953849077224731, "rewards/rejected": -3.428696393966675, "step": 560 }, { "epoch": 0.5967024339178225, "grad_norm": 6.376011574902463, "learning_rate": 2.0949227648656194e-06, "logits/chosen": -1.129098653793335, "logits/rejected": -1.0245158672332764, "logps/chosen": -559.3487548828125, "logps/rejected": -636.18310546875, "loss": 0.5053, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.986375331878662, "rewards/margins": 0.8951998949050903, "rewards/rejected": -3.8815758228302, "step": 570 }, { "epoch": 0.6071708976707668, "grad_norm": 5.245103814265102, "learning_rate": 2.00499027745888e-06, "logits/chosen": -1.081526517868042, "logits/rejected": -0.9703742861747742, "logps/chosen": -521.400146484375, "logps/rejected": -581.4735107421875, "loss": 0.5167, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8668112754821777, "rewards/margins": 0.8452316522598267, "rewards/rejected": -3.712043046951294, "step": 580 }, { "epoch": 0.6176393614237111, "grad_norm": 5.7527775937649, "learning_rate": 1.915719821680624e-06, "logits/chosen": -1.287595510482788, "logits/rejected": -1.2315866947174072, "logps/chosen": -470.63995361328125, "logps/rejected": -567.0520629882812, "loss": 0.5018, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.246957778930664, "rewards/margins": 0.9269870519638062, "rewards/rejected": -3.1739444732666016, "step": 590 }, { "epoch": 0.6281078251766553, "grad_norm": 4.552980884850473, "learning_rate": 1.8272307888529276e-06, "logits/chosen": -1.1632342338562012, "logits/rejected": -1.0386791229248047, "logps/chosen": -524.4891357421875, "logps/rejected": -568.3989868164062, "loss": 0.4851, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2760519981384277, "rewards/margins": 0.7800448536872864, "rewards/rejected": -3.0560970306396484, "step": 600 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -1.107803463935852, "eval_logits/rejected": -0.9969872832298279, "eval_logps/chosen": -492.1783447265625, "eval_logps/rejected": -567.8048706054688, "eval_loss": 0.5023476481437683, "eval_rewards/accuracies": 0.7678571343421936, "eval_rewards/chosen": -2.272569179534912, "eval_rewards/margins": 0.9590328931808472, "eval_rewards/rejected": -3.231602191925049, "eval_runtime": 309.4155, "eval_samples_per_second": 6.464, "eval_steps_per_second": 0.204, "step": 600 }, { "epoch": 0.6385762889295996, "grad_norm": 5.93434046945835, "learning_rate": 1.739641525213929e-06, "logits/chosen": -1.1899484395980835, "logits/rejected": -1.128395438194275, "logps/chosen": -505.1640625, "logps/rejected": -569.9411010742188, "loss": 0.5131, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3301897048950195, "rewards/margins": 1.009691834449768, "rewards/rejected": -3.339881420135498, "step": 610 }, { "epoch": 0.6490447526825438, "grad_norm": 6.7592991694860345, "learning_rate": 1.6530691736402317e-06, "logits/chosen": -1.3041086196899414, "logits/rejected": -1.1299916505813599, "logps/chosen": -520.1345825195312, "logps/rejected": -570.2188110351562, "loss": 0.4665, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1966605186462402, "rewards/margins": 1.0572277307510376, "rewards/rejected": -3.253887891769409, "step": 620 }, { "epoch": 0.6595132164354881, "grad_norm": 6.939788728853941, "learning_rate": 1.5676295169786864e-06, "logits/chosen": -1.2350585460662842, "logits/rejected": -1.1085925102233887, "logps/chosen": -506.601806640625, "logps/rejected": -558.4306640625, "loss": 0.4883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2913689613342285, "rewards/margins": 0.9455882906913757, "rewards/rejected": -3.23695707321167, "step": 630 }, { "epoch": 0.6699816801884323, "grad_norm": 6.885905876648702, "learning_rate": 1.4834368231970922e-06, "logits/chosen": -1.1120684146881104, "logits/rejected": -0.9865466952323914, "logps/chosen": -484.1168518066406, "logps/rejected": -577.2811279296875, "loss": 0.4707, "rewards/accuracies": 0.8125, "rewards/chosen": -2.489487648010254, "rewards/margins": 1.0948190689086914, "rewards/rejected": -3.5843067169189453, "step": 640 }, { "epoch": 0.6804501439413766, "grad_norm": 5.355709320338061, "learning_rate": 1.4006036925609245e-06, "logits/chosen": -1.2093435525894165, "logits/rejected": -1.1428296566009521, "logps/chosen": -484.78680419921875, "logps/rejected": -564.5701293945312, "loss": 0.5169, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2595038414001465, "rewards/margins": 0.8043657541275024, "rewards/rejected": -3.0638692378997803, "step": 650 }, { "epoch": 0.6909186076943209, "grad_norm": 5.122013631042651, "learning_rate": 1.3192409070404582e-06, "logits/chosen": -1.2926921844482422, "logits/rejected": -1.1912837028503418, "logps/chosen": -462.49700927734375, "logps/rejected": -535.156005859375, "loss": 0.4902, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.105215311050415, "rewards/margins": 0.9164140820503235, "rewards/rejected": -3.0216293334960938, "step": 660 }, { "epoch": 0.7013870714472651, "grad_norm": 7.049199430649973, "learning_rate": 1.2394572821496953e-06, "logits/chosen": -1.144698977470398, "logits/rejected": -0.9912746548652649, "logps/chosen": -518.4602661132812, "logps/rejected": -603.5948486328125, "loss": 0.4773, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6358141899108887, "rewards/margins": 1.113600492477417, "rewards/rejected": -3.7494144439697266, "step": 670 }, { "epoch": 0.7118555352002094, "grad_norm": 7.264843779332727, "learning_rate": 1.1613595214152713e-06, "logits/chosen": -1.1309657096862793, "logits/rejected": -1.0589998960494995, "logps/chosen": -539.7699584960938, "logps/rejected": -620.6878662109375, "loss": 0.5266, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.733842134475708, "rewards/margins": 0.9637987017631531, "rewards/rejected": -3.6976406574249268, "step": 680 }, { "epoch": 0.7223239989531536, "grad_norm": 5.030210764921467, "learning_rate": 1.0850520736699362e-06, "logits/chosen": -1.288641333580017, "logits/rejected": -1.134615182876587, "logps/chosen": -510.186767578125, "logps/rejected": -550.937744140625, "loss": 0.4574, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3800137042999268, "rewards/margins": 0.9033814668655396, "rewards/rejected": -3.2833950519561768, "step": 690 }, { "epoch": 0.7327924627060979, "grad_norm": 5.9328337595528495, "learning_rate": 1.0106369933615043e-06, "logits/chosen": -1.1679879426956177, "logits/rejected": -1.1002038717269897, "logps/chosen": -487.10821533203125, "logps/rejected": -576.1431884765625, "loss": 0.4681, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3741798400878906, "rewards/margins": 0.9901224970817566, "rewards/rejected": -3.364302158355713, "step": 700 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -1.119031548500061, "eval_logits/rejected": -1.006774663925171, "eval_logps/chosen": -496.6231994628906, "eval_logps/rejected": -581.5197143554688, "eval_loss": 0.49932044744491577, "eval_rewards/accuracies": 0.7678571343421936, "eval_rewards/chosen": -2.3170175552368164, "eval_rewards/margins": 1.0517328977584839, "eval_rewards/rejected": -3.3687500953674316, "eval_runtime": 280.6333, "eval_samples_per_second": 7.127, "eval_steps_per_second": 0.224, "step": 700 }, { "epoch": 0.7432609264590422, "grad_norm": 5.3428016079168215, "learning_rate": 9.382138040640714e-07, "logits/chosen": -1.1798994541168213, "logits/rejected": -1.021723985671997, "logps/chosen": -505.2616271972656, "logps/rejected": -571.1856689453125, "loss": 0.547, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.396669864654541, "rewards/margins": 0.9654728174209595, "rewards/rejected": -3.362142562866211, "step": 710 }, { "epoch": 0.7537293902119864, "grad_norm": 5.0438516064442505, "learning_rate": 8.678793653740633e-07, "logits/chosen": -1.3297260999679565, "logits/rejected": -1.1736423969268799, "logps/chosen": -527.0916748046875, "logps/rejected": -579.3074340820312, "loss": 0.478, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.070934534072876, "rewards/margins": 1.0328184366226196, "rewards/rejected": -3.103752851486206, "step": 720 }, { "epoch": 0.7641978539649307, "grad_norm": 5.464567536353577, "learning_rate": 7.997277433690984e-07, "logits/chosen": -1.2094228267669678, "logits/rejected": -1.076755404472351, "logps/chosen": -460.71563720703125, "logps/rejected": -538.8247680664062, "loss": 0.4966, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2227933406829834, "rewards/margins": 0.9530885815620422, "rewards/rejected": -3.175881862640381, "step": 730 }, { "epoch": 0.7746663177178749, "grad_norm": 5.377248875033102, "learning_rate": 7.338500848029603e-07, "logits/chosen": -1.1969387531280518, "logits/rejected": -1.0555990934371948, "logps/chosen": -529.8873291015625, "logps/rejected": -568.1295166015625, "loss": 0.5065, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4915201663970947, "rewards/margins": 0.9319826364517212, "rewards/rejected": -3.4235024452209473, "step": 740 }, { "epoch": 0.7851347814708192, "grad_norm": 5.342695362281337, "learning_rate": 6.70334495204884e-07, "logits/chosen": -1.0425455570220947, "logits/rejected": -0.9723536372184753, "logps/chosen": -487.095947265625, "logps/rejected": -599.2386474609375, "loss": 0.4857, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.589409112930298, "rewards/margins": 1.040583848953247, "rewards/rejected": -3.629992723464966, "step": 750 }, { "epoch": 0.7956032452237635, "grad_norm": 4.96165517698307, "learning_rate": 6.092659210462232e-07, "logits/chosen": -1.172639012336731, "logits/rejected": -1.0221275091171265, "logps/chosen": -510.41253662109375, "logps/rejected": -564.8956909179688, "loss": 0.5034, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.53062105178833, "rewards/margins": 0.8949772119522095, "rewards/rejected": -3.42559814453125, "step": 760 }, { "epoch": 0.8060717089767077, "grad_norm": 4.707914305263311, "learning_rate": 5.507260361320738e-07, "logits/chosen": -1.2693157196044922, "logits/rejected": -1.2365710735321045, "logps/chosen": -522.3062744140625, "logps/rejected": -619.4640502929688, "loss": 0.4736, "rewards/accuracies": 0.75, "rewards/chosen": -2.363852024078369, "rewards/margins": 0.9652940034866333, "rewards/rejected": -3.329145908355713, "step": 770 }, { "epoch": 0.816540172729652, "grad_norm": 5.592342234404946, "learning_rate": 4.947931323697983e-07, "logits/chosen": -1.2886607646942139, "logits/rejected": -1.0684127807617188, "logps/chosen": -534.323486328125, "logps/rejected": -556.2030639648438, "loss": 0.5036, "rewards/accuracies": 0.71875, "rewards/chosen": -2.277695894241333, "rewards/margins": 0.8380621075630188, "rewards/rejected": -3.115757942199707, "step": 780 }, { "epoch": 0.8270086364825961, "grad_norm": 6.076661367759154, "learning_rate": 4.4154201506053985e-07, "logits/chosen": -1.1675662994384766, "logits/rejected": -1.065953254699707, "logps/chosen": -489.9781188964844, "logps/rejected": -578.2179565429688, "loss": 0.5128, "rewards/accuracies": 0.78125, "rewards/chosen": -2.482016086578369, "rewards/margins": 0.9305012822151184, "rewards/rejected": -3.4125168323516846, "step": 790 }, { "epoch": 0.8374771002355405, "grad_norm": 6.131830839970953, "learning_rate": 3.910439028537638e-07, "logits/chosen": -1.1610690355300903, "logits/rejected": -1.1072094440460205, "logps/chosen": -469.524658203125, "logps/rejected": -585.119140625, "loss": 0.4852, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3968334197998047, "rewards/margins": 1.120810866355896, "rewards/rejected": -3.5176444053649902, "step": 800 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -1.1353023052215576, "eval_logits/rejected": -1.0236940383911133, "eval_logps/chosen": -504.6183166503906, "eval_logps/rejected": -585.8155517578125, "eval_loss": 0.49497368931770325, "eval_rewards/accuracies": 0.773809552192688, "eval_rewards/chosen": -2.396967887878418, "eval_rewards/margins": 1.0147408246994019, "eval_rewards/rejected": -3.4117088317871094, "eval_runtime": 274.6399, "eval_samples_per_second": 7.282, "eval_steps_per_second": 0.229, "step": 800 }, { "epoch": 0.8479455639884846, "grad_norm": 4.892138077626307, "learning_rate": 3.4336633249862084e-07, "logits/chosen": -1.209530234336853, "logits/rejected": -1.0349524021148682, "logps/chosen": -532.2420043945312, "logps/rejected": -591.3436279296875, "loss": 0.5065, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4228711128234863, "rewards/margins": 0.9818238019943237, "rewards/rejected": -3.4046947956085205, "step": 810 }, { "epoch": 0.8584140277414289, "grad_norm": 6.133736149907814, "learning_rate": 2.98573068519539e-07, "logits/chosen": -1.2474277019500732, "logits/rejected": -1.1728675365447998, "logps/chosen": -516.3637084960938, "logps/rejected": -589.2227783203125, "loss": 0.5066, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5452017784118652, "rewards/margins": 0.8416748046875, "rewards/rejected": -3.3868765830993652, "step": 820 }, { "epoch": 0.8688824914943732, "grad_norm": 4.448941268784857, "learning_rate": 2.5672401793681854e-07, "logits/chosen": -1.1796165704727173, "logits/rejected": -1.1301778554916382, "logps/chosen": -493.81610107421875, "logps/rejected": -576.2391357421875, "loss": 0.5119, "rewards/accuracies": 0.6875, "rewards/chosen": -2.471508502960205, "rewards/margins": 0.8166677355766296, "rewards/rejected": -3.2881767749786377, "step": 830 }, { "epoch": 0.8793509552473174, "grad_norm": 6.561312283813159, "learning_rate": 2.178751501463036e-07, "logits/chosen": -1.2200143337249756, "logits/rejected": -1.120086431503296, "logps/chosen": -525.2439575195312, "logps/rejected": -629.3662719726562, "loss": 0.4823, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.501779317855835, "rewards/margins": 1.0803827047348022, "rewards/rejected": -3.5821621417999268, "step": 840 }, { "epoch": 0.8898194190002617, "grad_norm": 5.675008200582371, "learning_rate": 1.820784220652766e-07, "logits/chosen": -1.148503065109253, "logits/rejected": -0.9931659698486328, "logps/chosen": -511.488037109375, "logps/rejected": -576.7986450195312, "loss": 0.4672, "rewards/accuracies": 0.8125, "rewards/chosen": -2.456411361694336, "rewards/margins": 0.9856443405151367, "rewards/rejected": -3.4420554637908936, "step": 850 }, { "epoch": 0.9002878827532059, "grad_norm": 5.847430828911881, "learning_rate": 1.4938170864468636e-07, "logits/chosen": -1.2107369899749756, "logits/rejected": -1.0609266757965088, "logps/chosen": -531.3970947265625, "logps/rejected": -590.2992553710938, "loss": 0.4693, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.60589861869812, "rewards/margins": 0.8416641354560852, "rewards/rejected": -3.4475624561309814, "step": 860 }, { "epoch": 0.9107563465061502, "grad_norm": 5.678184676582534, "learning_rate": 1.1982873884064466e-07, "logits/chosen": -1.24862539768219, "logits/rejected": -1.0584386587142944, "logps/chosen": -544.3020629882812, "logps/rejected": -598.3751220703125, "loss": 0.4947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.632056713104248, "rewards/margins": 1.0834977626800537, "rewards/rejected": -3.7155539989471436, "step": 870 }, { "epoch": 0.9212248102590945, "grad_norm": 5.3183871343635944, "learning_rate": 9.345903713082305e-08, "logits/chosen": -1.2423183917999268, "logits/rejected": -1.1057523488998413, "logps/chosen": -514.6922607421875, "logps/rejected": -574.3685302734375, "loss": 0.4801, "rewards/accuracies": 0.75, "rewards/chosen": -2.4431517124176025, "rewards/margins": 0.9935353994369507, "rewards/rejected": -3.4366869926452637, "step": 880 }, { "epoch": 0.9316932740120387, "grad_norm": 5.735081148749753, "learning_rate": 7.030787065396866e-08, "logits/chosen": -1.1413437128067017, "logits/rejected": -1.0328372716903687, "logps/chosen": -508.3773498535156, "logps/rejected": -619.8019409179688, "loss": 0.4938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5983500480651855, "rewards/margins": 1.2036244869232178, "rewards/rejected": -3.8019745349884033, "step": 890 }, { "epoch": 0.942161737764983, "grad_norm": 6.618746821621782, "learning_rate": 5.0406202043228604e-08, "logits/chosen": -1.045906901359558, "logits/rejected": -0.9974561929702759, "logps/chosen": -517.8667602539062, "logps/rejected": -669.0172729492188, "loss": 0.4907, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5165352821350098, "rewards/margins": 1.2452183961868286, "rewards/rejected": -3.761753559112549, "step": 900 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -1.1023893356323242, "eval_logits/rejected": -0.9901031255722046, "eval_logps/chosen": -521.706298828125, "eval_logps/rejected": -608.1346435546875, "eval_loss": 0.494513601064682, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": -2.567847490310669, "eval_rewards/margins": 1.0670523643493652, "eval_rewards/rejected": -3.634899854660034, "eval_runtime": 302.7434, "eval_samples_per_second": 6.606, "eval_steps_per_second": 0.208, "step": 900 }, { "epoch": 0.9526302015179272, "grad_norm": 6.10762031606348, "learning_rate": 3.378064801637687e-08, "logits/chosen": -1.1716662645339966, "logits/rejected": -0.9894771575927734, "logps/chosen": -494.2669982910156, "logps/rejected": -552.8900146484375, "loss": 0.4935, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.474764585494995, "rewards/margins": 1.0439238548278809, "rewards/rejected": -3.518688201904297, "step": 910 }, { "epoch": 0.9630986652708715, "grad_norm": 5.683369015946174, "learning_rate": 2.0453443778310766e-08, "logits/chosen": -1.1771481037139893, "logits/rejected": -1.0182334184646606, "logps/chosen": -545.4282836914062, "logps/rejected": -589.4589233398438, "loss": 0.5081, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6610922813415527, "rewards/margins": 0.9007646441459656, "rewards/rejected": -3.561856746673584, "step": 920 }, { "epoch": 0.9735671290238157, "grad_norm": 5.950595947719059, "learning_rate": 1.0442413283435759e-08, "logits/chosen": -1.1201808452606201, "logits/rejected": -0.9143557548522949, "logps/chosen": -556.3572998046875, "logps/rejected": -596.0765380859375, "loss": 0.493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5759315490722656, "rewards/margins": 1.0089161396026611, "rewards/rejected": -3.5848472118377686, "step": 930 }, { "epoch": 0.98403559277676, "grad_norm": 7.5374296673981425, "learning_rate": 3.760945397705828e-09, "logits/chosen": -1.1323813199996948, "logits/rejected": -1.0745770931243896, "logps/chosen": -517.08056640625, "logps/rejected": -605.00146484375, "loss": 0.4829, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5434603691101074, "rewards/margins": 0.8878037333488464, "rewards/rejected": -3.4312641620635986, "step": 940 }, { "epoch": 0.9945040565297043, "grad_norm": 5.8745190839702515, "learning_rate": 4.1797599220405605e-10, "logits/chosen": -1.1730302572250366, "logits/rejected": -0.9989528656005859, "logps/chosen": -524.8547973632812, "logps/rejected": -583.33203125, "loss": 0.4861, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.5831165313720703, "rewards/margins": 0.908293604850769, "rewards/rejected": -3.49141001701355, "step": 950 }, { "epoch": 0.9997382884061764, "step": 955, "total_flos": 0.0, "train_loss": 0.5319095570379527, "train_runtime": 23762.0752, "train_samples_per_second": 2.573, "train_steps_per_second": 0.04 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }