diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,1596 +10,1596 @@ "log_history": [ { "epoch": 0.0010468463752944255, - "grad_norm": 1.1940648296392757, + "grad_norm": 1.1945625860018705, "learning_rate": 5.208333333333333e-08, - "logits/chosen": -2.519019603729248, - "logits/rejected": -2.354379177093506, - "logps/chosen": -297.6008605957031, - "logps/rejected": -252.44248962402344, - "loss": 0.693, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0007321774610318244, - "rewards/margins": 6.297111394815147e-05, - "rewards/rejected": 0.0006692063761875033, + "logits/chosen": -2.5192830562591553, + "logits/rejected": -2.3547825813293457, + "logps/chosen": -297.60443115234375, + "logps/rejected": -252.4619903564453, + "loss": 0.6929, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00020415784092620015, + "rewards/margins": -0.0002505290030967444, + "rewards/rejected": 4.637122037820518e-05, "step": 1 }, { "epoch": 0.010468463752944255, - "grad_norm": 1.106408968948395, + "grad_norm": 1.1009278086693854, "learning_rate": 5.208333333333334e-07, - "logits/chosen": -2.2454488277435303, - "logits/rejected": -2.215104818344116, - "logps/chosen": -275.67840576171875, - "logps/rejected": -254.77935791015625, + "logits/chosen": -2.2455766201019287, + "logits/rejected": -2.215245008468628, + "logps/chosen": -275.6755065917969, + "logps/rejected": -254.76722717285156, "loss": 0.6927, - "rewards/accuracies": 0.5694444179534912, - "rewards/chosen": 0.004385500214993954, - "rewards/margins": 0.0008544763550162315, - "rewards/rejected": 0.0035310229286551476, + "rewards/accuracies": 0.6111111044883728, + "rewards/chosen": 0.004448441788554192, + "rewards/margins": 0.0008290203404612839, + "rewards/rejected": 0.0036194208078086376, "step": 10 }, { "epoch": 0.02093692750588851, - "grad_norm": 1.1723169233645143, + "grad_norm": 1.165704750760885, "learning_rate": 1.0416666666666667e-06, - "logits/chosen": -2.231245517730713, - "logits/rejected": -2.1146271228790283, - "logps/chosen": -277.5929260253906, - "logps/rejected": -255.2114715576172, + "logits/chosen": -2.2313215732574463, + "logits/rejected": -2.114736795425415, + "logps/chosen": -277.5883483886719, + "logps/rejected": -255.2056427001953, "loss": 0.6907, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.026119515299797058, - "rewards/margins": 0.005599636118859053, - "rewards/rejected": 0.020519878715276718, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.026263630017638206, + "rewards/margins": 0.005699009168893099, + "rewards/rejected": 0.020564619451761246, "step": 20 }, { "epoch": 0.031405391258832765, - "grad_norm": 1.1930628386131166, + "grad_norm": 1.1872789709669673, "learning_rate": 1.5625e-06, - "logits/chosen": -2.314044952392578, - "logits/rejected": -2.211050510406494, - "logps/chosen": -281.39801025390625, - "logps/rejected": -262.43841552734375, + "logits/chosen": -2.3138914108276367, + "logits/rejected": -2.2109274864196777, + "logps/chosen": -281.3846740722656, + "logps/rejected": -262.41693115234375, "loss": 0.6858, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.042405955493450165, - "rewards/margins": 0.015279242768883705, - "rewards/rejected": 0.027126718312501907, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04216086491942406, + "rewards/margins": 0.014992751181125641, + "rewards/rejected": 0.027168119326233864, "step": 30 }, { "epoch": 0.04187385501177702, - "grad_norm": 1.1771194457412584, + "grad_norm": 1.1753743538686479, "learning_rate": 2.0833333333333334e-06, - "logits/chosen": -2.3085246086120605, - "logits/rejected": -2.2145581245422363, - "logps/chosen": -268.41571044921875, - "logps/rejected": -255.4882354736328, + "logits/chosen": -2.307976245880127, + "logits/rejected": -2.2140889167785645, + "logps/chosen": -268.4030456542969, + "logps/rejected": -255.44882202148438, "loss": 0.6814, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.05249170586466789, - "rewards/margins": 0.027045782655477524, - "rewards/rejected": 0.025445926934480667, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.05265005677938461, + "rewards/margins": 0.026613134890794754, + "rewards/rejected": 0.026036927476525307, "step": 40 }, { "epoch": 0.05234231876472128, - "grad_norm": 1.1817503689886124, + "grad_norm": 1.1656515626034387, "learning_rate": 2.604166666666667e-06, - "logits/chosen": -2.279360294342041, - "logits/rejected": -2.1755104064941406, - "logps/chosen": -227.83438110351562, - "logps/rejected": -206.697265625, - "loss": 0.6761, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.05746235325932503, - "rewards/margins": 0.04355225712060928, - "rewards/rejected": 0.013910098001360893, + "logits/chosen": -2.2787346839904785, + "logits/rejected": -2.175128936767578, + "logps/chosen": -227.7914581298828, + "logps/rejected": -206.5706024169922, + "loss": 0.6764, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05774398520588875, + "rewards/margins": 0.042526550590991974, + "rewards/rejected": 0.015217426232993603, "step": 50 }, { "epoch": 0.06281078251766553, - "grad_norm": 1.318174585891366, + "grad_norm": 1.324103463890143, "learning_rate": 3.125e-06, - "logits/chosen": -2.287376880645752, - "logits/rejected": -2.186383008956909, - "logps/chosen": -264.8748474121094, - "logps/rejected": -229.34848022460938, - "loss": 0.6681, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.045027900487184525, - "rewards/margins": 0.06142578274011612, - "rewards/rejected": -0.016397882252931595, + "logits/chosen": -2.2864975929260254, + "logits/rejected": -2.185832977294922, + "logps/chosen": -264.6636657714844, + "logps/rejected": -228.9823455810547, + "loss": 0.6687, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.04733141511678696, + "rewards/margins": 0.05985499545931816, + "rewards/rejected": -0.012523581273853779, "step": 60 }, { "epoch": 0.07327924627060979, - "grad_norm": 1.5287003725547763, + "grad_norm": 1.4908224965531012, "learning_rate": 3.6458333333333333e-06, - "logits/chosen": -2.129021406173706, - "logits/rejected": -2.0812830924987793, - "logps/chosen": -257.1100158691406, - "logps/rejected": -263.87359619140625, - "loss": 0.6544, + "logits/chosen": -2.128446102142334, + "logits/rejected": -2.080828905105591, + "logps/chosen": -256.41363525390625, + "logps/rejected": -262.66949462890625, + "loss": 0.6557, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.00797443836927414, - "rewards/margins": 0.10772331804037094, - "rewards/rejected": -0.11569775640964508, + "rewards/chosen": -0.0010238643735647202, + "rewards/margins": 0.10270833969116211, + "rewards/rejected": -0.10373219102621078, "step": 70 }, { "epoch": 0.08374771002355404, - "grad_norm": 2.951307924811214, + "grad_norm": 2.5274509004519445, "learning_rate": 4.166666666666667e-06, - "logits/chosen": -2.2603583335876465, - "logits/rejected": -2.1030356884002686, - "logps/chosen": -265.6451110839844, - "logps/rejected": -258.9366455078125, - "loss": 0.6386, + "logits/chosen": -2.2609763145446777, + "logits/rejected": -2.1039209365844727, + "logps/chosen": -263.5312194824219, + "logps/rejected": -256.4105529785156, + "loss": 0.6402, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.1463477909564972, - "rewards/margins": 0.12257362902164459, - "rewards/rejected": -0.268921434879303, + "rewards/chosen": -0.12527017295360565, + "rewards/margins": 0.11860889196395874, + "rewards/rejected": -0.2438790500164032, "step": 80 }, { "epoch": 0.0942161737764983, - "grad_norm": 2.228941966023797, + "grad_norm": 2.5638356680875156, "learning_rate": 4.6875000000000004e-06, - "logits/chosen": -2.1493210792541504, - "logits/rejected": -2.081740140914917, - "logps/chosen": -267.15631103515625, - "logps/rejected": -286.8982238769531, - "loss": 0.6314, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.1616009771823883, - "rewards/margins": 0.18432030081748962, - "rewards/rejected": -0.34592124819755554, + "logits/chosen": -2.1489036083221436, + "logits/rejected": -2.081789493560791, + "logps/chosen": -270.69091796875, + "logps/rejected": -291.17962646484375, + "loss": 0.6304, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1973474770784378, + "rewards/margins": 0.19165988266468048, + "rewards/rejected": -0.3890073299407959, "step": 90 }, { "epoch": 0.10468463752944256, - "grad_norm": 2.3171658298596007, + "grad_norm": 2.7331412184768507, "learning_rate": 4.9997324926814375e-06, - "logits/chosen": -2.198456287384033, - "logits/rejected": -2.08889102935791, - "logps/chosen": -310.09893798828125, - "logps/rejected": -328.76763916015625, - "loss": 0.6163, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.37811416387557983, - "rewards/margins": 0.23748056590557098, - "rewards/rejected": -0.615594744682312, + "logits/chosen": -2.193147659301758, + "logits/rejected": -2.0836679935455322, + "logps/chosen": -303.36590576171875, + "logps/rejected": -320.6068420410156, + "loss": 0.6212, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.31098657846450806, + "rewards/margins": 0.2229224145412445, + "rewards/rejected": -0.5339089632034302, "step": 100 }, { "epoch": 0.10468463752944256, - "eval_logits/chosen": -2.1005911827087402, - "eval_logits/rejected": -2.0161635875701904, - "eval_logps/chosen": -303.8350524902344, - "eval_logps/rejected": -310.3097229003906, - "eval_loss": 0.6178256869316101, - "eval_rewards/accuracies": 0.6805555820465088, - "eval_rewards/chosen": -0.38932088017463684, - "eval_rewards/margins": 0.26718538999557495, - "eval_rewards/rejected": -0.6565062999725342, - "eval_runtime": 497.4053, - "eval_samples_per_second": 4.021, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -2.0932729244232178, + "eval_logits/rejected": -2.008643627166748, + "eval_logps/chosen": -298.0505676269531, + "eval_logps/rejected": -299.1472473144531, + "eval_loss": 0.6321468353271484, + "eval_rewards/accuracies": 0.6944444179534912, + "eval_rewards/chosen": -0.3312907814979553, + "eval_rewards/margins": 0.213734969496727, + "eval_rewards/rejected": -0.5450257658958435, + "eval_runtime": 321.7711, + "eval_samples_per_second": 6.216, + "eval_steps_per_second": 0.196, "step": 100 }, { "epoch": 0.11515310128238682, - "grad_norm": 3.2280840758041305, + "grad_norm": 2.3956891782186904, "learning_rate": 4.996723692767927e-06, - "logits/chosen": -2.185472011566162, - "logits/rejected": -2.0565757751464844, - "logps/chosen": -283.398681640625, - "logps/rejected": -271.3462219238281, - "loss": 0.6159, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.35291650891304016, - "rewards/margins": 0.27926865220069885, - "rewards/rejected": -0.632185161113739, + "logits/chosen": -2.1808857917785645, + "logits/rejected": -2.05126690864563, + "logps/chosen": -281.98193359375, + "logps/rejected": -265.5942687988281, + "loss": 0.6236, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3388732373714447, + "rewards/margins": 0.2357875108718872, + "rewards/rejected": -0.5746607184410095, "step": 110 }, { "epoch": 0.12562156503533106, - "grad_norm": 4.321662600064664, + "grad_norm": 4.49315717191337, "learning_rate": 4.9903757462135984e-06, - "logits/chosen": -2.1495065689086914, - "logits/rejected": -2.0793585777282715, - "logps/chosen": -293.3641662597656, - "logps/rejected": -339.5681457519531, - "loss": 0.5751, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.4733458161354065, - "rewards/margins": 0.3658001124858856, - "rewards/rejected": -0.8391459584236145, + "logits/chosen": -2.1513657569885254, + "logits/rejected": -2.0812501907348633, + "logps/chosen": -287.4888610839844, + "logps/rejected": -328.67156982421875, + "loss": 0.5888, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.41489553451538086, + "rewards/margins": 0.31543681025505066, + "rewards/rejected": -0.7303323149681091, "step": 120 }, { "epoch": 0.1360900287882753, - "grad_norm": 5.281496669320805, + "grad_norm": 3.426776723546068, "learning_rate": 4.980697142834315e-06, - "logits/chosen": -2.118959426879883, - "logits/rejected": -2.0077736377716064, - "logps/chosen": -376.8563232421875, - "logps/rejected": -348.50531005859375, - "loss": 0.5843, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.770808756351471, - "rewards/margins": 0.335581511259079, - "rewards/rejected": -1.1063902378082275, + "logits/chosen": -2.1264045238494873, + "logits/rejected": -2.0155797004699707, + "logps/chosen": -381.717041015625, + "logps/rejected": -350.6758728027344, + "loss": 0.5946, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8191909790039062, + "rewards/margins": 0.3090600371360779, + "rewards/rejected": -1.1282509565353394, "step": 130 }, { "epoch": 0.14655849254121958, - "grad_norm": 5.5524041616811255, + "grad_norm": 3.004217560772131, "learning_rate": 4.967700826904229e-06, - "logits/chosen": -2.0535736083984375, - "logits/rejected": -1.9956505298614502, - "logps/chosen": -353.10333251953125, - "logps/rejected": -393.17108154296875, - "loss": 0.5757, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8411790132522583, - "rewards/margins": 0.44028061628341675, - "rewards/rejected": -1.2814596891403198, + "logits/chosen": -2.0693180561065674, + "logits/rejected": -2.010124683380127, + "logps/chosen": -363.1732482910156, + "logps/rejected": -400.6826171875, + "loss": 0.5746, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9419809579849243, + "rewards/margins": 0.41463392972946167, + "rewards/rejected": -1.3566150665283203, "step": 140 }, { "epoch": 0.15702695629416383, - "grad_norm": 4.851579207894888, + "grad_norm": 4.5725215394059004, "learning_rate": 4.951404179843963e-06, - "logits/chosen": -2.1413304805755615, - "logits/rejected": -2.0062034130096436, - "logps/chosen": -385.7857360839844, - "logps/rejected": -377.45819091796875, - "loss": 0.5887, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9732216000556946, - "rewards/margins": 0.39891117811203003, - "rewards/rejected": -1.3721327781677246, + "logits/chosen": -2.154370069503784, + "logits/rejected": -2.016098976135254, + "logps/chosen": -362.11627197265625, + "logps/rejected": -358.12823486328125, + "loss": 0.5888, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7362005710601807, + "rewards/margins": 0.44253450632095337, + "rewards/rejected": -1.1787351369857788, "step": 150 }, { "epoch": 0.16749542004710807, - "grad_norm": 7.670497596403814, + "grad_norm": 7.788576778528557, "learning_rate": 4.931828996974498e-06, - "logits/chosen": -2.0889430046081543, - "logits/rejected": -1.922328233718872, - "logps/chosen": -347.14544677734375, - "logps/rejected": -342.8069763183594, - "loss": 0.5572, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.606709361076355, - "rewards/margins": 0.5376572608947754, - "rewards/rejected": -1.1443665027618408, + "logits/chosen": -2.099804162979126, + "logits/rejected": -1.9289453029632568, + "logps/chosen": -348.2688903808594, + "logps/rejected": -349.8648986816406, + "loss": 0.5524, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.6180304288864136, + "rewards/margins": 0.5966934561729431, + "rewards/rejected": -1.2147239446640015, "step": 160 }, { "epoch": 0.17796388380005235, - "grad_norm": 5.79406761477676, + "grad_norm": 3.73369227909531, "learning_rate": 4.909001458367867e-06, - "logits/chosen": -1.9932262897491455, - "logits/rejected": -1.8520256280899048, - "logps/chosen": -393.05499267578125, - "logps/rejected": -439.379150390625, - "loss": 0.5534, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3342100381851196, - "rewards/margins": 0.6197064518928528, - "rewards/rejected": -1.9539167881011963, + "logits/chosen": -2.005589008331299, + "logits/rejected": -1.8591816425323486, + "logps/chosen": -357.9122619628906, + "logps/rejected": -401.5499267578125, + "loss": 0.5578, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9827691912651062, + "rewards/margins": 0.5925935506820679, + "rewards/rejected": -1.5753626823425293, "step": 170 }, { "epoch": 0.1884323475529966, - "grad_norm": 5.16143784148925, + "grad_norm": 5.142786695761387, "learning_rate": 4.882952093833628e-06, - "logits/chosen": -1.9763168096542358, - "logits/rejected": -1.9191144704818726, - "logps/chosen": -370.9029235839844, - "logps/rejected": -421.9246520996094, - "loss": 0.5354, + "logits/chosen": -1.906904935836792, + "logits/rejected": -1.8502197265625, + "logps/chosen": -384.1532287597656, + "logps/rejected": -441.2289123535156, + "loss": 0.5357, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0804250240325928, - "rewards/margins": 0.5884348154067993, - "rewards/rejected": -1.668859839439392, + "rewards/chosen": -1.2132524251937866, + "rewards/margins": 0.6486458778381348, + "rewards/rejected": -1.861898422241211, "step": 180 }, { "epoch": 0.19890081130594087, - "grad_norm": 4.841208626728555, + "grad_norm": 4.970082596077688, "learning_rate": 4.853715742087947e-06, - "logits/chosen": -1.8281657695770264, - "logits/rejected": -1.785474419593811, - "logps/chosen": -358.50970458984375, - "logps/rejected": -449.71478271484375, - "loss": 0.5327, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.1242244243621826, - "rewards/margins": 0.7143529653549194, - "rewards/rejected": -1.8385772705078125, + "logits/chosen": -1.7953016757965088, + "logits/rejected": -1.749053716659546, + "logps/chosen": -331.3048400878906, + "logps/rejected": -420.5511779785156, + "loss": 0.5344, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8520873188972473, + "rewards/margins": 0.6949248313903809, + "rewards/rejected": -1.5470120906829834, "step": 190 }, { "epoch": 0.2093692750588851, - "grad_norm": 4.320034048961276, + "grad_norm": 4.380709251136357, "learning_rate": 4.821331504159906e-06, - "logits/chosen": -1.9889659881591797, - "logits/rejected": -1.8328838348388672, - "logps/chosen": -417.7860412597656, - "logps/rejected": -414.47528076171875, - "loss": 0.5679, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.052539587020874, - "rewards/margins": 0.5909037590026855, - "rewards/rejected": -1.6434433460235596, + "logits/chosen": -1.931880235671997, + "logits/rejected": -1.7673060894012451, + "logps/chosen": -395.6458435058594, + "logps/rejected": -393.07818603515625, + "loss": 0.5618, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8311824798583984, + "rewards/margins": 0.598331868648529, + "rewards/rejected": -1.4295144081115723, "step": 200 }, { "epoch": 0.2093692750588851, - "eval_logits/chosen": -1.8226656913757324, - "eval_logits/rejected": -1.73935866355896, - "eval_logps/chosen": -352.2879333496094, - "eval_logps/rejected": -389.65753173828125, - "eval_loss": 0.5566642880439758, - "eval_rewards/accuracies": 0.7400793433189392, - "eval_rewards/chosen": -0.8738502264022827, - "eval_rewards/margins": 0.5761341452598572, - "eval_rewards/rejected": -1.4499843120574951, - "eval_runtime": 495.1318, - "eval_samples_per_second": 4.039, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -1.7550567388534546, + "eval_logits/rejected": -1.669370174407959, + "eval_logps/chosen": -346.90643310546875, + "eval_logps/rejected": -381.2445983886719, + "eval_loss": 0.5600804686546326, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": -0.8198498487472534, + "eval_rewards/margins": 0.5461496114730835, + "eval_rewards/rejected": -1.365999460220337, + "eval_runtime": 319.6036, + "eval_samples_per_second": 6.258, + "eval_steps_per_second": 0.197, "step": 200 }, { "epoch": 0.21983773881182936, - "grad_norm": 4.600314274109389, + "grad_norm": 4.390548860079351, "learning_rate": 4.7858426910973435e-06, - "logits/chosen": -1.884235143661499, - "logits/rejected": -1.8107131719589233, - "logps/chosen": -395.2080993652344, - "logps/rejected": -432.21160888671875, - "loss": 0.5779, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1761962175369263, - "rewards/margins": 0.46132412552833557, - "rewards/rejected": -1.637520432472229, + "logits/chosen": -1.82965886592865, + "logits/rejected": -1.7543909549713135, + "logps/chosen": -382.2540588378906, + "logps/rejected": -415.6708984375, + "loss": 0.5746, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.0466556549072266, + "rewards/margins": 0.42583417892456055, + "rewards/rejected": -1.4724897146224976, "step": 210 }, { "epoch": 0.23030620256477363, - "grad_norm": 5.190566663970971, + "grad_norm": 5.154433364870237, "learning_rate": 4.747296766042161e-06, - "logits/chosen": -1.8367938995361328, - "logits/rejected": -1.7323997020721436, - "logps/chosen": -448.40631103515625, - "logps/rejected": -463.5052185058594, - "loss": 0.5486, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5517961978912354, - "rewards/margins": 0.5995836853981018, - "rewards/rejected": -2.1513800621032715, + "logits/chosen": -1.791577696800232, + "logits/rejected": -1.676790475845337, + "logps/chosen": -430.439697265625, + "logps/rejected": -446.4627380371094, + "loss": 0.5594, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3720312118530273, + "rewards/margins": 0.6089810132980347, + "rewards/rejected": -1.9810121059417725, "step": 220 }, { "epoch": 0.24077466631771788, - "grad_norm": 6.503627910465957, + "grad_norm": 5.067287557540918, "learning_rate": 4.705745280752586e-06, - "logits/chosen": -1.6796023845672607, - "logits/rejected": -1.613417387008667, - "logps/chosen": -363.4635009765625, - "logps/rejected": -397.3682861328125, - "loss": 0.5564, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1600974798202515, - "rewards/margins": 0.502653956413269, - "rewards/rejected": -1.66275155544281, + "logits/chosen": -1.6005672216415405, + "logits/rejected": -1.5293024778366089, + "logps/chosen": -364.32769775390625, + "logps/rejected": -393.8982849121094, + "loss": 0.5585, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1689074039459229, + "rewards/margins": 0.4595082402229309, + "rewards/rejected": -1.6284157037734985, "step": 230 }, { "epoch": 0.2512431300706621, - "grad_norm": 5.303084146530296, + "grad_norm": 4.187057753178847, "learning_rate": 4.661243806657256e-06, - "logits/chosen": -1.7421451807022095, - "logits/rejected": -1.6884396076202393, - "logps/chosen": -386.07489013671875, - "logps/rejected": -414.7723693847656, - "loss": 0.5355, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2963042259216309, - "rewards/margins": 0.5929707884788513, - "rewards/rejected": -1.8892749547958374, + "logits/chosen": -1.6649366617202759, + "logits/rejected": -1.6055676937103271, + "logps/chosen": -377.30517578125, + "logps/rejected": -407.87017822265625, + "loss": 0.5306, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2085318565368652, + "rewards/margins": 0.6119499802589417, + "rewards/rejected": -1.8204820156097412, "step": 240 }, { "epoch": 0.26171159382360637, - "grad_norm": 4.582341616936657, + "grad_norm": 4.31385003289797, "learning_rate": 4.613851860533367e-06, - "logits/chosen": -1.766645073890686, - "logits/rejected": -1.6948877573013306, - "logps/chosen": -388.61328125, - "logps/rejected": -415.88690185546875, - "loss": 0.5156, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.220664620399475, - "rewards/margins": 0.6846394538879395, - "rewards/rejected": -1.905303955078125, + "logits/chosen": -1.6592628955841064, + "logits/rejected": -1.5831575393676758, + "logps/chosen": -392.666259765625, + "logps/rejected": -415.00933837890625, + "loss": 0.5341, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2613078355789185, + "rewards/margins": 0.6352987289428711, + "rewards/rejected": -1.896606683731079, "step": 250 }, { "epoch": 0.2721800575765506, - "grad_norm": 5.258036005468907, + "grad_norm": 6.08274430115293, "learning_rate": 4.563632824908252e-06, - "logits/chosen": -1.777336835861206, - "logits/rejected": -1.651208519935608, - "logps/chosen": -376.238525390625, - "logps/rejected": -421.6551818847656, - "loss": 0.5197, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.0568403005599976, - "rewards/margins": 0.6248779296875, - "rewards/rejected": -1.6817182302474976, + "logits/chosen": -1.5671354532241821, + "logits/rejected": -1.4325566291809082, + "logps/chosen": -446.5708923339844, + "logps/rejected": -492.8990173339844, + "loss": 0.5228, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7604526281356812, + "rewards/margins": 0.6336302161216736, + "rewards/rejected": -2.394083261489868, "step": 260 }, { "epoch": 0.2826485213294949, - "grad_norm": 4.95113233440466, + "grad_norm": 4.56724778147284, "learning_rate": 4.510653863290871e-06, - "logits/chosen": -1.6241214275360107, - "logits/rejected": -1.5427122116088867, - "logps/chosen": -393.2299499511719, - "logps/rejected": -455.7496032714844, - "loss": 0.5353, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.1928220987319946, - "rewards/margins": 0.8355104327201843, - "rewards/rejected": -2.028332471847534, + "logits/chosen": -1.456993579864502, + "logits/rejected": -1.3645284175872803, + "logps/chosen": -401.00244140625, + "logps/rejected": -461.67816162109375, + "loss": 0.5277, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2706773281097412, + "rewards/margins": 0.8170326352119446, + "rewards/rejected": -2.087709903717041, "step": 270 }, { "epoch": 0.29311698508243916, - "grad_norm": 4.316588759606841, + "grad_norm": 3.947817632488332, "learning_rate": 4.454985830346574e-06, - "logits/chosen": -1.6576063632965088, - "logits/rejected": -1.546891212463379, - "logps/chosen": -402.0686340332031, - "logps/rejected": -449.80694580078125, - "loss": 0.5469, + "logits/chosen": -1.5088273286819458, + "logits/rejected": -1.3900407552719116, + "logps/chosen": -385.47772216796875, + "logps/rejected": -426.82049560546875, + "loss": 0.5485, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.1402246952056885, - "rewards/margins": 0.8147087097167969, - "rewards/rejected": -1.9549334049224854, + "rewards/chosen": -0.9744614362716675, + "rewards/margins": 0.75079345703125, + "rewards/rejected": -1.725255012512207, "step": 280 }, { "epoch": 0.3035854488353834, - "grad_norm": 4.334299410781991, + "grad_norm": 4.730568661724723, "learning_rate": 4.396703177135262e-06, - "logits/chosen": -1.7125823497772217, - "logits/rejected": -1.547957181930542, - "logps/chosen": -367.20501708984375, - "logps/rejected": -389.2141418457031, - "loss": 0.5292, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8416954278945923, - "rewards/margins": 0.5679039359092712, - "rewards/rejected": -1.4095993041992188, + "logits/chosen": -1.517740249633789, + "logits/rejected": -1.351360559463501, + "logps/chosen": -388.90789794921875, + "logps/rejected": -416.22052001953125, + "loss": 0.526, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0588579177856445, + "rewards/margins": 0.6205722093582153, + "rewards/rejected": -1.6794300079345703, "step": 290 }, { "epoch": 0.31405391258832765, - "grad_norm": 5.689892772495371, + "grad_norm": 5.829501633867066, "learning_rate": 4.335883851539693e-06, - "logits/chosen": -1.4305754899978638, - "logits/rejected": -1.4178600311279297, - "logps/chosen": -366.5882263183594, - "logps/rejected": -427.017333984375, - "loss": 0.5412, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.239133596420288, - "rewards/margins": 0.6136574149131775, - "rewards/rejected": -1.8527911901474, + "logits/chosen": -1.2372510433197021, + "logits/rejected": -1.2240632772445679, + "logps/chosen": -386.8778381347656, + "logps/rejected": -448.6904296875, + "loss": 0.54, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.441900610923767, + "rewards/margins": 0.6276523470878601, + "rewards/rejected": -2.0695528984069824, "step": 300 }, { "epoch": 0.31405391258832765, - "eval_logits/chosen": -1.3111425638198853, - "eval_logits/rejected": -1.218103051185608, - "eval_logps/chosen": -421.32574462890625, - "eval_logps/rejected": -483.0422668457031, - "eval_loss": 0.5305107831954956, + "eval_logits/chosen": -1.1714633703231812, + "eval_logits/rejected": -1.0704221725463867, + "eval_logps/chosen": -417.12750244140625, + "eval_logps/rejected": -478.07476806640625, + "eval_loss": 0.5264545679092407, "eval_rewards/accuracies": 0.7460317611694336, - "eval_rewards/chosen": -1.5642281770706177, - "eval_rewards/margins": 0.819603681564331, - "eval_rewards/rejected": -2.3838319778442383, - "eval_runtime": 495.2081, - "eval_samples_per_second": 4.039, - "eval_steps_per_second": 0.127, + "eval_rewards/chosen": -1.5220601558685303, + "eval_rewards/margins": 0.8122406601905823, + "eval_rewards/rejected": -2.334300994873047, + "eval_runtime": 318.2186, + "eval_samples_per_second": 6.285, + "eval_steps_per_second": 0.198, "step": 300 }, { "epoch": 0.3245223763412719, - "grad_norm": 5.39322804571776, + "grad_norm": 4.0412766328088585, "learning_rate": 4.2726091940171055e-06, - "logits/chosen": -1.4802881479263306, - "logits/rejected": -1.3236931562423706, - "logps/chosen": -424.968505859375, - "logps/rejected": -472.8589782714844, - "loss": 0.5441, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5254794359207153, - "rewards/margins": 0.8688099980354309, - "rewards/rejected": -2.394289493560791, + "logits/chosen": -1.3419673442840576, + "logits/rejected": -1.1779625415802002, + "logps/chosen": -421.1036682128906, + "logps/rejected": -466.26275634765625, + "loss": 0.5369, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4869836568832397, + "rewards/margins": 0.8415184020996094, + "rewards/rejected": -2.3285021781921387, "step": 310 }, { "epoch": 0.33499084009421615, - "grad_norm": 5.55708768926168, + "grad_norm": 7.228642390991299, "learning_rate": 4.206963828813555e-06, - "logits/chosen": -1.5147395133972168, - "logits/rejected": -1.4260919094085693, - "logps/chosen": -356.251953125, - "logps/rejected": -424.2235412597656, - "loss": 0.5537, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1914732456207275, - "rewards/margins": 0.6315604448318481, - "rewards/rejected": -1.8230335712432861, + "logits/chosen": -1.3606576919555664, + "logits/rejected": -1.266966700553894, + "logps/chosen": -374.7814025878906, + "logps/rejected": -450.57489013671875, + "loss": 0.5399, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3771132230758667, + "rewards/margins": 0.7092502117156982, + "rewards/rejected": -2.0863633155822754, "step": 320 }, { "epoch": 0.34545930384716045, - "grad_norm": 5.004482442410713, + "grad_norm": 5.998674043872773, "learning_rate": 4.139035550786495e-06, - "logits/chosen": -1.504990577697754, - "logits/rejected": -1.4216346740722656, - "logps/chosen": -406.04473876953125, - "logps/rejected": -461.72283935546875, - "loss": 0.5321, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5744965076446533, - "rewards/margins": 0.6061158776283264, - "rewards/rejected": -2.180612325668335, + "logits/chosen": -1.305854082107544, + "logits/rejected": -1.2263256311416626, + "logps/chosen": -429.99188232421875, + "logps/rejected": -497.3521423339844, + "loss": 0.5206, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8139232397079468, + "rewards/margins": 0.7226920127868652, + "rewards/rejected": -2.5366153717041016, "step": 330 }, { "epoch": 0.3559277676001047, - "grad_norm": 5.1331172845370165, + "grad_norm": 6.244527338235992, "learning_rate": 4.068915207986931e-06, - "logits/chosen": -1.2623932361602783, - "logits/rejected": -1.1655136346817017, - "logps/chosen": -411.1018981933594, - "logps/rejected": -456.156982421875, - "loss": 0.5393, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.6048648357391357, - "rewards/margins": 0.7221436500549316, - "rewards/rejected": -2.3270087242126465, + "logits/chosen": -1.0698177814483643, + "logits/rejected": -0.9774864315986633, + "logps/chosen": -438.493408203125, + "logps/rejected": -487.9413146972656, + "loss": 0.5553, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8784738779067993, + "rewards/margins": 0.7664145231246948, + "rewards/rejected": -2.644888401031494, "step": 340 }, { "epoch": 0.36639623135304894, - "grad_norm": 6.204494142283613, + "grad_norm": 6.05372732646654, "learning_rate": 3.996696580158211e-06, - "logits/chosen": -1.4823843240737915, - "logits/rejected": -1.3702657222747803, - "logps/chosen": -387.9534606933594, - "logps/rejected": -448.01373291015625, - "loss": 0.5072, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3335731029510498, - "rewards/margins": 0.7507054805755615, - "rewards/rejected": -2.0842788219451904, + "logits/chosen": -1.419528603553772, + "logits/rejected": -1.3026950359344482, + "logps/chosen": -399.3412170410156, + "logps/rejected": -459.59063720703125, + "loss": 0.5144, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4473296403884888, + "rewards/margins": 0.7528419494628906, + "rewards/rejected": -2.2001712322235107, "step": 350 }, { "epoch": 0.3768646951059932, - "grad_norm": 8.049482757420606, + "grad_norm": 7.477190042107064, "learning_rate": 3.922476253313921e-06, - "logits/chosen": -1.307775855064392, - "logits/rejected": -1.24771249294281, - "logps/chosen": -400.76910400390625, - "logps/rejected": -469.23651123046875, - "loss": 0.4704, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.4676017761230469, - "rewards/margins": 0.8654597997665405, - "rewards/rejected": -2.333061456680298, + "logits/chosen": -1.3192278146743774, + "logits/rejected": -1.2596288919448853, + "logps/chosen": -388.9505920410156, + "logps/rejected": -450.23748779296875, + "loss": 0.476, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3493094444274902, + "rewards/margins": 0.7934570908546448, + "rewards/rejected": -2.1427664756774902, "step": 360 }, { "epoch": 0.38733315885893743, - "grad_norm": 5.519216288621626, + "grad_norm": 5.411077344670459, "learning_rate": 3.846353490562664e-06, - "logits/chosen": -1.2273913621902466, - "logits/rejected": -1.1667524576187134, - "logps/chosen": -445.7137756347656, - "logps/rejected": -534.3553466796875, - "loss": 0.5271, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.012014865875244, - "rewards/margins": 1.0135688781738281, - "rewards/rejected": -3.0255837440490723, + "logits/chosen": -1.3181861639022827, + "logits/rejected": -1.2644492387771606, + "logps/chosen": -406.28851318359375, + "logps/rejected": -484.81646728515625, + "loss": 0.5189, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.618009328842163, + "rewards/margins": 0.912137508392334, + "rewards/rejected": -2.530146837234497, "step": 370 }, { "epoch": 0.39780162261188173, - "grad_norm": 6.046235636435534, + "grad_norm": 6.255769900471538, "learning_rate": 3.768430099352445e-06, - "logits/chosen": -1.2952733039855957, - "logits/rejected": -1.2378871440887451, - "logps/chosen": -393.07000732421875, - "logps/rejected": -465.0540466308594, - "loss": 0.5257, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3947498798370361, - "rewards/margins": 0.7023818492889404, - "rewards/rejected": -2.0971317291259766, + "logits/chosen": -1.2576202154159546, + "logits/rejected": -1.1985712051391602, + "logps/chosen": -426.02978515625, + "logps/rejected": -499.4483337402344, + "loss": 0.5291, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.724285364151001, + "rewards/margins": 0.7168464660644531, + "rewards/rejected": -2.441131830215454, "step": 380 }, { "epoch": 0.408270086364826, - "grad_norm": 3.5923613903948133, + "grad_norm": 5.374401251386735, "learning_rate": 3.6888102953122307e-06, - "logits/chosen": -1.3613311052322388, - "logits/rejected": -1.2925641536712646, - "logps/chosen": -355.1478576660156, - "logps/rejected": -389.9283752441406, - "loss": 0.5452, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.9670610427856445, - "rewards/margins": 0.5416972637176514, - "rewards/rejected": -1.5087581872940063, + "logits/chosen": -1.3139002323150635, + "logits/rejected": -1.2482521533966064, + "logps/chosen": -366.7402648925781, + "logps/rejected": -402.67144775390625, + "loss": 0.5445, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0828481912612915, + "rewards/margins": 0.5530051589012146, + "rewards/rejected": -1.6358531713485718, "step": 390 }, { "epoch": 0.4187385501177702, - "grad_norm": 4.7487137488013484, + "grad_norm": 5.700868192112112, "learning_rate": 3.607600562872785e-06, - "logits/chosen": -1.3237230777740479, - "logits/rejected": -1.2487279176712036, - "logps/chosen": -384.8719177246094, - "logps/rejected": -442.74957275390625, - "loss": 0.5364, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4063746929168701, - "rewards/margins": 0.5768125057220459, - "rewards/rejected": -1.9831870794296265, + "logits/chosen": -1.2806731462478638, + "logits/rejected": -1.2076570987701416, + "logps/chosen": -397.6805725097656, + "logps/rejected": -457.99578857421875, + "loss": 0.5261, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.534794807434082, + "rewards/margins": 0.6010990738868713, + "rewards/rejected": -2.1358938217163086, "step": 400 }, { "epoch": 0.4187385501177702, - "eval_logits/chosen": -1.2333532571792603, - "eval_logits/rejected": -1.1332385540008545, - "eval_logps/chosen": -416.69793701171875, - "eval_logps/rejected": -476.3457946777344, - "eval_loss": 0.5142984986305237, - "eval_rewards/accuracies": 0.7579365372657776, - "eval_rewards/chosen": -1.5179502964019775, - "eval_rewards/margins": 0.7989169359207153, - "eval_rewards/rejected": -2.3168673515319824, - "eval_runtime": 495.3694, - "eval_samples_per_second": 4.037, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -1.201329231262207, + "eval_logits/rejected": -1.1013988256454468, + "eval_logps/chosen": -430.45263671875, + "eval_logps/rejected": -497.2759094238281, + "eval_loss": 0.5082111954689026, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -1.6553115844726562, + "eval_rewards/margins": 0.8710008859634399, + "eval_rewards/rejected": -2.5263123512268066, + "eval_runtime": 314.7811, + "eval_samples_per_second": 6.354, + "eval_steps_per_second": 0.2, "step": 400 }, { "epoch": 0.42920701387071447, - "grad_norm": 4.606627296879372, + "grad_norm": 4.78368697455832, "learning_rate": 3.5249095128531863e-06, - "logits/chosen": -1.2254225015640259, - "logits/rejected": -1.1056945323944092, - "logps/chosen": -440.85369873046875, - "logps/rejected": -507.37774658203125, - "loss": 0.5273, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7427661418914795, - "rewards/margins": 0.9440004229545593, - "rewards/rejected": -2.6867663860321045, + "logits/chosen": -1.1789991855621338, + "logits/rejected": -1.0596911907196045, + "logps/chosen": -449.33709716796875, + "logps/rejected": -515.829833984375, + "loss": 0.5275, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.8276984691619873, + "rewards/margins": 0.9437017440795898, + "rewards/rejected": -2.7714004516601562, "step": 410 }, { "epoch": 0.4396754776236587, - "grad_norm": 7.32723857385592, + "grad_norm": 5.718886468906387, "learning_rate": 3.4408477372034743e-06, - "logits/chosen": -1.3876991271972656, - "logits/rejected": -1.1844953298568726, - "logps/chosen": -440.2545471191406, - "logps/rejected": -458.7481384277344, - "loss": 0.5242, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.5724732875823975, - "rewards/margins": 0.7581513524055481, - "rewards/rejected": -2.330624580383301, + "logits/chosen": -1.3089560270309448, + "logits/rejected": -1.111132025718689, + "logps/chosen": -451.94354248046875, + "logps/rejected": -472.0818786621094, + "loss": 0.5212, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6895692348480225, + "rewards/margins": 0.77433842420578, + "rewards/rejected": -2.4639077186584473, "step": 420 }, { "epoch": 0.45014394137660296, - "grad_norm": 4.83507866233375, + "grad_norm": 5.4739752222845, "learning_rate": 3.355527661097728e-06, - "logits/chosen": -1.3145965337753296, - "logits/rejected": -1.2404569387435913, - "logps/chosen": -391.62640380859375, - "logps/rejected": -483.40020751953125, - "loss": 0.5203, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6367257833480835, - "rewards/margins": 0.8808576464653015, - "rewards/rejected": -2.5175833702087402, + "logits/chosen": -1.2457327842712402, + "logits/rejected": -1.1701027154922485, + "logps/chosen": -408.38629150390625, + "logps/rejected": -495.34906005859375, + "loss": 0.5228, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8043445348739624, + "rewards/margins": 0.8327828645706177, + "rewards/rejected": -2.63712739944458, "step": 430 }, { "epoch": 0.46061240512954726, - "grad_norm": 6.295429995521184, + "grad_norm": 6.85581768096026, "learning_rate": 3.269063392575352e-06, - "logits/chosen": -1.246955156326294, - "logits/rejected": -1.2044976949691772, - "logps/chosen": -412.8096618652344, - "logps/rejected": -462.73175048828125, - "loss": 0.5309, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.6489166021347046, - "rewards/margins": 0.686037540435791, - "rewards/rejected": -2.334954261779785, + "logits/chosen": -1.2453696727752686, + "logits/rejected": -1.202413558959961, + "logps/chosen": -399.9681091308594, + "logps/rejected": -447.87744140625, + "loss": 0.5256, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5209221839904785, + "rewards/margins": 0.6652868986129761, + "rewards/rejected": -2.186208963394165, "step": 440 }, { "epoch": 0.4710808688824915, - "grad_norm": 8.155653541939053, + "grad_norm": 5.776991001419176, "learning_rate": 3.181570569931697e-06, - "logits/chosen": -1.3565101623535156, - "logits/rejected": -1.2913014888763428, - "logps/chosen": -404.30938720703125, - "logps/rejected": -522.0363159179688, - "loss": 0.5151, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.5972925424575806, - "rewards/margins": 1.02085280418396, - "rewards/rejected": -2.61814546585083, + "logits/chosen": -1.3612130880355835, + "logits/rejected": -1.2961633205413818, + "logps/chosen": -399.5633239746094, + "logps/rejected": -510.95513916015625, + "loss": 0.5105, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5496407747268677, + "rewards/margins": 0.9577849507331848, + "rewards/rejected": -2.507425308227539, "step": 450 }, { "epoch": 0.48154933263543576, - "grad_norm": 5.153431243466889, + "grad_norm": 5.051312773178446, "learning_rate": 3.09316620706208e-06, - "logits/chosen": -1.3433706760406494, - "logits/rejected": -1.2827776670455933, - "logps/chosen": -448.34283447265625, - "logps/rejected": -517.4175415039062, - "loss": 0.5305, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.879359245300293, - "rewards/margins": 0.8195317983627319, - "rewards/rejected": -2.6988909244537354, + "logits/chosen": -1.2932283878326416, + "logits/rejected": -1.2328197956085205, + "logps/chosen": -472.6172790527344, + "logps/rejected": -538.5615234375, + "loss": 0.5286, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.122169256210327, + "rewards/margins": 0.7884066104888916, + "rewards/rejected": -2.9105758666992188, "step": 460 }, { "epoch": 0.49201779638838, - "grad_norm": 5.610862161934833, + "grad_norm": 4.905312405134056, "learning_rate": 3.0039685369660785e-06, - "logits/chosen": -1.3113436698913574, - "logits/rejected": -1.173460602760315, - "logps/chosen": -466.1058044433594, - "logps/rejected": -511.7659606933594, - "loss": 0.4856, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.0702614784240723, - "rewards/margins": 0.7728853821754456, - "rewards/rejected": -2.843147039413452, + "logits/chosen": -1.299459457397461, + "logits/rejected": -1.164813756942749, + "logps/chosen": -433.37554931640625, + "logps/rejected": -470.384521484375, + "loss": 0.4942, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7430375814437866, + "rewards/margins": 0.6865721940994263, + "rewards/rejected": -2.429609775543213, "step": 470 }, { "epoch": 0.5024862601413242, - "grad_norm": 6.26181575439223, + "grad_norm": 6.146340380005326, "learning_rate": 2.91409685362137e-06, - "logits/chosen": -1.2794568538665771, - "logits/rejected": -1.123780608177185, - "logps/chosen": -523.4374389648438, - "logps/rejected": -596.9427490234375, + "logits/chosen": -1.2403475046157837, + "logits/rejected": -1.0839545726776123, + "logps/chosen": -513.665771484375, + "logps/rejected": -583.8973388671875, "loss": 0.5046, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.514650583267212, - "rewards/margins": 1.0325826406478882, - "rewards/rejected": -3.5472328662872314, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4166836738586426, + "rewards/margins": 1.000232219696045, + "rewards/rejected": -3.4169158935546875, "step": 480 }, { "epoch": 0.5129547238942685, - "grad_norm": 4.885030756434318, + "grad_norm": 5.112445065707855, "learning_rate": 2.8236713524386085e-06, - "logits/chosen": -1.2807716131210327, - "logits/rejected": -1.107301950454712, - "logps/chosen": -547.37890625, - "logps/rejected": -605.0728149414062, - "loss": 0.4989, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.634721040725708, - "rewards/margins": 0.92457515001297, - "rewards/rejected": -3.5592963695526123, + "logits/chosen": -1.2714743614196777, + "logits/rejected": -1.0941110849380493, + "logps/chosen": -532.9962158203125, + "logps/rejected": -588.236328125, + "loss": 0.5025, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.491058826446533, + "rewards/margins": 0.900057315826416, + "rewards/rejected": -3.39111590385437, "step": 490 }, { "epoch": 0.5234231876472127, - "grad_norm": 4.812586396006598, + "grad_norm": 5.578978194196055, "learning_rate": 2.7328129695107205e-06, - "logits/chosen": -1.2112505435943604, - "logits/rejected": -1.1074721813201904, - "logps/chosen": -523.0760498046875, - "logps/rejected": -595.048095703125, - "loss": 0.5046, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.574467420578003, - "rewards/margins": 0.9888777732849121, - "rewards/rejected": -3.563344955444336, + "logits/chosen": -1.2111141681671143, + "logits/rejected": -1.1049137115478516, + "logps/chosen": -476.11932373046875, + "logps/rejected": -544.0899658203125, + "loss": 0.5107, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.1048712730407715, + "rewards/margins": 0.9492176175117493, + "rewards/rejected": -3.054089069366455, "step": 500 }, { "epoch": 0.5234231876472127, - "eval_logits/chosen": -1.1373296976089478, - "eval_logits/rejected": -1.0301768779754639, - "eval_logps/chosen": -529.9542236328125, - "eval_logps/rejected": -605.2976684570312, - "eval_loss": 0.5062148571014404, - "eval_rewards/accuracies": 0.7579365372657776, - "eval_rewards/chosen": -2.650512456893921, - "eval_rewards/margins": 0.9558730721473694, - "eval_rewards/rejected": -3.6063857078552246, - "eval_runtime": 494.8767, - "eval_samples_per_second": 4.041, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -1.0955979824066162, + "eval_logits/rejected": -0.9851866364479065, + "eval_logps/chosen": -509.9847717285156, + "eval_logps/rejected": -587.1475830078125, + "eval_loss": 0.5058528184890747, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -2.4506328105926514, + "eval_rewards/margins": 0.9743964076042175, + "eval_rewards/rejected": -3.4250295162200928, + "eval_runtime": 306.1826, + "eval_samples_per_second": 6.532, + "eval_steps_per_second": 0.206, "step": 500 }, { "epoch": 0.533891651400157, - "grad_norm": 4.595490575025387, + "grad_norm": 5.171650655838442, "learning_rate": 2.641643219871597e-06, - "logits/chosen": -1.2453548908233643, - "logits/rejected": -1.1635602712631226, - "logps/chosen": -500.3102111816406, - "logps/rejected": -559.625244140625, - "loss": 0.4867, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.409621000289917, - "rewards/margins": 0.8319376707077026, - "rewards/rejected": -3.24155855178833, + "logits/chosen": -1.1813862323760986, + "logits/rejected": -1.0965768098831177, + "logps/chosen": -508.13092041015625, + "logps/rejected": -570.53759765625, + "loss": 0.4794, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.4878056049346924, + "rewards/margins": 0.8628519773483276, + "rewards/rejected": -3.3506579399108887, "step": 510 }, { "epoch": 0.5443601151531012, - "grad_norm": 7.198268767989711, + "grad_norm": 6.785478355965952, "learning_rate": 2.5502840349805074e-06, - "logits/chosen": -1.2010407447814941, - "logits/rejected": -1.0801866054534912, - "logps/chosen": -578.692138671875, - "logps/rejected": -627.0946044921875, - "loss": 0.5249, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -3.033255100250244, - "rewards/margins": 0.9397614598274231, - "rewards/rejected": -3.9730167388916016, + "logits/chosen": -1.1492969989776611, + "logits/rejected": -1.0285675525665283, + "logps/chosen": -577.2451171875, + "logps/rejected": -618.67236328125, + "loss": 0.536, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.018739938735962, + "rewards/margins": 0.8701656460762024, + "rewards/rejected": -3.8889052867889404, "step": 520 }, { "epoch": 0.5548285789060455, - "grad_norm": 6.519693692708999, + "grad_norm": 5.9061132192029975, "learning_rate": 2.4588575996495797e-06, - "logits/chosen": -1.169585943222046, - "logits/rejected": -1.0577750205993652, - "logps/chosen": -533.5675048828125, - "logps/rejected": -615.3276977539062, - "loss": 0.4947, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.606905460357666, - "rewards/margins": 1.0898813009262085, - "rewards/rejected": -3.696786880493164, + "logits/chosen": -1.1598259210586548, + "logits/rejected": -1.0490505695343018, + "logps/chosen": -515.8734130859375, + "logps/rejected": -585.2592163085938, + "loss": 0.5089, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.4302353858947754, + "rewards/margins": 0.9659484624862671, + "rewards/rejected": -3.396183729171753, "step": 530 }, { "epoch": 0.5652970426589898, - "grad_norm": 4.681072379969835, + "grad_norm": 4.6033180245939604, "learning_rate": 2.367486188632446e-06, - "logits/chosen": -1.2149909734725952, - "logits/rejected": -1.0861246585845947, - "logps/chosen": -495.14093017578125, - "logps/rejected": -530.830810546875, - "loss": 0.5288, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.406106472015381, - "rewards/margins": 0.7864419221878052, - "rewards/rejected": -3.1925485134124756, + "logits/chosen": -1.1840062141418457, + "logits/rejected": -1.05405592918396, + "logps/chosen": -469.9341735839844, + "logps/rejected": -503.66156005859375, + "loss": 0.5311, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1539134979248047, + "rewards/margins": 0.7668038606643677, + "rewards/rejected": -2.920717239379883, "step": 540 }, { "epoch": 0.575765506411934, - "grad_norm": 4.605359920599186, + "grad_norm": 4.876993529338895, "learning_rate": 2.276292003092593e-06, - "logits/chosen": -1.25673508644104, - "logits/rejected": -1.0914690494537354, - "logps/chosen": -509.0771484375, - "logps/rejected": -543.2958374023438, - "loss": 0.512, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.4712395668029785, - "rewards/margins": 0.7063032388687134, - "rewards/rejected": -3.1775424480438232, + "logits/chosen": -1.1908996105194092, + "logits/rejected": -1.03009033203125, + "logps/chosen": -489.2470703125, + "logps/rejected": -519.9906616210938, + "loss": 0.5265, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.272523880004883, + "rewards/margins": 0.6718829870223999, + "rewards/rejected": -2.9444069862365723, "step": 550 }, { "epoch": 0.5862339701648783, - "grad_norm": 6.363842169113803, + "grad_norm": 6.4140589895092734, "learning_rate": 2.1853970071701415e-06, - "logits/chosen": -1.153988242149353, - "logits/rejected": -1.0725597143173218, - "logps/chosen": -513.7321166992188, - "logps/rejected": -567.1982421875, - "loss": 0.5248, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.496264934539795, - "rewards/margins": 0.8739057779312134, - "rewards/rejected": -3.370171070098877, + "logits/chosen": -1.0713722705841064, + "logits/rejected": -0.9895181655883789, + "logps/chosen": -517.431396484375, + "logps/rejected": -573.0247802734375, + "loss": 0.527, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.533311605453491, + "rewards/margins": 0.8953849077224731, + "rewards/rejected": -3.428696393966675, "step": 560 }, { "epoch": 0.5967024339178225, - "grad_norm": 6.958473109368914, + "grad_norm": 6.376011574902463, "learning_rate": 2.0949227648656194e-06, - "logits/chosen": -1.243544578552246, - "logits/rejected": -1.138840913772583, - "logps/chosen": -514.3547973632812, - "logps/rejected": -589.1139526367188, - "loss": 0.5081, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.536358594894409, - "rewards/margins": 0.8745514154434204, - "rewards/rejected": -3.410910129547119, + "logits/chosen": -1.129098653793335, + "logits/rejected": -1.0245158672332764, + "logps/chosen": -559.3487548828125, + "logps/rejected": -636.18310546875, + "loss": 0.5053, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.986375331878662, + "rewards/margins": 0.8951998949050903, + "rewards/rejected": -3.8815758228302, "step": 570 }, { "epoch": 0.6071708976707668, - "grad_norm": 5.30154815627735, + "grad_norm": 5.245103814265102, "learning_rate": 2.00499027745888e-06, - "logits/chosen": -1.1577163934707642, - "logits/rejected": -1.0448474884033203, - "logps/chosen": -500.58599853515625, - "logps/rejected": -561.0791015625, - "loss": 0.5112, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.6586689949035645, - "rewards/margins": 0.8494758605957031, - "rewards/rejected": -3.5081450939178467, + "logits/chosen": -1.081526517868042, + "logits/rejected": -0.9703742861747742, + "logps/chosen": -521.400146484375, + "logps/rejected": -581.4735107421875, + "loss": 0.5167, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8668112754821777, + "rewards/margins": 0.8452316522598267, + "rewards/rejected": -3.712043046951294, "step": 580 }, { "epoch": 0.6176393614237111, - "grad_norm": 5.957700119060022, + "grad_norm": 5.7527775937649, "learning_rate": 1.915719821680624e-06, - "logits/chosen": -1.342710256576538, - "logits/rejected": -1.2864643335342407, - "logps/chosen": -480.73272705078125, - "logps/rejected": -578.4927978515625, - "loss": 0.4987, + "logits/chosen": -1.287595510482788, + "logits/rejected": -1.2315866947174072, + "logps/chosen": -470.63995361328125, + "logps/rejected": -567.0520629882812, + "loss": 0.5018, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.3478407859802246, - "rewards/margins": 0.940704345703125, - "rewards/rejected": -3.2885451316833496, + "rewards/chosen": -2.246957778930664, + "rewards/margins": 0.9269870519638062, + "rewards/rejected": -3.1739444732666016, "step": 590 }, { "epoch": 0.6281078251766553, - "grad_norm": 5.805746259122187, + "grad_norm": 4.552980884850473, "learning_rate": 1.8272307888529276e-06, - "logits/chosen": -1.1955945491790771, - "logits/rejected": -1.0686155557632446, - "logps/chosen": -551.1502685546875, - "logps/rejected": -601.6345825195312, - "loss": 0.4736, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.542466640472412, - "rewards/margins": 0.8464245796203613, - "rewards/rejected": -3.3888916969299316, + "logits/chosen": -1.1632342338562012, + "logits/rejected": -1.0386791229248047, + "logps/chosen": -524.4891357421875, + "logps/rejected": -568.3989868164062, + "loss": 0.4851, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2760519981384277, + "rewards/margins": 0.7800448536872864, + "rewards/rejected": -3.0560970306396484, "step": 600 }, { "epoch": 0.6281078251766553, - "eval_logits/chosen": -1.1252615451812744, - "eval_logits/rejected": -1.0135337114334106, - "eval_logps/chosen": -537.3406372070312, - "eval_logps/rejected": -621.1549072265625, - "eval_loss": 0.5059433579444885, - "eval_rewards/accuracies": 0.7638888955116272, - "eval_rewards/chosen": -2.724376678466797, - "eval_rewards/margins": 1.0405809879302979, - "eval_rewards/rejected": -3.7649576663970947, - "eval_runtime": 496.2742, - "eval_samples_per_second": 4.03, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -1.107803463935852, + "eval_logits/rejected": -0.9969872832298279, + "eval_logps/chosen": -492.1783447265625, + "eval_logps/rejected": -567.8048706054688, + "eval_loss": 0.5023476481437683, + "eval_rewards/accuracies": 0.7678571343421936, + "eval_rewards/chosen": -2.272569179534912, + "eval_rewards/margins": 0.9590328931808472, + "eval_rewards/rejected": -3.231602191925049, + "eval_runtime": 309.4155, + "eval_samples_per_second": 6.464, + "eval_steps_per_second": 0.204, "step": 600 }, { "epoch": 0.6385762889295996, - "grad_norm": 5.793634680662145, + "grad_norm": 5.93434046945835, "learning_rate": 1.739641525213929e-06, - "logits/chosen": -1.216796636581421, - "logits/rejected": -1.1526730060577393, - "logps/chosen": -538.6376953125, - "logps/rejected": -615.1124877929688, - "loss": 0.5134, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.6652350425720215, - "rewards/margins": 1.1263973712921143, - "rewards/rejected": -3.7916324138641357, + "logits/chosen": -1.1899484395980835, + "logits/rejected": -1.128395438194275, + "logps/chosen": -505.1640625, + "logps/rejected": -569.9411010742188, + "loss": 0.5131, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3301897048950195, + "rewards/margins": 1.009691834449768, + "rewards/rejected": -3.339881420135498, "step": 610 }, { "epoch": 0.6490447526825438, - "grad_norm": 5.8551862397886, + "grad_norm": 6.7592991694860345, "learning_rate": 1.6530691736402317e-06, - "logits/chosen": -1.3594751358032227, - "logits/rejected": -1.1856592893600464, - "logps/chosen": -541.5281372070312, - "logps/rejected": -591.7969970703125, - "loss": 0.4667, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.4107346534729004, - "rewards/margins": 1.0584288835525513, - "rewards/rejected": -3.4691638946533203, + "logits/chosen": -1.3041086196899414, + "logits/rejected": -1.1299916505813599, + "logps/chosen": -520.1345825195312, + "logps/rejected": -570.2188110351562, + "loss": 0.4665, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1966605186462402, + "rewards/margins": 1.0572277307510376, + "rewards/rejected": -3.253887891769409, "step": 620 }, { "epoch": 0.6595132164354881, - "grad_norm": 7.40407247870995, + "grad_norm": 6.939788728853941, "learning_rate": 1.5676295169786864e-06, - "logits/chosen": -1.2886357307434082, - "logits/rejected": -1.160259485244751, - "logps/chosen": -528.3416137695312, - "logps/rejected": -582.4341430664062, - "loss": 0.4807, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.508863925933838, - "rewards/margins": 0.9682027697563171, - "rewards/rejected": -3.477067232131958, + "logits/chosen": -1.2350585460662842, + "logits/rejected": -1.1085925102233887, + "logps/chosen": -506.601806640625, + "logps/rejected": -558.4306640625, + "loss": 0.4883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2913689613342285, + "rewards/margins": 0.9455882906913757, + "rewards/rejected": -3.23695707321167, "step": 630 }, { "epoch": 0.6699816801884323, - "grad_norm": 6.664520102647163, + "grad_norm": 6.885905876648702, "learning_rate": 1.4834368231970922e-06, - "logits/chosen": -1.1566966772079468, - "logits/rejected": -1.0316822528839111, - "logps/chosen": -519.1185913085938, - "logps/rejected": -613.6243896484375, - "loss": 0.4678, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.8396458625793457, - "rewards/margins": 1.1082502603530884, - "rewards/rejected": -3.9478962421417236, + "logits/chosen": -1.1120684146881104, + "logits/rejected": -0.9865466952323914, + "logps/chosen": -484.1168518066406, + "logps/rejected": -577.2811279296875, + "loss": 0.4707, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.489487648010254, + "rewards/margins": 1.0948190689086914, + "rewards/rejected": -3.5843067169189453, "step": 640 }, { "epoch": 0.6804501439413766, - "grad_norm": 6.202576057234435, + "grad_norm": 5.355709320338061, "learning_rate": 1.4006036925609245e-06, - "logits/chosen": -1.2698938846588135, - "logits/rejected": -1.2037663459777832, - "logps/chosen": -512.4635009765625, - "logps/rejected": -599.8485107421875, - "loss": 0.5124, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.5361390113830566, - "rewards/margins": 0.8807609677314758, - "rewards/rejected": -3.416900157928467, + "logits/chosen": -1.2093435525894165, + "logits/rejected": -1.1428296566009521, + "logps/chosen": -484.78680419921875, + "logps/rejected": -564.5701293945312, + "loss": 0.5169, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2595038414001465, + "rewards/margins": 0.8043657541275024, + "rewards/rejected": -3.0638692378997803, "step": 650 }, { "epoch": 0.6909186076943209, - "grad_norm": 5.411195685464347, + "grad_norm": 5.122013631042651, "learning_rate": 1.3192409070404582e-06, - "logits/chosen": -1.33925461769104, - "logits/rejected": -1.2339675426483154, - "logps/chosen": -507.4242248535156, - "logps/rejected": -586.1668701171875, - "loss": 0.4971, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.554523468017578, - "rewards/margins": 0.9770146608352661, - "rewards/rejected": -3.5315380096435547, + "logits/chosen": -1.2926921844482422, + "logits/rejected": -1.1912837028503418, + "logps/chosen": -462.49700927734375, + "logps/rejected": -535.156005859375, + "loss": 0.4902, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.105215311050415, + "rewards/margins": 0.9164140820503235, + "rewards/rejected": -3.0216293334960938, "step": 660 }, { "epoch": 0.7013870714472651, - "grad_norm": 7.306078973174853, + "grad_norm": 7.049199430649973, "learning_rate": 1.2394572821496953e-06, - "logits/chosen": -1.1949710845947266, - "logits/rejected": -1.0358936786651611, - "logps/chosen": -556.1329956054688, - "logps/rejected": -644.8235473632812, - "loss": 0.4783, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -3.012876510620117, - "rewards/margins": 1.1485122442245483, - "rewards/rejected": -4.161388397216797, + "logits/chosen": -1.144698977470398, + "logits/rejected": -0.9912746548652649, + "logps/chosen": -518.4602661132812, + "logps/rejected": -603.5948486328125, + "loss": 0.4773, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6358141899108887, + "rewards/margins": 1.113600492477417, + "rewards/rejected": -3.7494144439697266, "step": 670 }, { "epoch": 0.7118555352002094, - "grad_norm": 6.569348565798864, + "grad_norm": 7.264843779332727, "learning_rate": 1.1613595214152713e-06, - "logits/chosen": -1.1749083995819092, - "logits/rejected": -1.0992681980133057, - "logps/chosen": -572.3611450195312, - "logps/rejected": -654.3881225585938, - "loss": 0.5193, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.059669017791748, - "rewards/margins": 0.9747347831726074, - "rewards/rejected": -4.0344038009643555, + "logits/chosen": -1.1309657096862793, + "logits/rejected": -1.0589998960494995, + "logps/chosen": -539.7699584960938, + "logps/rejected": -620.6878662109375, + "loss": 0.5266, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.733842134475708, + "rewards/margins": 0.9637987017631531, + "rewards/rejected": -3.6976406574249268, "step": 680 }, { "epoch": 0.7223239989531536, - "grad_norm": 4.877247655613693, + "grad_norm": 5.030210764921467, "learning_rate": 1.0850520736699362e-06, - "logits/chosen": -1.3139179944992065, - "logits/rejected": -1.1570199728012085, - "logps/chosen": -550.0722045898438, - "logps/rejected": -594.5676879882812, - "loss": 0.4496, + "logits/chosen": -1.288641333580017, + "logits/rejected": -1.134615182876587, + "logps/chosen": -510.186767578125, + "logps/rejected": -550.937744140625, + "loss": 0.4574, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.778721570968628, - "rewards/margins": 0.9407702684402466, - "rewards/rejected": -3.719491958618164, + "rewards/chosen": -2.3800137042999268, + "rewards/margins": 0.9033814668655396, + "rewards/rejected": -3.2833950519561768, "step": 690 }, { "epoch": 0.7327924627060979, - "grad_norm": 5.850084839342618, + "grad_norm": 5.9328337595528495, "learning_rate": 1.0106369933615043e-06, - "logits/chosen": -1.1720659732818604, - "logits/rejected": -1.1006288528442383, - "logps/chosen": -543.8446044921875, - "logps/rejected": -637.2833862304688, - "loss": 0.4619, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.941668748855591, - "rewards/margins": 1.034071922302246, - "rewards/rejected": -3.975741147994995, + "logits/chosen": -1.1679879426956177, + "logits/rejected": -1.1002038717269897, + "logps/chosen": -487.10821533203125, + "logps/rejected": -576.1431884765625, + "loss": 0.4681, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3741798400878906, + "rewards/margins": 0.9901224970817566, + "rewards/rejected": -3.364302158355713, "step": 700 }, { "epoch": 0.7327924627060979, - "eval_logits/chosen": -1.119376301765442, - "eval_logits/rejected": -1.006381869316101, - "eval_logps/chosen": -557.3041381835938, - "eval_logps/rejected": -644.5651245117188, - "eval_loss": 0.4993818998336792, - "eval_rewards/accuracies": 0.761904776096344, - "eval_rewards/chosen": -2.9240119457244873, - "eval_rewards/margins": 1.0750477313995361, - "eval_rewards/rejected": -3.9990594387054443, - "eval_runtime": 494.8985, - "eval_samples_per_second": 4.041, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -1.119031548500061, + "eval_logits/rejected": -1.006774663925171, + "eval_logps/chosen": -496.6231994628906, + "eval_logps/rejected": -581.5197143554688, + "eval_loss": 0.49932044744491577, + "eval_rewards/accuracies": 0.7678571343421936, + "eval_rewards/chosen": -2.3170175552368164, + "eval_rewards/margins": 1.0517328977584839, + "eval_rewards/rejected": -3.3687500953674316, + "eval_runtime": 280.6333, + "eval_samples_per_second": 7.127, + "eval_steps_per_second": 0.224, "step": 700 }, { "epoch": 0.7432609264590422, - "grad_norm": 5.940990677339228, + "grad_norm": 5.3428016079168215, "learning_rate": 9.382138040640714e-07, - "logits/chosen": -1.175462245941162, - "logits/rejected": -1.0119060277938843, - "logps/chosen": -566.7076416015625, - "logps/rejected": -634.9506225585938, - "loss": 0.5486, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -3.01143217086792, - "rewards/margins": 0.9881938695907593, - "rewards/rejected": -3.9996261596679688, + "logits/chosen": -1.1798994541168213, + "logits/rejected": -1.021723985671997, + "logps/chosen": -505.2616271972656, + "logps/rejected": -571.1856689453125, + "loss": 0.547, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.396669864654541, + "rewards/margins": 0.9654728174209595, + "rewards/rejected": -3.362142562866211, "step": 710 }, { "epoch": 0.7537293902119864, - "grad_norm": 5.5111666858195125, + "grad_norm": 5.0438516064442505, "learning_rate": 8.678793653740633e-07, - "logits/chosen": -1.3316354751586914, - "logits/rejected": -1.171438455581665, - "logps/chosen": -583.8369140625, - "logps/rejected": -640.4646606445312, - "loss": 0.4747, + "logits/chosen": -1.3297260999679565, + "logits/rejected": -1.1736423969268799, + "logps/chosen": -527.0916748046875, + "logps/rejected": -579.3074340820312, + "loss": 0.478, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.6381282806396484, - "rewards/margins": 1.0775749683380127, - "rewards/rejected": -3.715702772140503, + "rewards/chosen": -2.070934534072876, + "rewards/margins": 1.0328184366226196, + "rewards/rejected": -3.103752851486206, "step": 720 }, { "epoch": 0.7641978539649307, - "grad_norm": 5.993566017988564, + "grad_norm": 5.464567536353577, "learning_rate": 7.997277433690984e-07, - "logits/chosen": -1.208320140838623, - "logits/rejected": -1.071702480316162, - "logps/chosen": -516.3609619140625, - "logps/rejected": -598.6437377929688, - "loss": 0.5075, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.7794041633605957, - "rewards/margins": 0.9944884181022644, - "rewards/rejected": -3.773892879486084, + "logits/chosen": -1.2094228267669678, + "logits/rejected": -1.076755404472351, + "logps/chosen": -460.71563720703125, + "logps/rejected": -538.8247680664062, + "loss": 0.4966, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2227933406829834, + "rewards/margins": 0.9530885815620422, + "rewards/rejected": -3.175881862640381, "step": 730 }, { "epoch": 0.7746663177178749, - "grad_norm": 5.60538349675765, + "grad_norm": 5.377248875033102, "learning_rate": 7.338500848029603e-07, - "logits/chosen": -1.2300490140914917, - "logits/rejected": -1.0854498147964478, - "logps/chosen": -557.65234375, - "logps/rejected": -595.0419921875, - "loss": 0.5081, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.7690441608428955, - "rewards/margins": 0.9237399101257324, - "rewards/rejected": -3.692783832550049, + "logits/chosen": -1.1969387531280518, + "logits/rejected": -1.0555990934371948, + "logps/chosen": -529.8873291015625, + "logps/rejected": -568.1295166015625, + "loss": 0.5065, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.4915201663970947, + "rewards/margins": 0.9319826364517212, + "rewards/rejected": -3.4235024452209473, "step": 740 }, { "epoch": 0.7851347814708192, - "grad_norm": 5.061641492930568, + "grad_norm": 5.342695362281337, "learning_rate": 6.70334495204884e-07, - "logits/chosen": -1.0873275995254517, - "logits/rejected": -1.0135056972503662, - "logps/chosen": -510.3736877441406, - "logps/rejected": -621.0284423828125, - "loss": 0.4814, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.822464942932129, - "rewards/margins": 1.0254597663879395, - "rewards/rejected": -3.8479247093200684, + "logits/chosen": -1.0425455570220947, + "logits/rejected": -0.9723536372184753, + "logps/chosen": -487.095947265625, + "logps/rejected": -599.2386474609375, + "loss": 0.4857, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.589409112930298, + "rewards/margins": 1.040583848953247, + "rewards/rejected": -3.629992723464966, "step": 750 }, { "epoch": 0.7956032452237635, - "grad_norm": 4.889175666405806, + "grad_norm": 4.96165517698307, "learning_rate": 6.092659210462232e-07, - "logits/chosen": -1.2110772132873535, - "logits/rejected": -1.0558052062988281, - "logps/chosen": -540.3411254882812, - "logps/rejected": -596.0307006835938, - "loss": 0.5076, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.8302786350250244, - "rewards/margins": 0.9066953659057617, - "rewards/rejected": -3.736973524093628, + "logits/chosen": -1.172639012336731, + "logits/rejected": -1.0221275091171265, + "logps/chosen": -510.41253662109375, + "logps/rejected": -564.8956909179688, + "loss": 0.5034, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.53062105178833, + "rewards/margins": 0.8949772119522095, + "rewards/rejected": -3.42559814453125, "step": 760 }, { "epoch": 0.8060717089767077, - "grad_norm": 4.4612176619325075, + "grad_norm": 4.707914305263311, "learning_rate": 5.507260361320738e-07, - "logits/chosen": -1.2954254150390625, - "logits/rejected": -1.2630584239959717, - "logps/chosen": -560.3121337890625, - "logps/rejected": -663.4389038085938, - "loss": 0.4681, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.743813991546631, - "rewards/margins": 1.0250587463378906, - "rewards/rejected": -3.7688724994659424, + "logits/chosen": -1.2693157196044922, + "logits/rejected": -1.2365710735321045, + "logps/chosen": -522.3062744140625, + "logps/rejected": -619.4640502929688, + "loss": 0.4736, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.363852024078369, + "rewards/margins": 0.9652940034866333, + "rewards/rejected": -3.329145908355713, "step": 770 }, { "epoch": 0.816540172729652, - "grad_norm": 5.524604671684533, + "grad_norm": 5.592342234404946, "learning_rate": 4.947931323697983e-07, - "logits/chosen": -1.3126929998397827, - "logits/rejected": -1.088275671005249, - "logps/chosen": -570.6515502929688, - "logps/rejected": -593.5574951171875, - "loss": 0.4959, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.6409595012664795, - "rewards/margins": 0.848545253276825, - "rewards/rejected": -3.48950457572937, + "logits/chosen": -1.2886607646942139, + "logits/rejected": -1.0684127807617188, + "logps/chosen": -534.323486328125, + "logps/rejected": -556.2030639648438, + "loss": 0.5036, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.277695894241333, + "rewards/margins": 0.8380621075630188, + "rewards/rejected": -3.115757942199707, "step": 780 }, { "epoch": 0.8270086364825961, - "grad_norm": 5.9693834124907585, + "grad_norm": 6.076661367759154, "learning_rate": 4.4154201506053985e-07, - "logits/chosen": -1.1836011409759521, - "logits/rejected": -1.0795371532440186, - "logps/chosen": -529.63134765625, - "logps/rejected": -619.8529663085938, - "loss": 0.523, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.8782217502593994, - "rewards/margins": 0.950698971748352, - "rewards/rejected": -3.828920841217041, + "logits/chosen": -1.1675662994384766, + "logits/rejected": -1.065953254699707, + "logps/chosen": -489.9781188964844, + "logps/rejected": -578.2179565429688, + "loss": 0.5128, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.482016086578369, + "rewards/margins": 0.9305012822151184, + "rewards/rejected": -3.4125168323516846, "step": 790 }, { "epoch": 0.8374771002355405, - "grad_norm": 6.22817743126936, + "grad_norm": 6.131830839970953, "learning_rate": 3.910439028537638e-07, - "logits/chosen": -1.1832584142684937, - "logits/rejected": -1.1257516145706177, - "logps/chosen": -505.93316650390625, - "logps/rejected": -627.0811157226562, - "loss": 0.4926, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.7608401775360107, - "rewards/margins": 1.1763994693756104, - "rewards/rejected": -3.937239408493042, + "logits/chosen": -1.1610690355300903, + "logits/rejected": -1.1072094440460205, + "logps/chosen": -469.524658203125, + "logps/rejected": -585.119140625, + "loss": 0.4852, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3968334197998047, + "rewards/margins": 1.120810866355896, + "rewards/rejected": -3.5176444053649902, "step": 800 }, { "epoch": 0.8374771002355405, - "eval_logits/chosen": -1.1640794277191162, - "eval_logits/rejected": -1.0516477823257446, - "eval_logps/chosen": -537.3770141601562, - "eval_logps/rejected": -619.2051391601562, - "eval_loss": 0.49621155858039856, - "eval_rewards/accuracies": 0.7658730149269104, - "eval_rewards/chosen": -2.724740743637085, - "eval_rewards/margins": 1.0207195281982422, - "eval_rewards/rejected": -3.745460033416748, - "eval_runtime": 494.3028, - "eval_samples_per_second": 4.046, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -1.1353023052215576, + "eval_logits/rejected": -1.0236940383911133, + "eval_logps/chosen": -504.6183166503906, + "eval_logps/rejected": -585.8155517578125, + "eval_loss": 0.49497368931770325, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -2.396967887878418, + "eval_rewards/margins": 1.0147408246994019, + "eval_rewards/rejected": -3.4117088317871094, + "eval_runtime": 274.6399, + "eval_samples_per_second": 7.282, + "eval_steps_per_second": 0.229, "step": 800 }, { "epoch": 0.8479455639884846, - "grad_norm": 4.972938912679825, + "grad_norm": 4.892138077626307, "learning_rate": 3.4336633249862084e-07, - "logits/chosen": -1.2425668239593506, - "logits/rejected": -1.0636166334152222, - "logps/chosen": -561.7000732421875, - "logps/rejected": -619.0046997070312, - "loss": 0.508, + "logits/chosen": -1.209530234336853, + "logits/rejected": -1.0349524021148682, + "logps/chosen": -532.2420043945312, + "logps/rejected": -591.3436279296875, + "loss": 0.5065, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.7175962924957275, - "rewards/margins": 0.9634302854537964, - "rewards/rejected": -3.6810269355773926, + "rewards/chosen": -2.4228711128234863, + "rewards/margins": 0.9818238019943237, + "rewards/rejected": -3.4046947956085205, "step": 810 }, { "epoch": 0.8584140277414289, - "grad_norm": 5.09720991480253, + "grad_norm": 6.133736149907814, "learning_rate": 2.98573068519539e-07, - "logits/chosen": -1.2827835083007812, - "logits/rejected": -1.2060225009918213, - "logps/chosen": -539.7214965820312, - "logps/rejected": -615.7735595703125, - "loss": 0.4994, + "logits/chosen": -1.2474277019500732, + "logits/rejected": -1.1728675365447998, + "logps/chosen": -516.3637084960938, + "logps/rejected": -589.2227783203125, + "loss": 0.5066, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.778557538986206, - "rewards/margins": 0.8738244771957397, - "rewards/rejected": -3.6523823738098145, + "rewards/chosen": -2.5452017784118652, + "rewards/margins": 0.8416748046875, + "rewards/rejected": -3.3868765830993652, "step": 820 }, { "epoch": 0.8688824914943732, - "grad_norm": 5.782887668669889, + "grad_norm": 4.448941268784857, "learning_rate": 2.5672401793681854e-07, - "logits/chosen": -1.2188454866409302, - "logits/rejected": -1.1675770282745361, - "logps/chosen": -514.8630981445312, - "logps/rejected": -599.4465942382812, - "loss": 0.5131, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.682121753692627, - "rewards/margins": 0.837975025177002, - "rewards/rejected": -3.520097017288208, + "logits/chosen": -1.1796165704727173, + "logits/rejected": -1.1301778554916382, + "logps/chosen": -493.81610107421875, + "logps/rejected": -576.2391357421875, + "loss": 0.5119, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.471508502960205, + "rewards/margins": 0.8166677355766296, + "rewards/rejected": -3.2881767749786377, "step": 830 }, { "epoch": 0.8793509552473174, - "grad_norm": 6.123094674224966, + "grad_norm": 6.561312283813159, "learning_rate": 2.178751501463036e-07, - "logits/chosen": -1.2655466794967651, - "logits/rejected": -1.162320613861084, - "logps/chosen": -548.4150390625, - "logps/rejected": -654.3787231445312, - "loss": 0.4696, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.7333908081054688, - "rewards/margins": 1.0990099906921387, - "rewards/rejected": -3.8324007987976074, + "logits/chosen": -1.2200143337249756, + "logits/rejected": -1.120086431503296, + "logps/chosen": -525.2439575195312, + "logps/rejected": -629.3662719726562, + "loss": 0.4823, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.501779317855835, + "rewards/margins": 1.0803827047348022, + "rewards/rejected": -3.5821621417999268, "step": 840 }, { "epoch": 0.8898194190002617, - "grad_norm": 5.66840809357227, + "grad_norm": 5.675008200582371, "learning_rate": 1.820784220652766e-07, - "logits/chosen": -1.1958177089691162, - "logits/rejected": -1.0371004343032837, - "logps/chosen": -533.7821044921875, - "logps/rejected": -595.7824096679688, - "loss": 0.4643, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.6792378425598145, - "rewards/margins": 0.9526662826538086, - "rewards/rejected": -3.631904125213623, + "logits/chosen": -1.148503065109253, + "logits/rejected": -0.9931659698486328, + "logps/chosen": -511.488037109375, + "logps/rejected": -576.7986450195312, + "loss": 0.4672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.456411361694336, + "rewards/margins": 0.9856443405151367, + "rewards/rejected": -3.4420554637908936, "step": 850 }, { "epoch": 0.9002878827532059, - "grad_norm": 4.99425900843833, + "grad_norm": 5.847430828911881, "learning_rate": 1.4938170864468636e-07, - "logits/chosen": -1.257932424545288, - "logits/rejected": -1.1045788526535034, - "logps/chosen": -551.285888671875, - "logps/rejected": -612.8128662109375, - "loss": 0.4681, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.804962396621704, - "rewards/margins": 0.8677828907966614, - "rewards/rejected": -3.6727447509765625, + "logits/chosen": -1.2107369899749756, + "logits/rejected": -1.0609266757965088, + "logps/chosen": -531.3970947265625, + "logps/rejected": -590.2992553710938, + "loss": 0.4693, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.60589861869812, + "rewards/margins": 0.8416641354560852, + "rewards/rejected": -3.4475624561309814, "step": 860 }, { "epoch": 0.9107563465061502, - "grad_norm": 4.991076961519154, + "grad_norm": 5.678184676582534, "learning_rate": 1.1982873884064466e-07, - "logits/chosen": -1.294597864151001, - "logits/rejected": -1.1019765138626099, - "logps/chosen": -572.5621337890625, - "logps/rejected": -625.5676879882812, - "loss": 0.5032, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.914654493331909, - "rewards/margins": 1.0726633071899414, - "rewards/rejected": -3.9873173236846924, + "logits/chosen": -1.24862539768219, + "logits/rejected": -1.0584386587142944, + "logps/chosen": -544.3020629882812, + "logps/rejected": -598.3751220703125, + "loss": 0.4947, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.632056713104248, + "rewards/margins": 1.0834977626800537, + "rewards/rejected": -3.7155539989471436, "step": 870 }, { "epoch": 0.9212248102590945, - "grad_norm": 5.0189704749647746, + "grad_norm": 5.3183871343635944, "learning_rate": 9.345903713082305e-08, - "logits/chosen": -1.2915667295455933, - "logits/rejected": -1.1525356769561768, - "logps/chosen": -541.0573120117188, - "logps/rejected": -600.8675537109375, - "loss": 0.4752, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.7068932056427, - "rewards/margins": 0.9946308135986328, - "rewards/rejected": -3.701524019241333, + "logits/chosen": -1.2423183917999268, + "logits/rejected": -1.1057523488998413, + "logps/chosen": -514.6922607421875, + "logps/rejected": -574.3685302734375, + "loss": 0.4801, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4431517124176025, + "rewards/margins": 0.9935353994369507, + "rewards/rejected": -3.4366869926452637, "step": 880 }, { "epoch": 0.9316932740120387, - "grad_norm": 5.260823538009292, + "grad_norm": 5.735081148749753, "learning_rate": 7.030787065396866e-08, - "logits/chosen": -1.188293218612671, - "logits/rejected": -1.0763866901397705, - "logps/chosen": -527.5496215820312, - "logps/rejected": -644.0205078125, - "loss": 0.4845, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.790327548980713, - "rewards/margins": 1.2539947032928467, - "rewards/rejected": -4.0443220138549805, + "logits/chosen": -1.1413437128067017, + "logits/rejected": -1.0328372716903687, + "logps/chosen": -508.3773498535156, + "logps/rejected": -619.8019409179688, + "loss": 0.4938, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5983500480651855, + "rewards/margins": 1.2036244869232178, + "rewards/rejected": -3.8019745349884033, "step": 890 }, { "epoch": 0.942161737764983, - "grad_norm": 6.861506991670971, + "grad_norm": 6.618746821621782, "learning_rate": 5.0406202043228604e-08, - "logits/chosen": -1.0917075872421265, - "logits/rejected": -1.0418908596038818, - "logps/chosen": -544.6124267578125, - "logps/rejected": -697.5197143554688, - "loss": 0.4856, + "logits/chosen": -1.045906901359558, + "logits/rejected": -0.9974561929702759, + "logps/chosen": -517.8667602539062, + "logps/rejected": -669.0172729492188, + "loss": 0.4907, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.783902406692505, - "rewards/margins": 1.2628685235977173, - "rewards/rejected": -4.0467705726623535, + "rewards/chosen": -2.5165352821350098, + "rewards/margins": 1.2452183961868286, + "rewards/rejected": -3.761753559112549, "step": 900 }, { "epoch": 0.942161737764983, - "eval_logits/chosen": -1.1508536338806152, - "eval_logits/rejected": -1.03853178024292, - "eval_logps/chosen": -545.9743041992188, - "eval_logps/rejected": -631.738525390625, - "eval_loss": 0.49515971541404724, - "eval_rewards/accuracies": 0.77182537317276, - "eval_rewards/chosen": -2.810713768005371, - "eval_rewards/margins": 1.0600804090499878, - "eval_rewards/rejected": -3.8707938194274902, - "eval_runtime": 496.7331, - "eval_samples_per_second": 4.026, - "eval_steps_per_second": 0.127, + "eval_logits/chosen": -1.1023893356323242, + "eval_logits/rejected": -0.9901031255722046, + "eval_logps/chosen": -521.706298828125, + "eval_logps/rejected": -608.1346435546875, + "eval_loss": 0.494513601064682, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -2.567847490310669, + "eval_rewards/margins": 1.0670523643493652, + "eval_rewards/rejected": -3.634899854660034, + "eval_runtime": 302.7434, + "eval_samples_per_second": 6.606, + "eval_steps_per_second": 0.208, "step": 900 }, { "epoch": 0.9526302015179272, - "grad_norm": 7.396195290341192, + "grad_norm": 6.10762031606348, "learning_rate": 3.378064801637687e-08, - "logits/chosen": -1.2156776189804077, - "logits/rejected": -1.0312784910202026, - "logps/chosen": -519.9219360351562, - "logps/rejected": -579.5977172851562, - "loss": 0.4987, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.731100559234619, - "rewards/margins": 1.0545246601104736, - "rewards/rejected": -3.785625457763672, + "logits/chosen": -1.1716662645339966, + "logits/rejected": -0.9894771575927734, + "logps/chosen": -494.2669982910156, + "logps/rejected": -552.8900146484375, + "loss": 0.4935, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.474764585494995, + "rewards/margins": 1.0439238548278809, + "rewards/rejected": -3.518688201904297, "step": 910 }, { "epoch": 0.9630986652708715, - "grad_norm": 5.785542711133717, + "grad_norm": 5.683369015946174, "learning_rate": 2.0453443778310766e-08, - "logits/chosen": -1.2227718830108643, - "logits/rejected": -1.0613749027252197, - "logps/chosen": -563.939453125, - "logps/rejected": -609.8938598632812, - "loss": 0.4994, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.846004009246826, - "rewards/margins": 0.9199835062026978, - "rewards/rejected": -3.7659878730773926, + "logits/chosen": -1.1771481037139893, + "logits/rejected": -1.0182334184646606, + "logps/chosen": -545.4282836914062, + "logps/rejected": -589.4589233398438, + "loss": 0.5081, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6610922813415527, + "rewards/margins": 0.9007646441459656, + "rewards/rejected": -3.561856746673584, "step": 920 }, { "epoch": 0.9735671290238157, - "grad_norm": 5.845655612266485, + "grad_norm": 5.950595947719059, "learning_rate": 1.0442413283435759e-08, - "logits/chosen": -1.1646806001663208, - "logits/rejected": -0.9581974744796753, - "logps/chosen": -586.3766479492188, - "logps/rejected": -622.99072265625, - "loss": 0.5017, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.876279354095459, - "rewards/margins": 0.9776785969734192, - "rewards/rejected": -3.8539581298828125, + "logits/chosen": -1.1201808452606201, + "logits/rejected": -0.9143557548522949, + "logps/chosen": -556.3572998046875, + "logps/rejected": -596.0765380859375, + "loss": 0.493, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5759315490722656, + "rewards/margins": 1.0089161396026611, + "rewards/rejected": -3.5848472118377686, "step": 930 }, { "epoch": 0.98403559277676, - "grad_norm": 6.202963993103904, + "grad_norm": 7.5374296673981425, "learning_rate": 3.760945397705828e-09, - "logits/chosen": -1.1803127527236938, - "logits/rejected": -1.1215112209320068, - "logps/chosen": -542.6771850585938, - "logps/rejected": -630.9061279296875, - "loss": 0.4831, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.7995152473449707, - "rewards/margins": 0.8906386494636536, - "rewards/rejected": -3.6901535987854004, + "logits/chosen": -1.1323813199996948, + "logits/rejected": -1.0745770931243896, + "logps/chosen": -517.08056640625, + "logps/rejected": -605.00146484375, + "loss": 0.4829, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5434603691101074, + "rewards/margins": 0.8878037333488464, + "rewards/rejected": -3.4312641620635986, "step": 940 }, { "epoch": 0.9945040565297043, - "grad_norm": 6.026230259479486, + "grad_norm": 5.8745190839702515, "learning_rate": 4.1797599220405605e-10, - "logits/chosen": -1.2184993028640747, - "logits/rejected": -1.041572093963623, - "logps/chosen": -546.837890625, - "logps/rejected": -604.7627563476562, - "loss": 0.489, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.8025598526000977, - "rewards/margins": 0.9029384851455688, - "rewards/rejected": -3.705498218536377, + "logits/chosen": -1.1730302572250366, + "logits/rejected": -0.9989528656005859, + "logps/chosen": -524.8547973632812, + "logps/rejected": -583.33203125, + "loss": 0.4861, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.5831165313720703, + "rewards/margins": 0.908293604850769, + "rewards/rejected": -3.49141001701355, "step": 950 }, { "epoch": 0.9997382884061764, "step": 955, "total_flos": 0.0, - "train_loss": 0.2358125359600127, - "train_runtime": 19082.7985, - "train_samples_per_second": 3.204, - "train_steps_per_second": 0.05 + "train_loss": 0.5319095570379527, + "train_runtime": 23762.0752, + "train_samples_per_second": 2.573, + "train_steps_per_second": 0.04 } ], "logging_steps": 10,