zephyr-7b-dpo-qlora / trainer_state.json
taicheng's picture
Model save
0993a3c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010468463752944255,
"grad_norm": 1.1945625860018705,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -2.5192830562591553,
"logits/rejected": -2.3547825813293457,
"logps/chosen": -297.60443115234375,
"logps/rejected": -252.4619903564453,
"loss": 0.6929,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.00020415784092620015,
"rewards/margins": -0.0002505290030967444,
"rewards/rejected": 4.637122037820518e-05,
"step": 1
},
{
"epoch": 0.010468463752944255,
"grad_norm": 1.1009278086693854,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -2.2455766201019287,
"logits/rejected": -2.215245008468628,
"logps/chosen": -275.6755065917969,
"logps/rejected": -254.76722717285156,
"loss": 0.6927,
"rewards/accuracies": 0.6111111044883728,
"rewards/chosen": 0.004448441788554192,
"rewards/margins": 0.0008290203404612839,
"rewards/rejected": 0.0036194208078086376,
"step": 10
},
{
"epoch": 0.02093692750588851,
"grad_norm": 1.165704750760885,
"learning_rate": 1.0416666666666667e-06,
"logits/chosen": -2.2313215732574463,
"logits/rejected": -2.114736795425415,
"logps/chosen": -277.5883483886719,
"logps/rejected": -255.2056427001953,
"loss": 0.6907,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.026263630017638206,
"rewards/margins": 0.005699009168893099,
"rewards/rejected": 0.020564619451761246,
"step": 20
},
{
"epoch": 0.031405391258832765,
"grad_norm": 1.1872789709669673,
"learning_rate": 1.5625e-06,
"logits/chosen": -2.3138914108276367,
"logits/rejected": -2.2109274864196777,
"logps/chosen": -281.3846740722656,
"logps/rejected": -262.41693115234375,
"loss": 0.6858,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.04216086491942406,
"rewards/margins": 0.014992751181125641,
"rewards/rejected": 0.027168119326233864,
"step": 30
},
{
"epoch": 0.04187385501177702,
"grad_norm": 1.1753743538686479,
"learning_rate": 2.0833333333333334e-06,
"logits/chosen": -2.307976245880127,
"logits/rejected": -2.2140889167785645,
"logps/chosen": -268.4030456542969,
"logps/rejected": -255.44882202148438,
"loss": 0.6814,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05265005677938461,
"rewards/margins": 0.026613134890794754,
"rewards/rejected": 0.026036927476525307,
"step": 40
},
{
"epoch": 0.05234231876472128,
"grad_norm": 1.1656515626034387,
"learning_rate": 2.604166666666667e-06,
"logits/chosen": -2.2787346839904785,
"logits/rejected": -2.175128936767578,
"logps/chosen": -227.7914581298828,
"logps/rejected": -206.5706024169922,
"loss": 0.6764,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.05774398520588875,
"rewards/margins": 0.042526550590991974,
"rewards/rejected": 0.015217426232993603,
"step": 50
},
{
"epoch": 0.06281078251766553,
"grad_norm": 1.324103463890143,
"learning_rate": 3.125e-06,
"logits/chosen": -2.2864975929260254,
"logits/rejected": -2.185832977294922,
"logps/chosen": -264.6636657714844,
"logps/rejected": -228.9823455810547,
"loss": 0.6687,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.04733141511678696,
"rewards/margins": 0.05985499545931816,
"rewards/rejected": -0.012523581273853779,
"step": 60
},
{
"epoch": 0.07327924627060979,
"grad_norm": 1.4908224965531012,
"learning_rate": 3.6458333333333333e-06,
"logits/chosen": -2.128446102142334,
"logits/rejected": -2.080828905105591,
"logps/chosen": -256.41363525390625,
"logps/rejected": -262.66949462890625,
"loss": 0.6557,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.0010238643735647202,
"rewards/margins": 0.10270833969116211,
"rewards/rejected": -0.10373219102621078,
"step": 70
},
{
"epoch": 0.08374771002355404,
"grad_norm": 2.5274509004519445,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": -2.2609763145446777,
"logits/rejected": -2.1039209365844727,
"logps/chosen": -263.5312194824219,
"logps/rejected": -256.4105529785156,
"loss": 0.6402,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.12527017295360565,
"rewards/margins": 0.11860889196395874,
"rewards/rejected": -0.2438790500164032,
"step": 80
},
{
"epoch": 0.0942161737764983,
"grad_norm": 2.5638356680875156,
"learning_rate": 4.6875000000000004e-06,
"logits/chosen": -2.1489036083221436,
"logits/rejected": -2.081789493560791,
"logps/chosen": -270.69091796875,
"logps/rejected": -291.17962646484375,
"loss": 0.6304,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.1973474770784378,
"rewards/margins": 0.19165988266468048,
"rewards/rejected": -0.3890073299407959,
"step": 90
},
{
"epoch": 0.10468463752944256,
"grad_norm": 2.7331412184768507,
"learning_rate": 4.9997324926814375e-06,
"logits/chosen": -2.193147659301758,
"logits/rejected": -2.0836679935455322,
"logps/chosen": -303.36590576171875,
"logps/rejected": -320.6068420410156,
"loss": 0.6212,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.31098657846450806,
"rewards/margins": 0.2229224145412445,
"rewards/rejected": -0.5339089632034302,
"step": 100
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -2.0932729244232178,
"eval_logits/rejected": -2.008643627166748,
"eval_logps/chosen": -298.0505676269531,
"eval_logps/rejected": -299.1472473144531,
"eval_loss": 0.6321468353271484,
"eval_rewards/accuracies": 0.6944444179534912,
"eval_rewards/chosen": -0.3312907814979553,
"eval_rewards/margins": 0.213734969496727,
"eval_rewards/rejected": -0.5450257658958435,
"eval_runtime": 321.7711,
"eval_samples_per_second": 6.216,
"eval_steps_per_second": 0.196,
"step": 100
},
{
"epoch": 0.11515310128238682,
"grad_norm": 2.3956891782186904,
"learning_rate": 4.996723692767927e-06,
"logits/chosen": -2.1808857917785645,
"logits/rejected": -2.05126690864563,
"logps/chosen": -281.98193359375,
"logps/rejected": -265.5942687988281,
"loss": 0.6236,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.3388732373714447,
"rewards/margins": 0.2357875108718872,
"rewards/rejected": -0.5746607184410095,
"step": 110
},
{
"epoch": 0.12562156503533106,
"grad_norm": 4.49315717191337,
"learning_rate": 4.9903757462135984e-06,
"logits/chosen": -2.1513657569885254,
"logits/rejected": -2.0812501907348633,
"logps/chosen": -287.4888610839844,
"logps/rejected": -328.67156982421875,
"loss": 0.5888,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.41489553451538086,
"rewards/margins": 0.31543681025505066,
"rewards/rejected": -0.7303323149681091,
"step": 120
},
{
"epoch": 0.1360900287882753,
"grad_norm": 3.426776723546068,
"learning_rate": 4.980697142834315e-06,
"logits/chosen": -2.1264045238494873,
"logits/rejected": -2.0155797004699707,
"logps/chosen": -381.717041015625,
"logps/rejected": -350.6758728027344,
"loss": 0.5946,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8191909790039062,
"rewards/margins": 0.3090600371360779,
"rewards/rejected": -1.1282509565353394,
"step": 130
},
{
"epoch": 0.14655849254121958,
"grad_norm": 3.004217560772131,
"learning_rate": 4.967700826904229e-06,
"logits/chosen": -2.0693180561065674,
"logits/rejected": -2.010124683380127,
"logps/chosen": -363.1732482910156,
"logps/rejected": -400.6826171875,
"loss": 0.5746,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9419809579849243,
"rewards/margins": 0.41463392972946167,
"rewards/rejected": -1.3566150665283203,
"step": 140
},
{
"epoch": 0.15702695629416383,
"grad_norm": 4.5725215394059004,
"learning_rate": 4.951404179843963e-06,
"logits/chosen": -2.154370069503784,
"logits/rejected": -2.016098976135254,
"logps/chosen": -362.11627197265625,
"logps/rejected": -358.12823486328125,
"loss": 0.5888,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.7362005710601807,
"rewards/margins": 0.44253450632095337,
"rewards/rejected": -1.1787351369857788,
"step": 150
},
{
"epoch": 0.16749542004710807,
"grad_norm": 7.788576778528557,
"learning_rate": 4.931828996974498e-06,
"logits/chosen": -2.099804162979126,
"logits/rejected": -1.9289453029632568,
"logps/chosen": -348.2688903808594,
"logps/rejected": -349.8648986816406,
"loss": 0.5524,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.6180304288864136,
"rewards/margins": 0.5966934561729431,
"rewards/rejected": -1.2147239446640015,
"step": 160
},
{
"epoch": 0.17796388380005235,
"grad_norm": 3.73369227909531,
"learning_rate": 4.909001458367867e-06,
"logits/chosen": -2.005589008331299,
"logits/rejected": -1.8591816425323486,
"logps/chosen": -357.9122619628906,
"logps/rejected": -401.5499267578125,
"loss": 0.5578,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9827691912651062,
"rewards/margins": 0.5925935506820679,
"rewards/rejected": -1.5753626823425293,
"step": 170
},
{
"epoch": 0.1884323475529966,
"grad_norm": 5.142786695761387,
"learning_rate": 4.882952093833628e-06,
"logits/chosen": -1.906904935836792,
"logits/rejected": -1.8502197265625,
"logps/chosen": -384.1532287597656,
"logps/rejected": -441.2289123535156,
"loss": 0.5357,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2132524251937866,
"rewards/margins": 0.6486458778381348,
"rewards/rejected": -1.861898422241211,
"step": 180
},
{
"epoch": 0.19890081130594087,
"grad_norm": 4.970082596077688,
"learning_rate": 4.853715742087947e-06,
"logits/chosen": -1.7953016757965088,
"logits/rejected": -1.749053716659546,
"logps/chosen": -331.3048400878906,
"logps/rejected": -420.5511779785156,
"loss": 0.5344,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.8520873188972473,
"rewards/margins": 0.6949248313903809,
"rewards/rejected": -1.5470120906829834,
"step": 190
},
{
"epoch": 0.2093692750588851,
"grad_norm": 4.380709251136357,
"learning_rate": 4.821331504159906e-06,
"logits/chosen": -1.931880235671997,
"logits/rejected": -1.7673060894012451,
"logps/chosen": -395.6458435058594,
"logps/rejected": -393.07818603515625,
"loss": 0.5618,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8311824798583984,
"rewards/margins": 0.598331868648529,
"rewards/rejected": -1.4295144081115723,
"step": 200
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -1.7550567388534546,
"eval_logits/rejected": -1.669370174407959,
"eval_logps/chosen": -346.90643310546875,
"eval_logps/rejected": -381.2445983886719,
"eval_loss": 0.5600804686546326,
"eval_rewards/accuracies": 0.7222222089767456,
"eval_rewards/chosen": -0.8198498487472534,
"eval_rewards/margins": 0.5461496114730835,
"eval_rewards/rejected": -1.365999460220337,
"eval_runtime": 319.6036,
"eval_samples_per_second": 6.258,
"eval_steps_per_second": 0.197,
"step": 200
},
{
"epoch": 0.21983773881182936,
"grad_norm": 4.390548860079351,
"learning_rate": 4.7858426910973435e-06,
"logits/chosen": -1.82965886592865,
"logits/rejected": -1.7543909549713135,
"logps/chosen": -382.2540588378906,
"logps/rejected": -415.6708984375,
"loss": 0.5746,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.0466556549072266,
"rewards/margins": 0.42583417892456055,
"rewards/rejected": -1.4724897146224976,
"step": 210
},
{
"epoch": 0.23030620256477363,
"grad_norm": 5.154433364870237,
"learning_rate": 4.747296766042161e-06,
"logits/chosen": -1.791577696800232,
"logits/rejected": -1.676790475845337,
"logps/chosen": -430.439697265625,
"logps/rejected": -446.4627380371094,
"loss": 0.5594,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.3720312118530273,
"rewards/margins": 0.6089810132980347,
"rewards/rejected": -1.9810121059417725,
"step": 220
},
{
"epoch": 0.24077466631771788,
"grad_norm": 5.067287557540918,
"learning_rate": 4.705745280752586e-06,
"logits/chosen": -1.6005672216415405,
"logits/rejected": -1.5293024778366089,
"logps/chosen": -364.32769775390625,
"logps/rejected": -393.8982849121094,
"loss": 0.5585,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1689074039459229,
"rewards/margins": 0.4595082402229309,
"rewards/rejected": -1.6284157037734985,
"step": 230
},
{
"epoch": 0.2512431300706621,
"grad_norm": 4.187057753178847,
"learning_rate": 4.661243806657256e-06,
"logits/chosen": -1.6649366617202759,
"logits/rejected": -1.6055676937103271,
"logps/chosen": -377.30517578125,
"logps/rejected": -407.87017822265625,
"loss": 0.5306,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.2085318565368652,
"rewards/margins": 0.6119499802589417,
"rewards/rejected": -1.8204820156097412,
"step": 240
},
{
"epoch": 0.26171159382360637,
"grad_norm": 4.31385003289797,
"learning_rate": 4.613851860533367e-06,
"logits/chosen": -1.6592628955841064,
"logits/rejected": -1.5831575393676758,
"logps/chosen": -392.666259765625,
"logps/rejected": -415.00933837890625,
"loss": 0.5341,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.2613078355789185,
"rewards/margins": 0.6352987289428711,
"rewards/rejected": -1.896606683731079,
"step": 250
},
{
"epoch": 0.2721800575765506,
"grad_norm": 6.08274430115293,
"learning_rate": 4.563632824908252e-06,
"logits/chosen": -1.5671354532241821,
"logits/rejected": -1.4325566291809082,
"logps/chosen": -446.5708923339844,
"logps/rejected": -492.8990173339844,
"loss": 0.5228,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.7604526281356812,
"rewards/margins": 0.6336302161216736,
"rewards/rejected": -2.394083261489868,
"step": 260
},
{
"epoch": 0.2826485213294949,
"grad_norm": 4.56724778147284,
"learning_rate": 4.510653863290871e-06,
"logits/chosen": -1.456993579864502,
"logits/rejected": -1.3645284175872803,
"logps/chosen": -401.00244140625,
"logps/rejected": -461.67816162109375,
"loss": 0.5277,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2706773281097412,
"rewards/margins": 0.8170326352119446,
"rewards/rejected": -2.087709903717041,
"step": 270
},
{
"epoch": 0.29311698508243916,
"grad_norm": 3.947817632488332,
"learning_rate": 4.454985830346574e-06,
"logits/chosen": -1.5088273286819458,
"logits/rejected": -1.3900407552719116,
"logps/chosen": -385.47772216796875,
"logps/rejected": -426.82049560546875,
"loss": 0.5485,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9744614362716675,
"rewards/margins": 0.75079345703125,
"rewards/rejected": -1.725255012512207,
"step": 280
},
{
"epoch": 0.3035854488353834,
"grad_norm": 4.730568661724723,
"learning_rate": 4.396703177135262e-06,
"logits/chosen": -1.517740249633789,
"logits/rejected": -1.351360559463501,
"logps/chosen": -388.90789794921875,
"logps/rejected": -416.22052001953125,
"loss": 0.526,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.0588579177856445,
"rewards/margins": 0.6205722093582153,
"rewards/rejected": -1.6794300079345703,
"step": 290
},
{
"epoch": 0.31405391258832765,
"grad_norm": 5.829501633867066,
"learning_rate": 4.335883851539693e-06,
"logits/chosen": -1.2372510433197021,
"logits/rejected": -1.2240632772445679,
"logps/chosen": -386.8778381347656,
"logps/rejected": -448.6904296875,
"loss": 0.54,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.441900610923767,
"rewards/margins": 0.6276523470878601,
"rewards/rejected": -2.0695528984069824,
"step": 300
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -1.1714633703231812,
"eval_logits/rejected": -1.0704221725463867,
"eval_logps/chosen": -417.12750244140625,
"eval_logps/rejected": -478.07476806640625,
"eval_loss": 0.5264545679092407,
"eval_rewards/accuracies": 0.7460317611694336,
"eval_rewards/chosen": -1.5220601558685303,
"eval_rewards/margins": 0.8122406601905823,
"eval_rewards/rejected": -2.334300994873047,
"eval_runtime": 318.2186,
"eval_samples_per_second": 6.285,
"eval_steps_per_second": 0.198,
"step": 300
},
{
"epoch": 0.3245223763412719,
"grad_norm": 4.0412766328088585,
"learning_rate": 4.2726091940171055e-06,
"logits/chosen": -1.3419673442840576,
"logits/rejected": -1.1779625415802002,
"logps/chosen": -421.1036682128906,
"logps/rejected": -466.26275634765625,
"loss": 0.5369,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.4869836568832397,
"rewards/margins": 0.8415184020996094,
"rewards/rejected": -2.3285021781921387,
"step": 310
},
{
"epoch": 0.33499084009421615,
"grad_norm": 7.228642390991299,
"learning_rate": 4.206963828813555e-06,
"logits/chosen": -1.3606576919555664,
"logits/rejected": -1.266966700553894,
"logps/chosen": -374.7814025878906,
"logps/rejected": -450.57489013671875,
"loss": 0.5399,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3771132230758667,
"rewards/margins": 0.7092502117156982,
"rewards/rejected": -2.0863633155822754,
"step": 320
},
{
"epoch": 0.34545930384716045,
"grad_norm": 5.998674043872773,
"learning_rate": 4.139035550786495e-06,
"logits/chosen": -1.305854082107544,
"logits/rejected": -1.2263256311416626,
"logps/chosen": -429.99188232421875,
"logps/rejected": -497.3521423339844,
"loss": 0.5206,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8139232397079468,
"rewards/margins": 0.7226920127868652,
"rewards/rejected": -2.5366153717041016,
"step": 330
},
{
"epoch": 0.3559277676001047,
"grad_norm": 6.244527338235992,
"learning_rate": 4.068915207986931e-06,
"logits/chosen": -1.0698177814483643,
"logits/rejected": -0.9774864315986633,
"logps/chosen": -438.493408203125,
"logps/rejected": -487.9413146972656,
"loss": 0.5553,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8784738779067993,
"rewards/margins": 0.7664145231246948,
"rewards/rejected": -2.644888401031494,
"step": 340
},
{
"epoch": 0.36639623135304894,
"grad_norm": 6.05372732646654,
"learning_rate": 3.996696580158211e-06,
"logits/chosen": -1.419528603553772,
"logits/rejected": -1.3026950359344482,
"logps/chosen": -399.3412170410156,
"logps/rejected": -459.59063720703125,
"loss": 0.5144,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4473296403884888,
"rewards/margins": 0.7528419494628906,
"rewards/rejected": -2.2001712322235107,
"step": 350
},
{
"epoch": 0.3768646951059932,
"grad_norm": 7.477190042107064,
"learning_rate": 3.922476253313921e-06,
"logits/chosen": -1.3192278146743774,
"logits/rejected": -1.2596288919448853,
"logps/chosen": -388.9505920410156,
"logps/rejected": -450.23748779296875,
"loss": 0.476,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3493094444274902,
"rewards/margins": 0.7934570908546448,
"rewards/rejected": -2.1427664756774902,
"step": 360
},
{
"epoch": 0.38733315885893743,
"grad_norm": 5.411077344670459,
"learning_rate": 3.846353490562664e-06,
"logits/chosen": -1.3181861639022827,
"logits/rejected": -1.2644492387771606,
"logps/chosen": -406.28851318359375,
"logps/rejected": -484.81646728515625,
"loss": 0.5189,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.618009328842163,
"rewards/margins": 0.912137508392334,
"rewards/rejected": -2.530146837234497,
"step": 370
},
{
"epoch": 0.39780162261188173,
"grad_norm": 6.255769900471538,
"learning_rate": 3.768430099352445e-06,
"logits/chosen": -1.2576202154159546,
"logits/rejected": -1.1985712051391602,
"logps/chosen": -426.02978515625,
"logps/rejected": -499.4483337402344,
"loss": 0.5291,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.724285364151001,
"rewards/margins": 0.7168464660644531,
"rewards/rejected": -2.441131830215454,
"step": 380
},
{
"epoch": 0.408270086364826,
"grad_norm": 5.374401251386735,
"learning_rate": 3.6888102953122307e-06,
"logits/chosen": -1.3139002323150635,
"logits/rejected": -1.2482521533966064,
"logps/chosen": -366.7402648925781,
"logps/rejected": -402.67144775390625,
"loss": 0.5445,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0828481912612915,
"rewards/margins": 0.5530051589012146,
"rewards/rejected": -1.6358531713485718,
"step": 390
},
{
"epoch": 0.4187385501177702,
"grad_norm": 5.700868192112112,
"learning_rate": 3.607600562872785e-06,
"logits/chosen": -1.2806731462478638,
"logits/rejected": -1.2076570987701416,
"logps/chosen": -397.6805725097656,
"logps/rejected": -457.99578857421875,
"loss": 0.5261,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.534794807434082,
"rewards/margins": 0.6010990738868713,
"rewards/rejected": -2.1358938217163086,
"step": 400
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -1.201329231262207,
"eval_logits/rejected": -1.1013988256454468,
"eval_logps/chosen": -430.45263671875,
"eval_logps/rejected": -497.2759094238281,
"eval_loss": 0.5082111954689026,
"eval_rewards/accuracies": 0.7539682388305664,
"eval_rewards/chosen": -1.6553115844726562,
"eval_rewards/margins": 0.8710008859634399,
"eval_rewards/rejected": -2.5263123512268066,
"eval_runtime": 314.7811,
"eval_samples_per_second": 6.354,
"eval_steps_per_second": 0.2,
"step": 400
},
{
"epoch": 0.42920701387071447,
"grad_norm": 4.78368697455832,
"learning_rate": 3.5249095128531863e-06,
"logits/chosen": -1.1789991855621338,
"logits/rejected": -1.0596911907196045,
"logps/chosen": -449.33709716796875,
"logps/rejected": -515.829833984375,
"loss": 0.5275,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8276984691619873,
"rewards/margins": 0.9437017440795898,
"rewards/rejected": -2.7714004516601562,
"step": 410
},
{
"epoch": 0.4396754776236587,
"grad_norm": 5.718886468906387,
"learning_rate": 3.4408477372034743e-06,
"logits/chosen": -1.3089560270309448,
"logits/rejected": -1.111132025718689,
"logps/chosen": -451.94354248046875,
"logps/rejected": -472.0818786621094,
"loss": 0.5212,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6895692348480225,
"rewards/margins": 0.77433842420578,
"rewards/rejected": -2.4639077186584473,
"step": 420
},
{
"epoch": 0.45014394137660296,
"grad_norm": 5.4739752222845,
"learning_rate": 3.355527661097728e-06,
"logits/chosen": -1.2457327842712402,
"logits/rejected": -1.1701027154922485,
"logps/chosen": -408.38629150390625,
"logps/rejected": -495.34906005859375,
"loss": 0.5228,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8043445348739624,
"rewards/margins": 0.8327828645706177,
"rewards/rejected": -2.63712739944458,
"step": 430
},
{
"epoch": 0.46061240512954726,
"grad_norm": 6.85581768096026,
"learning_rate": 3.269063392575352e-06,
"logits/chosen": -1.2453696727752686,
"logits/rejected": -1.202413558959961,
"logps/chosen": -399.9681091308594,
"logps/rejected": -447.87744140625,
"loss": 0.5256,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5209221839904785,
"rewards/margins": 0.6652868986129761,
"rewards/rejected": -2.186208963394165,
"step": 440
},
{
"epoch": 0.4710808688824915,
"grad_norm": 5.776991001419176,
"learning_rate": 3.181570569931697e-06,
"logits/chosen": -1.3612130880355835,
"logits/rejected": -1.2961633205413818,
"logps/chosen": -399.5633239746094,
"logps/rejected": -510.95513916015625,
"loss": 0.5105,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.5496407747268677,
"rewards/margins": 0.9577849507331848,
"rewards/rejected": -2.507425308227539,
"step": 450
},
{
"epoch": 0.48154933263543576,
"grad_norm": 5.051312773178446,
"learning_rate": 3.09316620706208e-06,
"logits/chosen": -1.2932283878326416,
"logits/rejected": -1.2328197956085205,
"logps/chosen": -472.6172790527344,
"logps/rejected": -538.5615234375,
"loss": 0.5286,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.122169256210327,
"rewards/margins": 0.7884066104888916,
"rewards/rejected": -2.9105758666992188,
"step": 460
},
{
"epoch": 0.49201779638838,
"grad_norm": 4.905312405134056,
"learning_rate": 3.0039685369660785e-06,
"logits/chosen": -1.299459457397461,
"logits/rejected": -1.164813756942749,
"logps/chosen": -433.37554931640625,
"logps/rejected": -470.384521484375,
"loss": 0.4942,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7430375814437866,
"rewards/margins": 0.6865721940994263,
"rewards/rejected": -2.429609775543213,
"step": 470
},
{
"epoch": 0.5024862601413242,
"grad_norm": 6.146340380005326,
"learning_rate": 2.91409685362137e-06,
"logits/chosen": -1.2403475046157837,
"logits/rejected": -1.0839545726776123,
"logps/chosen": -513.665771484375,
"logps/rejected": -583.8973388671875,
"loss": 0.5046,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.4166836738586426,
"rewards/margins": 1.000232219696045,
"rewards/rejected": -3.4169158935546875,
"step": 480
},
{
"epoch": 0.5129547238942685,
"grad_norm": 5.112445065707855,
"learning_rate": 2.8236713524386085e-06,
"logits/chosen": -1.2714743614196777,
"logits/rejected": -1.0941110849380493,
"logps/chosen": -532.9962158203125,
"logps/rejected": -588.236328125,
"loss": 0.5025,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.491058826446533,
"rewards/margins": 0.900057315826416,
"rewards/rejected": -3.39111590385437,
"step": 490
},
{
"epoch": 0.5234231876472127,
"grad_norm": 5.578978194196055,
"learning_rate": 2.7328129695107205e-06,
"logits/chosen": -1.2111141681671143,
"logits/rejected": -1.1049137115478516,
"logps/chosen": -476.11932373046875,
"logps/rejected": -544.0899658203125,
"loss": 0.5107,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.1048712730407715,
"rewards/margins": 0.9492176175117493,
"rewards/rejected": -3.054089069366455,
"step": 500
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -1.0955979824066162,
"eval_logits/rejected": -0.9851866364479065,
"eval_logps/chosen": -509.9847717285156,
"eval_logps/rejected": -587.1475830078125,
"eval_loss": 0.5058528184890747,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -2.4506328105926514,
"eval_rewards/margins": 0.9743964076042175,
"eval_rewards/rejected": -3.4250295162200928,
"eval_runtime": 306.1826,
"eval_samples_per_second": 6.532,
"eval_steps_per_second": 0.206,
"step": 500
},
{
"epoch": 0.533891651400157,
"grad_norm": 5.171650655838442,
"learning_rate": 2.641643219871597e-06,
"logits/chosen": -1.1813862323760986,
"logits/rejected": -1.0965768098831177,
"logps/chosen": -508.13092041015625,
"logps/rejected": -570.53759765625,
"loss": 0.4794,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.4878056049346924,
"rewards/margins": 0.8628519773483276,
"rewards/rejected": -3.3506579399108887,
"step": 510
},
{
"epoch": 0.5443601151531012,
"grad_norm": 6.785478355965952,
"learning_rate": 2.5502840349805074e-06,
"logits/chosen": -1.1492969989776611,
"logits/rejected": -1.0285675525665283,
"logps/chosen": -577.2451171875,
"logps/rejected": -618.67236328125,
"loss": 0.536,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.018739938735962,
"rewards/margins": 0.8701656460762024,
"rewards/rejected": -3.8889052867889404,
"step": 520
},
{
"epoch": 0.5548285789060455,
"grad_norm": 5.9061132192029975,
"learning_rate": 2.4588575996495797e-06,
"logits/chosen": -1.1598259210586548,
"logits/rejected": -1.0490505695343018,
"logps/chosen": -515.8734130859375,
"logps/rejected": -585.2592163085938,
"loss": 0.5089,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.4302353858947754,
"rewards/margins": 0.9659484624862671,
"rewards/rejected": -3.396183729171753,
"step": 530
},
{
"epoch": 0.5652970426589898,
"grad_norm": 4.6033180245939604,
"learning_rate": 2.367486188632446e-06,
"logits/chosen": -1.1840062141418457,
"logits/rejected": -1.05405592918396,
"logps/chosen": -469.9341735839844,
"logps/rejected": -503.66156005859375,
"loss": 0.5311,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.1539134979248047,
"rewards/margins": 0.7668038606643677,
"rewards/rejected": -2.920717239379883,
"step": 540
},
{
"epoch": 0.575765506411934,
"grad_norm": 4.876993529338895,
"learning_rate": 2.276292003092593e-06,
"logits/chosen": -1.1908996105194092,
"logits/rejected": -1.03009033203125,
"logps/chosen": -489.2470703125,
"logps/rejected": -519.9906616210938,
"loss": 0.5265,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.272523880004883,
"rewards/margins": 0.6718829870223999,
"rewards/rejected": -2.9444069862365723,
"step": 550
},
{
"epoch": 0.5862339701648783,
"grad_norm": 6.4140589895092734,
"learning_rate": 2.1853970071701415e-06,
"logits/chosen": -1.0713722705841064,
"logits/rejected": -0.9895181655883789,
"logps/chosen": -517.431396484375,
"logps/rejected": -573.0247802734375,
"loss": 0.527,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.533311605453491,
"rewards/margins": 0.8953849077224731,
"rewards/rejected": -3.428696393966675,
"step": 560
},
{
"epoch": 0.5967024339178225,
"grad_norm": 6.376011574902463,
"learning_rate": 2.0949227648656194e-06,
"logits/chosen": -1.129098653793335,
"logits/rejected": -1.0245158672332764,
"logps/chosen": -559.3487548828125,
"logps/rejected": -636.18310546875,
"loss": 0.5053,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.986375331878662,
"rewards/margins": 0.8951998949050903,
"rewards/rejected": -3.8815758228302,
"step": 570
},
{
"epoch": 0.6071708976707668,
"grad_norm": 5.245103814265102,
"learning_rate": 2.00499027745888e-06,
"logits/chosen": -1.081526517868042,
"logits/rejected": -0.9703742861747742,
"logps/chosen": -521.400146484375,
"logps/rejected": -581.4735107421875,
"loss": 0.5167,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.8668112754821777,
"rewards/margins": 0.8452316522598267,
"rewards/rejected": -3.712043046951294,
"step": 580
},
{
"epoch": 0.6176393614237111,
"grad_norm": 5.7527775937649,
"learning_rate": 1.915719821680624e-06,
"logits/chosen": -1.287595510482788,
"logits/rejected": -1.2315866947174072,
"logps/chosen": -470.63995361328125,
"logps/rejected": -567.0520629882812,
"loss": 0.5018,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.246957778930664,
"rewards/margins": 0.9269870519638062,
"rewards/rejected": -3.1739444732666016,
"step": 590
},
{
"epoch": 0.6281078251766553,
"grad_norm": 4.552980884850473,
"learning_rate": 1.8272307888529276e-06,
"logits/chosen": -1.1632342338562012,
"logits/rejected": -1.0386791229248047,
"logps/chosen": -524.4891357421875,
"logps/rejected": -568.3989868164062,
"loss": 0.4851,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.2760519981384277,
"rewards/margins": 0.7800448536872864,
"rewards/rejected": -3.0560970306396484,
"step": 600
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -1.107803463935852,
"eval_logits/rejected": -0.9969872832298279,
"eval_logps/chosen": -492.1783447265625,
"eval_logps/rejected": -567.8048706054688,
"eval_loss": 0.5023476481437683,
"eval_rewards/accuracies": 0.7678571343421936,
"eval_rewards/chosen": -2.272569179534912,
"eval_rewards/margins": 0.9590328931808472,
"eval_rewards/rejected": -3.231602191925049,
"eval_runtime": 309.4155,
"eval_samples_per_second": 6.464,
"eval_steps_per_second": 0.204,
"step": 600
},
{
"epoch": 0.6385762889295996,
"grad_norm": 5.93434046945835,
"learning_rate": 1.739641525213929e-06,
"logits/chosen": -1.1899484395980835,
"logits/rejected": -1.128395438194275,
"logps/chosen": -505.1640625,
"logps/rejected": -569.9411010742188,
"loss": 0.5131,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.3301897048950195,
"rewards/margins": 1.009691834449768,
"rewards/rejected": -3.339881420135498,
"step": 610
},
{
"epoch": 0.6490447526825438,
"grad_norm": 6.7592991694860345,
"learning_rate": 1.6530691736402317e-06,
"logits/chosen": -1.3041086196899414,
"logits/rejected": -1.1299916505813599,
"logps/chosen": -520.1345825195312,
"logps/rejected": -570.2188110351562,
"loss": 0.4665,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.1966605186462402,
"rewards/margins": 1.0572277307510376,
"rewards/rejected": -3.253887891769409,
"step": 620
},
{
"epoch": 0.6595132164354881,
"grad_norm": 6.939788728853941,
"learning_rate": 1.5676295169786864e-06,
"logits/chosen": -1.2350585460662842,
"logits/rejected": -1.1085925102233887,
"logps/chosen": -506.601806640625,
"logps/rejected": -558.4306640625,
"loss": 0.4883,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.2913689613342285,
"rewards/margins": 0.9455882906913757,
"rewards/rejected": -3.23695707321167,
"step": 630
},
{
"epoch": 0.6699816801884323,
"grad_norm": 6.885905876648702,
"learning_rate": 1.4834368231970922e-06,
"logits/chosen": -1.1120684146881104,
"logits/rejected": -0.9865466952323914,
"logps/chosen": -484.1168518066406,
"logps/rejected": -577.2811279296875,
"loss": 0.4707,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.489487648010254,
"rewards/margins": 1.0948190689086914,
"rewards/rejected": -3.5843067169189453,
"step": 640
},
{
"epoch": 0.6804501439413766,
"grad_norm": 5.355709320338061,
"learning_rate": 1.4006036925609245e-06,
"logits/chosen": -1.2093435525894165,
"logits/rejected": -1.1428296566009521,
"logps/chosen": -484.78680419921875,
"logps/rejected": -564.5701293945312,
"loss": 0.5169,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2595038414001465,
"rewards/margins": 0.8043657541275024,
"rewards/rejected": -3.0638692378997803,
"step": 650
},
{
"epoch": 0.6909186076943209,
"grad_norm": 5.122013631042651,
"learning_rate": 1.3192409070404582e-06,
"logits/chosen": -1.2926921844482422,
"logits/rejected": -1.1912837028503418,
"logps/chosen": -462.49700927734375,
"logps/rejected": -535.156005859375,
"loss": 0.4902,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.105215311050415,
"rewards/margins": 0.9164140820503235,
"rewards/rejected": -3.0216293334960938,
"step": 660
},
{
"epoch": 0.7013870714472651,
"grad_norm": 7.049199430649973,
"learning_rate": 1.2394572821496953e-06,
"logits/chosen": -1.144698977470398,
"logits/rejected": -0.9912746548652649,
"logps/chosen": -518.4602661132812,
"logps/rejected": -603.5948486328125,
"loss": 0.4773,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6358141899108887,
"rewards/margins": 1.113600492477417,
"rewards/rejected": -3.7494144439697266,
"step": 670
},
{
"epoch": 0.7118555352002094,
"grad_norm": 7.264843779332727,
"learning_rate": 1.1613595214152713e-06,
"logits/chosen": -1.1309657096862793,
"logits/rejected": -1.0589998960494995,
"logps/chosen": -539.7699584960938,
"logps/rejected": -620.6878662109375,
"loss": 0.5266,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.733842134475708,
"rewards/margins": 0.9637987017631531,
"rewards/rejected": -3.6976406574249268,
"step": 680
},
{
"epoch": 0.7223239989531536,
"grad_norm": 5.030210764921467,
"learning_rate": 1.0850520736699362e-06,
"logits/chosen": -1.288641333580017,
"logits/rejected": -1.134615182876587,
"logps/chosen": -510.186767578125,
"logps/rejected": -550.937744140625,
"loss": 0.4574,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3800137042999268,
"rewards/margins": 0.9033814668655396,
"rewards/rejected": -3.2833950519561768,
"step": 690
},
{
"epoch": 0.7327924627060979,
"grad_norm": 5.9328337595528495,
"learning_rate": 1.0106369933615043e-06,
"logits/chosen": -1.1679879426956177,
"logits/rejected": -1.1002038717269897,
"logps/chosen": -487.10821533203125,
"logps/rejected": -576.1431884765625,
"loss": 0.4681,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3741798400878906,
"rewards/margins": 0.9901224970817566,
"rewards/rejected": -3.364302158355713,
"step": 700
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -1.119031548500061,
"eval_logits/rejected": -1.006774663925171,
"eval_logps/chosen": -496.6231994628906,
"eval_logps/rejected": -581.5197143554688,
"eval_loss": 0.49932044744491577,
"eval_rewards/accuracies": 0.7678571343421936,
"eval_rewards/chosen": -2.3170175552368164,
"eval_rewards/margins": 1.0517328977584839,
"eval_rewards/rejected": -3.3687500953674316,
"eval_runtime": 280.6333,
"eval_samples_per_second": 7.127,
"eval_steps_per_second": 0.224,
"step": 700
},
{
"epoch": 0.7432609264590422,
"grad_norm": 5.3428016079168215,
"learning_rate": 9.382138040640714e-07,
"logits/chosen": -1.1798994541168213,
"logits/rejected": -1.021723985671997,
"logps/chosen": -505.2616271972656,
"logps/rejected": -571.1856689453125,
"loss": 0.547,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.396669864654541,
"rewards/margins": 0.9654728174209595,
"rewards/rejected": -3.362142562866211,
"step": 710
},
{
"epoch": 0.7537293902119864,
"grad_norm": 5.0438516064442505,
"learning_rate": 8.678793653740633e-07,
"logits/chosen": -1.3297260999679565,
"logits/rejected": -1.1736423969268799,
"logps/chosen": -527.0916748046875,
"logps/rejected": -579.3074340820312,
"loss": 0.478,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.070934534072876,
"rewards/margins": 1.0328184366226196,
"rewards/rejected": -3.103752851486206,
"step": 720
},
{
"epoch": 0.7641978539649307,
"grad_norm": 5.464567536353577,
"learning_rate": 7.997277433690984e-07,
"logits/chosen": -1.2094228267669678,
"logits/rejected": -1.076755404472351,
"logps/chosen": -460.71563720703125,
"logps/rejected": -538.8247680664062,
"loss": 0.4966,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.2227933406829834,
"rewards/margins": 0.9530885815620422,
"rewards/rejected": -3.175881862640381,
"step": 730
},
{
"epoch": 0.7746663177178749,
"grad_norm": 5.377248875033102,
"learning_rate": 7.338500848029603e-07,
"logits/chosen": -1.1969387531280518,
"logits/rejected": -1.0555990934371948,
"logps/chosen": -529.8873291015625,
"logps/rejected": -568.1295166015625,
"loss": 0.5065,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.4915201663970947,
"rewards/margins": 0.9319826364517212,
"rewards/rejected": -3.4235024452209473,
"step": 740
},
{
"epoch": 0.7851347814708192,
"grad_norm": 5.342695362281337,
"learning_rate": 6.70334495204884e-07,
"logits/chosen": -1.0425455570220947,
"logits/rejected": -0.9723536372184753,
"logps/chosen": -487.095947265625,
"logps/rejected": -599.2386474609375,
"loss": 0.4857,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.589409112930298,
"rewards/margins": 1.040583848953247,
"rewards/rejected": -3.629992723464966,
"step": 750
},
{
"epoch": 0.7956032452237635,
"grad_norm": 4.96165517698307,
"learning_rate": 6.092659210462232e-07,
"logits/chosen": -1.172639012336731,
"logits/rejected": -1.0221275091171265,
"logps/chosen": -510.41253662109375,
"logps/rejected": -564.8956909179688,
"loss": 0.5034,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.53062105178833,
"rewards/margins": 0.8949772119522095,
"rewards/rejected": -3.42559814453125,
"step": 760
},
{
"epoch": 0.8060717089767077,
"grad_norm": 4.707914305263311,
"learning_rate": 5.507260361320738e-07,
"logits/chosen": -1.2693157196044922,
"logits/rejected": -1.2365710735321045,
"logps/chosen": -522.3062744140625,
"logps/rejected": -619.4640502929688,
"loss": 0.4736,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.363852024078369,
"rewards/margins": 0.9652940034866333,
"rewards/rejected": -3.329145908355713,
"step": 770
},
{
"epoch": 0.816540172729652,
"grad_norm": 5.592342234404946,
"learning_rate": 4.947931323697983e-07,
"logits/chosen": -1.2886607646942139,
"logits/rejected": -1.0684127807617188,
"logps/chosen": -534.323486328125,
"logps/rejected": -556.2030639648438,
"loss": 0.5036,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.277695894241333,
"rewards/margins": 0.8380621075630188,
"rewards/rejected": -3.115757942199707,
"step": 780
},
{
"epoch": 0.8270086364825961,
"grad_norm": 6.076661367759154,
"learning_rate": 4.4154201506053985e-07,
"logits/chosen": -1.1675662994384766,
"logits/rejected": -1.065953254699707,
"logps/chosen": -489.9781188964844,
"logps/rejected": -578.2179565429688,
"loss": 0.5128,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.482016086578369,
"rewards/margins": 0.9305012822151184,
"rewards/rejected": -3.4125168323516846,
"step": 790
},
{
"epoch": 0.8374771002355405,
"grad_norm": 6.131830839970953,
"learning_rate": 3.910439028537638e-07,
"logits/chosen": -1.1610690355300903,
"logits/rejected": -1.1072094440460205,
"logps/chosen": -469.524658203125,
"logps/rejected": -585.119140625,
"loss": 0.4852,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3968334197998047,
"rewards/margins": 1.120810866355896,
"rewards/rejected": -3.5176444053649902,
"step": 800
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -1.1353023052215576,
"eval_logits/rejected": -1.0236940383911133,
"eval_logps/chosen": -504.6183166503906,
"eval_logps/rejected": -585.8155517578125,
"eval_loss": 0.49497368931770325,
"eval_rewards/accuracies": 0.773809552192688,
"eval_rewards/chosen": -2.396967887878418,
"eval_rewards/margins": 1.0147408246994019,
"eval_rewards/rejected": -3.4117088317871094,
"eval_runtime": 274.6399,
"eval_samples_per_second": 7.282,
"eval_steps_per_second": 0.229,
"step": 800
},
{
"epoch": 0.8479455639884846,
"grad_norm": 4.892138077626307,
"learning_rate": 3.4336633249862084e-07,
"logits/chosen": -1.209530234336853,
"logits/rejected": -1.0349524021148682,
"logps/chosen": -532.2420043945312,
"logps/rejected": -591.3436279296875,
"loss": 0.5065,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.4228711128234863,
"rewards/margins": 0.9818238019943237,
"rewards/rejected": -3.4046947956085205,
"step": 810
},
{
"epoch": 0.8584140277414289,
"grad_norm": 6.133736149907814,
"learning_rate": 2.98573068519539e-07,
"logits/chosen": -1.2474277019500732,
"logits/rejected": -1.1728675365447998,
"logps/chosen": -516.3637084960938,
"logps/rejected": -589.2227783203125,
"loss": 0.5066,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.5452017784118652,
"rewards/margins": 0.8416748046875,
"rewards/rejected": -3.3868765830993652,
"step": 820
},
{
"epoch": 0.8688824914943732,
"grad_norm": 4.448941268784857,
"learning_rate": 2.5672401793681854e-07,
"logits/chosen": -1.1796165704727173,
"logits/rejected": -1.1301778554916382,
"logps/chosen": -493.81610107421875,
"logps/rejected": -576.2391357421875,
"loss": 0.5119,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.471508502960205,
"rewards/margins": 0.8166677355766296,
"rewards/rejected": -3.2881767749786377,
"step": 830
},
{
"epoch": 0.8793509552473174,
"grad_norm": 6.561312283813159,
"learning_rate": 2.178751501463036e-07,
"logits/chosen": -1.2200143337249756,
"logits/rejected": -1.120086431503296,
"logps/chosen": -525.2439575195312,
"logps/rejected": -629.3662719726562,
"loss": 0.4823,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.501779317855835,
"rewards/margins": 1.0803827047348022,
"rewards/rejected": -3.5821621417999268,
"step": 840
},
{
"epoch": 0.8898194190002617,
"grad_norm": 5.675008200582371,
"learning_rate": 1.820784220652766e-07,
"logits/chosen": -1.148503065109253,
"logits/rejected": -0.9931659698486328,
"logps/chosen": -511.488037109375,
"logps/rejected": -576.7986450195312,
"loss": 0.4672,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.456411361694336,
"rewards/margins": 0.9856443405151367,
"rewards/rejected": -3.4420554637908936,
"step": 850
},
{
"epoch": 0.9002878827532059,
"grad_norm": 5.847430828911881,
"learning_rate": 1.4938170864468636e-07,
"logits/chosen": -1.2107369899749756,
"logits/rejected": -1.0609266757965088,
"logps/chosen": -531.3970947265625,
"logps/rejected": -590.2992553710938,
"loss": 0.4693,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.60589861869812,
"rewards/margins": 0.8416641354560852,
"rewards/rejected": -3.4475624561309814,
"step": 860
},
{
"epoch": 0.9107563465061502,
"grad_norm": 5.678184676582534,
"learning_rate": 1.1982873884064466e-07,
"logits/chosen": -1.24862539768219,
"logits/rejected": -1.0584386587142944,
"logps/chosen": -544.3020629882812,
"logps/rejected": -598.3751220703125,
"loss": 0.4947,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.632056713104248,
"rewards/margins": 1.0834977626800537,
"rewards/rejected": -3.7155539989471436,
"step": 870
},
{
"epoch": 0.9212248102590945,
"grad_norm": 5.3183871343635944,
"learning_rate": 9.345903713082305e-08,
"logits/chosen": -1.2423183917999268,
"logits/rejected": -1.1057523488998413,
"logps/chosen": -514.6922607421875,
"logps/rejected": -574.3685302734375,
"loss": 0.4801,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.4431517124176025,
"rewards/margins": 0.9935353994369507,
"rewards/rejected": -3.4366869926452637,
"step": 880
},
{
"epoch": 0.9316932740120387,
"grad_norm": 5.735081148749753,
"learning_rate": 7.030787065396866e-08,
"logits/chosen": -1.1413437128067017,
"logits/rejected": -1.0328372716903687,
"logps/chosen": -508.3773498535156,
"logps/rejected": -619.8019409179688,
"loss": 0.4938,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.5983500480651855,
"rewards/margins": 1.2036244869232178,
"rewards/rejected": -3.8019745349884033,
"step": 890
},
{
"epoch": 0.942161737764983,
"grad_norm": 6.618746821621782,
"learning_rate": 5.0406202043228604e-08,
"logits/chosen": -1.045906901359558,
"logits/rejected": -0.9974561929702759,
"logps/chosen": -517.8667602539062,
"logps/rejected": -669.0172729492188,
"loss": 0.4907,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.5165352821350098,
"rewards/margins": 1.2452183961868286,
"rewards/rejected": -3.761753559112549,
"step": 900
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -1.1023893356323242,
"eval_logits/rejected": -0.9901031255722046,
"eval_logps/chosen": -521.706298828125,
"eval_logps/rejected": -608.1346435546875,
"eval_loss": 0.494513601064682,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -2.567847490310669,
"eval_rewards/margins": 1.0670523643493652,
"eval_rewards/rejected": -3.634899854660034,
"eval_runtime": 302.7434,
"eval_samples_per_second": 6.606,
"eval_steps_per_second": 0.208,
"step": 900
},
{
"epoch": 0.9526302015179272,
"grad_norm": 6.10762031606348,
"learning_rate": 3.378064801637687e-08,
"logits/chosen": -1.1716662645339966,
"logits/rejected": -0.9894771575927734,
"logps/chosen": -494.2669982910156,
"logps/rejected": -552.8900146484375,
"loss": 0.4935,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.474764585494995,
"rewards/margins": 1.0439238548278809,
"rewards/rejected": -3.518688201904297,
"step": 910
},
{
"epoch": 0.9630986652708715,
"grad_norm": 5.683369015946174,
"learning_rate": 2.0453443778310766e-08,
"logits/chosen": -1.1771481037139893,
"logits/rejected": -1.0182334184646606,
"logps/chosen": -545.4282836914062,
"logps/rejected": -589.4589233398438,
"loss": 0.5081,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6610922813415527,
"rewards/margins": 0.9007646441459656,
"rewards/rejected": -3.561856746673584,
"step": 920
},
{
"epoch": 0.9735671290238157,
"grad_norm": 5.950595947719059,
"learning_rate": 1.0442413283435759e-08,
"logits/chosen": -1.1201808452606201,
"logits/rejected": -0.9143557548522949,
"logps/chosen": -556.3572998046875,
"logps/rejected": -596.0765380859375,
"loss": 0.493,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.5759315490722656,
"rewards/margins": 1.0089161396026611,
"rewards/rejected": -3.5848472118377686,
"step": 930
},
{
"epoch": 0.98403559277676,
"grad_norm": 7.5374296673981425,
"learning_rate": 3.760945397705828e-09,
"logits/chosen": -1.1323813199996948,
"logits/rejected": -1.0745770931243896,
"logps/chosen": -517.08056640625,
"logps/rejected": -605.00146484375,
"loss": 0.4829,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.5434603691101074,
"rewards/margins": 0.8878037333488464,
"rewards/rejected": -3.4312641620635986,
"step": 940
},
{
"epoch": 0.9945040565297043,
"grad_norm": 5.8745190839702515,
"learning_rate": 4.1797599220405605e-10,
"logits/chosen": -1.1730302572250366,
"logits/rejected": -0.9989528656005859,
"logps/chosen": -524.8547973632812,
"logps/rejected": -583.33203125,
"loss": 0.4861,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.5831165313720703,
"rewards/margins": 0.908293604850769,
"rewards/rejected": -3.49141001701355,
"step": 950
},
{
"epoch": 0.9997382884061764,
"step": 955,
"total_flos": 0.0,
"train_loss": 0.5319095570379527,
"train_runtime": 23762.0752,
"train_samples_per_second": 2.573,
"train_steps_per_second": 0.04
}
],
"logging_steps": 10,
"max_steps": 955,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}