Delta-Vector's picture
Training in progress, step 289, checkpoint
0d69938 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5002163565556036,
"eval_steps": 500,
"global_step": 289,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017308524448290783,
"grad_norm": 46.5,
"kl": 0.0,
"learning_rate": 1.4285714285714287e-07,
"logits/chosen": -6239313.454545454,
"logits/rejected": -4940240.761904762,
"logps/chosen": -236.17436079545453,
"logps/rejected": -209.70107886904762,
"loss": 0.5,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0034617048896581565,
"grad_norm": 38.25,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"logits/chosen": -2665428.3076923075,
"logits/rejected": -1073632.5263157894,
"logps/chosen": -155.0839562049279,
"logps/rejected": -255.23524876644737,
"loss": 0.5,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.005192557334487235,
"grad_norm": 39.25,
"kl": 0.11506986618041992,
"learning_rate": 4.285714285714286e-07,
"logits/chosen": -1627763.2,
"logits/rejected": -1337906.8235294118,
"logps/chosen": -214.50914713541667,
"logps/rejected": -210.7369887408088,
"loss": 0.5011,
"rewards/chosen": -0.005750782291094462,
"rewards/margins": -0.006572681913773219,
"rewards/rejected": 0.0008218996226787567,
"step": 3
},
{
"epoch": 0.006923409779316313,
"grad_norm": 33.5,
"kl": 0.13516950607299805,
"learning_rate": 5.714285714285715e-07,
"logits/chosen": -9900144.0,
"logits/rejected": 2390790.4,
"logps/chosen": -236.11850873161765,
"logps/rejected": -187.40989583333334,
"loss": 0.5052,
"rewards/chosen": 0.00019769019940320184,
"rewards/margins": -0.03652356716932035,
"rewards/rejected": 0.03672125736872355,
"step": 4
},
{
"epoch": 0.00865426222414539,
"grad_norm": 39.0,
"kl": 0.16583895683288574,
"learning_rate": 7.142857142857143e-07,
"logits/chosen": 273089.5,
"logits/rejected": -8537414.0,
"logps/chosen": -191.99412536621094,
"logps/rejected": -250.5272216796875,
"loss": 0.5,
"rewards/chosen": -0.018333029001951218,
"rewards/margins": -0.014739224454388022,
"rewards/rejected": -0.0035938045475631952,
"step": 5
},
{
"epoch": 0.01038511466897447,
"grad_norm": 37.0,
"kl": 0.1269383430480957,
"learning_rate": 8.571428571428572e-07,
"logits/chosen": 625734.125,
"logits/rejected": -3864760.5,
"logps/chosen": -130.1186065673828,
"logps/rejected": -263.1868591308594,
"loss": 0.4974,
"rewards/chosen": 0.005134785547852516,
"rewards/margins": 0.02763364464044571,
"rewards/rejected": -0.022498859092593193,
"step": 6
},
{
"epoch": 0.012115967113803548,
"grad_norm": 45.5,
"kl": 0.058301448822021484,
"learning_rate": 1.0000000000000002e-06,
"logits/chosen": 11427891.2,
"logits/rejected": -8692000.94117647,
"logps/chosen": -247.42198893229167,
"logps/rejected": -318.3026769301471,
"loss": 0.4879,
"rewards/chosen": 0.020061949888865154,
"rewards/margins": 0.09507267031015135,
"rewards/rejected": -0.07501072042128619,
"step": 7
},
{
"epoch": 0.013846819558632626,
"grad_norm": 34.75,
"kl": 0.10779595375061035,
"learning_rate": 1.142857142857143e-06,
"logits/chosen": 9745310.315789474,
"logits/rejected": 4968272.0,
"logps/chosen": -266.39432565789474,
"logps/rejected": -200.4144568810096,
"loss": 0.4926,
"rewards/chosen": 0.03307872383218063,
"rewards/margins": 0.08410431619597833,
"rewards/rejected": -0.0510255923637977,
"step": 8
},
{
"epoch": 0.015577672003461706,
"grad_norm": 35.75,
"kl": 0.0004693269729614258,
"learning_rate": 1.2857142857142856e-06,
"logits/chosen": 2292229.3333333335,
"logits/rejected": -5866576.571428572,
"logps/chosen": -164.06934950086804,
"logps/rejected": -255.37636021205358,
"loss": 0.4857,
"rewards/chosen": 0.017767790291044448,
"rewards/margins": 0.12696768035964362,
"rewards/rejected": -0.10919989006859916,
"step": 9
},
{
"epoch": 0.01730852444829078,
"grad_norm": 58.0,
"kl": 0.08134031295776367,
"learning_rate": 1.4285714285714286e-06,
"logits/chosen": -2206551.1428571427,
"logits/rejected": -2327785.3333333335,
"logps/chosen": -210.65478515625,
"logps/rejected": -399.8186848958333,
"loss": 0.4595,
"rewards/chosen": 0.041608184576034546,
"rewards/margins": 0.30469969577259487,
"rewards/rejected": -0.2630915111965603,
"step": 10
},
{
"epoch": 0.019039376893119863,
"grad_norm": 33.0,
"kl": 0.06869983673095703,
"learning_rate": 1.5714285714285714e-06,
"logits/chosen": 5004582.315789473,
"logits/rejected": 15390077.538461538,
"logps/chosen": -168.22392835115133,
"logps/rejected": -250.3277869591346,
"loss": 0.4842,
"rewards/chosen": 0.025214639149214093,
"rewards/margins": 0.15980646617499442,
"rewards/rejected": -0.13459182702578032,
"step": 11
},
{
"epoch": 0.02077022933794894,
"grad_norm": 35.25,
"kl": 0.03198128938674927,
"learning_rate": 1.7142857142857145e-06,
"logits/chosen": -6019438.5,
"logits/rejected": -12351150.0,
"logps/chosen": -203.1639404296875,
"logps/rejected": -248.1376495361328,
"loss": 0.4481,
"rewards/chosen": 0.022723043337464333,
"rewards/margins": 0.46131726540625095,
"rewards/rejected": -0.4385942220687866,
"step": 12
},
{
"epoch": 0.02250108178277802,
"grad_norm": 27.375,
"kl": 0.03030562400817871,
"learning_rate": 1.8571428571428573e-06,
"logits/chosen": -1771344.705882353,
"logits/rejected": 4848613.333333333,
"logps/chosen": -156.79733455882354,
"logps/rejected": -143.33746744791668,
"loss": 0.4641,
"rewards/chosen": 0.040735574329600614,
"rewards/margins": 0.31227434055477965,
"rewards/rejected": -0.27153876622517903,
"step": 13
},
{
"epoch": 0.024231934227607096,
"grad_norm": 31.75,
"kl": 0.0,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": -402277.15789473685,
"logits/rejected": 8271367.384615385,
"logps/chosen": -283.14478824013156,
"logps/rejected": -204.32831280048077,
"loss": 0.4584,
"rewards/chosen": 0.008756919126761587,
"rewards/margins": 0.42188657892619064,
"rewards/rejected": -0.41312965979942906,
"step": 14
},
{
"epoch": 0.025962786672436174,
"grad_norm": 36.75,
"kl": 0.0032711029052734375,
"learning_rate": 2.1428571428571427e-06,
"logits/chosen": -811063.5882352941,
"logits/rejected": -12282100.266666668,
"logps/chosen": -176.79848345588235,
"logps/rejected": -325.2669270833333,
"loss": 0.3948,
"rewards/chosen": 0.08565068244934082,
"rewards/margins": 0.9919464588165283,
"rewards/rejected": -0.9062957763671875,
"step": 15
},
{
"epoch": 0.027693639117265252,
"grad_norm": 30.375,
"kl": 0.007568359375,
"learning_rate": 2.285714285714286e-06,
"logits/chosen": -1084260.0,
"logits/rejected": 31534450.666666668,
"logps/chosen": -210.231787109375,
"logps/rejected": -276.17873128255206,
"loss": 0.4251,
"rewards/chosen": 0.08407727479934693,
"rewards/margins": 0.8169409394264221,
"rewards/rejected": -0.7328636646270752,
"step": 16
},
{
"epoch": 0.02942449156209433,
"grad_norm": 30.0,
"kl": 0.0,
"learning_rate": 2.428571428571429e-06,
"logits/chosen": -1036163.6923076923,
"logits/rejected": 961232.6315789474,
"logps/chosen": -268.2616624098558,
"logps/rejected": -225.40373149671052,
"loss": 0.4054,
"rewards/chosen": 0.010549396276473999,
"rewards/margins": 0.7161211230252919,
"rewards/rejected": -0.7055717267488179,
"step": 17
},
{
"epoch": 0.03115534400692341,
"grad_norm": 26.25,
"kl": 0.0,
"learning_rate": 2.571428571428571e-06,
"logits/chosen": 4818987.555555556,
"logits/rejected": -827468.8571428572,
"logps/chosen": -251.50027126736111,
"logps/rejected": -213.26834542410714,
"loss": 0.4032,
"rewards/chosen": 0.03222567505306668,
"rewards/margins": 1.113093238028269,
"rewards/rejected": -1.0808675629752023,
"step": 18
},
{
"epoch": 0.03288619645175249,
"grad_norm": 24.625,
"kl": 0.0,
"learning_rate": 2.7142857142857144e-06,
"logits/chosen": -1698852.380952381,
"logits/rejected": 2662217.6363636362,
"logps/chosen": -172.64027622767858,
"logps/rejected": -205.12626509232953,
"loss": 0.4404,
"rewards/chosen": -0.05192979744502476,
"rewards/margins": 0.9354158980505807,
"rewards/rejected": -0.9873456954956055,
"step": 19
},
{
"epoch": 0.03461704889658156,
"grad_norm": 22.375,
"kl": 0.0,
"learning_rate": 2.8571428571428573e-06,
"logits/chosen": 6553660.0,
"logits/rejected": -5781368.444444444,
"logps/chosen": -143.31090436662947,
"logps/rejected": -192.53050401475696,
"loss": 0.3749,
"rewards/chosen": 0.09408829041889735,
"rewards/margins": 1.171789647094787,
"rewards/rejected": -1.0777013566758897,
"step": 20
},
{
"epoch": 0.036347901341410645,
"grad_norm": 20.375,
"kl": 0.012153387069702148,
"learning_rate": 3e-06,
"logits/chosen": 1939333.8666666667,
"logits/rejected": 1052395.0588235294,
"logps/chosen": -177.10651041666668,
"logps/rejected": -190.28341854319854,
"loss": 0.3776,
"rewards/chosen": 0.045921965440114336,
"rewards/margins": 1.3893623017797283,
"rewards/rejected": -1.343440336339614,
"step": 21
},
{
"epoch": 0.038078753786239726,
"grad_norm": 21.0,
"kl": 0.0,
"learning_rate": 3.142857142857143e-06,
"logits/chosen": 3900064.5,
"logits/rejected": 2436417.0,
"logps/chosen": -188.06832885742188,
"logps/rejected": -307.90692138671875,
"loss": 0.3542,
"rewards/chosen": -0.08758784085512161,
"rewards/margins": 1.7587207481265068,
"rewards/rejected": -1.8463085889816284,
"step": 22
},
{
"epoch": 0.0398096062310688,
"grad_norm": 22.625,
"kl": 0.000914454460144043,
"learning_rate": 3.285714285714286e-06,
"logits/chosen": -42746.86666666667,
"logits/rejected": 1372338.8235294118,
"logps/chosen": -226.461865234375,
"logps/rejected": -258.68396714154414,
"loss": 0.3754,
"rewards/chosen": -0.14824188550313314,
"rewards/margins": 1.6312208166309432,
"rewards/rejected": -1.7794627021340763,
"step": 23
},
{
"epoch": 0.04154045867589788,
"grad_norm": 19.75,
"kl": 0.0,
"learning_rate": 3.428571428571429e-06,
"logits/chosen": -1758418.3333333333,
"logits/rejected": 7633656.0,
"logps/chosen": -143.17743598090277,
"logps/rejected": -136.41956438337053,
"loss": 0.4083,
"rewards/chosen": -0.15943604045444065,
"rewards/margins": 1.379212019935487,
"rewards/rejected": -1.5386480603899275,
"step": 24
},
{
"epoch": 0.043271311120726956,
"grad_norm": 22.375,
"kl": 0.0,
"learning_rate": 3.5714285714285718e-06,
"logits/chosen": -3627161.777777778,
"logits/rejected": -1282915.142857143,
"logps/chosen": -189.07590060763889,
"logps/rejected": -282.23025948660717,
"loss": 0.3667,
"rewards/chosen": -0.05861267778608534,
"rewards/margins": 2.27708803850507,
"rewards/rejected": -2.335700716291155,
"step": 25
},
{
"epoch": 0.04500216356555604,
"grad_norm": 20.125,
"kl": 0.0,
"learning_rate": 3.7142857142857146e-06,
"logits/chosen": 3197816.8571428573,
"logits/rejected": -24990.666666666668,
"logps/chosen": -133.38133893694197,
"logps/rejected": -252.71739366319446,
"loss": 0.3729,
"rewards/chosen": -0.2894209793635777,
"rewards/margins": 1.5688878127506802,
"rewards/rejected": -1.8583087921142578,
"step": 26
},
{
"epoch": 0.04673301601038511,
"grad_norm": 22.0,
"kl": 0.0,
"learning_rate": 3.857142857142858e-06,
"logits/chosen": 8530238.857142856,
"logits/rejected": -6356856.888888889,
"logps/chosen": -228.37095424107142,
"logps/rejected": -290.1540256076389,
"loss": 0.3187,
"rewards/chosen": -0.04233703442982265,
"rewards/margins": 2.3515855594286843,
"rewards/rejected": -2.393922593858507,
"step": 27
},
{
"epoch": 0.04846386845521419,
"grad_norm": 17.125,
"kl": 0.0,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": 11541931.294117646,
"logits/rejected": -18899741.866666667,
"logps/chosen": -161.64244887408088,
"logps/rejected": -298.1225911458333,
"loss": 0.3221,
"rewards/chosen": -0.20178667236776912,
"rewards/margins": 3.408641692703845,
"rewards/rejected": -3.6104283650716145,
"step": 28
},
{
"epoch": 0.050194720900043274,
"grad_norm": 21.125,
"kl": 0.0,
"learning_rate": 4.1428571428571435e-06,
"logits/chosen": -8176106.0,
"logits/rejected": 8892046.0,
"logps/chosen": -202.91030883789062,
"logps/rejected": -320.7770690917969,
"loss": 0.3603,
"rewards/chosen": -0.17716556787490845,
"rewards/margins": 2.8113109469413757,
"rewards/rejected": -2.988476514816284,
"step": 29
},
{
"epoch": 0.05192557334487235,
"grad_norm": 20.75,
"kl": 0.0,
"learning_rate": 4.2857142857142855e-06,
"logits/chosen": 1024021.4736842106,
"logits/rejected": 4154760.0,
"logps/chosen": -166.43669048108552,
"logps/rejected": -148.49478853665866,
"loss": 0.4417,
"rewards/chosen": -0.38098611329731186,
"rewards/margins": 1.4392063704579465,
"rewards/rejected": -1.8201924837552583,
"step": 30
},
{
"epoch": 0.05365642578970143,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 4.428571428571429e-06,
"logits/chosen": 12327768.727272727,
"logits/rejected": -4335401.904761905,
"logps/chosen": -141.8250732421875,
"logps/rejected": -224.83528645833334,
"loss": 0.3034,
"rewards/chosen": -0.04076832803812894,
"rewards/margins": 2.3152393841898284,
"rewards/rejected": -2.3560077122279575,
"step": 31
},
{
"epoch": 0.055387278234530504,
"grad_norm": 17.125,
"kl": 0.0,
"learning_rate": 4.571428571428572e-06,
"logits/chosen": -7621606.545454546,
"logits/rejected": -5030670.857142857,
"logps/chosen": -168.98353160511363,
"logps/rejected": -240.65597098214286,
"loss": 0.294,
"rewards/chosen": -0.1550229029221968,
"rewards/margins": 2.5555503843150613,
"rewards/rejected": -2.710573287237258,
"step": 32
},
{
"epoch": 0.057118130679359586,
"grad_norm": 20.75,
"kl": 0.0,
"learning_rate": 4.714285714285715e-06,
"logits/chosen": -4837878.153846154,
"logits/rejected": -2727320.4210526315,
"logps/chosen": -260.5615985576923,
"logps/rejected": -309.63633326480266,
"loss": 0.3038,
"rewards/chosen": -0.1790018998659574,
"rewards/margins": 3.2555804590464605,
"rewards/rejected": -3.434582358912418,
"step": 33
},
{
"epoch": 0.05884898312418866,
"grad_norm": 16.75,
"kl": 0.0,
"learning_rate": 4.857142857142858e-06,
"logits/chosen": 334761.73333333334,
"logits/rejected": -6532100.705882353,
"logps/chosen": -111.3039794921875,
"logps/rejected": -291.9061638327206,
"loss": 0.3181,
"rewards/chosen": -0.061692579587300615,
"rewards/margins": 3.2174934447980394,
"rewards/rejected": -3.27918602438534,
"step": 34
},
{
"epoch": 0.06057983556901774,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5367810.0,
"logits/rejected": -11132618.0,
"logps/chosen": -157.48095703125,
"logps/rejected": -343.7468566894531,
"loss": 0.3208,
"rewards/chosen": -0.24039308726787567,
"rewards/margins": 3.439171150326729,
"rewards/rejected": -3.6795642375946045,
"step": 35
},
{
"epoch": 0.06231068801384682,
"grad_norm": 21.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4363456.888888889,
"logits/rejected": -3689830.285714286,
"logps/chosen": -200.380126953125,
"logps/rejected": -326.18729073660717,
"loss": 0.3453,
"rewards/chosen": -0.15904908710055882,
"rewards/margins": 3.3486854840838722,
"rewards/rejected": -3.507734571184431,
"step": 36
},
{
"epoch": 0.0640415404586759,
"grad_norm": 24.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -14376078.4,
"logits/rejected": -13662346.666666666,
"logps/chosen": -232.297265625,
"logps/rejected": -266.85874430338544,
"loss": 0.3776,
"rewards/chosen": -0.15787798166275024,
"rewards/margins": 3.1688521107037864,
"rewards/rejected": -3.3267300923665366,
"step": 37
},
{
"epoch": 0.06577239290350498,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 9234006.4,
"logits/rejected": 8716803.764705881,
"logps/chosen": -195.22076822916668,
"logps/rejected": -223.59991096047793,
"loss": 0.3266,
"rewards/chosen": 0.033547862370808916,
"rewards/margins": 2.3299624059714525,
"rewards/rejected": -2.2964145436006436,
"step": 38
},
{
"epoch": 0.06750324534833406,
"grad_norm": 23.5,
"kl": 0.22034478187561035,
"learning_rate": 5e-06,
"logits/chosen": -10664284.0,
"logits/rejected": -6822504.5,
"logps/chosen": -254.58663940429688,
"logps/rejected": -303.74578857421875,
"loss": 0.3069,
"rewards/chosen": -0.025931095704436302,
"rewards/margins": 2.927818799391389,
"rewards/rejected": -2.953749895095825,
"step": 39
},
{
"epoch": 0.06923409779316313,
"grad_norm": 19.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3069599.6666666665,
"logits/rejected": -6284954.8,
"logps/chosen": -277.9341634114583,
"logps/rejected": -314.748974609375,
"loss": 0.2888,
"rewards/chosen": -0.13368964195251465,
"rewards/margins": 2.755869913101196,
"rewards/rejected": -2.8895595550537108,
"step": 40
},
{
"epoch": 0.07096495023799221,
"grad_norm": 22.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 946472.0,
"logits/rejected": 9102588.235294119,
"logps/chosen": -248.21149088541668,
"logps/rejected": -162.85635914522058,
"loss": 0.3098,
"rewards/chosen": 0.07899113496144612,
"rewards/margins": 2.07641756067089,
"rewards/rejected": -1.9974264257094438,
"step": 41
},
{
"epoch": 0.07269580268282129,
"grad_norm": 21.0,
"kl": 0.07075059413909912,
"learning_rate": 5e-06,
"logits/chosen": 7388380.19047619,
"logits/rejected": -89609.45454545454,
"logps/chosen": -191.81854538690476,
"logps/rejected": -390.0617009943182,
"loss": 0.3575,
"rewards/chosen": 0.03016080175127302,
"rewards/margins": 3.0582882986440287,
"rewards/rejected": -3.028127496892756,
"step": 42
},
{
"epoch": 0.07442665512765037,
"grad_norm": 22.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6876745.6,
"logits/rejected": -1067452.5,
"logps/chosen": -200.07301025390626,
"logps/rejected": -290.6153564453125,
"loss": 0.3613,
"rewards/chosen": -0.03498818874359131,
"rewards/margins": 2.9018725315729776,
"rewards/rejected": -2.936860720316569,
"step": 43
},
{
"epoch": 0.07615750757247945,
"grad_norm": 20.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 3785227.111111111,
"logits/rejected": 1574901.0,
"logps/chosen": -188.12406412760416,
"logps/rejected": -150.00552804129464,
"loss": 0.3361,
"rewards/chosen": 0.11941173341539171,
"rewards/margins": 2.0707845612177773,
"rewards/rejected": -1.9513728278023856,
"step": 44
},
{
"epoch": 0.07788836001730852,
"grad_norm": 18.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3288104.0,
"logits/rejected": -1550288.125,
"logps/chosen": -169.7630157470703,
"logps/rejected": -244.53619384765625,
"loss": 0.3335,
"rewards/chosen": -0.1161470040678978,
"rewards/margins": 2.0714645758271217,
"rewards/rejected": -2.1876115798950195,
"step": 45
},
{
"epoch": 0.0796192124621376,
"grad_norm": 22.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2523540.222222222,
"logits/rejected": 807282.2142857143,
"logps/chosen": -206.785400390625,
"logps/rejected": -197.57388741629464,
"loss": 0.3727,
"rewards/chosen": -0.07473884688483344,
"rewards/margins": 1.7757124862973652,
"rewards/rejected": -1.8504513331821986,
"step": 46
},
{
"epoch": 0.08135006490696668,
"grad_norm": 22.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3043348.533333333,
"logits/rejected": 10325686.588235294,
"logps/chosen": -231.226220703125,
"logps/rejected": -169.99207261029412,
"loss": 0.3148,
"rewards/chosen": 0.14197413126627603,
"rewards/margins": 2.014080571193321,
"rewards/rejected": -1.872106439927045,
"step": 47
},
{
"epoch": 0.08308091735179576,
"grad_norm": 18.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1849486.9411764706,
"logits/rejected": -2160865.3333333335,
"logps/chosen": -169.53436638327207,
"logps/rejected": -239.082568359375,
"loss": 0.3451,
"rewards/chosen": -0.1786177158355713,
"rewards/margins": 2.253294515609741,
"rewards/rejected": -2.4319122314453123,
"step": 48
},
{
"epoch": 0.08481176979662484,
"grad_norm": 21.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 900298.5714285715,
"logits/rejected": -6812352.7272727275,
"logps/chosen": -166.2104259672619,
"logps/rejected": -353.5106312144886,
"loss": 0.3726,
"rewards/chosen": -0.07294606594812303,
"rewards/margins": 2.6174716572740895,
"rewards/rejected": -2.6904177232222124,
"step": 49
},
{
"epoch": 0.08654262224145391,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9324354.461538462,
"logits/rejected": -14040514.52631579,
"logps/chosen": -223.0580115685096,
"logps/rejected": -286.9508634868421,
"loss": 0.2674,
"rewards/chosen": 0.13786140772012564,
"rewards/margins": 2.67845962452985,
"rewards/rejected": -2.5405982168097245,
"step": 50
},
{
"epoch": 0.088273474686283,
"grad_norm": 20.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 930246.5,
"logits/rejected": -7422215.0,
"logps/chosen": -238.2467803955078,
"logps/rejected": -259.29217529296875,
"loss": 0.294,
"rewards/chosen": 0.032952681183815,
"rewards/margins": 3.1361082941293716,
"rewards/rejected": -3.1031556129455566,
"step": 51
},
{
"epoch": 0.09000432713111207,
"grad_norm": 20.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2028204.625,
"logits/rejected": -1362418.25,
"logps/chosen": -190.04937744140625,
"logps/rejected": -216.1826171875,
"loss": 0.3165,
"rewards/chosen": -0.050155334174633026,
"rewards/margins": 2.5483616068959236,
"rewards/rejected": -2.5985169410705566,
"step": 52
},
{
"epoch": 0.09173517957594116,
"grad_norm": 26.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1491913.3333333333,
"logits/rejected": 17979700.363636363,
"logps/chosen": -285.51971726190476,
"logps/rejected": -230.9169256036932,
"loss": 0.3878,
"rewards/chosen": -0.13681457156226748,
"rewards/margins": 2.993390062670687,
"rewards/rejected": -3.1302046342329546,
"step": 53
},
{
"epoch": 0.09346603202077022,
"grad_norm": 16.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5621796.571428572,
"logits/rejected": -3721314.222222222,
"logps/chosen": -154.634521484375,
"logps/rejected": -144.23682996961804,
"loss": 0.3166,
"rewards/chosen": 0.15212011337280273,
"rewards/margins": 2.1092937787373858,
"rewards/rejected": -1.9571736653645833,
"step": 54
},
{
"epoch": 0.0951968844655993,
"grad_norm": 20.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13213840.0,
"logits/rejected": -2418853.6,
"logps/chosen": -264.71543375651044,
"logps/rejected": -299.3638916015625,
"loss": 0.2498,
"rewards/chosen": 0.169629176457723,
"rewards/margins": 2.8914440949757894,
"rewards/rejected": -2.7218149185180662,
"step": 55
},
{
"epoch": 0.09692773691042839,
"grad_norm": 20.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1270494.2222222222,
"logits/rejected": 3427853.1428571427,
"logps/chosen": -218.35677083333334,
"logps/rejected": -151.46371023995536,
"loss": 0.3539,
"rewards/chosen": 0.18590817186567518,
"rewards/margins": 1.9470306029395452,
"rewards/rejected": -1.76112243107387,
"step": 56
},
{
"epoch": 0.09865858935525747,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 960138.5,
"logits/rejected": -11440672.0,
"logps/chosen": -194.89010620117188,
"logps/rejected": -279.1370544433594,
"loss": 0.3033,
"rewards/chosen": 0.042164143174886703,
"rewards/margins": 2.7914009653031826,
"rewards/rejected": -2.749236822128296,
"step": 57
},
{
"epoch": 0.10038944180008655,
"grad_norm": 17.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3341974.933333333,
"logits/rejected": -3835131.7647058824,
"logps/chosen": -99.36764322916666,
"logps/rejected": -268.3916015625,
"loss": 0.3016,
"rewards/chosen": -0.05830394426981608,
"rewards/margins": 2.6891182179544484,
"rewards/rejected": -2.7474221622242645,
"step": 58
},
{
"epoch": 0.10212029424491562,
"grad_norm": 20.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5338708.2105263155,
"logits/rejected": -522996.92307692306,
"logps/chosen": -177.21720805921052,
"logps/rejected": -242.44989483173077,
"loss": 0.3425,
"rewards/chosen": -0.03957033157348633,
"rewards/margins": 3.2665699812082143,
"rewards/rejected": -3.3061403127817006,
"step": 59
},
{
"epoch": 0.1038511466897447,
"grad_norm": 17.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4021004.5714285714,
"logits/rejected": 3649617.3333333335,
"logps/chosen": -150.47998046875,
"logps/rejected": -242.73616536458334,
"loss": 0.2824,
"rewards/chosen": 0.04531372019222805,
"rewards/margins": 2.6572553756691164,
"rewards/rejected": -2.611941655476888,
"step": 60
},
{
"epoch": 0.10558199913457378,
"grad_norm": 20.0,
"kl": 0.0002346038818359375,
"learning_rate": 5e-06,
"logits/chosen": -4345515.0,
"logits/rejected": -8544926.0,
"logps/chosen": -253.4796142578125,
"logps/rejected": -246.94094848632812,
"loss": 0.2793,
"rewards/chosen": 0.15215471386909485,
"rewards/margins": 3.54859259724617,
"rewards/rejected": -3.396437883377075,
"step": 61
},
{
"epoch": 0.10731285157940286,
"grad_norm": 18.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1196661.3333333333,
"logits/rejected": -3318421.1428571427,
"logps/chosen": -154.80669487847223,
"logps/rejected": -289.3799525669643,
"loss": 0.3092,
"rewards/chosen": 0.0665718052122328,
"rewards/margins": 3.497844584404476,
"rewards/rejected": -3.4312727791922435,
"step": 62
},
{
"epoch": 0.10904370402423194,
"grad_norm": 20.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3965800.4444444445,
"logits/rejected": -11330329.142857144,
"logps/chosen": -182.16238064236111,
"logps/rejected": -249.26039341517858,
"loss": 0.3376,
"rewards/chosen": -0.1392565303378635,
"rewards/margins": 3.219647899506584,
"rewards/rejected": -3.3589044298444475,
"step": 63
},
{
"epoch": 0.11077455646906101,
"grad_norm": 19.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2474531.25,
"logits/rejected": -5974583.5,
"logps/chosen": -209.27244567871094,
"logps/rejected": -302.0538635253906,
"loss": 0.325,
"rewards/chosen": 0.020520292222499847,
"rewards/margins": 3.1491325721144676,
"rewards/rejected": -3.1286122798919678,
"step": 64
},
{
"epoch": 0.11250540891389009,
"grad_norm": 14.5625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5467615.384615385,
"logits/rejected": -3365480.0,
"logps/chosen": -144.33649151141827,
"logps/rejected": -279.4246761924342,
"loss": 0.2657,
"rewards/chosen": -0.11599624156951904,
"rewards/margins": 3.2443882477910897,
"rewards/rejected": -3.3603844893606087,
"step": 65
},
{
"epoch": 0.11423626135871917,
"grad_norm": 22.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1735104.4,
"logits/rejected": 5294354.666666667,
"logps/chosen": -211.195068359375,
"logps/rejected": -247.84977213541666,
"loss": 0.3939,
"rewards/chosen": -0.25041675567626953,
"rewards/margins": 3.1013142267862954,
"rewards/rejected": -3.351730982462565,
"step": 66
},
{
"epoch": 0.11596711380354825,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5322431.555555556,
"logits/rejected": -17504265.14285714,
"logps/chosen": -156.61607530381946,
"logps/rejected": -485.01771763392856,
"loss": 0.3061,
"rewards/chosen": 0.008888012833065458,
"rewards/margins": 3.940254797065069,
"rewards/rejected": -3.9313667842320035,
"step": 67
},
{
"epoch": 0.11769796624837732,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2996452.2666666666,
"logits/rejected": 11716244.705882354,
"logps/chosen": -192.19383138020834,
"logps/rejected": -302.7353515625,
"loss": 0.2934,
"rewards/chosen": 0.03241715629895528,
"rewards/margins": 2.8701628228028615,
"rewards/rejected": -2.8377456665039062,
"step": 68
},
{
"epoch": 0.1194288186932064,
"grad_norm": 17.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 14927777.142857144,
"logits/rejected": -4720057.777777778,
"logps/chosen": -149.42860630580358,
"logps/rejected": -217.70494249131946,
"loss": 0.2898,
"rewards/chosen": -0.16934810365949357,
"rewards/margins": 3.1008769973875983,
"rewards/rejected": -3.270225101047092,
"step": 69
},
{
"epoch": 0.12115967113803548,
"grad_norm": 20.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 44536797.538461536,
"logits/rejected": -7564164.2105263155,
"logps/chosen": -682.1751802884615,
"logps/rejected": -270.47216796875,
"loss": 0.2777,
"rewards/chosen": -0.24122038254371056,
"rewards/margins": 2.748665248816795,
"rewards/rejected": -2.989885631360506,
"step": 70
},
{
"epoch": 0.12289052358286456,
"grad_norm": 24.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8335642.105263158,
"logits/rejected": -9525784.615384616,
"logps/chosen": -303.1348170230263,
"logps/rejected": -280.5024601862981,
"loss": 0.3256,
"rewards/chosen": 0.11083748466090153,
"rewards/margins": 2.885276489412254,
"rewards/rejected": -2.7744390047513523,
"step": 71
},
{
"epoch": 0.12462137602769364,
"grad_norm": 12.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8330821.333333333,
"logits/rejected": -3045587.4,
"logps/chosen": -137.8349812825521,
"logps/rejected": -273.801513671875,
"loss": 0.1949,
"rewards/chosen": 0.5404347976048788,
"rewards/margins": 3.689098318417867,
"rewards/rejected": -3.1486635208129883,
"step": 72
},
{
"epoch": 0.12635222847252273,
"grad_norm": 21.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 9142004.444444444,
"logits/rejected": -3207848.8571428573,
"logps/chosen": -188.33430989583334,
"logps/rejected": -207.32901436941964,
"loss": 0.3074,
"rewards/chosen": 0.039681686295403376,
"rewards/margins": 3.252643155673194,
"rewards/rejected": -3.2129614693777904,
"step": 73
},
{
"epoch": 0.1280830809173518,
"grad_norm": 22.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6030457.263157895,
"logits/rejected": -3718584.0,
"logps/chosen": -238.16568153782896,
"logps/rejected": -195.5281700721154,
"loss": 0.3596,
"rewards/chosen": 0.006416631372351395,
"rewards/margins": 2.2222877086898096,
"rewards/rejected": -2.215871077317458,
"step": 74
},
{
"epoch": 0.12981393336218086,
"grad_norm": 19.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3802380.705882353,
"logits/rejected": 7472531.2,
"logps/chosen": -168.80958467371323,
"logps/rejected": -285.90989583333334,
"loss": 0.3219,
"rewards/chosen": -0.224185635061825,
"rewards/margins": 2.7274849480273673,
"rewards/rejected": -2.9516705830891925,
"step": 75
},
{
"epoch": 0.13154478580700996,
"grad_norm": 21.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4579526.5,
"logits/rejected": 6935348.0,
"logps/chosen": -246.0178985595703,
"logps/rejected": -326.98834228515625,
"loss": 0.2722,
"rewards/chosen": 0.10884374380111694,
"rewards/margins": 3.900286853313446,
"rewards/rejected": -3.791443109512329,
"step": 76
},
{
"epoch": 0.13327563825183902,
"grad_norm": 23.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2948165.6,
"logits/rejected": -10498065.88235294,
"logps/chosen": -323.5631510416667,
"logps/rejected": -268.2412683823529,
"loss": 0.2814,
"rewards/chosen": -0.016830217838287354,
"rewards/margins": 3.3728826207273146,
"rewards/rejected": -3.389712838565602,
"step": 77
},
{
"epoch": 0.13500649069666812,
"grad_norm": 19.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 3724001.263157895,
"logits/rejected": -4905061.230769231,
"logps/chosen": -196.80241313733552,
"logps/rejected": -246.7076697716346,
"loss": 0.3493,
"rewards/chosen": 0.13470386203966642,
"rewards/margins": 2.7459867975489813,
"rewards/rejected": -2.611282935509315,
"step": 78
},
{
"epoch": 0.1367373431414972,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 882787.5,
"logits/rejected": -2752756.8333333335,
"logps/chosen": -152.87762451171875,
"logps/rejected": -143.17495727539062,
"loss": 0.3448,
"rewards/chosen": -0.04928714632987976,
"rewards/margins": 3.1339206834634146,
"rewards/rejected": -3.1832078297932944,
"step": 79
},
{
"epoch": 0.13846819558632625,
"grad_norm": 16.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10046429.6,
"logits/rejected": -9446046.545454545,
"logps/chosen": -211.123828125,
"logps/rejected": -224.4654873934659,
"loss": 0.2231,
"rewards/chosen": -0.09817437529563904,
"rewards/margins": 3.1476648303595454,
"rewards/rejected": -3.2458392056551846,
"step": 80
},
{
"epoch": 0.14019904803115535,
"grad_norm": 21.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1529026.8235294118,
"logits/rejected": -18009732.266666666,
"logps/chosen": -215.9851505055147,
"logps/rejected": -285.50341796875,
"loss": 0.3038,
"rewards/chosen": 0.07493850062875186,
"rewards/margins": 3.3579501278260175,
"rewards/rejected": -3.283011627197266,
"step": 81
},
{
"epoch": 0.14192990047598442,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2760504.3076923075,
"logits/rejected": -6295609.684210527,
"logps/chosen": -247.9722618689904,
"logps/rejected": -289.9179173519737,
"loss": 0.2071,
"rewards/chosen": 0.2483532978938176,
"rewards/margins": 3.7582735185198453,
"rewards/rejected": -3.509920220626028,
"step": 82
},
{
"epoch": 0.1436607529208135,
"grad_norm": 18.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12023732.923076924,
"logits/rejected": -5987361.263157895,
"logps/chosen": -222.31482872596155,
"logps/rejected": -229.85079152960526,
"loss": 0.2564,
"rewards/chosen": -0.01725879082312951,
"rewards/margins": 3.232607895546114,
"rewards/rejected": -3.2498666863692436,
"step": 83
},
{
"epoch": 0.14539160536564258,
"grad_norm": 17.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 612477.6842105263,
"logits/rejected": 610641.0769230769,
"logps/chosen": -134.88838918585526,
"logps/rejected": -210.28667743389423,
"loss": 0.3361,
"rewards/chosen": -0.02787588772020842,
"rewards/margins": 2.743228191306234,
"rewards/rejected": -2.7711040790264425,
"step": 84
},
{
"epoch": 0.14712245781047165,
"grad_norm": 19.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8578859.294117646,
"logits/rejected": -4098941.8666666667,
"logps/chosen": -180.28768382352942,
"logps/rejected": -149.85576171875,
"loss": 0.3247,
"rewards/chosen": 0.19388238121481502,
"rewards/margins": 2.1001341614068725,
"rewards/rejected": -1.9062517801920573,
"step": 85
},
{
"epoch": 0.14885331025530074,
"grad_norm": 20.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1573394.5882352942,
"logits/rejected": 10871098.666666666,
"logps/chosen": -191.07115981158088,
"logps/rejected": -244.60188802083334,
"loss": 0.3206,
"rewards/chosen": 0.2755900551291073,
"rewards/margins": 2.1674577563416726,
"rewards/rejected": -1.891867701212565,
"step": 86
},
{
"epoch": 0.1505841627001298,
"grad_norm": 21.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1688797.142857143,
"logits/rejected": -1483586.6666666667,
"logps/chosen": -255.131103515625,
"logps/rejected": -332.4247233072917,
"loss": 0.2465,
"rewards/chosen": 0.21271177700587682,
"rewards/margins": 3.036478909235152,
"rewards/rejected": -2.8237671322292752,
"step": 87
},
{
"epoch": 0.1523150151449589,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15565679.0,
"logits/rejected": -2823755.25,
"logps/chosen": -226.8282012939453,
"logps/rejected": -287.0397644042969,
"loss": 0.2921,
"rewards/chosen": 0.2816123962402344,
"rewards/margins": 2.9265496730804443,
"rewards/rejected": -2.64493727684021,
"step": 88
},
{
"epoch": 0.15404586758978797,
"grad_norm": 20.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -413132.4,
"logits/rejected": -10101343.05882353,
"logps/chosen": -230.32145182291666,
"logps/rejected": -227.8623764935662,
"loss": 0.3061,
"rewards/chosen": 0.1068873405456543,
"rewards/margins": 2.7386797456180347,
"rewards/rejected": -2.6317924050723804,
"step": 89
},
{
"epoch": 0.15577672003461704,
"grad_norm": 16.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2141461.846153846,
"logits/rejected": -5844247.157894737,
"logps/chosen": -115.6215350811298,
"logps/rejected": -274.47286184210526,
"loss": 0.2861,
"rewards/chosen": -0.02882493459261381,
"rewards/margins": 2.667185855780536,
"rewards/rejected": -2.69601079037315,
"step": 90
},
{
"epoch": 0.15750757247944613,
"grad_norm": 25.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12380478.545454545,
"logits/rejected": -8278526.4,
"logps/chosen": -197.9210759943182,
"logps/rejected": -223.8676513671875,
"loss": 0.3626,
"rewards/chosen": 0.14342746951363303,
"rewards/margins": 3.160957529328086,
"rewards/rejected": -3.017530059814453,
"step": 91
},
{
"epoch": 0.1592384249242752,
"grad_norm": 20.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13449718.588235294,
"logits/rejected": -4683606.933333334,
"logps/chosen": -317.87795840992646,
"logps/rejected": -267.261279296875,
"loss": 0.2726,
"rewards/chosen": 0.3811823059530819,
"rewards/margins": 3.948626662235634,
"rewards/rejected": -3.567444356282552,
"step": 92
},
{
"epoch": 0.1609692773691043,
"grad_norm": 16.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -543169.375,
"logits/rejected": -1371199.125,
"logps/chosen": -150.7501678466797,
"logps/rejected": -181.0672149658203,
"loss": 0.3291,
"rewards/chosen": -0.053351566195487976,
"rewards/margins": 2.6354714184999466,
"rewards/rejected": -2.6888229846954346,
"step": 93
},
{
"epoch": 0.16270012981393336,
"grad_norm": 18.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2523244.0,
"logits/rejected": 2864016.380952381,
"logps/chosen": -234.7637606534091,
"logps/rejected": -178.11604817708334,
"loss": 0.2795,
"rewards/chosen": -0.16940477761355313,
"rewards/margins": 2.3981288062545643,
"rewards/rejected": -2.5675335838681175,
"step": 94
},
{
"epoch": 0.16443098225876243,
"grad_norm": 21.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 3101979.2,
"logits/rejected": -5256099.0,
"logps/chosen": -156.21822509765624,
"logps/rejected": -249.0777791341146,
"loss": 0.3557,
"rewards/chosen": -0.0824066936969757,
"rewards/margins": 3.3445211907227836,
"rewards/rejected": -3.4269278844197593,
"step": 95
},
{
"epoch": 0.16616183470359153,
"grad_norm": 16.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 7612368.7272727275,
"logits/rejected": -5602336.761904762,
"logps/chosen": -176.8388338955966,
"logps/rejected": -257.1624348958333,
"loss": 0.2211,
"rewards/chosen": 0.0035542053255167875,
"rewards/margins": 3.556209743248694,
"rewards/rejected": -3.5526555379231772,
"step": 96
},
{
"epoch": 0.1678926871484206,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1705522.0,
"logits/rejected": -19220438.0,
"logps/chosen": -229.12918090820312,
"logps/rejected": -301.3127746582031,
"loss": 0.2716,
"rewards/chosen": 0.30286985635757446,
"rewards/margins": 3.395688831806183,
"rewards/rejected": -3.0928189754486084,
"step": 97
},
{
"epoch": 0.1696235395932497,
"grad_norm": 24.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 12111708.307692308,
"logits/rejected": -993389.2631578947,
"logps/chosen": -285.0191180889423,
"logps/rejected": -165.50485711348685,
"loss": 0.2834,
"rewards/chosen": 0.09072128626016471,
"rewards/margins": 2.5426856144237133,
"rewards/rejected": -2.4519643281635486,
"step": 98
},
{
"epoch": 0.17135439203807876,
"grad_norm": 22.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6123332.705882353,
"logits/rejected": -4620013.866666666,
"logps/chosen": -249.7477596507353,
"logps/rejected": -251.340185546875,
"loss": 0.3179,
"rewards/chosen": 0.006986297228757073,
"rewards/margins": 2.8065402319618302,
"rewards/rejected": -2.799553934733073,
"step": 99
},
{
"epoch": 0.17308524448290782,
"grad_norm": 17.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3662578.0,
"logits/rejected": -3268372.75,
"logps/chosen": -159.33383178710938,
"logps/rejected": -199.64552307128906,
"loss": 0.3467,
"rewards/chosen": -0.1916092038154602,
"rewards/margins": 2.2172593474388123,
"rewards/rejected": -2.4088685512542725,
"step": 100
},
{
"epoch": 0.17481609692773692,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11227556.0,
"logits/rejected": -4947570.4,
"logps/chosen": -226.8167521158854,
"logps/rejected": -230.9011474609375,
"loss": 0.2496,
"rewards/chosen": 0.0410018265247345,
"rewards/margins": 3.1503684341907503,
"rewards/rejected": -3.109366607666016,
"step": 101
},
{
"epoch": 0.176546949372566,
"grad_norm": 21.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4914299.368421053,
"logits/rejected": -4945337.846153846,
"logps/chosen": -197.87137643914474,
"logps/rejected": -258.1477614182692,
"loss": 0.3485,
"rewards/chosen": -0.031160028357254833,
"rewards/margins": 3.5723934501771506,
"rewards/rejected": -3.603553478534405,
"step": 102
},
{
"epoch": 0.17827780181739505,
"grad_norm": 20.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7837846.222222222,
"logits/rejected": -11910576.0,
"logps/chosen": -174.4716796875,
"logps/rejected": -307.66469029017856,
"loss": 0.3231,
"rewards/chosen": -0.013415685130490197,
"rewards/margins": 2.68944691819331,
"rewards/rejected": -2.7028626033238004,
"step": 103
},
{
"epoch": 0.18000865426222415,
"grad_norm": 20.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2126388.0,
"logits/rejected": 277817.7894736842,
"logps/chosen": -284.61337515024036,
"logps/rejected": -307.56527549342104,
"loss": 0.2237,
"rewards/chosen": 0.27964045451237607,
"rewards/margins": 3.5338759731184615,
"rewards/rejected": -3.2542355186060856,
"step": 104
},
{
"epoch": 0.18173950670705322,
"grad_norm": 21.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1334305.875,
"logits/rejected": -5412106.0,
"logps/chosen": -228.7758331298828,
"logps/rejected": -258.7738342285156,
"loss": 0.3184,
"rewards/chosen": -0.1122078001499176,
"rewards/margins": 2.8912404477596283,
"rewards/rejected": -3.003448247909546,
"step": 105
},
{
"epoch": 0.1834703591518823,
"grad_norm": 18.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10178373.05263158,
"logits/rejected": -8054584.615384615,
"logps/chosen": -173.59982781661185,
"logps/rejected": -278.5968674879808,
"loss": 0.3082,
"rewards/chosen": 0.23594951629638672,
"rewards/margins": 3.9558092997624326,
"rewards/rejected": -3.719859783466046,
"step": 106
},
{
"epoch": 0.18520121159671138,
"grad_norm": 20.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7455461.647058823,
"logits/rejected": -10272849.066666666,
"logps/chosen": -190.27541934742646,
"logps/rejected": -201.00078125,
"loss": 0.3043,
"rewards/chosen": 0.15676203896017635,
"rewards/margins": 2.838399357889213,
"rewards/rejected": -2.6816373189290363,
"step": 107
},
{
"epoch": 0.18693206404154045,
"grad_norm": 21.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8889729.6,
"logits/rejected": -3655384.0,
"logps/chosen": -190.6529541015625,
"logps/rejected": -243.38692220052084,
"loss": 0.3275,
"rewards/chosen": 0.024685271084308624,
"rewards/margins": 3.879149484137694,
"rewards/rejected": -3.8544642130533853,
"step": 108
},
{
"epoch": 0.18866291648636954,
"grad_norm": 24.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1406335.5789473683,
"logits/rejected": -5790012.307692308,
"logps/chosen": -160.40576171875,
"logps/rejected": -207.287353515625,
"loss": 0.3181,
"rewards/chosen": -0.008714937850048668,
"rewards/margins": 3.360927466559507,
"rewards/rejected": -3.3696424044095554,
"step": 109
},
{
"epoch": 0.1903937689311986,
"grad_norm": 17.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1734898.1333333333,
"logits/rejected": -7056189.176470588,
"logps/chosen": -146.405810546875,
"logps/rejected": -183.34127987132354,
"loss": 0.3017,
"rewards/chosen": 0.023581977685292563,
"rewards/margins": 2.700872501438739,
"rewards/rejected": -2.6772905237534466,
"step": 110
},
{
"epoch": 0.1921246213760277,
"grad_norm": 18.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 531886.4615384615,
"logits/rejected": -6064568.421052632,
"logps/chosen": -133.6158728966346,
"logps/rejected": -266.82632606907896,
"loss": 0.2827,
"rewards/chosen": -0.16753161870516264,
"rewards/margins": 2.9463925081708653,
"rewards/rejected": -3.113924126876028,
"step": 111
},
{
"epoch": 0.19385547382085677,
"grad_norm": 15.9375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7024642.4,
"logits/rejected": -11717272.727272727,
"logps/chosen": -179.82689208984374,
"logps/rejected": -256.05178000710225,
"loss": 0.1916,
"rewards/chosen": 0.08213082551956177,
"rewards/margins": 3.862654645876451,
"rewards/rejected": -3.780523820356889,
"step": 112
},
{
"epoch": 0.19558632626568584,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7312199.5,
"logits/rejected": -9691646.0,
"logps/chosen": -190.84982299804688,
"logps/rejected": -189.02178955078125,
"loss": 0.2895,
"rewards/chosen": -0.05902346968650818,
"rewards/margins": 3.4700850546360016,
"rewards/rejected": -3.5291085243225098,
"step": 113
},
{
"epoch": 0.19731717871051493,
"grad_norm": 17.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8977872.0,
"logits/rejected": -2104818.5,
"logps/chosen": -153.08140563964844,
"logps/rejected": -362.2707824707031,
"loss": 0.2608,
"rewards/chosen": 0.1350196748971939,
"rewards/margins": 4.345773592591286,
"rewards/rejected": -4.210753917694092,
"step": 114
},
{
"epoch": 0.199048031155344,
"grad_norm": 20.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15021820.444444444,
"logits/rejected": 3075554.8571428573,
"logps/chosen": -254.52723524305554,
"logps/rejected": -324.88724190848217,
"loss": 0.2895,
"rewards/chosen": 0.18979620933532715,
"rewards/margins": 4.077897787094116,
"rewards/rejected": -3.888101577758789,
"step": 115
},
{
"epoch": 0.2007788836001731,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8322480.0,
"logits/rejected": 2723888.4210526315,
"logps/chosen": -225.67595027043268,
"logps/rejected": -165.18586811266448,
"loss": 0.2731,
"rewards/chosen": 0.10532364478478065,
"rewards/margins": 3.168428977008773,
"rewards/rejected": -3.0631053322239925,
"step": 116
},
{
"epoch": 0.20250973604500216,
"grad_norm": 13.4375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3385337.6,
"logits/rejected": -4593669.454545454,
"logps/chosen": -155.44390869140625,
"logps/rejected": -335.2124689275568,
"loss": 0.2003,
"rewards/chosen": -0.034807294607162476,
"rewards/margins": 4.202970068563115,
"rewards/rejected": -4.237777363170277,
"step": 117
},
{
"epoch": 0.20424058848983123,
"grad_norm": 18.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2332099.8181818184,
"logits/rejected": -2151819.8,
"logps/chosen": -175.2345525568182,
"logps/rejected": -296.2167724609375,
"loss": 0.3551,
"rewards/chosen": 0.0834602876143022,
"rewards/margins": 4.01979642347856,
"rewards/rejected": -3.936336135864258,
"step": 118
},
{
"epoch": 0.20597144093466033,
"grad_norm": 19.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1813728.4210526317,
"logits/rejected": -1663898.1538461538,
"logps/chosen": -162.3670076069079,
"logps/rejected": -307.88326322115387,
"loss": 0.2952,
"rewards/chosen": 0.22818475020559212,
"rewards/margins": 3.5458337111994322,
"rewards/rejected": -3.31764896099384,
"step": 119
},
{
"epoch": 0.2077022933794894,
"grad_norm": 17.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 5888752.533333333,
"logits/rejected": -2605564.705882353,
"logps/chosen": -156.10784505208332,
"logps/rejected": -253.71030560661765,
"loss": 0.3016,
"rewards/chosen": -0.10760652224222819,
"rewards/margins": 3.660627281899546,
"rewards/rejected": -3.768233804141774,
"step": 120
},
{
"epoch": 0.2094331458243185,
"grad_norm": 15.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7869675.076923077,
"logits/rejected": -4793528.842105263,
"logps/chosen": -159.2930626502404,
"logps/rejected": -251.59606291118422,
"loss": 0.2563,
"rewards/chosen": -0.1771384019118089,
"rewards/margins": 3.1404216936242726,
"rewards/rejected": -3.3175600955360816,
"step": 121
},
{
"epoch": 0.21116399826914756,
"grad_norm": 18.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3852856.0,
"logits/rejected": 1017586.2352941176,
"logps/chosen": -130.758642578125,
"logps/rejected": -310.29397403492646,
"loss": 0.2913,
"rewards/chosen": -0.17760810852050782,
"rewards/margins": 3.5648737963508155,
"rewards/rejected": -3.7424819048713234,
"step": 122
},
{
"epoch": 0.21289485071397662,
"grad_norm": 21.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 394461.8947368421,
"logits/rejected": -11013158.153846154,
"logps/chosen": -211.87534693667763,
"logps/rejected": -266.10584435096155,
"loss": 0.3519,
"rewards/chosen": -0.09623796061465614,
"rewards/margins": 3.0594592017200792,
"rewards/rejected": -3.1556971623347354,
"step": 123
},
{
"epoch": 0.21462570315880572,
"grad_norm": 20.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -375859.125,
"logits/rejected": -18902716.0,
"logps/chosen": -144.1176513671875,
"logps/rejected": -328.43532307942706,
"loss": 0.3587,
"rewards/chosen": -0.10335218906402588,
"rewards/margins": 3.1057602961858115,
"rewards/rejected": -3.2091124852498374,
"step": 124
},
{
"epoch": 0.2163565556036348,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9888419.2,
"logits/rejected": -13417634.823529411,
"logps/chosen": -207.53665364583333,
"logps/rejected": -305.8712373621324,
"loss": 0.2801,
"rewards/chosen": 0.16173944473266602,
"rewards/margins": 3.5778553738313565,
"rewards/rejected": -3.4161159290986904,
"step": 125
},
{
"epoch": 0.21808740804846388,
"grad_norm": 15.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12050750.4,
"logits/rejected": -13769786.181818182,
"logps/chosen": -183.52490234375,
"logps/rejected": -278.15651633522725,
"loss": 0.2183,
"rewards/chosen": 0.14027655124664307,
"rewards/margins": 3.1395226283506914,
"rewards/rejected": -2.9992460771040483,
"step": 126
},
{
"epoch": 0.21981826049329295,
"grad_norm": 17.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12270530.666666666,
"logits/rejected": -8869286.4,
"logps/chosen": -190.94978841145834,
"logps/rejected": -284.20654296875,
"loss": 0.233,
"rewards/chosen": 0.16113528609275818,
"rewards/margins": 3.4381788194179537,
"rewards/rejected": -3.2770435333251955,
"step": 127
},
{
"epoch": 0.22154911293812202,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15397617.066666666,
"logits/rejected": -13504640.94117647,
"logps/chosen": -223.714453125,
"logps/rejected": -284.49543313419116,
"loss": 0.2465,
"rewards/chosen": 0.2658435821533203,
"rewards/margins": 3.566493337294635,
"rewards/rejected": -3.3006497551413143,
"step": 128
},
{
"epoch": 0.2232799653829511,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13344241.333333334,
"logits/rejected": 3126395.2,
"logps/chosen": -236.7048543294271,
"logps/rejected": -158.71007080078124,
"loss": 0.2397,
"rewards/chosen": 0.05072679618994395,
"rewards/margins": 3.053556347886721,
"rewards/rejected": -3.0028295516967773,
"step": 129
},
{
"epoch": 0.22501081782778018,
"grad_norm": 18.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1188826.4,
"logits/rejected": -7702007.529411765,
"logps/chosen": -194.06878255208332,
"logps/rejected": -285.5570714613971,
"loss": 0.287,
"rewards/chosen": -0.050044012069702146,
"rewards/margins": 2.950371789932251,
"rewards/rejected": -3.000415802001953,
"step": 130
},
{
"epoch": 0.22674167027260925,
"grad_norm": 24.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9090264.421052631,
"logits/rejected": -13554491.076923076,
"logps/chosen": -228.48311574835526,
"logps/rejected": -317.5834209735577,
"loss": 0.3322,
"rewards/chosen": 0.17829758242556923,
"rewards/margins": 2.7610747524601247,
"rewards/rejected": -2.5827771700345554,
"step": 131
},
{
"epoch": 0.22847252271743834,
"grad_norm": 20.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11592303.111111112,
"logits/rejected": -3384686.285714286,
"logps/chosen": -236.07706705729166,
"logps/rejected": -231.01377650669642,
"loss": 0.3204,
"rewards/chosen": 0.08443025747934978,
"rewards/margins": 3.5145191181273687,
"rewards/rejected": -3.430088860648019,
"step": 132
},
{
"epoch": 0.2302033751622674,
"grad_norm": 18.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4109541.0,
"logits/rejected": -10256295.0,
"logps/chosen": -150.5631561279297,
"logps/rejected": -251.85006713867188,
"loss": 0.3414,
"rewards/chosen": -0.03023519366979599,
"rewards/margins": 2.841710902750492,
"rewards/rejected": -2.871946096420288,
"step": 133
},
{
"epoch": 0.2319342276070965,
"grad_norm": 19.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -17335990.153846152,
"logits/rejected": -11810266.94736842,
"logps/chosen": -255.87794846754807,
"logps/rejected": -292.57041529605266,
"loss": 0.2403,
"rewards/chosen": -0.009397160548430223,
"rewards/margins": 3.8956148698021043,
"rewards/rejected": -3.9050120303505347,
"step": 134
},
{
"epoch": 0.23366508005192557,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 3816658.933333333,
"logits/rejected": -21506733.17647059,
"logps/chosen": -192.03406575520833,
"logps/rejected": -326.34670840992646,
"loss": 0.2583,
"rewards/chosen": -0.08073126475016276,
"rewards/margins": 4.162721368378285,
"rewards/rejected": -4.243452633128447,
"step": 135
},
{
"epoch": 0.23539593249675464,
"grad_norm": 23.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5209688.421052632,
"logits/rejected": 4241887.076923077,
"logps/chosen": -241.80124383223685,
"logps/rejected": -267.38955453725964,
"loss": 0.3287,
"rewards/chosen": 0.1192607001254433,
"rewards/margins": 3.0787412792082254,
"rewards/rejected": -2.9594805790827823,
"step": 136
},
{
"epoch": 0.23712678494158373,
"grad_norm": 16.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13466715.733333332,
"logits/rejected": -6725747.764705882,
"logps/chosen": -169.23546549479167,
"logps/rejected": -301.47449448529414,
"loss": 0.2489,
"rewards/chosen": 0.17315847078959148,
"rewards/margins": 4.150024351419187,
"rewards/rejected": -3.9768658806295956,
"step": 137
},
{
"epoch": 0.2388576373864128,
"grad_norm": 20.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7070817.5,
"logits/rejected": -5078294.5,
"logps/chosen": -224.52960205078125,
"logps/rejected": -250.922119140625,
"loss": 0.299,
"rewards/chosen": -0.062333978712558746,
"rewards/margins": 3.91761764138937,
"rewards/rejected": -3.9799516201019287,
"step": 138
},
{
"epoch": 0.2405884898312419,
"grad_norm": 20.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10853539.555555556,
"logits/rejected": -8516350.857142856,
"logps/chosen": -245.61515299479166,
"logps/rejected": -311.77559988839283,
"loss": 0.2919,
"rewards/chosen": 0.143819702996148,
"rewards/margins": 4.062012430221316,
"rewards/rejected": -3.9181927272251675,
"step": 139
},
{
"epoch": 0.24231934227607096,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -23256322.666666668,
"logits/rejected": -14564356.8,
"logps/chosen": -252.69757080078125,
"logps/rejected": -324.65302734375,
"loss": 0.1926,
"rewards/chosen": 0.15854175885518393,
"rewards/margins": 4.718555339177449,
"rewards/rejected": -4.5600135803222654,
"step": 140
},
{
"epoch": 0.24405019472090003,
"grad_norm": 18.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13540496.0,
"logits/rejected": -14330663.111111112,
"logps/chosen": -261.66441127232144,
"logps/rejected": -279.0700954861111,
"loss": 0.2759,
"rewards/chosen": -0.16050028800964355,
"rewards/margins": 3.2266637219323053,
"rewards/rejected": -3.387164009941949,
"step": 141
},
{
"epoch": 0.24578104716572913,
"grad_norm": 22.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -16567258.352941176,
"logits/rejected": -9630573.866666667,
"logps/chosen": -248.67431640625,
"logps/rejected": -195.8955078125,
"loss": 0.2962,
"rewards/chosen": 0.18976323744829962,
"rewards/margins": 3.4638149037080654,
"rewards/rejected": -3.2740516662597656,
"step": 142
},
{
"epoch": 0.2475118996105582,
"grad_norm": 16.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12402342.153846154,
"logits/rejected": -7839504.842105263,
"logps/chosen": -210.70182917668268,
"logps/rejected": -379.2096525493421,
"loss": 0.2114,
"rewards/chosen": 0.16687591259296125,
"rewards/margins": 5.201281671099335,
"rewards/rejected": -5.034405758506374,
"step": 143
},
{
"epoch": 0.2492427520553873,
"grad_norm": 19.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4255770.285714285,
"logits/rejected": -3999491.111111111,
"logps/chosen": -195.87995256696428,
"logps/rejected": -214.72549099392361,
"loss": 0.2982,
"rewards/chosen": -0.21987019266401017,
"rewards/margins": 2.6090391514793274,
"rewards/rejected": -2.8289093441433377,
"step": 144
},
{
"epoch": 0.25097360450021633,
"grad_norm": 22.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4979902.666666667,
"logits/rejected": -11014548.57142857,
"logps/chosen": -255.74172634548611,
"logps/rejected": -255.93235560825892,
"loss": 0.3164,
"rewards/chosen": -0.09737168418036567,
"rewards/margins": 3.5333697076827764,
"rewards/rejected": -3.630741391863142,
"step": 145
},
{
"epoch": 0.25270445694504545,
"grad_norm": 17.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7553311.5,
"logits/rejected": -12855062.0,
"logps/chosen": -181.3147735595703,
"logps/rejected": -319.3401184082031,
"loss": 0.2509,
"rewards/chosen": 0.28162485361099243,
"rewards/margins": 4.752773344516754,
"rewards/rejected": -4.471148490905762,
"step": 146
},
{
"epoch": 0.2544353093898745,
"grad_norm": 16.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -18317777.6,
"logits/rejected": -8850414.545454545,
"logps/chosen": -208.5265380859375,
"logps/rejected": -253.0011319247159,
"loss": 0.2135,
"rewards/chosen": 0.22829954624176024,
"rewards/margins": 3.561434630914168,
"rewards/rejected": -3.3331350846724077,
"step": 147
},
{
"epoch": 0.2561661618347036,
"grad_norm": 22.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 7776780.0,
"logits/rejected": -13990768.0,
"logps/chosen": -222.059326171875,
"logps/rejected": -333.6552734375,
"loss": 0.3596,
"rewards/chosen": -0.05462043881416321,
"rewards/margins": 2.7295163333415986,
"rewards/rejected": -2.7841367721557617,
"step": 148
},
{
"epoch": 0.25789701427953265,
"grad_norm": 20.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -501711.76470588235,
"logits/rejected": -4720805.333333333,
"logps/chosen": -165.69982192095588,
"logps/rejected": -258.313623046875,
"loss": 0.3126,
"rewards/chosen": -0.00996632085126989,
"rewards/margins": 2.9709932535302404,
"rewards/rejected": -2.9809595743815103,
"step": 149
},
{
"epoch": 0.2596278667243617,
"grad_norm": 16.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11498374.153846154,
"logits/rejected": -12492654.315789474,
"logps/chosen": -136.2483191856971,
"logps/rejected": -243.18302837171052,
"loss": 0.27,
"rewards/chosen": -0.3298172950744629,
"rewards/margins": 2.9719581854970833,
"rewards/rejected": -3.3017754805715462,
"step": 150
},
{
"epoch": 0.26135871916919085,
"grad_norm": 20.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -18241306.0,
"logits/rejected": -5504833.5,
"logps/chosen": -285.53369140625,
"logps/rejected": -346.5982360839844,
"loss": 0.2638,
"rewards/chosen": 0.2393263578414917,
"rewards/margins": 3.530009150505066,
"rewards/rejected": -3.290682792663574,
"step": 151
},
{
"epoch": 0.2630895716140199,
"grad_norm": 21.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9077254.588235294,
"logits/rejected": -16832766.933333334,
"logps/chosen": -218.34030330882354,
"logps/rejected": -284.46845703125,
"loss": 0.3062,
"rewards/chosen": 0.03087810558431289,
"rewards/margins": 3.274915225131839,
"rewards/rejected": -3.244037119547526,
"step": 152
},
{
"epoch": 0.264820424058849,
"grad_norm": 17.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 3784820.0,
"logits/rejected": -27976580.266666666,
"logps/chosen": -176.8842342601103,
"logps/rejected": -371.0309244791667,
"loss": 0.2938,
"rewards/chosen": 0.037597624694599825,
"rewards/margins": 3.3108126006874383,
"rewards/rejected": -3.2732149759928384,
"step": 153
},
{
"epoch": 0.26655127650367805,
"grad_norm": 19.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 8110033.066666666,
"logits/rejected": -6098681.411764706,
"logps/chosen": -224.60833333333332,
"logps/rejected": -270.91673368566177,
"loss": 0.2667,
"rewards/chosen": 0.19785807927449545,
"rewards/margins": 3.2095348676045736,
"rewards/rejected": -3.011676788330078,
"step": 154
},
{
"epoch": 0.2682821289485071,
"grad_norm": 24.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10915676.16,
"logits/rejected": -12048051.42857143,
"logps/chosen": -200.02255859375,
"logps/rejected": -363.83028738839283,
"loss": 0.4141,
"rewards/chosen": -0.20076709747314453,
"rewards/margins": 4.246241580418179,
"rewards/rejected": -4.447008677891323,
"step": 155
},
{
"epoch": 0.27001298139333624,
"grad_norm": 21.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 4039928.5,
"logits/rejected": -297019.3125,
"logps/chosen": -122.26431274414062,
"logps/rejected": -205.24542236328125,
"loss": 0.2976,
"rewards/chosen": 0.13761137425899506,
"rewards/margins": 2.9543447345495224,
"rewards/rejected": -2.8167333602905273,
"step": 156
},
{
"epoch": 0.2717438338381653,
"grad_norm": 24.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11950380.444444444,
"logits/rejected": -14319524.57142857,
"logps/chosen": -316.365234375,
"logps/rejected": -238.67583356584822,
"loss": 0.3578,
"rewards/chosen": -0.22798464033338758,
"rewards/margins": 2.9639355038839676,
"rewards/rejected": -3.191920144217355,
"step": 157
},
{
"epoch": 0.2734746862829944,
"grad_norm": 16.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7220196.923076923,
"logits/rejected": -7274026.105263158,
"logps/chosen": -168.70252403846155,
"logps/rejected": -272.20877878289474,
"loss": 0.2521,
"rewards/chosen": 0.034194111824035645,
"rewards/margins": 3.5594172916914286,
"rewards/rejected": -3.525223179867393,
"step": 158
},
{
"epoch": 0.27520553872782344,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7790591.111111111,
"logits/rejected": -4348877.714285715,
"logps/chosen": -150.17240397135416,
"logps/rejected": -257.23592703683033,
"loss": 0.2867,
"rewards/chosen": 0.2716523011525472,
"rewards/margins": 3.2814045747121177,
"rewards/rejected": -3.0097522735595703,
"step": 159
},
{
"epoch": 0.2769363911726525,
"grad_norm": 21.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6992461.0,
"logits/rejected": -9803589.0,
"logps/chosen": -197.62159729003906,
"logps/rejected": -200.63839721679688,
"loss": 0.3119,
"rewards/chosen": 0.2022910714149475,
"rewards/margins": 2.61891371011734,
"rewards/rejected": -2.4166226387023926,
"step": 160
},
{
"epoch": 0.27866724361748163,
"grad_norm": 15.5625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2316156.0,
"logits/rejected": -6923803.294117647,
"logps/chosen": -157.97198893229168,
"logps/rejected": -226.3733340992647,
"loss": 0.2941,
"rewards/chosen": -0.181062380472819,
"rewards/margins": 3.137106035269943,
"rewards/rejected": -3.318168415742762,
"step": 161
},
{
"epoch": 0.2803980960623107,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1817514.6666666667,
"logits/rejected": -4328779.428571428,
"logps/chosen": -125.52289496527777,
"logps/rejected": -229.05801827566964,
"loss": 0.3201,
"rewards/chosen": 0.1051819192038642,
"rewards/margins": 2.722233724972558,
"rewards/rejected": -2.617051805768694,
"step": 162
},
{
"epoch": 0.28212894850713977,
"grad_norm": 21.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7507249.411764706,
"logits/rejected": 2358673.6,
"logps/chosen": -203.8069278492647,
"logps/rejected": -239.440380859375,
"loss": 0.2975,
"rewards/chosen": 0.04498655655804802,
"rewards/margins": 3.3732679591459385,
"rewards/rejected": -3.3282814025878906,
"step": 163
},
{
"epoch": 0.28385980095196883,
"grad_norm": 18.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -17538291.2,
"logits/rejected": -8524349.176470589,
"logps/chosen": -203.443017578125,
"logps/rejected": -309.54041245404414,
"loss": 0.2556,
"rewards/chosen": 0.23375027974446613,
"rewards/margins": 3.705425703759287,
"rewards/rejected": -3.471675424014821,
"step": 164
},
{
"epoch": 0.2855906533967979,
"grad_norm": 15.5625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11055366.857142856,
"logits/rejected": -1121562.3333333333,
"logps/chosen": -159.264892578125,
"logps/rejected": -286.91015625,
"loss": 0.2449,
"rewards/chosen": 0.12227598258427211,
"rewards/margins": 4.091070063530452,
"rewards/rejected": -3.9687940809461804,
"step": 165
},
{
"epoch": 0.287321505841627,
"grad_norm": 23.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 657442.6,
"logits/rejected": -9425391.333333334,
"logps/chosen": -220.0943603515625,
"logps/rejected": -196.21675618489584,
"loss": 0.3849,
"rewards/chosen": -0.32888593673706057,
"rewards/margins": 2.8519148190816246,
"rewards/rejected": -3.180800755818685,
"step": 166
},
{
"epoch": 0.2890523582864561,
"grad_norm": 22.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4699375.555555556,
"logits/rejected": -3600353.1428571427,
"logps/chosen": -265.4497341579861,
"logps/rejected": -267.37051827566967,
"loss": 0.2924,
"rewards/chosen": 0.23302984237670898,
"rewards/margins": 3.618199280330113,
"rewards/rejected": -3.385169437953404,
"step": 167
},
{
"epoch": 0.29078321073128516,
"grad_norm": 24.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15453594.666666666,
"logits/rejected": -7304598.5,
"logps/chosen": -211.82305908203125,
"logps/rejected": -305.252685546875,
"loss": 0.4196,
"rewards/chosen": -0.15757346153259277,
"rewards/margins": 3.7481679916381836,
"rewards/rejected": -3.9057414531707764,
"step": 168
},
{
"epoch": 0.2925140631761142,
"grad_norm": 19.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 917019.2941176471,
"logits/rejected": -11583381.333333334,
"logps/chosen": -187.60915958180146,
"logps/rejected": -256.85833333333335,
"loss": 0.2921,
"rewards/chosen": -0.004517814692328958,
"rewards/margins": 3.6537013320361864,
"rewards/rejected": -3.6582191467285154,
"step": 169
},
{
"epoch": 0.2942449156209433,
"grad_norm": 20.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5889556.705882353,
"logits/rejected": -12821309.866666667,
"logps/chosen": -256.27088120404414,
"logps/rejected": -308.8744791666667,
"loss": 0.2513,
"rewards/chosen": 0.38170385360717773,
"rewards/margins": 4.123530483245849,
"rewards/rejected": -3.741826629638672,
"step": 170
},
{
"epoch": 0.2959757680657724,
"grad_norm": 17.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 16747204.0,
"logits/rejected": 620209.6,
"logps/chosen": -243.82991536458334,
"logps/rejected": -253.942626953125,
"loss": 0.2081,
"rewards/chosen": 0.4542102813720703,
"rewards/margins": 3.8508056640625,
"rewards/rejected": -3.3965953826904296,
"step": 171
},
{
"epoch": 0.2977066205106015,
"grad_norm": 17.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12614680.615384616,
"logits/rejected": -7096439.578947368,
"logps/chosen": -130.17988469050482,
"logps/rejected": -229.95723684210526,
"loss": 0.2725,
"rewards/chosen": 0.0035039232327387882,
"rewards/margins": 2.607151255675173,
"rewards/rejected": -2.603647332442434,
"step": 172
},
{
"epoch": 0.29943747295543055,
"grad_norm": 16.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3154612.0,
"logits/rejected": -9195659.2,
"logps/chosen": -173.6625773111979,
"logps/rejected": -270.675537109375,
"loss": 0.2182,
"rewards/chosen": 0.14845428864161173,
"rewards/margins": 3.5584659616152443,
"rewards/rejected": -3.4100116729736327,
"step": 173
},
{
"epoch": 0.3011683254002596,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9196400.0,
"logits/rejected": -15175718.0,
"logps/chosen": -157.8928680419922,
"logps/rejected": -259.74560546875,
"loss": 0.3143,
"rewards/chosen": -0.17662523686885834,
"rewards/margins": 3.229040876030922,
"rewards/rejected": -3.4056661128997803,
"step": 174
},
{
"epoch": 0.3028991778450887,
"grad_norm": 18.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15626394.352941176,
"logits/rejected": -7392939.2,
"logps/chosen": -241.25695082720588,
"logps/rejected": -236.45672200520832,
"loss": 0.2589,
"rewards/chosen": 0.28153758890488567,
"rewards/margins": 4.230858064165302,
"rewards/rejected": -3.9493204752604165,
"step": 175
},
{
"epoch": 0.3046300302899178,
"grad_norm": 22.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9918681.6,
"logits/rejected": -10010532.0,
"logps/chosen": -202.339111328125,
"logps/rejected": -184.87152099609375,
"loss": 0.3503,
"rewards/chosen": -0.07323684692382812,
"rewards/margins": 3.2869134902954102,
"rewards/rejected": -3.3601503372192383,
"step": 176
},
{
"epoch": 0.3063608827347469,
"grad_norm": 17.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9519585.0,
"logits/rejected": -17195120.0,
"logps/chosen": -151.671875,
"logps/rejected": -259.06878662109375,
"loss": 0.2859,
"rewards/chosen": 0.010412598960101604,
"rewards/margins": 3.337350751273334,
"rewards/rejected": -3.3269381523132324,
"step": 177
},
{
"epoch": 0.30809173517957594,
"grad_norm": 21.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -16214270.11764706,
"logits/rejected": -9008196.266666668,
"logps/chosen": -273.4817899816176,
"logps/rejected": -351.15856119791664,
"loss": 0.2819,
"rewards/chosen": 0.03458939930971931,
"rewards/margins": 3.560055555315579,
"rewards/rejected": -3.5254661560058596,
"step": 178
},
{
"epoch": 0.309822587624405,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4070659.5555555555,
"logits/rejected": 564305.2857142857,
"logps/chosen": -159.47158474392361,
"logps/rejected": -243.50922502790178,
"loss": 0.3114,
"rewards/chosen": 0.13283884525299072,
"rewards/margins": 3.35727219922202,
"rewards/rejected": -3.224433353969029,
"step": 179
},
{
"epoch": 0.3115534400692341,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6579336.888888889,
"logits/rejected": -11798860.57142857,
"logps/chosen": -194.06880696614584,
"logps/rejected": -265.37472098214283,
"loss": 0.3398,
"rewards/chosen": -0.14026531908247206,
"rewards/margins": 2.981163579320151,
"rewards/rejected": -3.121428898402623,
"step": 180
},
{
"epoch": 0.3132842925140632,
"grad_norm": 17.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5290882.0,
"logits/rejected": -21157156.0,
"logps/chosen": -148.17227172851562,
"logps/rejected": -296.1889953613281,
"loss": 0.3044,
"rewards/chosen": -0.1293964684009552,
"rewards/margins": 3.0376605689525604,
"rewards/rejected": -3.1670570373535156,
"step": 181
},
{
"epoch": 0.31501514495889227,
"grad_norm": 21.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9015065.263157895,
"logits/rejected": -6190756.923076923,
"logps/chosen": -208.37291837993422,
"logps/rejected": -260.5161884014423,
"loss": 0.3085,
"rewards/chosen": 0.10386697869551809,
"rewards/margins": 3.3943309397832584,
"rewards/rejected": -3.2904639610877404,
"step": 182
},
{
"epoch": 0.31674599740372134,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10990801.066666666,
"logits/rejected": -6380602.352941177,
"logps/chosen": -166.50836588541668,
"logps/rejected": -261.99778837316177,
"loss": 0.2828,
"rewards/chosen": -0.09178520043690999,
"rewards/margins": 3.105228430149602,
"rewards/rejected": -3.197013630586512,
"step": 183
},
{
"epoch": 0.3184768498485504,
"grad_norm": 22.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15325334.857142856,
"logits/rejected": -24043242.181818184,
"logps/chosen": -213.09461030505952,
"logps/rejected": -345.95725319602275,
"loss": 0.3397,
"rewards/chosen": 0.22786199478876024,
"rewards/margins": 2.5508950675204716,
"rewards/rejected": -2.3230330727317114,
"step": 184
},
{
"epoch": 0.32020770229337947,
"grad_norm": 21.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11346512.94117647,
"logits/rejected": -2746510.4,
"logps/chosen": -264.31959443933823,
"logps/rejected": -207.83736979166667,
"loss": 0.2974,
"rewards/chosen": 0.12100423083585851,
"rewards/margins": 2.763645679810468,
"rewards/rejected": -2.6426414489746093,
"step": 185
},
{
"epoch": 0.3219385547382086,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10335120.0,
"logits/rejected": -11680630.0,
"logps/chosen": -169.1227264404297,
"logps/rejected": -242.72560119628906,
"loss": 0.3009,
"rewards/chosen": 0.12381087243556976,
"rewards/margins": 3.1314540952444077,
"rewards/rejected": -3.007643222808838,
"step": 186
},
{
"epoch": 0.32366940718303766,
"grad_norm": 18.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15741749.714285715,
"logits/rejected": -5845185.333333333,
"logps/chosen": -198.52054268973214,
"logps/rejected": -295.60541449652777,
"loss": 0.2393,
"rewards/chosen": 0.4524484021323068,
"rewards/margins": 3.021870806103661,
"rewards/rejected": -2.569422403971354,
"step": 187
},
{
"epoch": 0.32540025962786673,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9898615.578947369,
"logits/rejected": -2790789.846153846,
"logps/chosen": -154.5710320723684,
"logps/rejected": -181.50324894831732,
"loss": 0.3752,
"rewards/chosen": -0.13492245423166374,
"rewards/margins": 2.425167830849466,
"rewards/rejected": -2.56009028508113,
"step": 188
},
{
"epoch": 0.3271311120726958,
"grad_norm": 19.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12387616.842105264,
"logits/rejected": -8763428.923076924,
"logps/chosen": -166.91708213404604,
"logps/rejected": -238.5693359375,
"loss": 0.3205,
"rewards/chosen": 0.12465482009084601,
"rewards/margins": 2.8608485503717955,
"rewards/rejected": -2.7361937302809496,
"step": 189
},
{
"epoch": 0.32886196451752486,
"grad_norm": 20.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11473281.454545455,
"logits/rejected": -16561514.666666666,
"logps/chosen": -248.29432262073863,
"logps/rejected": -292.7100074404762,
"loss": 0.2314,
"rewards/chosen": 0.17398832061073996,
"rewards/margins": 3.0009191387143486,
"rewards/rejected": -2.826930818103609,
"step": 190
},
{
"epoch": 0.330592816962354,
"grad_norm": 16.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8514306.823529411,
"logits/rejected": -15071434.666666666,
"logps/chosen": -131.8534725413603,
"logps/rejected": -270.87555338541665,
"loss": 0.3024,
"rewards/chosen": 0.18799910825841568,
"rewards/margins": 3.0057781406477386,
"rewards/rejected": -2.817779032389323,
"step": 191
},
{
"epoch": 0.33232366940718305,
"grad_norm": 19.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1210433.894736842,
"logits/rejected": -8759064.0,
"logps/chosen": -148.5827765213816,
"logps/rejected": -183.75860126201923,
"loss": 0.3729,
"rewards/chosen": -0.010686732436481276,
"rewards/margins": 2.6109067429053154,
"rewards/rejected": -2.621593475341797,
"step": 192
},
{
"epoch": 0.3340545218520121,
"grad_norm": 18.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1876100.6153846155,
"logits/rejected": -12178267.789473685,
"logps/chosen": -158.33451021634616,
"logps/rejected": -272.60916940789474,
"loss": 0.2534,
"rewards/chosen": 0.049141957209660456,
"rewards/margins": 3.0719871752657872,
"rewards/rejected": -3.0228452180561267,
"step": 193
},
{
"epoch": 0.3357853742968412,
"grad_norm": 18.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9391846.0,
"logits/rejected": -7076478.5,
"logps/chosen": -197.68260192871094,
"logps/rejected": -201.0494384765625,
"loss": 0.2827,
"rewards/chosen": 0.280254989862442,
"rewards/margins": 3.4145003855228424,
"rewards/rejected": -3.1342453956604004,
"step": 194
},
{
"epoch": 0.33751622674167026,
"grad_norm": 18.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -19537841.066666666,
"logits/rejected": -4787949.176470588,
"logps/chosen": -203.43776041666666,
"logps/rejected": -216.7041015625,
"loss": 0.3368,
"rewards/chosen": 0.03912758032480876,
"rewards/margins": 2.348604769332736,
"rewards/rejected": -2.3094771890079273,
"step": 195
},
{
"epoch": 0.3392470791864994,
"grad_norm": 16.625,
"kl": 0.16342926025390625,
"learning_rate": 5e-06,
"logits/chosen": -3170068.8421052634,
"logits/rejected": -18859544.615384616,
"logps/chosen": -124.50954718338816,
"logps/rejected": -328.8591120793269,
"loss": 0.2909,
"rewards/chosen": 0.2587398478859349,
"rewards/margins": 3.9374563607127078,
"rewards/rejected": -3.6787165128267727,
"step": 196
},
{
"epoch": 0.34097793163132845,
"grad_norm": 21.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 454123.25,
"logits/rejected": -4452693.0,
"logps/chosen": -233.21722412109375,
"logps/rejected": -264.53143310546875,
"loss": 0.2989,
"rewards/chosen": -0.011935576796531677,
"rewards/margins": 3.860817089676857,
"rewards/rejected": -3.8727526664733887,
"step": 197
},
{
"epoch": 0.3427087840761575,
"grad_norm": 16.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3420107.466666667,
"logits/rejected": -14321411.764705881,
"logps/chosen": -135.79720052083334,
"logps/rejected": -363.07223690257354,
"loss": 0.2492,
"rewards/chosen": 0.19408594767252604,
"rewards/margins": 3.735214442832797,
"rewards/rejected": -3.541128495160271,
"step": 198
},
{
"epoch": 0.3444396365209866,
"grad_norm": 21.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5829539.6,
"logits/rejected": -15839922.666666666,
"logps/chosen": -191.86925048828124,
"logps/rejected": -276.8868408203125,
"loss": 0.3512,
"rewards/chosen": 0.14528814554214478,
"rewards/margins": 2.9095884919166566,
"rewards/rejected": -2.7643003463745117,
"step": 199
},
{
"epoch": 0.34617048896581565,
"grad_norm": 17.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3732642.0,
"logits/rejected": -6055160.0,
"logps/chosen": -145.93505859375,
"logps/rejected": -213.4742889404297,
"loss": 0.2826,
"rewards/chosen": 0.03191981464624405,
"rewards/margins": 3.5864234939217567,
"rewards/rejected": -3.5545036792755127,
"step": 200
},
{
"epoch": 0.3479013414106447,
"grad_norm": 17.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10997382.666666666,
"logits/rejected": -13361217.6,
"logps/chosen": -214.18802897135416,
"logps/rejected": -290.353857421875,
"loss": 0.2108,
"rewards/chosen": 0.08698128660519917,
"rewards/margins": 3.8983208556969964,
"rewards/rejected": -3.811339569091797,
"step": 201
},
{
"epoch": 0.34963219385547384,
"grad_norm": 15.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -16647024.0,
"logits/rejected": -16219051.789473685,
"logps/chosen": -266.0360576923077,
"logps/rejected": -278.2851305509868,
"loss": 0.2141,
"rewards/chosen": 0.3400090290949895,
"rewards/margins": 4.1297148075180985,
"rewards/rejected": -3.7897057784231087,
"step": 202
},
{
"epoch": 0.3513630463003029,
"grad_norm": 20.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -14588352.0,
"logits/rejected": -4774448.842105263,
"logps/chosen": -257.1218825120192,
"logps/rejected": -202.72636975740133,
"loss": 0.2567,
"rewards/chosen": -0.031910451558920055,
"rewards/margins": 3.413597200322248,
"rewards/rejected": -3.445507651881168,
"step": 203
},
{
"epoch": 0.353093898745132,
"grad_norm": 21.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3514067.5,
"logits/rejected": -21243462.0,
"logps/chosen": -222.9899139404297,
"logps/rejected": -302.18115234375,
"loss": 0.2852,
"rewards/chosen": -0.013363361358642578,
"rewards/margins": 3.822953462600708,
"rewards/rejected": -3.8363168239593506,
"step": 204
},
{
"epoch": 0.35482475118996104,
"grad_norm": 23.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 10442356.444444444,
"logits/rejected": -5046877.142857143,
"logps/chosen": -239.98092990451389,
"logps/rejected": -214.3353271484375,
"loss": 0.3217,
"rewards/chosen": 0.006140223807758755,
"rewards/margins": 3.9656730977788803,
"rewards/rejected": -3.9595328739711215,
"step": 205
},
{
"epoch": 0.3565556036347901,
"grad_norm": 17.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3316098.5714285714,
"logits/rejected": -3345895.5555555555,
"logps/chosen": -141.17440359933036,
"logps/rejected": -268.8661838107639,
"loss": 0.2669,
"rewards/chosen": -0.06388027327401298,
"rewards/margins": 3.7650589526645724,
"rewards/rejected": -3.8289392259385853,
"step": 206
},
{
"epoch": 0.35828645607961923,
"grad_norm": 24.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10579598.476190476,
"logits/rejected": -7490005.818181818,
"logps/chosen": -215.40415736607142,
"logps/rejected": -398.5939275568182,
"loss": 0.3157,
"rewards/chosen": 0.11144000007992699,
"rewards/margins": 5.1400641011985355,
"rewards/rejected": -5.028624101118608,
"step": 207
},
{
"epoch": 0.3600173085244483,
"grad_norm": 18.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2552588.470588235,
"logits/rejected": 11076235.733333332,
"logps/chosen": -175.71145450367646,
"logps/rejected": -340.4918619791667,
"loss": 0.293,
"rewards/chosen": 0.09705781235414393,
"rewards/margins": 3.936987011572894,
"rewards/rejected": -3.83992919921875,
"step": 208
},
{
"epoch": 0.36174816096927737,
"grad_norm": 17.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8181181.0,
"logits/rejected": -4103885.5,
"logps/chosen": -175.54034423828125,
"logps/rejected": -195.105712890625,
"loss": 0.3223,
"rewards/chosen": 0.006002817302942276,
"rewards/margins": 2.481421146541834,
"rewards/rejected": -2.4754183292388916,
"step": 209
},
{
"epoch": 0.36347901341410643,
"grad_norm": 21.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6023152.0,
"logits/rejected": -6930216.615384615,
"logps/chosen": -220.12386924342104,
"logps/rejected": -259.09130859375,
"loss": 0.3206,
"rewards/chosen": 0.02514595577591344,
"rewards/margins": 3.885440004378678,
"rewards/rejected": -3.8602940486027646,
"step": 210
},
{
"epoch": 0.3652098658589355,
"grad_norm": 17.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8873345.846153846,
"logits/rejected": -4841296.421052632,
"logps/chosen": -222.28448016826923,
"logps/rejected": -257.08958675986844,
"loss": 0.2588,
"rewards/chosen": -0.17216739287743202,
"rewards/margins": 3.402585816286836,
"rewards/rejected": -3.574753209164268,
"step": 211
},
{
"epoch": 0.3669407183037646,
"grad_norm": 18.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 1087914.2666666666,
"logits/rejected": -13504388.705882354,
"logps/chosen": -207.50533854166667,
"logps/rejected": -327.5465303308824,
"loss": 0.2455,
"rewards/chosen": 0.09607280890146891,
"rewards/margins": 4.1472624559028475,
"rewards/rejected": -4.051189647001379,
"step": 212
},
{
"epoch": 0.3686715707485937,
"grad_norm": 18.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12526222.933333334,
"logits/rejected": -2303273.882352941,
"logps/chosen": -217.412255859375,
"logps/rejected": -159.63197954963235,
"loss": 0.2619,
"rewards/chosen": 0.2408916155497233,
"rewards/margins": 3.14007298151652,
"rewards/rejected": -2.899181365966797,
"step": 213
},
{
"epoch": 0.37040242319342276,
"grad_norm": 21.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6768866.105263158,
"logits/rejected": -11241085.538461538,
"logps/chosen": -209.8044305098684,
"logps/rejected": -294.76034780649036,
"loss": 0.3237,
"rewards/chosen": 0.22350662632992394,
"rewards/margins": 2.894371611869287,
"rewards/rejected": -2.670864985539363,
"step": 214
},
{
"epoch": 0.3721332756382518,
"grad_norm": 20.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15369493.333333334,
"logits/rejected": 2045517.2857142857,
"logps/chosen": -224.15772840711804,
"logps/rejected": -162.84868512834822,
"loss": 0.2959,
"rewards/chosen": 0.2034378316667345,
"rewards/margins": 3.11308999667092,
"rewards/rejected": -2.9096521650041853,
"step": 215
},
{
"epoch": 0.3738641280830809,
"grad_norm": 20.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -20242884.0,
"logits/rejected": -7449432.0,
"logps/chosen": -279.4679260253906,
"logps/rejected": -204.05104064941406,
"loss": 0.2813,
"rewards/chosen": 0.30105486512184143,
"rewards/margins": 3.196340948343277,
"rewards/rejected": -2.8952860832214355,
"step": 216
},
{
"epoch": 0.37559498052791,
"grad_norm": 17.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5433502.909090909,
"logits/rejected": 500345.14285714284,
"logps/chosen": -189.3955078125,
"logps/rejected": -303.5468982514881,
"loss": 0.2097,
"rewards/chosen": 0.4285439144481312,
"rewards/margins": 3.622101653705944,
"rewards/rejected": -3.1935577392578125,
"step": 217
},
{
"epoch": 0.3773258329727391,
"grad_norm": 18.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8591011.333333334,
"logits/rejected": -6802850.4,
"logps/chosen": -284.0384928385417,
"logps/rejected": -196.76927490234374,
"loss": 0.2485,
"rewards/chosen": 0.3856252034505208,
"rewards/margins": 3.309361775716146,
"rewards/rejected": -2.923736572265625,
"step": 218
},
{
"epoch": 0.37905668541756815,
"grad_norm": 14.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 4748231.2,
"logits/rejected": -13977383.272727273,
"logps/chosen": -120.3367431640625,
"logps/rejected": -259.1859685724432,
"loss": 0.2385,
"rewards/chosen": -0.07536518573760986,
"rewards/margins": 3.3064329515803945,
"rewards/rejected": -3.3817981373180044,
"step": 219
},
{
"epoch": 0.3807875378623972,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2363141.230769231,
"logits/rejected": -15810723.368421054,
"logps/chosen": -170.66443810096155,
"logps/rejected": -376.7901675575658,
"loss": 0.2529,
"rewards/chosen": 0.12850810931279108,
"rewards/margins": 3.2996522936261132,
"rewards/rejected": -3.1711441843133223,
"step": 220
},
{
"epoch": 0.3825183903072263,
"grad_norm": 19.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11173596.8,
"logits/rejected": -14623304.470588235,
"logps/chosen": -228.55849609375,
"logps/rejected": -248.14662798713235,
"loss": 0.2426,
"rewards/chosen": 0.3422792116800944,
"rewards/margins": 3.8824749011619417,
"rewards/rejected": -3.5401956894818474,
"step": 221
},
{
"epoch": 0.3842492427520554,
"grad_norm": 17.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -20544094.769230768,
"logits/rejected": -15617290.105263159,
"logps/chosen": -238.24759615384616,
"logps/rejected": -270.76454564144734,
"loss": 0.2101,
"rewards/chosen": 0.37209848257211536,
"rewards/margins": 4.252398904035931,
"rewards/rejected": -3.880300421463816,
"step": 222
},
{
"epoch": 0.3859800951968845,
"grad_norm": 18.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2339170.5,
"logits/rejected": -14668691.0,
"logps/chosen": -168.8272247314453,
"logps/rejected": -316.6446228027344,
"loss": 0.2904,
"rewards/chosen": 0.05972611904144287,
"rewards/margins": 3.1063586473464966,
"rewards/rejected": -3.0466325283050537,
"step": 223
},
{
"epoch": 0.38771094764171354,
"grad_norm": 21.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10121347.42857143,
"logits/rejected": -9601993.777777778,
"logps/chosen": -243.92325265066964,
"logps/rejected": -305.10259331597223,
"loss": 0.2482,
"rewards/chosen": 0.2614833116531372,
"rewards/margins": 3.4309277137120566,
"rewards/rejected": -3.1694444020589194,
"step": 224
},
{
"epoch": 0.3894418000865426,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 8372479.157894737,
"logits/rejected": -18436361.846153848,
"logps/chosen": -128.38247841282896,
"logps/rejected": -282.94106820913464,
"loss": 0.353,
"rewards/chosen": -0.023535085351843583,
"rewards/margins": 3.7766199674200913,
"rewards/rejected": -3.800155052771935,
"step": 225
},
{
"epoch": 0.3911726525313717,
"grad_norm": 21.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -1354891.6842105263,
"logits/rejected": -14058054.153846154,
"logps/chosen": -161.87158203125,
"logps/rejected": -339.60730919471155,
"loss": 0.304,
"rewards/chosen": 0.12897560470982602,
"rewards/margins": 4.465609140241677,
"rewards/rejected": -4.336633535531851,
"step": 226
},
{
"epoch": 0.3929035049762008,
"grad_norm": 21.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10190957.176470589,
"logits/rejected": -10005033.6,
"logps/chosen": -191.16943359375,
"logps/rejected": -214.92547200520832,
"loss": 0.3543,
"rewards/chosen": -0.27459220325245576,
"rewards/margins": 2.7234729822944193,
"rewards/rejected": -2.998065185546875,
"step": 227
},
{
"epoch": 0.39463435742102987,
"grad_norm": 16.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12754082.133333333,
"logits/rejected": 9428894.11764706,
"logps/chosen": -160.12646484375,
"logps/rejected": -193.08636833639707,
"loss": 0.2904,
"rewards/chosen": 0.34527934392293297,
"rewards/margins": 2.9829243921766095,
"rewards/rejected": -2.6376450482536766,
"step": 228
},
{
"epoch": 0.39636520986585894,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11040588.631578946,
"logits/rejected": -28768635.076923076,
"logps/chosen": -202.25439453125,
"logps/rejected": -298.80213341346155,
"loss": 0.2818,
"rewards/chosen": 0.2841626719424599,
"rewards/margins": 4.040860400026144,
"rewards/rejected": -3.756697728083684,
"step": 229
},
{
"epoch": 0.398096062310688,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -16696243.2,
"logits/rejected": 782789.7058823529,
"logps/chosen": -209.39425455729167,
"logps/rejected": -243.5696518841912,
"loss": 0.2405,
"rewards/chosen": 0.21385353406270344,
"rewards/margins": 3.5233376792832916,
"rewards/rejected": -3.3094841452205883,
"step": 230
},
{
"epoch": 0.39982691475551707,
"grad_norm": 17.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6847610.857142857,
"logits/rejected": -341155.47222222225,
"logps/chosen": -145.1298566545759,
"logps/rejected": -159.03776041666666,
"loss": 0.3048,
"rewards/chosen": 0.034968899829047065,
"rewards/margins": 2.7618577683728835,
"rewards/rejected": -2.7268888685438366,
"step": 231
},
{
"epoch": 0.4015577672003462,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -14881117.0,
"logits/rejected": -15492494.0,
"logps/chosen": -207.72711181640625,
"logps/rejected": -320.58319091796875,
"loss": 0.2664,
"rewards/chosen": 0.16509190201759338,
"rewards/margins": 4.239029794931412,
"rewards/rejected": -4.073937892913818,
"step": 232
},
{
"epoch": 0.40328861964517526,
"grad_norm": 20.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5929799.529411765,
"logits/rejected": -12552819.2,
"logps/chosen": -215.2191664751838,
"logps/rejected": -290.16845703125,
"loss": 0.2607,
"rewards/chosen": 0.19065551196827607,
"rewards/margins": 4.717087295008641,
"rewards/rejected": -4.526431783040365,
"step": 233
},
{
"epoch": 0.40501947209000433,
"grad_norm": 17.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 8185308.8,
"logits/rejected": -14831882.352941176,
"logps/chosen": -165.01144205729167,
"logps/rejected": -273.51809512867646,
"loss": 0.2833,
"rewards/chosen": -0.0994392474492391,
"rewards/margins": 3.8827245745004393,
"rewards/rejected": -3.9821638219496784,
"step": 234
},
{
"epoch": 0.4067503245348334,
"grad_norm": 21.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12612798.315789474,
"logits/rejected": -19476348.307692308,
"logps/chosen": -231.91385690789474,
"logps/rejected": -316.7388446514423,
"loss": 0.3007,
"rewards/chosen": 0.1857273955094187,
"rewards/margins": 5.110347803787664,
"rewards/rejected": -4.924620408278245,
"step": 235
},
{
"epoch": 0.40848117697966246,
"grad_norm": 14.1875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13798296.0,
"logits/rejected": -13150624.0,
"logps/chosen": -184.28253173828125,
"logps/rejected": -293.38729580965907,
"loss": 0.2181,
"rewards/chosen": -0.08902863264083863,
"rewards/margins": 3.767916405200958,
"rewards/rejected": -3.856945037841797,
"step": 236
},
{
"epoch": 0.4102120294244916,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -5185518.933333334,
"logits/rejected": -10506853.647058824,
"logps/chosen": -195.71438802083333,
"logps/rejected": -266.6700080422794,
"loss": 0.2492,
"rewards/chosen": 0.16841630935668944,
"rewards/margins": 4.66889471727259,
"rewards/rejected": -4.5004784079159,
"step": 237
},
{
"epoch": 0.41194288186932065,
"grad_norm": 17.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -120490.13333333333,
"logits/rejected": -4157334.588235294,
"logps/chosen": -165.81774088541667,
"logps/rejected": -338.9480985753676,
"loss": 0.2444,
"rewards/chosen": 0.16940480868021648,
"rewards/margins": 3.9526734567156026,
"rewards/rejected": -3.783268648035386,
"step": 238
},
{
"epoch": 0.4136737343141497,
"grad_norm": 21.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13063847.466666667,
"logits/rejected": -7603120.0,
"logps/chosen": -217.052294921875,
"logps/rejected": -281.8909696691176,
"loss": 0.2602,
"rewards/chosen": 0.0732549508412679,
"rewards/margins": 3.9795507272084554,
"rewards/rejected": -3.9062957763671875,
"step": 239
},
{
"epoch": 0.4154045867589788,
"grad_norm": 19.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13696624.0,
"logits/rejected": 21262386.82352941,
"logps/chosen": -188.62125651041666,
"logps/rejected": -326.68488625919116,
"loss": 0.2636,
"rewards/chosen": 0.14170858065287273,
"rewards/margins": 3.8223358425439575,
"rewards/rejected": -3.6806272618910847,
"step": 240
},
{
"epoch": 0.41713543920380786,
"grad_norm": 15.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11527736.615384616,
"logits/rejected": -13245899.789473685,
"logps/chosen": -171.26971905048077,
"logps/rejected": -316.2771638569079,
"loss": 0.2359,
"rewards/chosen": 0.039726394873399004,
"rewards/margins": 4.057520953749838,
"rewards/rejected": -4.017794558876439,
"step": 241
},
{
"epoch": 0.418866291648637,
"grad_norm": 20.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10542560.0,
"logits/rejected": -15128954.0,
"logps/chosen": -197.97857666015625,
"logps/rejected": -249.15786743164062,
"loss": 0.2893,
"rewards/chosen": -0.0119645856320858,
"rewards/margins": 3.6342065073549747,
"rewards/rejected": -3.6461710929870605,
"step": 242
},
{
"epoch": 0.42059714409346605,
"grad_norm": 22.5,
"kl": 0.13220763206481934,
"learning_rate": 5e-06,
"logits/chosen": -17448896.0,
"logits/rejected": -10536552.533333333,
"logps/chosen": -242.55230353860293,
"logps/rejected": -180.8974609375,
"loss": 0.3269,
"rewards/chosen": 0.07533069217906278,
"rewards/margins": 3.0032341854245055,
"rewards/rejected": -2.927903493245443,
"step": 243
},
{
"epoch": 0.4223279965382951,
"grad_norm": 16.875,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15767290.181818182,
"logits/rejected": -11785589.333333334,
"logps/chosen": -199.07419655539772,
"logps/rejected": -267.2129371279762,
"loss": 0.1908,
"rewards/chosen": 0.3069478381763805,
"rewards/margins": 3.4843519693845275,
"rewards/rejected": -3.177404131208147,
"step": 244
},
{
"epoch": 0.4240588489831242,
"grad_norm": 16.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9023009.777777778,
"logits/rejected": -8307068.0,
"logps/chosen": -138.5511474609375,
"logps/rejected": -191.49358258928572,
"loss": 0.2808,
"rewards/chosen": 0.27963558832804364,
"rewards/margins": 3.714873745327904,
"rewards/rejected": -3.4352381569998607,
"step": 245
},
{
"epoch": 0.42578970142795325,
"grad_norm": 14.8125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12051193.6,
"logits/rejected": -10616396.235294119,
"logps/chosen": -171.37771809895833,
"logps/rejected": -222.20760569852942,
"loss": 0.2502,
"rewards/chosen": 0.1699681282043457,
"rewards/margins": 4.40653590595021,
"rewards/rejected": -4.236567777745864,
"step": 246
},
{
"epoch": 0.42752055387278237,
"grad_norm": 19.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12088685.714285715,
"logits/rejected": -5290852.0,
"logps/chosen": -208.72258649553572,
"logps/rejected": -252.75640190972223,
"loss": 0.2659,
"rewards/chosen": 0.08243453502655029,
"rewards/margins": 3.2210644483566284,
"rewards/rejected": -3.138629913330078,
"step": 247
},
{
"epoch": 0.42925140631761144,
"grad_norm": 16.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -18058203.2,
"logits/rejected": -10416639.272727273,
"logps/chosen": -224.68359375,
"logps/rejected": -289.28861860795456,
"loss": 0.2142,
"rewards/chosen": -0.14810900688171386,
"rewards/margins": 3.584905880147761,
"rewards/rejected": -3.7330148870294746,
"step": 248
},
{
"epoch": 0.4309822587624405,
"grad_norm": 18.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -15116648.533333333,
"logits/rejected": -3318948.0,
"logps/chosen": -177.97216796875,
"logps/rejected": -264.22144990808823,
"loss": 0.2523,
"rewards/chosen": 0.038343381881713864,
"rewards/margins": 4.054038841584149,
"rewards/rejected": -4.015695459702435,
"step": 249
},
{
"epoch": 0.4327131112072696,
"grad_norm": 22.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11157197.47368421,
"logits/rejected": 2493580.3076923075,
"logps/chosen": -177.07334498355263,
"logps/rejected": -290.3555438701923,
"loss": 0.3557,
"rewards/chosen": -0.19027650983710037,
"rewards/margins": 3.0934826806489273,
"rewards/rejected": -3.2837591904860277,
"step": 250
},
{
"epoch": 0.43444396365209864,
"grad_norm": 20.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -16668712.533333333,
"logits/rejected": -3976555.294117647,
"logps/chosen": -244.20847981770834,
"logps/rejected": -327.9715935202206,
"loss": 0.2648,
"rewards/chosen": 0.03703808784484863,
"rewards/margins": 4.71754776730257,
"rewards/rejected": -4.680509679457721,
"step": 251
},
{
"epoch": 0.43617481609692776,
"grad_norm": 17.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 5272862.545454546,
"logits/rejected": -6936409.904761905,
"logps/chosen": -229.92167524857953,
"logps/rejected": -213.8749534970238,
"loss": 0.2408,
"rewards/chosen": -0.30214368213306775,
"rewards/margins": 3.3398687777581153,
"rewards/rejected": -3.642012459891183,
"step": 252
},
{
"epoch": 0.43790566854175683,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3558934.933333333,
"logits/rejected": -2213124.2352941176,
"logps/chosen": -207.47721354166666,
"logps/rejected": -196.65370806525735,
"loss": 0.2565,
"rewards/chosen": 0.18845229148864745,
"rewards/margins": 3.44047677376691,
"rewards/rejected": -3.2520244822782627,
"step": 253
},
{
"epoch": 0.4396365209865859,
"grad_norm": 20.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3256860.5714285714,
"logits/rejected": -1937111.3333333333,
"logps/chosen": -222.31358119419642,
"logps/rejected": -198.45515950520834,
"loss": 0.2842,
"rewards/chosen": -0.027769644345555986,
"rewards/margins": 3.096243832556028,
"rewards/rejected": -3.124013476901584,
"step": 254
},
{
"epoch": 0.44136737343141497,
"grad_norm": 20.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 2231512.0,
"logits/rejected": -15707685.714285715,
"logps/chosen": -224.28355577256946,
"logps/rejected": -377.65659877232144,
"loss": 0.283,
"rewards/chosen": 0.1279101769129435,
"rewards/margins": 4.255186188788642,
"rewards/rejected": -4.127276011875698,
"step": 255
},
{
"epoch": 0.44309822587624403,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -3650349.0,
"logits/rejected": -25763404.0,
"logps/chosen": -175.8751220703125,
"logps/rejected": -384.6330261230469,
"loss": 0.2993,
"rewards/chosen": -0.2350717931985855,
"rewards/margins": 3.420280560851097,
"rewards/rejected": -3.6553523540496826,
"step": 256
},
{
"epoch": 0.4448290783210731,
"grad_norm": 22.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8227142.095238095,
"logits/rejected": -3991596.0,
"logps/chosen": -204.65394810267858,
"logps/rejected": -230.69422496448863,
"loss": 0.3679,
"rewards/chosen": -0.06990920929681688,
"rewards/margins": 4.00661361785162,
"rewards/rejected": -4.0765228271484375,
"step": 257
},
{
"epoch": 0.4465599307659022,
"grad_norm": 20.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9966890.105263159,
"logits/rejected": -14217319.384615384,
"logps/chosen": -199.6491827713816,
"logps/rejected": -275.9331242487981,
"loss": 0.3237,
"rewards/chosen": 0.01347437344099346,
"rewards/margins": 4.150257386418007,
"rewards/rejected": -4.136783012977014,
"step": 258
},
{
"epoch": 0.4482907832107313,
"grad_norm": 20.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13260032.94117647,
"logits/rejected": -3421154.6666666665,
"logps/chosen": -238.68637982536765,
"logps/rejected": -267.1708658854167,
"loss": 0.2502,
"rewards/chosen": 0.2879646806155934,
"rewards/margins": 4.020605773551791,
"rewards/rejected": -3.732641092936198,
"step": 259
},
{
"epoch": 0.45002163565556036,
"grad_norm": 18.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -20596262.85714286,
"logits/rejected": -7963298.666666667,
"logps/chosen": -236.05545479910714,
"logps/rejected": -297.91436089409723,
"loss": 0.2213,
"rewards/chosen": 0.6324899537222726,
"rewards/margins": 4.426475108615936,
"rewards/rejected": -3.7939851548936634,
"step": 260
},
{
"epoch": 0.4517524881003894,
"grad_norm": 15.9375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -14417637.333333334,
"logits/rejected": -13684048.0,
"logps/chosen": -196.896435546875,
"logps/rejected": -312.3822380514706,
"loss": 0.1969,
"rewards/chosen": 0.5348507563273112,
"rewards/margins": 4.734749868804333,
"rewards/rejected": -4.199899112477022,
"step": 261
},
{
"epoch": 0.4534833405452185,
"grad_norm": 19.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -24702128.0,
"logits/rejected": -9365918.222222222,
"logps/chosen": -337.36460658482144,
"logps/rejected": -256.26673719618054,
"loss": 0.2306,
"rewards/chosen": 0.24765947886875697,
"rewards/margins": 4.048638669271318,
"rewards/rejected": -3.8009791904025607,
"step": 262
},
{
"epoch": 0.4552141929900476,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12827164.631578946,
"logits/rejected": -11524402.461538462,
"logps/chosen": -181.87505139802633,
"logps/rejected": -344.0277569110577,
"loss": 0.2989,
"rewards/chosen": 0.03691702453713668,
"rewards/margins": 4.332692511409883,
"rewards/rejected": -4.2957754868727465,
"step": 263
},
{
"epoch": 0.4569450454348767,
"grad_norm": 20.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -18784899.555555556,
"logits/rejected": -12124728.0,
"logps/chosen": -233.41818576388889,
"logps/rejected": -315.662109375,
"loss": 0.3184,
"rewards/chosen": 0.08528125286102295,
"rewards/margins": 3.363411920411246,
"rewards/rejected": -3.278130667550223,
"step": 264
},
{
"epoch": 0.45867589787970575,
"grad_norm": 19.125,
"kl": 0.11611628532409668,
"learning_rate": 5e-06,
"logits/chosen": 1691305.142857143,
"logits/rejected": -6767852.0,
"logps/chosen": -285.6856166294643,
"logps/rejected": -247.17659505208334,
"loss": 0.2455,
"rewards/chosen": 0.24254277774265834,
"rewards/margins": 3.9829009184761657,
"rewards/rejected": -3.740358140733507,
"step": 265
},
{
"epoch": 0.4604067503245348,
"grad_norm": 20.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11006114.4,
"logits/rejected": -19388294.666666668,
"logps/chosen": -188.975390625,
"logps/rejected": -325.20001220703125,
"loss": 0.3136,
"rewards/chosen": 0.19371647834777833,
"rewards/margins": 3.5510452111562096,
"rewards/rejected": -3.357328732808431,
"step": 266
},
{
"epoch": 0.4621376027693639,
"grad_norm": 16.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -17104797.53846154,
"logits/rejected": -6318394.105263158,
"logps/chosen": -190.21593299278845,
"logps/rejected": -221.87461451480263,
"loss": 0.2442,
"rewards/chosen": 0.2514270819150485,
"rewards/margins": 3.471239374716755,
"rewards/rejected": -3.2198122928017066,
"step": 267
},
{
"epoch": 0.463868455214193,
"grad_norm": 20.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4255990.933333334,
"logits/rejected": -16446816.0,
"logps/chosen": -253.98564453125,
"logps/rejected": -270.46570542279414,
"loss": 0.2827,
"rewards/chosen": -0.036499599615732826,
"rewards/margins": 3.59961351109486,
"rewards/rejected": -3.6361131107105926,
"step": 268
},
{
"epoch": 0.4655993076590221,
"grad_norm": 17.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11160579.2,
"logits/rejected": -10607611.294117646,
"logps/chosen": -202.769384765625,
"logps/rejected": -240.4682186351103,
"loss": 0.2514,
"rewards/chosen": 0.4125640551249186,
"rewards/margins": 3.6268443518993903,
"rewards/rejected": -3.2142802967744717,
"step": 269
},
{
"epoch": 0.46733016010385114,
"grad_norm": 18.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -13615528.888888888,
"logits/rejected": -6793550.857142857,
"logps/chosen": -177.54604763454861,
"logps/rejected": -297.06703404017856,
"loss": 0.2885,
"rewards/chosen": 0.25184231334262425,
"rewards/margins": 3.5622371454087514,
"rewards/rejected": -3.310394832066127,
"step": 270
},
{
"epoch": 0.4690610125486802,
"grad_norm": 21.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2008512.4210526317,
"logits/rejected": 1065751.2307692308,
"logps/chosen": -230.33552631578948,
"logps/rejected": -225.39548903245193,
"loss": 0.2912,
"rewards/chosen": 0.33769374144704717,
"rewards/margins": 3.8641099331349977,
"rewards/rejected": -3.5264161916879506,
"step": 271
},
{
"epoch": 0.4707918649935093,
"grad_norm": 20.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8079465.5,
"logits/rejected": -4624071.0,
"logps/chosen": -245.87030029296875,
"logps/rejected": -286.6881103515625,
"loss": 0.256,
"rewards/chosen": 0.18378782272338867,
"rewards/margins": 4.057176828384399,
"rewards/rejected": -3.8733890056610107,
"step": 272
},
{
"epoch": 0.4725227174383384,
"grad_norm": 17.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": 34063.02272727273,
"logits/rejected": 5305572.4,
"logps/chosen": -97.9369229403409,
"logps/rejected": -163.92113037109374,
"loss": 0.362,
"rewards/chosen": 0.07122220234437422,
"rewards/margins": 3.202643482251601,
"rewards/rejected": -3.1314212799072267,
"step": 273
},
{
"epoch": 0.47425356988316747,
"grad_norm": 19.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8214460.0,
"logits/rejected": -21400504.0,
"logps/chosen": -159.88619995117188,
"logps/rejected": -304.96246337890625,
"loss": 0.2742,
"rewards/chosen": 0.257793664932251,
"rewards/margins": 3.4434404373168945,
"rewards/rejected": -3.1856467723846436,
"step": 274
},
{
"epoch": 0.47598442232799654,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9168399.157894736,
"logits/rejected": -14751639.384615384,
"logps/chosen": -186.708251953125,
"logps/rejected": -292.65147986778845,
"loss": 0.2819,
"rewards/chosen": 0.2397254642687346,
"rewards/margins": 4.295501033304191,
"rewards/rejected": -4.055775569035457,
"step": 275
},
{
"epoch": 0.4777152747728256,
"grad_norm": 20.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9120151.529411765,
"logits/rejected": -2838455.7333333334,
"logps/chosen": -179.58385512408088,
"logps/rejected": -274.78151041666666,
"loss": 0.2887,
"rewards/chosen": 0.22131121859830968,
"rewards/margins": 3.1119616499134137,
"rewards/rejected": -2.890650431315104,
"step": 276
},
{
"epoch": 0.47944612721765467,
"grad_norm": 17.625,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -17451059.42857143,
"logits/rejected": -19632037.333333332,
"logps/chosen": -194.51698521205358,
"logps/rejected": -261.6064453125,
"loss": 0.255,
"rewards/chosen": 0.24481826169150217,
"rewards/margins": 3.8417675211316062,
"rewards/rejected": -3.596949259440104,
"step": 277
},
{
"epoch": 0.4811769796624838,
"grad_norm": 18.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11978563.2,
"logits/rejected": -11180960.94117647,
"logps/chosen": -204.08976236979166,
"logps/rejected": -332.37023207720586,
"loss": 0.2457,
"rewards/chosen": 0.2240306536356608,
"rewards/margins": 3.5974735839694154,
"rewards/rejected": -3.3734429303337548,
"step": 278
},
{
"epoch": 0.48290783210731286,
"grad_norm": 17.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -4350533.866666666,
"logits/rejected": -8590544.94117647,
"logps/chosen": -151.40662434895833,
"logps/rejected": -249.31729664522058,
"loss": 0.2934,
"rewards/chosen": 0.01796001394589742,
"rewards/margins": 2.6930718967727585,
"rewards/rejected": -2.675111882826861,
"step": 279
},
{
"epoch": 0.48463868455214193,
"grad_norm": 19.375,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -10933750.4,
"logits/rejected": -15255677.176470589,
"logps/chosen": -171.53665364583333,
"logps/rejected": -308.31043198529414,
"loss": 0.2806,
"rewards/chosen": 0.030906534194946288,
"rewards/margins": 4.104628924762501,
"rewards/rejected": -4.073722390567555,
"step": 280
},
{
"epoch": 0.486369536996971,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8110217.142857143,
"logits/rejected": -18321370.181818184,
"logps/chosen": -182.0908435639881,
"logps/rejected": -304.40236594460225,
"loss": 0.3329,
"rewards/chosen": 0.09797722952706474,
"rewards/margins": 3.671043247371525,
"rewards/rejected": -3.5730660178444604,
"step": 281
},
{
"epoch": 0.48810038944180006,
"grad_norm": 19.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -8963434.0,
"logits/rejected": -6483309.0,
"logps/chosen": -224.63430786132812,
"logps/rejected": -291.1379699707031,
"loss": 0.2706,
"rewards/chosen": 0.08380473405122757,
"rewards/margins": 3.3000806644558907,
"rewards/rejected": -3.216275930404663,
"step": 282
},
{
"epoch": 0.4898312418866292,
"grad_norm": 17.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -9069934.0,
"logits/rejected": -22540276.0,
"logps/chosen": -141.39923095703125,
"logps/rejected": -317.710693359375,
"loss": 0.3155,
"rewards/chosen": -0.18840433657169342,
"rewards/margins": 3.7216622680425644,
"rewards/rejected": -3.910066604614258,
"step": 283
},
{
"epoch": 0.49156209433145825,
"grad_norm": 23.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -11534782.315789474,
"logits/rejected": -14227288.615384616,
"logps/chosen": -347.19901315789474,
"logps/rejected": -233.33997521033655,
"loss": 0.2763,
"rewards/chosen": 0.6239749506900185,
"rewards/margins": 3.671866420792182,
"rewards/rejected": -3.0478914701021633,
"step": 284
},
{
"epoch": 0.4932929467762873,
"grad_norm": 19.0,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -7307938.52631579,
"logits/rejected": -6305785.846153846,
"logps/chosen": -182.99164782072367,
"logps/rejected": -234.64518855168268,
"loss": 0.3165,
"rewards/chosen": 0.0814883081536544,
"rewards/margins": 3.057790234986587,
"rewards/rejected": -2.9763019268329325,
"step": 285
},
{
"epoch": 0.4950237992211164,
"grad_norm": 17.125,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -2387972.2666666666,
"logits/rejected": -10976164.705882354,
"logps/chosen": -138.42179361979166,
"logps/rejected": -257.4625459558824,
"loss": 0.2974,
"rewards/chosen": -0.04797365665435791,
"rewards/margins": 3.496035608123331,
"rewards/rejected": -3.5440092647776886,
"step": 286
},
{
"epoch": 0.49675465166594546,
"grad_norm": 16.25,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -17446714.0,
"logits/rejected": -4344534.5,
"logps/chosen": -249.31387329101562,
"logps/rejected": -275.65106201171875,
"loss": 0.2517,
"rewards/chosen": 0.2581062614917755,
"rewards/margins": 3.798191577196121,
"rewards/rejected": -3.5400853157043457,
"step": 287
},
{
"epoch": 0.4984855041107746,
"grad_norm": 18.75,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -6731353.846153846,
"logits/rejected": -10663467.789473685,
"logps/chosen": -210.49735201322116,
"logps/rejected": -270.64185855263156,
"loss": 0.2199,
"rewards/chosen": 0.23169115873483512,
"rewards/margins": 4.13718504558208,
"rewards/rejected": -3.905493886847245,
"step": 288
},
{
"epoch": 0.5002163565556036,
"grad_norm": 20.5,
"kl": 0.0,
"learning_rate": 5e-06,
"logits/chosen": -12233035.2,
"logits/rejected": -17931900.0,
"logps/chosen": -199.65758056640624,
"logps/rejected": -295.8255208333333,
"loss": 0.3644,
"rewards/chosen": -0.10184909105300903,
"rewards/margins": 2.8798390905062354,
"rewards/rejected": -2.9816881815592446,
"step": 289
}
],
"logging_steps": 1,
"max_steps": 578,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 289,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}