{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5002163565556036, "eval_steps": 500, "global_step": 289, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017308524448290783, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 1.4285714285714287e-07, "logits/chosen": -6239313.454545454, "logits/rejected": -4940240.761904762, "logps/chosen": -236.17436079545453, "logps/rejected": -209.70107886904762, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0034617048896581565, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "logits/chosen": -2665428.3076923075, "logits/rejected": -1073632.5263157894, "logps/chosen": -155.0839562049279, "logps/rejected": -255.23524876644737, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.005192557334487235, "grad_norm": 39.25, "kl": 0.11506986618041992, "learning_rate": 4.285714285714286e-07, "logits/chosen": -1627763.2, "logits/rejected": -1337906.8235294118, "logps/chosen": -214.50914713541667, "logps/rejected": -210.7369887408088, "loss": 0.5011, "rewards/chosen": -0.005750782291094462, "rewards/margins": -0.006572681913773219, "rewards/rejected": 0.0008218996226787567, "step": 3 }, { "epoch": 0.006923409779316313, "grad_norm": 33.5, "kl": 0.13516950607299805, "learning_rate": 5.714285714285715e-07, "logits/chosen": -9900144.0, "logits/rejected": 2390790.4, "logps/chosen": -236.11850873161765, "logps/rejected": -187.40989583333334, "loss": 0.5052, "rewards/chosen": 0.00019769019940320184, "rewards/margins": -0.03652356716932035, "rewards/rejected": 0.03672125736872355, "step": 4 }, { "epoch": 0.00865426222414539, "grad_norm": 39.0, "kl": 0.16583895683288574, "learning_rate": 7.142857142857143e-07, "logits/chosen": 273089.5, "logits/rejected": -8537414.0, "logps/chosen": -191.99412536621094, "logps/rejected": -250.5272216796875, "loss": 0.5, "rewards/chosen": -0.018333029001951218, "rewards/margins": -0.014739224454388022, "rewards/rejected": -0.0035938045475631952, "step": 5 }, { "epoch": 0.01038511466897447, "grad_norm": 37.0, "kl": 0.1269383430480957, "learning_rate": 8.571428571428572e-07, "logits/chosen": 625734.125, "logits/rejected": -3864760.5, "logps/chosen": -130.1186065673828, "logps/rejected": -263.1868591308594, "loss": 0.4974, "rewards/chosen": 0.005134785547852516, "rewards/margins": 0.02763364464044571, "rewards/rejected": -0.022498859092593193, "step": 6 }, { "epoch": 0.012115967113803548, "grad_norm": 45.5, "kl": 0.058301448822021484, "learning_rate": 1.0000000000000002e-06, "logits/chosen": 11427891.2, "logits/rejected": -8692000.94117647, "logps/chosen": -247.42198893229167, "logps/rejected": -318.3026769301471, "loss": 0.4879, "rewards/chosen": 0.020061949888865154, "rewards/margins": 0.09507267031015135, "rewards/rejected": -0.07501072042128619, "step": 7 }, { "epoch": 0.013846819558632626, "grad_norm": 34.75, "kl": 0.10779595375061035, "learning_rate": 1.142857142857143e-06, "logits/chosen": 9745310.315789474, "logits/rejected": 4968272.0, "logps/chosen": -266.39432565789474, "logps/rejected": -200.4144568810096, "loss": 0.4926, "rewards/chosen": 0.03307872383218063, "rewards/margins": 0.08410431619597833, "rewards/rejected": -0.0510255923637977, "step": 8 }, { "epoch": 0.015577672003461706, "grad_norm": 35.75, "kl": 0.0004693269729614258, "learning_rate": 1.2857142857142856e-06, "logits/chosen": 2292229.3333333335, "logits/rejected": -5866576.571428572, "logps/chosen": -164.06934950086804, "logps/rejected": -255.37636021205358, "loss": 0.4857, "rewards/chosen": 0.017767790291044448, "rewards/margins": 0.12696768035964362, "rewards/rejected": -0.10919989006859916, "step": 9 }, { "epoch": 0.01730852444829078, "grad_norm": 58.0, "kl": 0.08134031295776367, "learning_rate": 1.4285714285714286e-06, "logits/chosen": -2206551.1428571427, "logits/rejected": -2327785.3333333335, "logps/chosen": -210.65478515625, "logps/rejected": -399.8186848958333, "loss": 0.4595, "rewards/chosen": 0.041608184576034546, "rewards/margins": 0.30469969577259487, "rewards/rejected": -0.2630915111965603, "step": 10 }, { "epoch": 0.019039376893119863, "grad_norm": 33.0, "kl": 0.06869983673095703, "learning_rate": 1.5714285714285714e-06, "logits/chosen": 5004582.315789473, "logits/rejected": 15390077.538461538, "logps/chosen": -168.22392835115133, "logps/rejected": -250.3277869591346, "loss": 0.4842, "rewards/chosen": 0.025214639149214093, "rewards/margins": 0.15980646617499442, "rewards/rejected": -0.13459182702578032, "step": 11 }, { "epoch": 0.02077022933794894, "grad_norm": 35.25, "kl": 0.03198128938674927, "learning_rate": 1.7142857142857145e-06, "logits/chosen": -6019438.5, "logits/rejected": -12351150.0, "logps/chosen": -203.1639404296875, "logps/rejected": -248.1376495361328, "loss": 0.4481, "rewards/chosen": 0.022723043337464333, "rewards/margins": 0.46131726540625095, "rewards/rejected": -0.4385942220687866, "step": 12 }, { "epoch": 0.02250108178277802, "grad_norm": 27.375, "kl": 0.03030562400817871, "learning_rate": 1.8571428571428573e-06, "logits/chosen": -1771344.705882353, "logits/rejected": 4848613.333333333, "logps/chosen": -156.79733455882354, "logps/rejected": -143.33746744791668, "loss": 0.4641, "rewards/chosen": 0.040735574329600614, "rewards/margins": 0.31227434055477965, "rewards/rejected": -0.27153876622517903, "step": 13 }, { "epoch": 0.024231934227607096, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -402277.15789473685, "logits/rejected": 8271367.384615385, "logps/chosen": -283.14478824013156, "logps/rejected": -204.32831280048077, "loss": 0.4584, "rewards/chosen": 0.008756919126761587, "rewards/margins": 0.42188657892619064, "rewards/rejected": -0.41312965979942906, "step": 14 }, { "epoch": 0.025962786672436174, "grad_norm": 36.75, "kl": 0.0032711029052734375, "learning_rate": 2.1428571428571427e-06, "logits/chosen": -811063.5882352941, "logits/rejected": -12282100.266666668, "logps/chosen": -176.79848345588235, "logps/rejected": -325.2669270833333, "loss": 0.3948, "rewards/chosen": 0.08565068244934082, "rewards/margins": 0.9919464588165283, "rewards/rejected": -0.9062957763671875, "step": 15 }, { "epoch": 0.027693639117265252, "grad_norm": 30.375, "kl": 0.007568359375, "learning_rate": 2.285714285714286e-06, "logits/chosen": -1084260.0, "logits/rejected": 31534450.666666668, "logps/chosen": -210.231787109375, "logps/rejected": -276.17873128255206, "loss": 0.4251, "rewards/chosen": 0.08407727479934693, "rewards/margins": 0.8169409394264221, "rewards/rejected": -0.7328636646270752, "step": 16 }, { "epoch": 0.02942449156209433, "grad_norm": 30.0, "kl": 0.0, "learning_rate": 2.428571428571429e-06, "logits/chosen": -1036163.6923076923, "logits/rejected": 961232.6315789474, "logps/chosen": -268.2616624098558, "logps/rejected": -225.40373149671052, "loss": 0.4054, "rewards/chosen": 0.010549396276473999, "rewards/margins": 0.7161211230252919, "rewards/rejected": -0.7055717267488179, "step": 17 }, { "epoch": 0.03115534400692341, "grad_norm": 26.25, "kl": 0.0, "learning_rate": 2.571428571428571e-06, "logits/chosen": 4818987.555555556, "logits/rejected": -827468.8571428572, "logps/chosen": -251.50027126736111, "logps/rejected": -213.26834542410714, "loss": 0.4032, "rewards/chosen": 0.03222567505306668, "rewards/margins": 1.113093238028269, "rewards/rejected": -1.0808675629752023, "step": 18 }, { "epoch": 0.03288619645175249, "grad_norm": 24.625, "kl": 0.0, "learning_rate": 2.7142857142857144e-06, "logits/chosen": -1698852.380952381, "logits/rejected": 2662217.6363636362, "logps/chosen": -172.64027622767858, "logps/rejected": -205.12626509232953, "loss": 0.4404, "rewards/chosen": -0.05192979744502476, "rewards/margins": 0.9354158980505807, "rewards/rejected": -0.9873456954956055, "step": 19 }, { "epoch": 0.03461704889658156, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 2.8571428571428573e-06, "logits/chosen": 6553660.0, "logits/rejected": -5781368.444444444, "logps/chosen": -143.31090436662947, "logps/rejected": -192.53050401475696, "loss": 0.3749, "rewards/chosen": 0.09408829041889735, "rewards/margins": 1.171789647094787, "rewards/rejected": -1.0777013566758897, "step": 20 }, { "epoch": 0.036347901341410645, "grad_norm": 20.375, "kl": 0.012153387069702148, "learning_rate": 3e-06, "logits/chosen": 1939333.8666666667, "logits/rejected": 1052395.0588235294, "logps/chosen": -177.10651041666668, "logps/rejected": -190.28341854319854, "loss": 0.3776, "rewards/chosen": 0.045921965440114336, "rewards/margins": 1.3893623017797283, "rewards/rejected": -1.343440336339614, "step": 21 }, { "epoch": 0.038078753786239726, "grad_norm": 21.0, "kl": 0.0, "learning_rate": 3.142857142857143e-06, "logits/chosen": 3900064.5, "logits/rejected": 2436417.0, "logps/chosen": -188.06832885742188, "logps/rejected": -307.90692138671875, "loss": 0.3542, "rewards/chosen": -0.08758784085512161, "rewards/margins": 1.7587207481265068, "rewards/rejected": -1.8463085889816284, "step": 22 }, { "epoch": 0.0398096062310688, "grad_norm": 22.625, "kl": 0.000914454460144043, "learning_rate": 3.285714285714286e-06, "logits/chosen": -42746.86666666667, "logits/rejected": 1372338.8235294118, "logps/chosen": -226.461865234375, "logps/rejected": -258.68396714154414, "loss": 0.3754, "rewards/chosen": -0.14824188550313314, "rewards/margins": 1.6312208166309432, "rewards/rejected": -1.7794627021340763, "step": 23 }, { "epoch": 0.04154045867589788, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 3.428571428571429e-06, "logits/chosen": -1758418.3333333333, "logits/rejected": 7633656.0, "logps/chosen": -143.17743598090277, "logps/rejected": -136.41956438337053, "loss": 0.4083, "rewards/chosen": -0.15943604045444065, "rewards/margins": 1.379212019935487, "rewards/rejected": -1.5386480603899275, "step": 24 }, { "epoch": 0.043271311120726956, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -3627161.777777778, "logits/rejected": -1282915.142857143, "logps/chosen": -189.07590060763889, "logps/rejected": -282.23025948660717, "loss": 0.3667, "rewards/chosen": -0.05861267778608534, "rewards/margins": 2.27708803850507, "rewards/rejected": -2.335700716291155, "step": 25 }, { "epoch": 0.04500216356555604, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 3.7142857142857146e-06, "logits/chosen": 3197816.8571428573, "logits/rejected": -24990.666666666668, "logps/chosen": -133.38133893694197, "logps/rejected": -252.71739366319446, "loss": 0.3729, "rewards/chosen": -0.2894209793635777, "rewards/margins": 1.5688878127506802, "rewards/rejected": -1.8583087921142578, "step": 26 }, { "epoch": 0.04673301601038511, "grad_norm": 22.0, "kl": 0.0, "learning_rate": 3.857142857142858e-06, "logits/chosen": 8530238.857142856, "logits/rejected": -6356856.888888889, "logps/chosen": -228.37095424107142, "logps/rejected": -290.1540256076389, "loss": 0.3187, "rewards/chosen": -0.04233703442982265, "rewards/margins": 2.3515855594286843, "rewards/rejected": -2.393922593858507, "step": 27 }, { "epoch": 0.04846386845521419, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 4.000000000000001e-06, "logits/chosen": 11541931.294117646, "logits/rejected": -18899741.866666667, "logps/chosen": -161.64244887408088, "logps/rejected": -298.1225911458333, "loss": 0.3221, "rewards/chosen": -0.20178667236776912, "rewards/margins": 3.408641692703845, "rewards/rejected": -3.6104283650716145, "step": 28 }, { "epoch": 0.050194720900043274, "grad_norm": 21.125, "kl": 0.0, "learning_rate": 4.1428571428571435e-06, "logits/chosen": -8176106.0, "logits/rejected": 8892046.0, "logps/chosen": -202.91030883789062, "logps/rejected": -320.7770690917969, "loss": 0.3603, "rewards/chosen": -0.17716556787490845, "rewards/margins": 2.8113109469413757, "rewards/rejected": -2.988476514816284, "step": 29 }, { "epoch": 0.05192557334487235, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 4.2857142857142855e-06, "logits/chosen": 1024021.4736842106, "logits/rejected": 4154760.0, "logps/chosen": -166.43669048108552, "logps/rejected": -148.49478853665866, "loss": 0.4417, "rewards/chosen": -0.38098611329731186, "rewards/margins": 1.4392063704579465, "rewards/rejected": -1.8201924837552583, "step": 30 }, { "epoch": 0.05365642578970143, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 4.428571428571429e-06, "logits/chosen": 12327768.727272727, "logits/rejected": -4335401.904761905, "logps/chosen": -141.8250732421875, "logps/rejected": -224.83528645833334, "loss": 0.3034, "rewards/chosen": -0.04076832803812894, "rewards/margins": 2.3152393841898284, "rewards/rejected": -2.3560077122279575, "step": 31 }, { "epoch": 0.055387278234530504, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 4.571428571428572e-06, "logits/chosen": -7621606.545454546, "logits/rejected": -5030670.857142857, "logps/chosen": -168.98353160511363, "logps/rejected": -240.65597098214286, "loss": 0.294, "rewards/chosen": -0.1550229029221968, "rewards/margins": 2.5555503843150613, "rewards/rejected": -2.710573287237258, "step": 32 }, { "epoch": 0.057118130679359586, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 4.714285714285715e-06, "logits/chosen": -4837878.153846154, "logits/rejected": -2727320.4210526315, "logps/chosen": -260.5615985576923, "logps/rejected": -309.63633326480266, "loss": 0.3038, "rewards/chosen": -0.1790018998659574, "rewards/margins": 3.2555804590464605, "rewards/rejected": -3.434582358912418, "step": 33 }, { "epoch": 0.05884898312418866, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 4.857142857142858e-06, "logits/chosen": 334761.73333333334, "logits/rejected": -6532100.705882353, "logps/chosen": -111.3039794921875, "logps/rejected": -291.9061638327206, "loss": 0.3181, "rewards/chosen": -0.061692579587300615, "rewards/margins": 3.2174934447980394, "rewards/rejected": -3.27918602438534, "step": 34 }, { "epoch": 0.06057983556901774, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5367810.0, "logits/rejected": -11132618.0, "logps/chosen": -157.48095703125, "logps/rejected": -343.7468566894531, "loss": 0.3208, "rewards/chosen": -0.24039308726787567, "rewards/margins": 3.439171150326729, "rewards/rejected": -3.6795642375946045, "step": 35 }, { "epoch": 0.06231068801384682, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4363456.888888889, "logits/rejected": -3689830.285714286, "logps/chosen": -200.380126953125, "logps/rejected": -326.18729073660717, "loss": 0.3453, "rewards/chosen": -0.15904908710055882, "rewards/margins": 3.3486854840838722, "rewards/rejected": -3.507734571184431, "step": 36 }, { "epoch": 0.0640415404586759, "grad_norm": 24.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14376078.4, "logits/rejected": -13662346.666666666, "logps/chosen": -232.297265625, "logps/rejected": -266.85874430338544, "loss": 0.3776, "rewards/chosen": -0.15787798166275024, "rewards/margins": 3.1688521107037864, "rewards/rejected": -3.3267300923665366, "step": 37 }, { "epoch": 0.06577239290350498, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9234006.4, "logits/rejected": 8716803.764705881, "logps/chosen": -195.22076822916668, "logps/rejected": -223.59991096047793, "loss": 0.3266, "rewards/chosen": 0.033547862370808916, "rewards/margins": 2.3299624059714525, "rewards/rejected": -2.2964145436006436, "step": 38 }, { "epoch": 0.06750324534833406, "grad_norm": 23.5, "kl": 0.22034478187561035, "learning_rate": 5e-06, "logits/chosen": -10664284.0, "logits/rejected": -6822504.5, "logps/chosen": -254.58663940429688, "logps/rejected": -303.74578857421875, "loss": 0.3069, "rewards/chosen": -0.025931095704436302, "rewards/margins": 2.927818799391389, "rewards/rejected": -2.953749895095825, "step": 39 }, { "epoch": 0.06923409779316313, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3069599.6666666665, "logits/rejected": -6284954.8, "logps/chosen": -277.9341634114583, "logps/rejected": -314.748974609375, "loss": 0.2888, "rewards/chosen": -0.13368964195251465, "rewards/margins": 2.755869913101196, "rewards/rejected": -2.8895595550537108, "step": 40 }, { "epoch": 0.07096495023799221, "grad_norm": 22.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 946472.0, "logits/rejected": 9102588.235294119, "logps/chosen": -248.21149088541668, "logps/rejected": -162.85635914522058, "loss": 0.3098, "rewards/chosen": 0.07899113496144612, "rewards/margins": 2.07641756067089, "rewards/rejected": -1.9974264257094438, "step": 41 }, { "epoch": 0.07269580268282129, "grad_norm": 21.0, "kl": 0.07075059413909912, "learning_rate": 5e-06, "logits/chosen": 7388380.19047619, "logits/rejected": -89609.45454545454, "logps/chosen": -191.81854538690476, "logps/rejected": -390.0617009943182, "loss": 0.3575, "rewards/chosen": 0.03016080175127302, "rewards/margins": 3.0582882986440287, "rewards/rejected": -3.028127496892756, "step": 42 }, { "epoch": 0.07442665512765037, "grad_norm": 22.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6876745.6, "logits/rejected": -1067452.5, "logps/chosen": -200.07301025390626, "logps/rejected": -290.6153564453125, "loss": 0.3613, "rewards/chosen": -0.03498818874359131, "rewards/margins": 2.9018725315729776, "rewards/rejected": -2.936860720316569, "step": 43 }, { "epoch": 0.07615750757247945, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3785227.111111111, "logits/rejected": 1574901.0, "logps/chosen": -188.12406412760416, "logps/rejected": -150.00552804129464, "loss": 0.3361, "rewards/chosen": 0.11941173341539171, "rewards/margins": 2.0707845612177773, "rewards/rejected": -1.9513728278023856, "step": 44 }, { "epoch": 0.07788836001730852, "grad_norm": 18.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3288104.0, "logits/rejected": -1550288.125, "logps/chosen": -169.7630157470703, "logps/rejected": -244.53619384765625, "loss": 0.3335, "rewards/chosen": -0.1161470040678978, "rewards/margins": 2.0714645758271217, "rewards/rejected": -2.1876115798950195, "step": 45 }, { "epoch": 0.0796192124621376, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2523540.222222222, "logits/rejected": 807282.2142857143, "logps/chosen": -206.785400390625, "logps/rejected": -197.57388741629464, "loss": 0.3727, "rewards/chosen": -0.07473884688483344, "rewards/margins": 1.7757124862973652, "rewards/rejected": -1.8504513331821986, "step": 46 }, { "epoch": 0.08135006490696668, "grad_norm": 22.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3043348.533333333, "logits/rejected": 10325686.588235294, "logps/chosen": -231.226220703125, "logps/rejected": -169.99207261029412, "loss": 0.3148, "rewards/chosen": 0.14197413126627603, "rewards/margins": 2.014080571193321, "rewards/rejected": -1.872106439927045, "step": 47 }, { "epoch": 0.08308091735179576, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1849486.9411764706, "logits/rejected": -2160865.3333333335, "logps/chosen": -169.53436638327207, "logps/rejected": -239.082568359375, "loss": 0.3451, "rewards/chosen": -0.1786177158355713, "rewards/margins": 2.253294515609741, "rewards/rejected": -2.4319122314453123, "step": 48 }, { "epoch": 0.08481176979662484, "grad_norm": 21.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 900298.5714285715, "logits/rejected": -6812352.7272727275, "logps/chosen": -166.2104259672619, "logps/rejected": -353.5106312144886, "loss": 0.3726, "rewards/chosen": -0.07294606594812303, "rewards/margins": 2.6174716572740895, "rewards/rejected": -2.6904177232222124, "step": 49 }, { "epoch": 0.08654262224145391, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9324354.461538462, "logits/rejected": -14040514.52631579, "logps/chosen": -223.0580115685096, "logps/rejected": -286.9508634868421, "loss": 0.2674, "rewards/chosen": 0.13786140772012564, "rewards/margins": 2.67845962452985, "rewards/rejected": -2.5405982168097245, "step": 50 }, { "epoch": 0.088273474686283, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 930246.5, "logits/rejected": -7422215.0, "logps/chosen": -238.2467803955078, "logps/rejected": -259.29217529296875, "loss": 0.294, "rewards/chosen": 0.032952681183815, "rewards/margins": 3.1361082941293716, "rewards/rejected": -3.1031556129455566, "step": 51 }, { "epoch": 0.09000432713111207, "grad_norm": 20.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2028204.625, "logits/rejected": -1362418.25, "logps/chosen": -190.04937744140625, "logps/rejected": -216.1826171875, "loss": 0.3165, "rewards/chosen": -0.050155334174633026, "rewards/margins": 2.5483616068959236, "rewards/rejected": -2.5985169410705566, "step": 52 }, { "epoch": 0.09173517957594116, "grad_norm": 26.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1491913.3333333333, "logits/rejected": 17979700.363636363, "logps/chosen": -285.51971726190476, "logps/rejected": -230.9169256036932, "loss": 0.3878, "rewards/chosen": -0.13681457156226748, "rewards/margins": 2.993390062670687, "rewards/rejected": -3.1302046342329546, "step": 53 }, { "epoch": 0.09346603202077022, "grad_norm": 16.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5621796.571428572, "logits/rejected": -3721314.222222222, "logps/chosen": -154.634521484375, "logps/rejected": -144.23682996961804, "loss": 0.3166, "rewards/chosen": 0.15212011337280273, "rewards/margins": 2.1092937787373858, "rewards/rejected": -1.9571736653645833, "step": 54 }, { "epoch": 0.0951968844655993, "grad_norm": 20.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13213840.0, "logits/rejected": -2418853.6, "logps/chosen": -264.71543375651044, "logps/rejected": -299.3638916015625, "loss": 0.2498, "rewards/chosen": 0.169629176457723, "rewards/margins": 2.8914440949757894, "rewards/rejected": -2.7218149185180662, "step": 55 }, { "epoch": 0.09692773691042839, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1270494.2222222222, "logits/rejected": 3427853.1428571427, "logps/chosen": -218.35677083333334, "logps/rejected": -151.46371023995536, "loss": 0.3539, "rewards/chosen": 0.18590817186567518, "rewards/margins": 1.9470306029395452, "rewards/rejected": -1.76112243107387, "step": 56 }, { "epoch": 0.09865858935525747, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 960138.5, "logits/rejected": -11440672.0, "logps/chosen": -194.89010620117188, "logps/rejected": -279.1370544433594, "loss": 0.3033, "rewards/chosen": 0.042164143174886703, "rewards/margins": 2.7914009653031826, "rewards/rejected": -2.749236822128296, "step": 57 }, { "epoch": 0.10038944180008655, "grad_norm": 17.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3341974.933333333, "logits/rejected": -3835131.7647058824, "logps/chosen": -99.36764322916666, "logps/rejected": -268.3916015625, "loss": 0.3016, "rewards/chosen": -0.05830394426981608, "rewards/margins": 2.6891182179544484, "rewards/rejected": -2.7474221622242645, "step": 58 }, { "epoch": 0.10212029424491562, "grad_norm": 20.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5338708.2105263155, "logits/rejected": -522996.92307692306, "logps/chosen": -177.21720805921052, "logps/rejected": -242.44989483173077, "loss": 0.3425, "rewards/chosen": -0.03957033157348633, "rewards/margins": 3.2665699812082143, "rewards/rejected": -3.3061403127817006, "step": 59 }, { "epoch": 0.1038511466897447, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4021004.5714285714, "logits/rejected": 3649617.3333333335, "logps/chosen": -150.47998046875, "logps/rejected": -242.73616536458334, "loss": 0.2824, "rewards/chosen": 0.04531372019222805, "rewards/margins": 2.6572553756691164, "rewards/rejected": -2.611941655476888, "step": 60 }, { "epoch": 0.10558199913457378, "grad_norm": 20.0, "kl": 0.0002346038818359375, "learning_rate": 5e-06, "logits/chosen": -4345515.0, "logits/rejected": -8544926.0, "logps/chosen": -253.4796142578125, "logps/rejected": -246.94094848632812, "loss": 0.2793, "rewards/chosen": 0.15215471386909485, "rewards/margins": 3.54859259724617, "rewards/rejected": -3.396437883377075, "step": 61 }, { "epoch": 0.10731285157940286, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1196661.3333333333, "logits/rejected": -3318421.1428571427, "logps/chosen": -154.80669487847223, "logps/rejected": -289.3799525669643, "loss": 0.3092, "rewards/chosen": 0.0665718052122328, "rewards/margins": 3.497844584404476, "rewards/rejected": -3.4312727791922435, "step": 62 }, { "epoch": 0.10904370402423194, "grad_norm": 20.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3965800.4444444445, "logits/rejected": -11330329.142857144, "logps/chosen": -182.16238064236111, "logps/rejected": -249.26039341517858, "loss": 0.3376, "rewards/chosen": -0.1392565303378635, "rewards/margins": 3.219647899506584, "rewards/rejected": -3.3589044298444475, "step": 63 }, { "epoch": 0.11077455646906101, "grad_norm": 19.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2474531.25, "logits/rejected": -5974583.5, "logps/chosen": -209.27244567871094, "logps/rejected": -302.0538635253906, "loss": 0.325, "rewards/chosen": 0.020520292222499847, "rewards/margins": 3.1491325721144676, "rewards/rejected": -3.1286122798919678, "step": 64 }, { "epoch": 0.11250540891389009, "grad_norm": 14.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5467615.384615385, "logits/rejected": -3365480.0, "logps/chosen": -144.33649151141827, "logps/rejected": -279.4246761924342, "loss": 0.2657, "rewards/chosen": -0.11599624156951904, "rewards/margins": 3.2443882477910897, "rewards/rejected": -3.3603844893606087, "step": 65 }, { "epoch": 0.11423626135871917, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1735104.4, "logits/rejected": 5294354.666666667, "logps/chosen": -211.195068359375, "logps/rejected": -247.84977213541666, "loss": 0.3939, "rewards/chosen": -0.25041675567626953, "rewards/margins": 3.1013142267862954, "rewards/rejected": -3.351730982462565, "step": 66 }, { "epoch": 0.11596711380354825, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5322431.555555556, "logits/rejected": -17504265.14285714, "logps/chosen": -156.61607530381946, "logps/rejected": -485.01771763392856, "loss": 0.3061, "rewards/chosen": 0.008888012833065458, "rewards/margins": 3.940254797065069, "rewards/rejected": -3.9313667842320035, "step": 67 }, { "epoch": 0.11769796624837732, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2996452.2666666666, "logits/rejected": 11716244.705882354, "logps/chosen": -192.19383138020834, "logps/rejected": -302.7353515625, "loss": 0.2934, "rewards/chosen": 0.03241715629895528, "rewards/margins": 2.8701628228028615, "rewards/rejected": -2.8377456665039062, "step": 68 }, { "epoch": 0.1194288186932064, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 14927777.142857144, "logits/rejected": -4720057.777777778, "logps/chosen": -149.42860630580358, "logps/rejected": -217.70494249131946, "loss": 0.2898, "rewards/chosen": -0.16934810365949357, "rewards/margins": 3.1008769973875983, "rewards/rejected": -3.270225101047092, "step": 69 }, { "epoch": 0.12115967113803548, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 44536797.538461536, "logits/rejected": -7564164.2105263155, "logps/chosen": -682.1751802884615, "logps/rejected": -270.47216796875, "loss": 0.2777, "rewards/chosen": -0.24122038254371056, "rewards/margins": 2.748665248816795, "rewards/rejected": -2.989885631360506, "step": 70 }, { "epoch": 0.12289052358286456, "grad_norm": 24.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8335642.105263158, "logits/rejected": -9525784.615384616, "logps/chosen": -303.1348170230263, "logps/rejected": -280.5024601862981, "loss": 0.3256, "rewards/chosen": 0.11083748466090153, "rewards/margins": 2.885276489412254, "rewards/rejected": -2.7744390047513523, "step": 71 }, { "epoch": 0.12462137602769364, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8330821.333333333, "logits/rejected": -3045587.4, "logps/chosen": -137.8349812825521, "logps/rejected": -273.801513671875, "loss": 0.1949, "rewards/chosen": 0.5404347976048788, "rewards/margins": 3.689098318417867, "rewards/rejected": -3.1486635208129883, "step": 72 }, { "epoch": 0.12635222847252273, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9142004.444444444, "logits/rejected": -3207848.8571428573, "logps/chosen": -188.33430989583334, "logps/rejected": -207.32901436941964, "loss": 0.3074, "rewards/chosen": 0.039681686295403376, "rewards/margins": 3.252643155673194, "rewards/rejected": -3.2129614693777904, "step": 73 }, { "epoch": 0.1280830809173518, "grad_norm": 22.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6030457.263157895, "logits/rejected": -3718584.0, "logps/chosen": -238.16568153782896, "logps/rejected": -195.5281700721154, "loss": 0.3596, "rewards/chosen": 0.006416631372351395, "rewards/margins": 2.2222877086898096, "rewards/rejected": -2.215871077317458, "step": 74 }, { "epoch": 0.12981393336218086, "grad_norm": 19.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3802380.705882353, "logits/rejected": 7472531.2, "logps/chosen": -168.80958467371323, "logps/rejected": -285.90989583333334, "loss": 0.3219, "rewards/chosen": -0.224185635061825, "rewards/margins": 2.7274849480273673, "rewards/rejected": -2.9516705830891925, "step": 75 }, { "epoch": 0.13154478580700996, "grad_norm": 21.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4579526.5, "logits/rejected": 6935348.0, "logps/chosen": -246.0178985595703, "logps/rejected": -326.98834228515625, "loss": 0.2722, "rewards/chosen": 0.10884374380111694, "rewards/margins": 3.900286853313446, "rewards/rejected": -3.791443109512329, "step": 76 }, { "epoch": 0.13327563825183902, "grad_norm": 23.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2948165.6, "logits/rejected": -10498065.88235294, "logps/chosen": -323.5631510416667, "logps/rejected": -268.2412683823529, "loss": 0.2814, "rewards/chosen": -0.016830217838287354, "rewards/margins": 3.3728826207273146, "rewards/rejected": -3.389712838565602, "step": 77 }, { "epoch": 0.13500649069666812, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3724001.263157895, "logits/rejected": -4905061.230769231, "logps/chosen": -196.80241313733552, "logps/rejected": -246.7076697716346, "loss": 0.3493, "rewards/chosen": 0.13470386203966642, "rewards/margins": 2.7459867975489813, "rewards/rejected": -2.611282935509315, "step": 78 }, { "epoch": 0.1367373431414972, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 882787.5, "logits/rejected": -2752756.8333333335, "logps/chosen": -152.87762451171875, "logps/rejected": -143.17495727539062, "loss": 0.3448, "rewards/chosen": -0.04928714632987976, "rewards/margins": 3.1339206834634146, "rewards/rejected": -3.1832078297932944, "step": 79 }, { "epoch": 0.13846819558632625, "grad_norm": 16.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10046429.6, "logits/rejected": -9446046.545454545, "logps/chosen": -211.123828125, "logps/rejected": -224.4654873934659, "loss": 0.2231, "rewards/chosen": -0.09817437529563904, "rewards/margins": 3.1476648303595454, "rewards/rejected": -3.2458392056551846, "step": 80 }, { "epoch": 0.14019904803115535, "grad_norm": 21.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1529026.8235294118, "logits/rejected": -18009732.266666666, "logps/chosen": -215.9851505055147, "logps/rejected": -285.50341796875, "loss": 0.3038, "rewards/chosen": 0.07493850062875186, "rewards/margins": 3.3579501278260175, "rewards/rejected": -3.283011627197266, "step": 81 }, { "epoch": 0.14192990047598442, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2760504.3076923075, "logits/rejected": -6295609.684210527, "logps/chosen": -247.9722618689904, "logps/rejected": -289.9179173519737, "loss": 0.2071, "rewards/chosen": 0.2483532978938176, "rewards/margins": 3.7582735185198453, "rewards/rejected": -3.509920220626028, "step": 82 }, { "epoch": 0.1436607529208135, "grad_norm": 18.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12023732.923076924, "logits/rejected": -5987361.263157895, "logps/chosen": -222.31482872596155, "logps/rejected": -229.85079152960526, "loss": 0.2564, "rewards/chosen": -0.01725879082312951, "rewards/margins": 3.232607895546114, "rewards/rejected": -3.2498666863692436, "step": 83 }, { "epoch": 0.14539160536564258, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 612477.6842105263, "logits/rejected": 610641.0769230769, "logps/chosen": -134.88838918585526, "logps/rejected": -210.28667743389423, "loss": 0.3361, "rewards/chosen": -0.02787588772020842, "rewards/margins": 2.743228191306234, "rewards/rejected": -2.7711040790264425, "step": 84 }, { "epoch": 0.14712245781047165, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8578859.294117646, "logits/rejected": -4098941.8666666667, "logps/chosen": -180.28768382352942, "logps/rejected": -149.85576171875, "loss": 0.3247, "rewards/chosen": 0.19388238121481502, "rewards/margins": 2.1001341614068725, "rewards/rejected": -1.9062517801920573, "step": 85 }, { "epoch": 0.14885331025530074, "grad_norm": 20.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1573394.5882352942, "logits/rejected": 10871098.666666666, "logps/chosen": -191.07115981158088, "logps/rejected": -244.60188802083334, "loss": 0.3206, "rewards/chosen": 0.2755900551291073, "rewards/margins": 2.1674577563416726, "rewards/rejected": -1.891867701212565, "step": 86 }, { "epoch": 0.1505841627001298, "grad_norm": 21.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1688797.142857143, "logits/rejected": -1483586.6666666667, "logps/chosen": -255.131103515625, "logps/rejected": -332.4247233072917, "loss": 0.2465, "rewards/chosen": 0.21271177700587682, "rewards/margins": 3.036478909235152, "rewards/rejected": -2.8237671322292752, "step": 87 }, { "epoch": 0.1523150151449589, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15565679.0, "logits/rejected": -2823755.25, "logps/chosen": -226.8282012939453, "logps/rejected": -287.0397644042969, "loss": 0.2921, "rewards/chosen": 0.2816123962402344, "rewards/margins": 2.9265496730804443, "rewards/rejected": -2.64493727684021, "step": 88 }, { "epoch": 0.15404586758978797, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -413132.4, "logits/rejected": -10101343.05882353, "logps/chosen": -230.32145182291666, "logps/rejected": -227.8623764935662, "loss": 0.3061, "rewards/chosen": 0.1068873405456543, "rewards/margins": 2.7386797456180347, "rewards/rejected": -2.6317924050723804, "step": 89 }, { "epoch": 0.15577672003461704, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2141461.846153846, "logits/rejected": -5844247.157894737, "logps/chosen": -115.6215350811298, "logps/rejected": -274.47286184210526, "loss": 0.2861, "rewards/chosen": -0.02882493459261381, "rewards/margins": 2.667185855780536, "rewards/rejected": -2.69601079037315, "step": 90 }, { "epoch": 0.15750757247944613, "grad_norm": 25.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12380478.545454545, "logits/rejected": -8278526.4, "logps/chosen": -197.9210759943182, "logps/rejected": -223.8676513671875, "loss": 0.3626, "rewards/chosen": 0.14342746951363303, "rewards/margins": 3.160957529328086, "rewards/rejected": -3.017530059814453, "step": 91 }, { "epoch": 0.1592384249242752, "grad_norm": 20.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13449718.588235294, "logits/rejected": -4683606.933333334, "logps/chosen": -317.87795840992646, "logps/rejected": -267.261279296875, "loss": 0.2726, "rewards/chosen": 0.3811823059530819, "rewards/margins": 3.948626662235634, "rewards/rejected": -3.567444356282552, "step": 92 }, { "epoch": 0.1609692773691043, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -543169.375, "logits/rejected": -1371199.125, "logps/chosen": -150.7501678466797, "logps/rejected": -181.0672149658203, "loss": 0.3291, "rewards/chosen": -0.053351566195487976, "rewards/margins": 2.6354714184999466, "rewards/rejected": -2.6888229846954346, "step": 93 }, { "epoch": 0.16270012981393336, "grad_norm": 18.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2523244.0, "logits/rejected": 2864016.380952381, "logps/chosen": -234.7637606534091, "logps/rejected": -178.11604817708334, "loss": 0.2795, "rewards/chosen": -0.16940477761355313, "rewards/margins": 2.3981288062545643, "rewards/rejected": -2.5675335838681175, "step": 94 }, { "epoch": 0.16443098225876243, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3101979.2, "logits/rejected": -5256099.0, "logps/chosen": -156.21822509765624, "logps/rejected": -249.0777791341146, "loss": 0.3557, "rewards/chosen": -0.0824066936969757, "rewards/margins": 3.3445211907227836, "rewards/rejected": -3.4269278844197593, "step": 95 }, { "epoch": 0.16616183470359153, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7612368.7272727275, "logits/rejected": -5602336.761904762, "logps/chosen": -176.8388338955966, "logps/rejected": -257.1624348958333, "loss": 0.2211, "rewards/chosen": 0.0035542053255167875, "rewards/margins": 3.556209743248694, "rewards/rejected": -3.5526555379231772, "step": 96 }, { "epoch": 0.1678926871484206, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1705522.0, "logits/rejected": -19220438.0, "logps/chosen": -229.12918090820312, "logps/rejected": -301.3127746582031, "loss": 0.2716, "rewards/chosen": 0.30286985635757446, "rewards/margins": 3.395688831806183, "rewards/rejected": -3.0928189754486084, "step": 97 }, { "epoch": 0.1696235395932497, "grad_norm": 24.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 12111708.307692308, "logits/rejected": -993389.2631578947, "logps/chosen": -285.0191180889423, "logps/rejected": -165.50485711348685, "loss": 0.2834, "rewards/chosen": 0.09072128626016471, "rewards/margins": 2.5426856144237133, "rewards/rejected": -2.4519643281635486, "step": 98 }, { "epoch": 0.17135439203807876, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6123332.705882353, "logits/rejected": -4620013.866666666, "logps/chosen": -249.7477596507353, "logps/rejected": -251.340185546875, "loss": 0.3179, "rewards/chosen": 0.006986297228757073, "rewards/margins": 2.8065402319618302, "rewards/rejected": -2.799553934733073, "step": 99 }, { "epoch": 0.17308524448290782, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3662578.0, "logits/rejected": -3268372.75, "logps/chosen": -159.33383178710938, "logps/rejected": -199.64552307128906, "loss": 0.3467, "rewards/chosen": -0.1916092038154602, "rewards/margins": 2.2172593474388123, "rewards/rejected": -2.4088685512542725, "step": 100 }, { "epoch": 0.17481609692773692, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11227556.0, "logits/rejected": -4947570.4, "logps/chosen": -226.8167521158854, "logps/rejected": -230.9011474609375, "loss": 0.2496, "rewards/chosen": 0.0410018265247345, "rewards/margins": 3.1503684341907503, "rewards/rejected": -3.109366607666016, "step": 101 }, { "epoch": 0.176546949372566, "grad_norm": 21.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4914299.368421053, "logits/rejected": -4945337.846153846, "logps/chosen": -197.87137643914474, "logps/rejected": -258.1477614182692, "loss": 0.3485, "rewards/chosen": -0.031160028357254833, "rewards/margins": 3.5723934501771506, "rewards/rejected": -3.603553478534405, "step": 102 }, { "epoch": 0.17827780181739505, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7837846.222222222, "logits/rejected": -11910576.0, "logps/chosen": -174.4716796875, "logps/rejected": -307.66469029017856, "loss": 0.3231, "rewards/chosen": -0.013415685130490197, "rewards/margins": 2.68944691819331, "rewards/rejected": -2.7028626033238004, "step": 103 }, { "epoch": 0.18000865426222415, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2126388.0, "logits/rejected": 277817.7894736842, "logps/chosen": -284.61337515024036, "logps/rejected": -307.56527549342104, "loss": 0.2237, "rewards/chosen": 0.27964045451237607, "rewards/margins": 3.5338759731184615, "rewards/rejected": -3.2542355186060856, "step": 104 }, { "epoch": 0.18173950670705322, "grad_norm": 21.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1334305.875, "logits/rejected": -5412106.0, "logps/chosen": -228.7758331298828, "logps/rejected": -258.7738342285156, "loss": 0.3184, "rewards/chosen": -0.1122078001499176, "rewards/margins": 2.8912404477596283, "rewards/rejected": -3.003448247909546, "step": 105 }, { "epoch": 0.1834703591518823, "grad_norm": 18.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10178373.05263158, "logits/rejected": -8054584.615384615, "logps/chosen": -173.59982781661185, "logps/rejected": -278.5968674879808, "loss": 0.3082, "rewards/chosen": 0.23594951629638672, "rewards/margins": 3.9558092997624326, "rewards/rejected": -3.719859783466046, "step": 106 }, { "epoch": 0.18520121159671138, "grad_norm": 20.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7455461.647058823, "logits/rejected": -10272849.066666666, "logps/chosen": -190.27541934742646, "logps/rejected": -201.00078125, "loss": 0.3043, "rewards/chosen": 0.15676203896017635, "rewards/margins": 2.838399357889213, "rewards/rejected": -2.6816373189290363, "step": 107 }, { "epoch": 0.18693206404154045, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8889729.6, "logits/rejected": -3655384.0, "logps/chosen": -190.6529541015625, "logps/rejected": -243.38692220052084, "loss": 0.3275, "rewards/chosen": 0.024685271084308624, "rewards/margins": 3.879149484137694, "rewards/rejected": -3.8544642130533853, "step": 108 }, { "epoch": 0.18866291648636954, "grad_norm": 24.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1406335.5789473683, "logits/rejected": -5790012.307692308, "logps/chosen": -160.40576171875, "logps/rejected": -207.287353515625, "loss": 0.3181, "rewards/chosen": -0.008714937850048668, "rewards/margins": 3.360927466559507, "rewards/rejected": -3.3696424044095554, "step": 109 }, { "epoch": 0.1903937689311986, "grad_norm": 17.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1734898.1333333333, "logits/rejected": -7056189.176470588, "logps/chosen": -146.405810546875, "logps/rejected": -183.34127987132354, "loss": 0.3017, "rewards/chosen": 0.023581977685292563, "rewards/margins": 2.700872501438739, "rewards/rejected": -2.6772905237534466, "step": 110 }, { "epoch": 0.1921246213760277, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 531886.4615384615, "logits/rejected": -6064568.421052632, "logps/chosen": -133.6158728966346, "logps/rejected": -266.82632606907896, "loss": 0.2827, "rewards/chosen": -0.16753161870516264, "rewards/margins": 2.9463925081708653, "rewards/rejected": -3.113924126876028, "step": 111 }, { "epoch": 0.19385547382085677, "grad_norm": 15.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7024642.4, "logits/rejected": -11717272.727272727, "logps/chosen": -179.82689208984374, "logps/rejected": -256.05178000710225, "loss": 0.1916, "rewards/chosen": 0.08213082551956177, "rewards/margins": 3.862654645876451, "rewards/rejected": -3.780523820356889, "step": 112 }, { "epoch": 0.19558632626568584, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7312199.5, "logits/rejected": -9691646.0, "logps/chosen": -190.84982299804688, "logps/rejected": -189.02178955078125, "loss": 0.2895, "rewards/chosen": -0.05902346968650818, "rewards/margins": 3.4700850546360016, "rewards/rejected": -3.5291085243225098, "step": 113 }, { "epoch": 0.19731717871051493, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8977872.0, "logits/rejected": -2104818.5, "logps/chosen": -153.08140563964844, "logps/rejected": -362.2707824707031, "loss": 0.2608, "rewards/chosen": 0.1350196748971939, "rewards/margins": 4.345773592591286, "rewards/rejected": -4.210753917694092, "step": 114 }, { "epoch": 0.199048031155344, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15021820.444444444, "logits/rejected": 3075554.8571428573, "logps/chosen": -254.52723524305554, "logps/rejected": -324.88724190848217, "loss": 0.2895, "rewards/chosen": 0.18979620933532715, "rewards/margins": 4.077897787094116, "rewards/rejected": -3.888101577758789, "step": 115 }, { "epoch": 0.2007788836001731, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8322480.0, "logits/rejected": 2723888.4210526315, "logps/chosen": -225.67595027043268, "logps/rejected": -165.18586811266448, "loss": 0.2731, "rewards/chosen": 0.10532364478478065, "rewards/margins": 3.168428977008773, "rewards/rejected": -3.0631053322239925, "step": 116 }, { "epoch": 0.20250973604500216, "grad_norm": 13.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3385337.6, "logits/rejected": -4593669.454545454, "logps/chosen": -155.44390869140625, "logps/rejected": -335.2124689275568, "loss": 0.2003, "rewards/chosen": -0.034807294607162476, "rewards/margins": 4.202970068563115, "rewards/rejected": -4.237777363170277, "step": 117 }, { "epoch": 0.20424058848983123, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2332099.8181818184, "logits/rejected": -2151819.8, "logps/chosen": -175.2345525568182, "logps/rejected": -296.2167724609375, "loss": 0.3551, "rewards/chosen": 0.0834602876143022, "rewards/margins": 4.01979642347856, "rewards/rejected": -3.936336135864258, "step": 118 }, { "epoch": 0.20597144093466033, "grad_norm": 19.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1813728.4210526317, "logits/rejected": -1663898.1538461538, "logps/chosen": -162.3670076069079, "logps/rejected": -307.88326322115387, "loss": 0.2952, "rewards/chosen": 0.22818475020559212, "rewards/margins": 3.5458337111994322, "rewards/rejected": -3.31764896099384, "step": 119 }, { "epoch": 0.2077022933794894, "grad_norm": 17.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5888752.533333333, "logits/rejected": -2605564.705882353, "logps/chosen": -156.10784505208332, "logps/rejected": -253.71030560661765, "loss": 0.3016, "rewards/chosen": -0.10760652224222819, "rewards/margins": 3.660627281899546, "rewards/rejected": -3.768233804141774, "step": 120 }, { "epoch": 0.2094331458243185, "grad_norm": 15.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7869675.076923077, "logits/rejected": -4793528.842105263, "logps/chosen": -159.2930626502404, "logps/rejected": -251.59606291118422, "loss": 0.2563, "rewards/chosen": -0.1771384019118089, "rewards/margins": 3.1404216936242726, "rewards/rejected": -3.3175600955360816, "step": 121 }, { "epoch": 0.21116399826914756, "grad_norm": 18.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3852856.0, "logits/rejected": 1017586.2352941176, "logps/chosen": -130.758642578125, "logps/rejected": -310.29397403492646, "loss": 0.2913, "rewards/chosen": -0.17760810852050782, "rewards/margins": 3.5648737963508155, "rewards/rejected": -3.7424819048713234, "step": 122 }, { "epoch": 0.21289485071397662, "grad_norm": 21.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 394461.8947368421, "logits/rejected": -11013158.153846154, "logps/chosen": -211.87534693667763, "logps/rejected": -266.10584435096155, "loss": 0.3519, "rewards/chosen": -0.09623796061465614, "rewards/margins": 3.0594592017200792, "rewards/rejected": -3.1556971623347354, "step": 123 }, { "epoch": 0.21462570315880572, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -375859.125, "logits/rejected": -18902716.0, "logps/chosen": -144.1176513671875, "logps/rejected": -328.43532307942706, "loss": 0.3587, "rewards/chosen": -0.10335218906402588, "rewards/margins": 3.1057602961858115, "rewards/rejected": -3.2091124852498374, "step": 124 }, { "epoch": 0.2163565556036348, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9888419.2, "logits/rejected": -13417634.823529411, "logps/chosen": -207.53665364583333, "logps/rejected": -305.8712373621324, "loss": 0.2801, "rewards/chosen": 0.16173944473266602, "rewards/margins": 3.5778553738313565, "rewards/rejected": -3.4161159290986904, "step": 125 }, { "epoch": 0.21808740804846388, "grad_norm": 15.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12050750.4, "logits/rejected": -13769786.181818182, "logps/chosen": -183.52490234375, "logps/rejected": -278.15651633522725, "loss": 0.2183, "rewards/chosen": 0.14027655124664307, "rewards/margins": 3.1395226283506914, "rewards/rejected": -2.9992460771040483, "step": 126 }, { "epoch": 0.21981826049329295, "grad_norm": 17.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12270530.666666666, "logits/rejected": -8869286.4, "logps/chosen": -190.94978841145834, "logps/rejected": -284.20654296875, "loss": 0.233, "rewards/chosen": 0.16113528609275818, "rewards/margins": 3.4381788194179537, "rewards/rejected": -3.2770435333251955, "step": 127 }, { "epoch": 0.22154911293812202, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15397617.066666666, "logits/rejected": -13504640.94117647, "logps/chosen": -223.714453125, "logps/rejected": -284.49543313419116, "loss": 0.2465, "rewards/chosen": 0.2658435821533203, "rewards/margins": 3.566493337294635, "rewards/rejected": -3.3006497551413143, "step": 128 }, { "epoch": 0.2232799653829511, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13344241.333333334, "logits/rejected": 3126395.2, "logps/chosen": -236.7048543294271, "logps/rejected": -158.71007080078124, "loss": 0.2397, "rewards/chosen": 0.05072679618994395, "rewards/margins": 3.053556347886721, "rewards/rejected": -3.0028295516967773, "step": 129 }, { "epoch": 0.22501081782778018, "grad_norm": 18.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1188826.4, "logits/rejected": -7702007.529411765, "logps/chosen": -194.06878255208332, "logps/rejected": -285.5570714613971, "loss": 0.287, "rewards/chosen": -0.050044012069702146, "rewards/margins": 2.950371789932251, "rewards/rejected": -3.000415802001953, "step": 130 }, { "epoch": 0.22674167027260925, "grad_norm": 24.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9090264.421052631, "logits/rejected": -13554491.076923076, "logps/chosen": -228.48311574835526, "logps/rejected": -317.5834209735577, "loss": 0.3322, "rewards/chosen": 0.17829758242556923, "rewards/margins": 2.7610747524601247, "rewards/rejected": -2.5827771700345554, "step": 131 }, { "epoch": 0.22847252271743834, "grad_norm": 20.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11592303.111111112, "logits/rejected": -3384686.285714286, "logps/chosen": -236.07706705729166, "logps/rejected": -231.01377650669642, "loss": 0.3204, "rewards/chosen": 0.08443025747934978, "rewards/margins": 3.5145191181273687, "rewards/rejected": -3.430088860648019, "step": 132 }, { "epoch": 0.2302033751622674, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4109541.0, "logits/rejected": -10256295.0, "logps/chosen": -150.5631561279297, "logps/rejected": -251.85006713867188, "loss": 0.3414, "rewards/chosen": -0.03023519366979599, "rewards/margins": 2.841710902750492, "rewards/rejected": -2.871946096420288, "step": 133 }, { "epoch": 0.2319342276070965, "grad_norm": 19.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17335990.153846152, "logits/rejected": -11810266.94736842, "logps/chosen": -255.87794846754807, "logps/rejected": -292.57041529605266, "loss": 0.2403, "rewards/chosen": -0.009397160548430223, "rewards/margins": 3.8956148698021043, "rewards/rejected": -3.9050120303505347, "step": 134 }, { "epoch": 0.23366508005192557, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3816658.933333333, "logits/rejected": -21506733.17647059, "logps/chosen": -192.03406575520833, "logps/rejected": -326.34670840992646, "loss": 0.2583, "rewards/chosen": -0.08073126475016276, "rewards/margins": 4.162721368378285, "rewards/rejected": -4.243452633128447, "step": 135 }, { "epoch": 0.23539593249675464, "grad_norm": 23.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5209688.421052632, "logits/rejected": 4241887.076923077, "logps/chosen": -241.80124383223685, "logps/rejected": -267.38955453725964, "loss": 0.3287, "rewards/chosen": 0.1192607001254433, "rewards/margins": 3.0787412792082254, "rewards/rejected": -2.9594805790827823, "step": 136 }, { "epoch": 0.23712678494158373, "grad_norm": 16.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13466715.733333332, "logits/rejected": -6725747.764705882, "logps/chosen": -169.23546549479167, "logps/rejected": -301.47449448529414, "loss": 0.2489, "rewards/chosen": 0.17315847078959148, "rewards/margins": 4.150024351419187, "rewards/rejected": -3.9768658806295956, "step": 137 }, { "epoch": 0.2388576373864128, "grad_norm": 20.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7070817.5, "logits/rejected": -5078294.5, "logps/chosen": -224.52960205078125, "logps/rejected": -250.922119140625, "loss": 0.299, "rewards/chosen": -0.062333978712558746, "rewards/margins": 3.91761764138937, "rewards/rejected": -3.9799516201019287, "step": 138 }, { "epoch": 0.2405884898312419, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10853539.555555556, "logits/rejected": -8516350.857142856, "logps/chosen": -245.61515299479166, "logps/rejected": -311.77559988839283, "loss": 0.2919, "rewards/chosen": 0.143819702996148, "rewards/margins": 4.062012430221316, "rewards/rejected": -3.9181927272251675, "step": 139 }, { "epoch": 0.24231934227607096, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23256322.666666668, "logits/rejected": -14564356.8, "logps/chosen": -252.69757080078125, "logps/rejected": -324.65302734375, "loss": 0.1926, "rewards/chosen": 0.15854175885518393, "rewards/margins": 4.718555339177449, "rewards/rejected": -4.5600135803222654, "step": 140 }, { "epoch": 0.24405019472090003, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13540496.0, "logits/rejected": -14330663.111111112, "logps/chosen": -261.66441127232144, "logps/rejected": -279.0700954861111, "loss": 0.2759, "rewards/chosen": -0.16050028800964355, "rewards/margins": 3.2266637219323053, "rewards/rejected": -3.387164009941949, "step": 141 }, { "epoch": 0.24578104716572913, "grad_norm": 22.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16567258.352941176, "logits/rejected": -9630573.866666667, "logps/chosen": -248.67431640625, "logps/rejected": -195.8955078125, "loss": 0.2962, "rewards/chosen": 0.18976323744829962, "rewards/margins": 3.4638149037080654, "rewards/rejected": -3.2740516662597656, "step": 142 }, { "epoch": 0.2475118996105582, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12402342.153846154, "logits/rejected": -7839504.842105263, "logps/chosen": -210.70182917668268, "logps/rejected": -379.2096525493421, "loss": 0.2114, "rewards/chosen": 0.16687591259296125, "rewards/margins": 5.201281671099335, "rewards/rejected": -5.034405758506374, "step": 143 }, { "epoch": 0.2492427520553873, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4255770.285714285, "logits/rejected": -3999491.111111111, "logps/chosen": -195.87995256696428, "logps/rejected": -214.72549099392361, "loss": 0.2982, "rewards/chosen": -0.21987019266401017, "rewards/margins": 2.6090391514793274, "rewards/rejected": -2.8289093441433377, "step": 144 }, { "epoch": 0.25097360450021633, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4979902.666666667, "logits/rejected": -11014548.57142857, "logps/chosen": -255.74172634548611, "logps/rejected": -255.93235560825892, "loss": 0.3164, "rewards/chosen": -0.09737168418036567, "rewards/margins": 3.5333697076827764, "rewards/rejected": -3.630741391863142, "step": 145 }, { "epoch": 0.25270445694504545, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7553311.5, "logits/rejected": -12855062.0, "logps/chosen": -181.3147735595703, "logps/rejected": -319.3401184082031, "loss": 0.2509, "rewards/chosen": 0.28162485361099243, "rewards/margins": 4.752773344516754, "rewards/rejected": -4.471148490905762, "step": 146 }, { "epoch": 0.2544353093898745, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18317777.6, "logits/rejected": -8850414.545454545, "logps/chosen": -208.5265380859375, "logps/rejected": -253.0011319247159, "loss": 0.2135, "rewards/chosen": 0.22829954624176024, "rewards/margins": 3.561434630914168, "rewards/rejected": -3.3331350846724077, "step": 147 }, { "epoch": 0.2561661618347036, "grad_norm": 22.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7776780.0, "logits/rejected": -13990768.0, "logps/chosen": -222.059326171875, "logps/rejected": -333.6552734375, "loss": 0.3596, "rewards/chosen": -0.05462043881416321, "rewards/margins": 2.7295163333415986, "rewards/rejected": -2.7841367721557617, "step": 148 }, { "epoch": 0.25789701427953265, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -501711.76470588235, "logits/rejected": -4720805.333333333, "logps/chosen": -165.69982192095588, "logps/rejected": -258.313623046875, "loss": 0.3126, "rewards/chosen": -0.00996632085126989, "rewards/margins": 2.9709932535302404, "rewards/rejected": -2.9809595743815103, "step": 149 }, { "epoch": 0.2596278667243617, "grad_norm": 16.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11498374.153846154, "logits/rejected": -12492654.315789474, "logps/chosen": -136.2483191856971, "logps/rejected": -243.18302837171052, "loss": 0.27, "rewards/chosen": -0.3298172950744629, "rewards/margins": 2.9719581854970833, "rewards/rejected": -3.3017754805715462, "step": 150 }, { "epoch": 0.26135871916919085, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18241306.0, "logits/rejected": -5504833.5, "logps/chosen": -285.53369140625, "logps/rejected": -346.5982360839844, "loss": 0.2638, "rewards/chosen": 0.2393263578414917, "rewards/margins": 3.530009150505066, "rewards/rejected": -3.290682792663574, "step": 151 }, { "epoch": 0.2630895716140199, "grad_norm": 21.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9077254.588235294, "logits/rejected": -16832766.933333334, "logps/chosen": -218.34030330882354, "logps/rejected": -284.46845703125, "loss": 0.3062, "rewards/chosen": 0.03087810558431289, "rewards/margins": 3.274915225131839, "rewards/rejected": -3.244037119547526, "step": 152 }, { "epoch": 0.264820424058849, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 3784820.0, "logits/rejected": -27976580.266666666, "logps/chosen": -176.8842342601103, "logps/rejected": -371.0309244791667, "loss": 0.2938, "rewards/chosen": 0.037597624694599825, "rewards/margins": 3.3108126006874383, "rewards/rejected": -3.2732149759928384, "step": 153 }, { "epoch": 0.26655127650367805, "grad_norm": 19.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8110033.066666666, "logits/rejected": -6098681.411764706, "logps/chosen": -224.60833333333332, "logps/rejected": -270.91673368566177, "loss": 0.2667, "rewards/chosen": 0.19785807927449545, "rewards/margins": 3.2095348676045736, "rewards/rejected": -3.011676788330078, "step": 154 }, { "epoch": 0.2682821289485071, "grad_norm": 24.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10915676.16, "logits/rejected": -12048051.42857143, "logps/chosen": -200.02255859375, "logps/rejected": -363.83028738839283, "loss": 0.4141, "rewards/chosen": -0.20076709747314453, "rewards/margins": 4.246241580418179, "rewards/rejected": -4.447008677891323, "step": 155 }, { "epoch": 0.27001298139333624, "grad_norm": 21.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4039928.5, "logits/rejected": -297019.3125, "logps/chosen": -122.26431274414062, "logps/rejected": -205.24542236328125, "loss": 0.2976, "rewards/chosen": 0.13761137425899506, "rewards/margins": 2.9543447345495224, "rewards/rejected": -2.8167333602905273, "step": 156 }, { "epoch": 0.2717438338381653, "grad_norm": 24.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11950380.444444444, "logits/rejected": -14319524.57142857, "logps/chosen": -316.365234375, "logps/rejected": -238.67583356584822, "loss": 0.3578, "rewards/chosen": -0.22798464033338758, "rewards/margins": 2.9639355038839676, "rewards/rejected": -3.191920144217355, "step": 157 }, { "epoch": 0.2734746862829944, "grad_norm": 16.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7220196.923076923, "logits/rejected": -7274026.105263158, "logps/chosen": -168.70252403846155, "logps/rejected": -272.20877878289474, "loss": 0.2521, "rewards/chosen": 0.034194111824035645, "rewards/margins": 3.5594172916914286, "rewards/rejected": -3.525223179867393, "step": 158 }, { "epoch": 0.27520553872782344, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7790591.111111111, "logits/rejected": -4348877.714285715, "logps/chosen": -150.17240397135416, "logps/rejected": -257.23592703683033, "loss": 0.2867, "rewards/chosen": 0.2716523011525472, "rewards/margins": 3.2814045747121177, "rewards/rejected": -3.0097522735595703, "step": 159 }, { "epoch": 0.2769363911726525, "grad_norm": 21.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6992461.0, "logits/rejected": -9803589.0, "logps/chosen": -197.62159729003906, "logps/rejected": -200.63839721679688, "loss": 0.3119, "rewards/chosen": 0.2022910714149475, "rewards/margins": 2.61891371011734, "rewards/rejected": -2.4166226387023926, "step": 160 }, { "epoch": 0.27866724361748163, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2316156.0, "logits/rejected": -6923803.294117647, "logps/chosen": -157.97198893229168, "logps/rejected": -226.3733340992647, "loss": 0.2941, "rewards/chosen": -0.181062380472819, "rewards/margins": 3.137106035269943, "rewards/rejected": -3.318168415742762, "step": 161 }, { "epoch": 0.2803980960623107, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1817514.6666666667, "logits/rejected": -4328779.428571428, "logps/chosen": -125.52289496527777, "logps/rejected": -229.05801827566964, "loss": 0.3201, "rewards/chosen": 0.1051819192038642, "rewards/margins": 2.722233724972558, "rewards/rejected": -2.617051805768694, "step": 162 }, { "epoch": 0.28212894850713977, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7507249.411764706, "logits/rejected": 2358673.6, "logps/chosen": -203.8069278492647, "logps/rejected": -239.440380859375, "loss": 0.2975, "rewards/chosen": 0.04498655655804802, "rewards/margins": 3.3732679591459385, "rewards/rejected": -3.3282814025878906, "step": 163 }, { "epoch": 0.28385980095196883, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17538291.2, "logits/rejected": -8524349.176470589, "logps/chosen": -203.443017578125, "logps/rejected": -309.54041245404414, "loss": 0.2556, "rewards/chosen": 0.23375027974446613, "rewards/margins": 3.705425703759287, "rewards/rejected": -3.471675424014821, "step": 164 }, { "epoch": 0.2855906533967979, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11055366.857142856, "logits/rejected": -1121562.3333333333, "logps/chosen": -159.264892578125, "logps/rejected": -286.91015625, "loss": 0.2449, "rewards/chosen": 0.12227598258427211, "rewards/margins": 4.091070063530452, "rewards/rejected": -3.9687940809461804, "step": 165 }, { "epoch": 0.287321505841627, "grad_norm": 23.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 657442.6, "logits/rejected": -9425391.333333334, "logps/chosen": -220.0943603515625, "logps/rejected": -196.21675618489584, "loss": 0.3849, "rewards/chosen": -0.32888593673706057, "rewards/margins": 2.8519148190816246, "rewards/rejected": -3.180800755818685, "step": 166 }, { "epoch": 0.2890523582864561, "grad_norm": 22.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4699375.555555556, "logits/rejected": -3600353.1428571427, "logps/chosen": -265.4497341579861, "logps/rejected": -267.37051827566967, "loss": 0.2924, "rewards/chosen": 0.23302984237670898, "rewards/margins": 3.618199280330113, "rewards/rejected": -3.385169437953404, "step": 167 }, { "epoch": 0.29078321073128516, "grad_norm": 24.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15453594.666666666, "logits/rejected": -7304598.5, "logps/chosen": -211.82305908203125, "logps/rejected": -305.252685546875, "loss": 0.4196, "rewards/chosen": -0.15757346153259277, "rewards/margins": 3.7481679916381836, "rewards/rejected": -3.9057414531707764, "step": 168 }, { "epoch": 0.2925140631761142, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 917019.2941176471, "logits/rejected": -11583381.333333334, "logps/chosen": -187.60915958180146, "logps/rejected": -256.85833333333335, "loss": 0.2921, "rewards/chosen": -0.004517814692328958, "rewards/margins": 3.6537013320361864, "rewards/rejected": -3.6582191467285154, "step": 169 }, { "epoch": 0.2942449156209433, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5889556.705882353, "logits/rejected": -12821309.866666667, "logps/chosen": -256.27088120404414, "logps/rejected": -308.8744791666667, "loss": 0.2513, "rewards/chosen": 0.38170385360717773, "rewards/margins": 4.123530483245849, "rewards/rejected": -3.741826629638672, "step": 170 }, { "epoch": 0.2959757680657724, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 16747204.0, "logits/rejected": 620209.6, "logps/chosen": -243.82991536458334, "logps/rejected": -253.942626953125, "loss": 0.2081, "rewards/chosen": 0.4542102813720703, "rewards/margins": 3.8508056640625, "rewards/rejected": -3.3965953826904296, "step": 171 }, { "epoch": 0.2977066205106015, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12614680.615384616, "logits/rejected": -7096439.578947368, "logps/chosen": -130.17988469050482, "logps/rejected": -229.95723684210526, "loss": 0.2725, "rewards/chosen": 0.0035039232327387882, "rewards/margins": 2.607151255675173, "rewards/rejected": -2.603647332442434, "step": 172 }, { "epoch": 0.29943747295543055, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3154612.0, "logits/rejected": -9195659.2, "logps/chosen": -173.6625773111979, "logps/rejected": -270.675537109375, "loss": 0.2182, "rewards/chosen": 0.14845428864161173, "rewards/margins": 3.5584659616152443, "rewards/rejected": -3.4100116729736327, "step": 173 }, { "epoch": 0.3011683254002596, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9196400.0, "logits/rejected": -15175718.0, "logps/chosen": -157.8928680419922, "logps/rejected": -259.74560546875, "loss": 0.3143, "rewards/chosen": -0.17662523686885834, "rewards/margins": 3.229040876030922, "rewards/rejected": -3.4056661128997803, "step": 174 }, { "epoch": 0.3028991778450887, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15626394.352941176, "logits/rejected": -7392939.2, "logps/chosen": -241.25695082720588, "logps/rejected": -236.45672200520832, "loss": 0.2589, "rewards/chosen": 0.28153758890488567, "rewards/margins": 4.230858064165302, "rewards/rejected": -3.9493204752604165, "step": 175 }, { "epoch": 0.3046300302899178, "grad_norm": 22.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9918681.6, "logits/rejected": -10010532.0, "logps/chosen": -202.339111328125, "logps/rejected": -184.87152099609375, "loss": 0.3503, "rewards/chosen": -0.07323684692382812, "rewards/margins": 3.2869134902954102, "rewards/rejected": -3.3601503372192383, "step": 176 }, { "epoch": 0.3063608827347469, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9519585.0, "logits/rejected": -17195120.0, "logps/chosen": -151.671875, "logps/rejected": -259.06878662109375, "loss": 0.2859, "rewards/chosen": 0.010412598960101604, "rewards/margins": 3.337350751273334, "rewards/rejected": -3.3269381523132324, "step": 177 }, { "epoch": 0.30809173517957594, "grad_norm": 21.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16214270.11764706, "logits/rejected": -9008196.266666668, "logps/chosen": -273.4817899816176, "logps/rejected": -351.15856119791664, "loss": 0.2819, "rewards/chosen": 0.03458939930971931, "rewards/margins": 3.560055555315579, "rewards/rejected": -3.5254661560058596, "step": 178 }, { "epoch": 0.309822587624405, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4070659.5555555555, "logits/rejected": 564305.2857142857, "logps/chosen": -159.47158474392361, "logps/rejected": -243.50922502790178, "loss": 0.3114, "rewards/chosen": 0.13283884525299072, "rewards/margins": 3.35727219922202, "rewards/rejected": -3.224433353969029, "step": 179 }, { "epoch": 0.3115534400692341, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6579336.888888889, "logits/rejected": -11798860.57142857, "logps/chosen": -194.06880696614584, "logps/rejected": -265.37472098214283, "loss": 0.3398, "rewards/chosen": -0.14026531908247206, "rewards/margins": 2.981163579320151, "rewards/rejected": -3.121428898402623, "step": 180 }, { "epoch": 0.3132842925140632, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5290882.0, "logits/rejected": -21157156.0, "logps/chosen": -148.17227172851562, "logps/rejected": -296.1889953613281, "loss": 0.3044, "rewards/chosen": -0.1293964684009552, "rewards/margins": 3.0376605689525604, "rewards/rejected": -3.1670570373535156, "step": 181 }, { "epoch": 0.31501514495889227, "grad_norm": 21.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9015065.263157895, "logits/rejected": -6190756.923076923, "logps/chosen": -208.37291837993422, "logps/rejected": -260.5161884014423, "loss": 0.3085, "rewards/chosen": 0.10386697869551809, "rewards/margins": 3.3943309397832584, "rewards/rejected": -3.2904639610877404, "step": 182 }, { "epoch": 0.31674599740372134, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10990801.066666666, "logits/rejected": -6380602.352941177, "logps/chosen": -166.50836588541668, "logps/rejected": -261.99778837316177, "loss": 0.2828, "rewards/chosen": -0.09178520043690999, "rewards/margins": 3.105228430149602, "rewards/rejected": -3.197013630586512, "step": 183 }, { "epoch": 0.3184768498485504, "grad_norm": 22.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15325334.857142856, "logits/rejected": -24043242.181818184, "logps/chosen": -213.09461030505952, "logps/rejected": -345.95725319602275, "loss": 0.3397, "rewards/chosen": 0.22786199478876024, "rewards/margins": 2.5508950675204716, "rewards/rejected": -2.3230330727317114, "step": 184 }, { "epoch": 0.32020770229337947, "grad_norm": 21.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11346512.94117647, "logits/rejected": -2746510.4, "logps/chosen": -264.31959443933823, "logps/rejected": -207.83736979166667, "loss": 0.2974, "rewards/chosen": 0.12100423083585851, "rewards/margins": 2.763645679810468, "rewards/rejected": -2.6426414489746093, "step": 185 }, { "epoch": 0.3219385547382086, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10335120.0, "logits/rejected": -11680630.0, "logps/chosen": -169.1227264404297, "logps/rejected": -242.72560119628906, "loss": 0.3009, "rewards/chosen": 0.12381087243556976, "rewards/margins": 3.1314540952444077, "rewards/rejected": -3.007643222808838, "step": 186 }, { "epoch": 0.32366940718303766, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15741749.714285715, "logits/rejected": -5845185.333333333, "logps/chosen": -198.52054268973214, "logps/rejected": -295.60541449652777, "loss": 0.2393, "rewards/chosen": 0.4524484021323068, "rewards/margins": 3.021870806103661, "rewards/rejected": -2.569422403971354, "step": 187 }, { "epoch": 0.32540025962786673, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9898615.578947369, "logits/rejected": -2790789.846153846, "logps/chosen": -154.5710320723684, "logps/rejected": -181.50324894831732, "loss": 0.3752, "rewards/chosen": -0.13492245423166374, "rewards/margins": 2.425167830849466, "rewards/rejected": -2.56009028508113, "step": 188 }, { "epoch": 0.3271311120726958, "grad_norm": 19.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12387616.842105264, "logits/rejected": -8763428.923076924, "logps/chosen": -166.91708213404604, "logps/rejected": -238.5693359375, "loss": 0.3205, "rewards/chosen": 0.12465482009084601, "rewards/margins": 2.8608485503717955, "rewards/rejected": -2.7361937302809496, "step": 189 }, { "epoch": 0.32886196451752486, "grad_norm": 20.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11473281.454545455, "logits/rejected": -16561514.666666666, "logps/chosen": -248.29432262073863, "logps/rejected": -292.7100074404762, "loss": 0.2314, "rewards/chosen": 0.17398832061073996, "rewards/margins": 3.0009191387143486, "rewards/rejected": -2.826930818103609, "step": 190 }, { "epoch": 0.330592816962354, "grad_norm": 16.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8514306.823529411, "logits/rejected": -15071434.666666666, "logps/chosen": -131.8534725413603, "logps/rejected": -270.87555338541665, "loss": 0.3024, "rewards/chosen": 0.18799910825841568, "rewards/margins": 3.0057781406477386, "rewards/rejected": -2.817779032389323, "step": 191 }, { "epoch": 0.33232366940718305, "grad_norm": 19.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1210433.894736842, "logits/rejected": -8759064.0, "logps/chosen": -148.5827765213816, "logps/rejected": -183.75860126201923, "loss": 0.3729, "rewards/chosen": -0.010686732436481276, "rewards/margins": 2.6109067429053154, "rewards/rejected": -2.621593475341797, "step": 192 }, { "epoch": 0.3340545218520121, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1876100.6153846155, "logits/rejected": -12178267.789473685, "logps/chosen": -158.33451021634616, "logps/rejected": -272.60916940789474, "loss": 0.2534, "rewards/chosen": 0.049141957209660456, "rewards/margins": 3.0719871752657872, "rewards/rejected": -3.0228452180561267, "step": 193 }, { "epoch": 0.3357853742968412, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9391846.0, "logits/rejected": -7076478.5, "logps/chosen": -197.68260192871094, "logps/rejected": -201.0494384765625, "loss": 0.2827, "rewards/chosen": 0.280254989862442, "rewards/margins": 3.4145003855228424, "rewards/rejected": -3.1342453956604004, "step": 194 }, { "epoch": 0.33751622674167026, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19537841.066666666, "logits/rejected": -4787949.176470588, "logps/chosen": -203.43776041666666, "logps/rejected": -216.7041015625, "loss": 0.3368, "rewards/chosen": 0.03912758032480876, "rewards/margins": 2.348604769332736, "rewards/rejected": -2.3094771890079273, "step": 195 }, { "epoch": 0.3392470791864994, "grad_norm": 16.625, "kl": 0.16342926025390625, "learning_rate": 5e-06, "logits/chosen": -3170068.8421052634, "logits/rejected": -18859544.615384616, "logps/chosen": -124.50954718338816, "logps/rejected": -328.8591120793269, "loss": 0.2909, "rewards/chosen": 0.2587398478859349, "rewards/margins": 3.9374563607127078, "rewards/rejected": -3.6787165128267727, "step": 196 }, { "epoch": 0.34097793163132845, "grad_norm": 21.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 454123.25, "logits/rejected": -4452693.0, "logps/chosen": -233.21722412109375, "logps/rejected": -264.53143310546875, "loss": 0.2989, "rewards/chosen": -0.011935576796531677, "rewards/margins": 3.860817089676857, "rewards/rejected": -3.8727526664733887, "step": 197 }, { "epoch": 0.3427087840761575, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3420107.466666667, "logits/rejected": -14321411.764705881, "logps/chosen": -135.79720052083334, "logps/rejected": -363.07223690257354, "loss": 0.2492, "rewards/chosen": 0.19408594767252604, "rewards/margins": 3.735214442832797, "rewards/rejected": -3.541128495160271, "step": 198 }, { "epoch": 0.3444396365209866, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5829539.6, "logits/rejected": -15839922.666666666, "logps/chosen": -191.86925048828124, "logps/rejected": -276.8868408203125, "loss": 0.3512, "rewards/chosen": 0.14528814554214478, "rewards/margins": 2.9095884919166566, "rewards/rejected": -2.7643003463745117, "step": 199 }, { "epoch": 0.34617048896581565, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3732642.0, "logits/rejected": -6055160.0, "logps/chosen": -145.93505859375, "logps/rejected": -213.4742889404297, "loss": 0.2826, "rewards/chosen": 0.03191981464624405, "rewards/margins": 3.5864234939217567, "rewards/rejected": -3.5545036792755127, "step": 200 }, { "epoch": 0.3479013414106447, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10997382.666666666, "logits/rejected": -13361217.6, "logps/chosen": -214.18802897135416, "logps/rejected": -290.353857421875, "loss": 0.2108, "rewards/chosen": 0.08698128660519917, "rewards/margins": 3.8983208556969964, "rewards/rejected": -3.811339569091797, "step": 201 }, { "epoch": 0.34963219385547384, "grad_norm": 15.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16647024.0, "logits/rejected": -16219051.789473685, "logps/chosen": -266.0360576923077, "logps/rejected": -278.2851305509868, "loss": 0.2141, "rewards/chosen": 0.3400090290949895, "rewards/margins": 4.1297148075180985, "rewards/rejected": -3.7897057784231087, "step": 202 }, { "epoch": 0.3513630463003029, "grad_norm": 20.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14588352.0, "logits/rejected": -4774448.842105263, "logps/chosen": -257.1218825120192, "logps/rejected": -202.72636975740133, "loss": 0.2567, "rewards/chosen": -0.031910451558920055, "rewards/margins": 3.413597200322248, "rewards/rejected": -3.445507651881168, "step": 203 }, { "epoch": 0.353093898745132, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3514067.5, "logits/rejected": -21243462.0, "logps/chosen": -222.9899139404297, "logps/rejected": -302.18115234375, "loss": 0.2852, "rewards/chosen": -0.013363361358642578, "rewards/margins": 3.822953462600708, "rewards/rejected": -3.8363168239593506, "step": 204 }, { "epoch": 0.35482475118996104, "grad_norm": 23.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 10442356.444444444, "logits/rejected": -5046877.142857143, "logps/chosen": -239.98092990451389, "logps/rejected": -214.3353271484375, "loss": 0.3217, "rewards/chosen": 0.006140223807758755, "rewards/margins": 3.9656730977788803, "rewards/rejected": -3.9595328739711215, "step": 205 }, { "epoch": 0.3565556036347901, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3316098.5714285714, "logits/rejected": -3345895.5555555555, "logps/chosen": -141.17440359933036, "logps/rejected": -268.8661838107639, "loss": 0.2669, "rewards/chosen": -0.06388027327401298, "rewards/margins": 3.7650589526645724, "rewards/rejected": -3.8289392259385853, "step": 206 }, { "epoch": 0.35828645607961923, "grad_norm": 24.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10579598.476190476, "logits/rejected": -7490005.818181818, "logps/chosen": -215.40415736607142, "logps/rejected": -398.5939275568182, "loss": 0.3157, "rewards/chosen": 0.11144000007992699, "rewards/margins": 5.1400641011985355, "rewards/rejected": -5.028624101118608, "step": 207 }, { "epoch": 0.3600173085244483, "grad_norm": 18.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2552588.470588235, "logits/rejected": 11076235.733333332, "logps/chosen": -175.71145450367646, "logps/rejected": -340.4918619791667, "loss": 0.293, "rewards/chosen": 0.09705781235414393, "rewards/margins": 3.936987011572894, "rewards/rejected": -3.83992919921875, "step": 208 }, { "epoch": 0.36174816096927737, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8181181.0, "logits/rejected": -4103885.5, "logps/chosen": -175.54034423828125, "logps/rejected": -195.105712890625, "loss": 0.3223, "rewards/chosen": 0.006002817302942276, "rewards/margins": 2.481421146541834, "rewards/rejected": -2.4754183292388916, "step": 209 }, { "epoch": 0.36347901341410643, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6023152.0, "logits/rejected": -6930216.615384615, "logps/chosen": -220.12386924342104, "logps/rejected": -259.09130859375, "loss": 0.3206, "rewards/chosen": 0.02514595577591344, "rewards/margins": 3.885440004378678, "rewards/rejected": -3.8602940486027646, "step": 210 }, { "epoch": 0.3652098658589355, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8873345.846153846, "logits/rejected": -4841296.421052632, "logps/chosen": -222.28448016826923, "logps/rejected": -257.08958675986844, "loss": 0.2588, "rewards/chosen": -0.17216739287743202, "rewards/margins": 3.402585816286836, "rewards/rejected": -3.574753209164268, "step": 211 }, { "epoch": 0.3669407183037646, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1087914.2666666666, "logits/rejected": -13504388.705882354, "logps/chosen": -207.50533854166667, "logps/rejected": -327.5465303308824, "loss": 0.2455, "rewards/chosen": 0.09607280890146891, "rewards/margins": 4.1472624559028475, "rewards/rejected": -4.051189647001379, "step": 212 }, { "epoch": 0.3686715707485937, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12526222.933333334, "logits/rejected": -2303273.882352941, "logps/chosen": -217.412255859375, "logps/rejected": -159.63197954963235, "loss": 0.2619, "rewards/chosen": 0.2408916155497233, "rewards/margins": 3.14007298151652, "rewards/rejected": -2.899181365966797, "step": 213 }, { "epoch": 0.37040242319342276, "grad_norm": 21.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6768866.105263158, "logits/rejected": -11241085.538461538, "logps/chosen": -209.8044305098684, "logps/rejected": -294.76034780649036, "loss": 0.3237, "rewards/chosen": 0.22350662632992394, "rewards/margins": 2.894371611869287, "rewards/rejected": -2.670864985539363, "step": 214 }, { "epoch": 0.3721332756382518, "grad_norm": 20.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15369493.333333334, "logits/rejected": 2045517.2857142857, "logps/chosen": -224.15772840711804, "logps/rejected": -162.84868512834822, "loss": 0.2959, "rewards/chosen": 0.2034378316667345, "rewards/margins": 3.11308999667092, "rewards/rejected": -2.9096521650041853, "step": 215 }, { "epoch": 0.3738641280830809, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20242884.0, "logits/rejected": -7449432.0, "logps/chosen": -279.4679260253906, "logps/rejected": -204.05104064941406, "loss": 0.2813, "rewards/chosen": 0.30105486512184143, "rewards/margins": 3.196340948343277, "rewards/rejected": -2.8952860832214355, "step": 216 }, { "epoch": 0.37559498052791, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5433502.909090909, "logits/rejected": 500345.14285714284, "logps/chosen": -189.3955078125, "logps/rejected": -303.5468982514881, "loss": 0.2097, "rewards/chosen": 0.4285439144481312, "rewards/margins": 3.622101653705944, "rewards/rejected": -3.1935577392578125, "step": 217 }, { "epoch": 0.3773258329727391, "grad_norm": 18.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8591011.333333334, "logits/rejected": -6802850.4, "logps/chosen": -284.0384928385417, "logps/rejected": -196.76927490234374, "loss": 0.2485, "rewards/chosen": 0.3856252034505208, "rewards/margins": 3.309361775716146, "rewards/rejected": -2.923736572265625, "step": 218 }, { "epoch": 0.37905668541756815, "grad_norm": 14.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4748231.2, "logits/rejected": -13977383.272727273, "logps/chosen": -120.3367431640625, "logps/rejected": -259.1859685724432, "loss": 0.2385, "rewards/chosen": -0.07536518573760986, "rewards/margins": 3.3064329515803945, "rewards/rejected": -3.3817981373180044, "step": 219 }, { "epoch": 0.3807875378623972, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2363141.230769231, "logits/rejected": -15810723.368421054, "logps/chosen": -170.66443810096155, "logps/rejected": -376.7901675575658, "loss": 0.2529, "rewards/chosen": 0.12850810931279108, "rewards/margins": 3.2996522936261132, "rewards/rejected": -3.1711441843133223, "step": 220 }, { "epoch": 0.3825183903072263, "grad_norm": 19.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11173596.8, "logits/rejected": -14623304.470588235, "logps/chosen": -228.55849609375, "logps/rejected": -248.14662798713235, "loss": 0.2426, "rewards/chosen": 0.3422792116800944, "rewards/margins": 3.8824749011619417, "rewards/rejected": -3.5401956894818474, "step": 221 }, { "epoch": 0.3842492427520554, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20544094.769230768, "logits/rejected": -15617290.105263159, "logps/chosen": -238.24759615384616, "logps/rejected": -270.76454564144734, "loss": 0.2101, "rewards/chosen": 0.37209848257211536, "rewards/margins": 4.252398904035931, "rewards/rejected": -3.880300421463816, "step": 222 }, { "epoch": 0.3859800951968845, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2339170.5, "logits/rejected": -14668691.0, "logps/chosen": -168.8272247314453, "logps/rejected": -316.6446228027344, "loss": 0.2904, "rewards/chosen": 0.05972611904144287, "rewards/margins": 3.1063586473464966, "rewards/rejected": -3.0466325283050537, "step": 223 }, { "epoch": 0.38771094764171354, "grad_norm": 21.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10121347.42857143, "logits/rejected": -9601993.777777778, "logps/chosen": -243.92325265066964, "logps/rejected": -305.10259331597223, "loss": 0.2482, "rewards/chosen": 0.2614833116531372, "rewards/margins": 3.4309277137120566, "rewards/rejected": -3.1694444020589194, "step": 224 }, { "epoch": 0.3894418000865426, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8372479.157894737, "logits/rejected": -18436361.846153848, "logps/chosen": -128.38247841282896, "logps/rejected": -282.94106820913464, "loss": 0.353, "rewards/chosen": -0.023535085351843583, "rewards/margins": 3.7766199674200913, "rewards/rejected": -3.800155052771935, "step": 225 }, { "epoch": 0.3911726525313717, "grad_norm": 21.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1354891.6842105263, "logits/rejected": -14058054.153846154, "logps/chosen": -161.87158203125, "logps/rejected": -339.60730919471155, "loss": 0.304, "rewards/chosen": 0.12897560470982602, "rewards/margins": 4.465609140241677, "rewards/rejected": -4.336633535531851, "step": 226 }, { "epoch": 0.3929035049762008, "grad_norm": 21.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10190957.176470589, "logits/rejected": -10005033.6, "logps/chosen": -191.16943359375, "logps/rejected": -214.92547200520832, "loss": 0.3543, "rewards/chosen": -0.27459220325245576, "rewards/margins": 2.7234729822944193, "rewards/rejected": -2.998065185546875, "step": 227 }, { "epoch": 0.39463435742102987, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12754082.133333333, "logits/rejected": 9428894.11764706, "logps/chosen": -160.12646484375, "logps/rejected": -193.08636833639707, "loss": 0.2904, "rewards/chosen": 0.34527934392293297, "rewards/margins": 2.9829243921766095, "rewards/rejected": -2.6376450482536766, "step": 228 }, { "epoch": 0.39636520986585894, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11040588.631578946, "logits/rejected": -28768635.076923076, "logps/chosen": -202.25439453125, "logps/rejected": -298.80213341346155, "loss": 0.2818, "rewards/chosen": 0.2841626719424599, "rewards/margins": 4.040860400026144, "rewards/rejected": -3.756697728083684, "step": 229 }, { "epoch": 0.398096062310688, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16696243.2, "logits/rejected": 782789.7058823529, "logps/chosen": -209.39425455729167, "logps/rejected": -243.5696518841912, "loss": 0.2405, "rewards/chosen": 0.21385353406270344, "rewards/margins": 3.5233376792832916, "rewards/rejected": -3.3094841452205883, "step": 230 }, { "epoch": 0.39982691475551707, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6847610.857142857, "logits/rejected": -341155.47222222225, "logps/chosen": -145.1298566545759, "logps/rejected": -159.03776041666666, "loss": 0.3048, "rewards/chosen": 0.034968899829047065, "rewards/margins": 2.7618577683728835, "rewards/rejected": -2.7268888685438366, "step": 231 }, { "epoch": 0.4015577672003462, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14881117.0, "logits/rejected": -15492494.0, "logps/chosen": -207.72711181640625, "logps/rejected": -320.58319091796875, "loss": 0.2664, "rewards/chosen": 0.16509190201759338, "rewards/margins": 4.239029794931412, "rewards/rejected": -4.073937892913818, "step": 232 }, { "epoch": 0.40328861964517526, "grad_norm": 20.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5929799.529411765, "logits/rejected": -12552819.2, "logps/chosen": -215.2191664751838, "logps/rejected": -290.16845703125, "loss": 0.2607, "rewards/chosen": 0.19065551196827607, "rewards/margins": 4.717087295008641, "rewards/rejected": -4.526431783040365, "step": 233 }, { "epoch": 0.40501947209000433, "grad_norm": 17.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 8185308.8, "logits/rejected": -14831882.352941176, "logps/chosen": -165.01144205729167, "logps/rejected": -273.51809512867646, "loss": 0.2833, "rewards/chosen": -0.0994392474492391, "rewards/margins": 3.8827245745004393, "rewards/rejected": -3.9821638219496784, "step": 234 }, { "epoch": 0.4067503245348334, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12612798.315789474, "logits/rejected": -19476348.307692308, "logps/chosen": -231.91385690789474, "logps/rejected": -316.7388446514423, "loss": 0.3007, "rewards/chosen": 0.1857273955094187, "rewards/margins": 5.110347803787664, "rewards/rejected": -4.924620408278245, "step": 235 }, { "epoch": 0.40848117697966246, "grad_norm": 14.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13798296.0, "logits/rejected": -13150624.0, "logps/chosen": -184.28253173828125, "logps/rejected": -293.38729580965907, "loss": 0.2181, "rewards/chosen": -0.08902863264083863, "rewards/margins": 3.767916405200958, "rewards/rejected": -3.856945037841797, "step": 236 }, { "epoch": 0.4102120294244916, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -5185518.933333334, "logits/rejected": -10506853.647058824, "logps/chosen": -195.71438802083333, "logps/rejected": -266.6700080422794, "loss": 0.2492, "rewards/chosen": 0.16841630935668944, "rewards/margins": 4.66889471727259, "rewards/rejected": -4.5004784079159, "step": 237 }, { "epoch": 0.41194288186932065, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -120490.13333333333, "logits/rejected": -4157334.588235294, "logps/chosen": -165.81774088541667, "logps/rejected": -338.9480985753676, "loss": 0.2444, "rewards/chosen": 0.16940480868021648, "rewards/margins": 3.9526734567156026, "rewards/rejected": -3.783268648035386, "step": 238 }, { "epoch": 0.4136737343141497, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13063847.466666667, "logits/rejected": -7603120.0, "logps/chosen": -217.052294921875, "logps/rejected": -281.8909696691176, "loss": 0.2602, "rewards/chosen": 0.0732549508412679, "rewards/margins": 3.9795507272084554, "rewards/rejected": -3.9062957763671875, "step": 239 }, { "epoch": 0.4154045867589788, "grad_norm": 19.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13696624.0, "logits/rejected": 21262386.82352941, "logps/chosen": -188.62125651041666, "logps/rejected": -326.68488625919116, "loss": 0.2636, "rewards/chosen": 0.14170858065287273, "rewards/margins": 3.8223358425439575, "rewards/rejected": -3.6806272618910847, "step": 240 }, { "epoch": 0.41713543920380786, "grad_norm": 15.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11527736.615384616, "logits/rejected": -13245899.789473685, "logps/chosen": -171.26971905048077, "logps/rejected": -316.2771638569079, "loss": 0.2359, "rewards/chosen": 0.039726394873399004, "rewards/margins": 4.057520953749838, "rewards/rejected": -4.017794558876439, "step": 241 }, { "epoch": 0.418866291648637, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10542560.0, "logits/rejected": -15128954.0, "logps/chosen": -197.97857666015625, "logps/rejected": -249.15786743164062, "loss": 0.2893, "rewards/chosen": -0.0119645856320858, "rewards/margins": 3.6342065073549747, "rewards/rejected": -3.6461710929870605, "step": 242 }, { "epoch": 0.42059714409346605, "grad_norm": 22.5, "kl": 0.13220763206481934, "learning_rate": 5e-06, "logits/chosen": -17448896.0, "logits/rejected": -10536552.533333333, "logps/chosen": -242.55230353860293, "logps/rejected": -180.8974609375, "loss": 0.3269, "rewards/chosen": 0.07533069217906278, "rewards/margins": 3.0032341854245055, "rewards/rejected": -2.927903493245443, "step": 243 }, { "epoch": 0.4223279965382951, "grad_norm": 16.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15767290.181818182, "logits/rejected": -11785589.333333334, "logps/chosen": -199.07419655539772, "logps/rejected": -267.2129371279762, "loss": 0.1908, "rewards/chosen": 0.3069478381763805, "rewards/margins": 3.4843519693845275, "rewards/rejected": -3.177404131208147, "step": 244 }, { "epoch": 0.4240588489831242, "grad_norm": 16.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9023009.777777778, "logits/rejected": -8307068.0, "logps/chosen": -138.5511474609375, "logps/rejected": -191.49358258928572, "loss": 0.2808, "rewards/chosen": 0.27963558832804364, "rewards/margins": 3.714873745327904, "rewards/rejected": -3.4352381569998607, "step": 245 }, { "epoch": 0.42578970142795325, "grad_norm": 14.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12051193.6, "logits/rejected": -10616396.235294119, "logps/chosen": -171.37771809895833, "logps/rejected": -222.20760569852942, "loss": 0.2502, "rewards/chosen": 0.1699681282043457, "rewards/margins": 4.40653590595021, "rewards/rejected": -4.236567777745864, "step": 246 }, { "epoch": 0.42752055387278237, "grad_norm": 19.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12088685.714285715, "logits/rejected": -5290852.0, "logps/chosen": -208.72258649553572, "logps/rejected": -252.75640190972223, "loss": 0.2659, "rewards/chosen": 0.08243453502655029, "rewards/margins": 3.2210644483566284, "rewards/rejected": -3.138629913330078, "step": 247 }, { "epoch": 0.42925140631761144, "grad_norm": 16.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18058203.2, "logits/rejected": -10416639.272727273, "logps/chosen": -224.68359375, "logps/rejected": -289.28861860795456, "loss": 0.2142, "rewards/chosen": -0.14810900688171386, "rewards/margins": 3.584905880147761, "rewards/rejected": -3.7330148870294746, "step": 248 }, { "epoch": 0.4309822587624405, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15116648.533333333, "logits/rejected": -3318948.0, "logps/chosen": -177.97216796875, "logps/rejected": -264.22144990808823, "loss": 0.2523, "rewards/chosen": 0.038343381881713864, "rewards/margins": 4.054038841584149, "rewards/rejected": -4.015695459702435, "step": 249 }, { "epoch": 0.4327131112072696, "grad_norm": 22.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11157197.47368421, "logits/rejected": 2493580.3076923075, "logps/chosen": -177.07334498355263, "logps/rejected": -290.3555438701923, "loss": 0.3557, "rewards/chosen": -0.19027650983710037, "rewards/margins": 3.0934826806489273, "rewards/rejected": -3.2837591904860277, "step": 250 }, { "epoch": 0.43444396365209864, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16668712.533333333, "logits/rejected": -3976555.294117647, "logps/chosen": -244.20847981770834, "logps/rejected": -327.9715935202206, "loss": 0.2648, "rewards/chosen": 0.03703808784484863, "rewards/margins": 4.71754776730257, "rewards/rejected": -4.680509679457721, "step": 251 }, { "epoch": 0.43617481609692776, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 5272862.545454546, "logits/rejected": -6936409.904761905, "logps/chosen": -229.92167524857953, "logps/rejected": -213.8749534970238, "loss": 0.2408, "rewards/chosen": -0.30214368213306775, "rewards/margins": 3.3398687777581153, "rewards/rejected": -3.642012459891183, "step": 252 }, { "epoch": 0.43790566854175683, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3558934.933333333, "logits/rejected": -2213124.2352941176, "logps/chosen": -207.47721354166666, "logps/rejected": -196.65370806525735, "loss": 0.2565, "rewards/chosen": 0.18845229148864745, "rewards/margins": 3.44047677376691, "rewards/rejected": -3.2520244822782627, "step": 253 }, { "epoch": 0.4396365209865859, "grad_norm": 20.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3256860.5714285714, "logits/rejected": -1937111.3333333333, "logps/chosen": -222.31358119419642, "logps/rejected": -198.45515950520834, "loss": 0.2842, "rewards/chosen": -0.027769644345555986, "rewards/margins": 3.096243832556028, "rewards/rejected": -3.124013476901584, "step": 254 }, { "epoch": 0.44136737343141497, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2231512.0, "logits/rejected": -15707685.714285715, "logps/chosen": -224.28355577256946, "logps/rejected": -377.65659877232144, "loss": 0.283, "rewards/chosen": 0.1279101769129435, "rewards/margins": 4.255186188788642, "rewards/rejected": -4.127276011875698, "step": 255 }, { "epoch": 0.44309822587624403, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3650349.0, "logits/rejected": -25763404.0, "logps/chosen": -175.8751220703125, "logps/rejected": -384.6330261230469, "loss": 0.2993, "rewards/chosen": -0.2350717931985855, "rewards/margins": 3.420280560851097, "rewards/rejected": -3.6553523540496826, "step": 256 }, { "epoch": 0.4448290783210731, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8227142.095238095, "logits/rejected": -3991596.0, "logps/chosen": -204.65394810267858, "logps/rejected": -230.69422496448863, "loss": 0.3679, "rewards/chosen": -0.06990920929681688, "rewards/margins": 4.00661361785162, "rewards/rejected": -4.0765228271484375, "step": 257 }, { "epoch": 0.4465599307659022, "grad_norm": 20.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9966890.105263159, "logits/rejected": -14217319.384615384, "logps/chosen": -199.6491827713816, "logps/rejected": -275.9331242487981, "loss": 0.3237, "rewards/chosen": 0.01347437344099346, "rewards/margins": 4.150257386418007, "rewards/rejected": -4.136783012977014, "step": 258 }, { "epoch": 0.4482907832107313, "grad_norm": 20.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13260032.94117647, "logits/rejected": -3421154.6666666665, "logps/chosen": -238.68637982536765, "logps/rejected": -267.1708658854167, "loss": 0.2502, "rewards/chosen": 0.2879646806155934, "rewards/margins": 4.020605773551791, "rewards/rejected": -3.732641092936198, "step": 259 }, { "epoch": 0.45002163565556036, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20596262.85714286, "logits/rejected": -7963298.666666667, "logps/chosen": -236.05545479910714, "logps/rejected": -297.91436089409723, "loss": 0.2213, "rewards/chosen": 0.6324899537222726, "rewards/margins": 4.426475108615936, "rewards/rejected": -3.7939851548936634, "step": 260 }, { "epoch": 0.4517524881003894, "grad_norm": 15.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14417637.333333334, "logits/rejected": -13684048.0, "logps/chosen": -196.896435546875, "logps/rejected": -312.3822380514706, "loss": 0.1969, "rewards/chosen": 0.5348507563273112, "rewards/margins": 4.734749868804333, "rewards/rejected": -4.199899112477022, "step": 261 }, { "epoch": 0.4534833405452185, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24702128.0, "logits/rejected": -9365918.222222222, "logps/chosen": -337.36460658482144, "logps/rejected": -256.26673719618054, "loss": 0.2306, "rewards/chosen": 0.24765947886875697, "rewards/margins": 4.048638669271318, "rewards/rejected": -3.8009791904025607, "step": 262 }, { "epoch": 0.4552141929900476, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12827164.631578946, "logits/rejected": -11524402.461538462, "logps/chosen": -181.87505139802633, "logps/rejected": -344.0277569110577, "loss": 0.2989, "rewards/chosen": 0.03691702453713668, "rewards/margins": 4.332692511409883, "rewards/rejected": -4.2957754868727465, "step": 263 }, { "epoch": 0.4569450454348767, "grad_norm": 20.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18784899.555555556, "logits/rejected": -12124728.0, "logps/chosen": -233.41818576388889, "logps/rejected": -315.662109375, "loss": 0.3184, "rewards/chosen": 0.08528125286102295, "rewards/margins": 3.363411920411246, "rewards/rejected": -3.278130667550223, "step": 264 }, { "epoch": 0.45867589787970575, "grad_norm": 19.125, "kl": 0.11611628532409668, "learning_rate": 5e-06, "logits/chosen": 1691305.142857143, "logits/rejected": -6767852.0, "logps/chosen": -285.6856166294643, "logps/rejected": -247.17659505208334, "loss": 0.2455, "rewards/chosen": 0.24254277774265834, "rewards/margins": 3.9829009184761657, "rewards/rejected": -3.740358140733507, "step": 265 }, { "epoch": 0.4604067503245348, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11006114.4, "logits/rejected": -19388294.666666668, "logps/chosen": -188.975390625, "logps/rejected": -325.20001220703125, "loss": 0.3136, "rewards/chosen": 0.19371647834777833, "rewards/margins": 3.5510452111562096, "rewards/rejected": -3.357328732808431, "step": 266 }, { "epoch": 0.4621376027693639, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17104797.53846154, "logits/rejected": -6318394.105263158, "logps/chosen": -190.21593299278845, "logps/rejected": -221.87461451480263, "loss": 0.2442, "rewards/chosen": 0.2514270819150485, "rewards/margins": 3.471239374716755, "rewards/rejected": -3.2198122928017066, "step": 267 }, { "epoch": 0.463868455214193, "grad_norm": 20.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4255990.933333334, "logits/rejected": -16446816.0, "logps/chosen": -253.98564453125, "logps/rejected": -270.46570542279414, "loss": 0.2827, "rewards/chosen": -0.036499599615732826, "rewards/margins": 3.59961351109486, "rewards/rejected": -3.6361131107105926, "step": 268 }, { "epoch": 0.4655993076590221, "grad_norm": 17.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11160579.2, "logits/rejected": -10607611.294117646, "logps/chosen": -202.769384765625, "logps/rejected": -240.4682186351103, "loss": 0.2514, "rewards/chosen": 0.4125640551249186, "rewards/margins": 3.6268443518993903, "rewards/rejected": -3.2142802967744717, "step": 269 }, { "epoch": 0.46733016010385114, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13615528.888888888, "logits/rejected": -6793550.857142857, "logps/chosen": -177.54604763454861, "logps/rejected": -297.06703404017856, "loss": 0.2885, "rewards/chosen": 0.25184231334262425, "rewards/margins": 3.5622371454087514, "rewards/rejected": -3.310394832066127, "step": 270 }, { "epoch": 0.4690610125486802, "grad_norm": 21.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2008512.4210526317, "logits/rejected": 1065751.2307692308, "logps/chosen": -230.33552631578948, "logps/rejected": -225.39548903245193, "loss": 0.2912, "rewards/chosen": 0.33769374144704717, "rewards/margins": 3.8641099331349977, "rewards/rejected": -3.5264161916879506, "step": 271 }, { "epoch": 0.4707918649935093, "grad_norm": 20.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8079465.5, "logits/rejected": -4624071.0, "logps/chosen": -245.87030029296875, "logps/rejected": -286.6881103515625, "loss": 0.256, "rewards/chosen": 0.18378782272338867, "rewards/margins": 4.057176828384399, "rewards/rejected": -3.8733890056610107, "step": 272 }, { "epoch": 0.4725227174383384, "grad_norm": 17.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 34063.02272727273, "logits/rejected": 5305572.4, "logps/chosen": -97.9369229403409, "logps/rejected": -163.92113037109374, "loss": 0.362, "rewards/chosen": 0.07122220234437422, "rewards/margins": 3.202643482251601, "rewards/rejected": -3.1314212799072267, "step": 273 }, { "epoch": 0.47425356988316747, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8214460.0, "logits/rejected": -21400504.0, "logps/chosen": -159.88619995117188, "logps/rejected": -304.96246337890625, "loss": 0.2742, "rewards/chosen": 0.257793664932251, "rewards/margins": 3.4434404373168945, "rewards/rejected": -3.1856467723846436, "step": 274 }, { "epoch": 0.47598442232799654, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9168399.157894736, "logits/rejected": -14751639.384615384, "logps/chosen": -186.708251953125, "logps/rejected": -292.65147986778845, "loss": 0.2819, "rewards/chosen": 0.2397254642687346, "rewards/margins": 4.295501033304191, "rewards/rejected": -4.055775569035457, "step": 275 }, { "epoch": 0.4777152747728256, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9120151.529411765, "logits/rejected": -2838455.7333333334, "logps/chosen": -179.58385512408088, "logps/rejected": -274.78151041666666, "loss": 0.2887, "rewards/chosen": 0.22131121859830968, "rewards/margins": 3.1119616499134137, "rewards/rejected": -2.890650431315104, "step": 276 }, { "epoch": 0.47944612721765467, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17451059.42857143, "logits/rejected": -19632037.333333332, "logps/chosen": -194.51698521205358, "logps/rejected": -261.6064453125, "loss": 0.255, "rewards/chosen": 0.24481826169150217, "rewards/margins": 3.8417675211316062, "rewards/rejected": -3.596949259440104, "step": 277 }, { "epoch": 0.4811769796624838, "grad_norm": 18.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11978563.2, "logits/rejected": -11180960.94117647, "logps/chosen": -204.08976236979166, "logps/rejected": -332.37023207720586, "loss": 0.2457, "rewards/chosen": 0.2240306536356608, "rewards/margins": 3.5974735839694154, "rewards/rejected": -3.3734429303337548, "step": 278 }, { "epoch": 0.48290783210731286, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4350533.866666666, "logits/rejected": -8590544.94117647, "logps/chosen": -151.40662434895833, "logps/rejected": -249.31729664522058, "loss": 0.2934, "rewards/chosen": 0.01796001394589742, "rewards/margins": 2.6930718967727585, "rewards/rejected": -2.675111882826861, "step": 279 }, { "epoch": 0.48463868455214193, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10933750.4, "logits/rejected": -15255677.176470589, "logps/chosen": -171.53665364583333, "logps/rejected": -308.31043198529414, "loss": 0.2806, "rewards/chosen": 0.030906534194946288, "rewards/margins": 4.104628924762501, "rewards/rejected": -4.073722390567555, "step": 280 }, { "epoch": 0.486369536996971, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8110217.142857143, "logits/rejected": -18321370.181818184, "logps/chosen": -182.0908435639881, "logps/rejected": -304.40236594460225, "loss": 0.3329, "rewards/chosen": 0.09797722952706474, "rewards/margins": 3.671043247371525, "rewards/rejected": -3.5730660178444604, "step": 281 }, { "epoch": 0.48810038944180006, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8963434.0, "logits/rejected": -6483309.0, "logps/chosen": -224.63430786132812, "logps/rejected": -291.1379699707031, "loss": 0.2706, "rewards/chosen": 0.08380473405122757, "rewards/margins": 3.3000806644558907, "rewards/rejected": -3.216275930404663, "step": 282 }, { "epoch": 0.4898312418866292, "grad_norm": 17.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9069934.0, "logits/rejected": -22540276.0, "logps/chosen": -141.39923095703125, "logps/rejected": -317.710693359375, "loss": 0.3155, "rewards/chosen": -0.18840433657169342, "rewards/margins": 3.7216622680425644, "rewards/rejected": -3.910066604614258, "step": 283 }, { "epoch": 0.49156209433145825, "grad_norm": 23.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11534782.315789474, "logits/rejected": -14227288.615384616, "logps/chosen": -347.19901315789474, "logps/rejected": -233.33997521033655, "loss": 0.2763, "rewards/chosen": 0.6239749506900185, "rewards/margins": 3.671866420792182, "rewards/rejected": -3.0478914701021633, "step": 284 }, { "epoch": 0.4932929467762873, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7307938.52631579, "logits/rejected": -6305785.846153846, "logps/chosen": -182.99164782072367, "logps/rejected": -234.64518855168268, "loss": 0.3165, "rewards/chosen": 0.0814883081536544, "rewards/margins": 3.057790234986587, "rewards/rejected": -2.9763019268329325, "step": 285 }, { "epoch": 0.4950237992211164, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2387972.2666666666, "logits/rejected": -10976164.705882354, "logps/chosen": -138.42179361979166, "logps/rejected": -257.4625459558824, "loss": 0.2974, "rewards/chosen": -0.04797365665435791, "rewards/margins": 3.496035608123331, "rewards/rejected": -3.5440092647776886, "step": 286 }, { "epoch": 0.49675465166594546, "grad_norm": 16.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17446714.0, "logits/rejected": -4344534.5, "logps/chosen": -249.31387329101562, "logps/rejected": -275.65106201171875, "loss": 0.2517, "rewards/chosen": 0.2581062614917755, "rewards/margins": 3.798191577196121, "rewards/rejected": -3.5400853157043457, "step": 287 }, { "epoch": 0.4984855041107746, "grad_norm": 18.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6731353.846153846, "logits/rejected": -10663467.789473685, "logps/chosen": -210.49735201322116, "logps/rejected": -270.64185855263156, "loss": 0.2199, "rewards/chosen": 0.23169115873483512, "rewards/margins": 4.13718504558208, "rewards/rejected": -3.905493886847245, "step": 288 }, { "epoch": 0.5002163565556036, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12233035.2, "logits/rejected": -17931900.0, "logps/chosen": -199.65758056640624, "logps/rejected": -295.8255208333333, "loss": 0.3644, "rewards/chosen": -0.10184909105300903, "rewards/margins": 2.8798390905062354, "rewards/rejected": -2.9816881815592446, "step": 289 } ], "logging_steps": 1, "max_steps": 578, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 289, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }