simonycl's picture
Upload folder using huggingface_hub
fdf1f8f verified
raw
history blame contribute delete
No virus
55.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999630314232902,
"eval_steps": 400,
"global_step": 507,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001971657424522489,
"grad_norm": 4.641391669893979,
"learning_rate": 9.803921568627451e-09,
"logits/chosen": -1.8306132555007935,
"logits/rejected": -1.2712628841400146,
"logps/chosen": -217.9743194580078,
"logps/rejected": -312.2440185546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.009858287122612447,
"grad_norm": 3.881553151172807,
"learning_rate": 4.901960784313725e-08,
"logits/chosen": -1.3956289291381836,
"logits/rejected": -1.324476718902588,
"logps/chosen": -213.20277404785156,
"logps/rejected": -243.072509765625,
"loss": 0.6932,
"rewards/accuracies": 0.3125,
"rewards/chosen": 9.495137783233076e-05,
"rewards/margins": -0.00030715527827851474,
"rewards/rejected": 0.0004021066124550998,
"step": 5
},
{
"epoch": 0.019716574245224893,
"grad_norm": 4.055647051577517,
"learning_rate": 9.80392156862745e-08,
"logits/chosen": -1.464820146560669,
"logits/rejected": -1.329075813293457,
"logps/chosen": -216.189697265625,
"logps/rejected": -249.85464477539062,
"loss": 0.6933,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": 0.0014236138667911291,
"rewards/margins": -0.0008448967710137367,
"rewards/rejected": 0.002268511103466153,
"step": 10
},
{
"epoch": 0.029574861367837338,
"grad_norm": 3.8004259300545313,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": -1.590954065322876,
"logits/rejected": -1.3920761346817017,
"logps/chosen": -227.84024047851562,
"logps/rejected": -267.3565368652344,
"loss": 0.6928,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0014651073142886162,
"rewards/margins": 0.0005061920965090394,
"rewards/rejected": 0.0009589152177795768,
"step": 15
},
{
"epoch": 0.039433148490449786,
"grad_norm": 4.362970881343374,
"learning_rate": 1.96078431372549e-07,
"logits/chosen": -1.4077281951904297,
"logits/rejected": -1.438763976097107,
"logps/chosen": -216.7683563232422,
"logps/rejected": -241.71524047851562,
"loss": 0.6921,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0071268146857619286,
"rewards/margins": 0.002329364651814103,
"rewards/rejected": -0.009456178173422813,
"step": 20
},
{
"epoch": 0.04929143561306223,
"grad_norm": 3.68850001761437,
"learning_rate": 2.4509803921568627e-07,
"logits/chosen": -1.368187427520752,
"logits/rejected": -1.3394204378128052,
"logps/chosen": -225.8297119140625,
"logps/rejected": -254.41439819335938,
"loss": 0.6899,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.015500446781516075,
"rewards/margins": 0.005922852084040642,
"rewards/rejected": -0.021423298865556717,
"step": 25
},
{
"epoch": 0.059149722735674676,
"grad_norm": 4.847654340669893,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": -1.4356650114059448,
"logits/rejected": -1.2754924297332764,
"logps/chosen": -221.5808563232422,
"logps/rejected": -255.44918823242188,
"loss": 0.6856,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.02648136578500271,
"rewards/margins": 0.015200227499008179,
"rewards/rejected": -0.04168159142136574,
"step": 30
},
{
"epoch": 0.06900800985828712,
"grad_norm": 6.653348898638824,
"learning_rate": 3.431372549019608e-07,
"logits/chosen": -1.347893476486206,
"logits/rejected": -1.2126632928848267,
"logps/chosen": -217.4748992919922,
"logps/rejected": -253.11001586914062,
"loss": 0.6716,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.04387308284640312,
"rewards/margins": 0.04525812342762947,
"rewards/rejected": -0.08913120627403259,
"step": 35
},
{
"epoch": 0.07886629698089957,
"grad_norm": 6.964114197906881,
"learning_rate": 3.92156862745098e-07,
"logits/chosen": -1.4753751754760742,
"logits/rejected": -1.3836042881011963,
"logps/chosen": -233.50979614257812,
"logps/rejected": -270.6595458984375,
"loss": 0.6487,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.09956349432468414,
"rewards/margins": 0.10375545918941498,
"rewards/rejected": -0.2033189833164215,
"step": 40
},
{
"epoch": 0.08872458410351201,
"grad_norm": 15.546171706823465,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": -1.4908992052078247,
"logits/rejected": -1.4922513961791992,
"logps/chosen": -260.85107421875,
"logps/rejected": -310.8064270019531,
"loss": 0.6302,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.39361271262168884,
"rewards/margins": 0.24461090564727783,
"rewards/rejected": -0.6382235884666443,
"step": 45
},
{
"epoch": 0.09858287122612445,
"grad_norm": 7.346421533742723,
"learning_rate": 4.901960784313725e-07,
"logits/chosen": -1.8035519123077393,
"logits/rejected": -1.7488648891448975,
"logps/chosen": -280.26544189453125,
"logps/rejected": -384.37969970703125,
"loss": 0.6188,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6396178007125854,
"rewards/margins": 0.6693423986434937,
"rewards/rejected": -1.308960199356079,
"step": 50
},
{
"epoch": 0.10844115834873691,
"grad_norm": 6.928842609814235,
"learning_rate": 4.999050767562379e-07,
"logits/chosen": -1.500614881515503,
"logits/rejected": -1.514692783355713,
"logps/chosen": -259.22607421875,
"logps/rejected": -324.70147705078125,
"loss": 0.5905,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.4987005591392517,
"rewards/margins": 0.39352884888648987,
"rewards/rejected": -0.8922293782234192,
"step": 55
},
{
"epoch": 0.11829944547134935,
"grad_norm": 5.183419407454259,
"learning_rate": 4.99519574616467e-07,
"logits/chosen": -1.6389617919921875,
"logits/rejected": -1.5824358463287354,
"logps/chosen": -283.13287353515625,
"logps/rejected": -382.1869201660156,
"loss": 0.6371,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7059253454208374,
"rewards/margins": 0.6277474164962769,
"rewards/rejected": -1.3336727619171143,
"step": 60
},
{
"epoch": 0.1281577325939618,
"grad_norm": 9.230406347476531,
"learning_rate": 4.988380179235842e-07,
"logits/chosen": -1.6305882930755615,
"logits/rejected": -1.6462520360946655,
"logps/chosen": -256.4553527832031,
"logps/rejected": -347.4143371582031,
"loss": 0.5805,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.49059420824050903,
"rewards/margins": 0.5759122371673584,
"rewards/rejected": -1.0665065050125122,
"step": 65
},
{
"epoch": 0.13801601971657423,
"grad_norm": 9.206165908014777,
"learning_rate": 4.978612153434526e-07,
"logits/chosen": -1.7708934545516968,
"logits/rejected": -1.7579914331436157,
"logps/chosen": -285.9685974121094,
"logps/rejected": -370.2804260253906,
"loss": 0.5983,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7101233601570129,
"rewards/margins": 0.5043641328811646,
"rewards/rejected": -1.2144873142242432,
"step": 70
},
{
"epoch": 0.1478743068391867,
"grad_norm": 7.723809446488398,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": -1.8401196002960205,
"logits/rejected": -1.7219253778457642,
"logps/chosen": -289.95068359375,
"logps/rejected": -389.30889892578125,
"loss": 0.5573,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7532116174697876,
"rewards/margins": 0.6191812753677368,
"rewards/rejected": -1.372393012046814,
"step": 75
},
{
"epoch": 0.15773259396179914,
"grad_norm": 10.916878987391435,
"learning_rate": 4.950268573535011e-07,
"logits/chosen": -2.015733480453491,
"logits/rejected": -1.8680551052093506,
"logps/chosen": -325.2226257324219,
"logps/rejected": -428.49066162109375,
"loss": 0.5476,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.0503952503204346,
"rewards/margins": 0.6346156597137451,
"rewards/rejected": -1.6850106716156006,
"step": 80
},
{
"epoch": 0.16759088108441159,
"grad_norm": 13.425184009136764,
"learning_rate": 4.93172664904641e-07,
"logits/chosen": -1.8795242309570312,
"logits/rejected": -1.913556694984436,
"logps/chosen": -317.763916015625,
"logps/rejected": -424.55450439453125,
"loss": 0.5141,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.1219675540924072,
"rewards/margins": 0.7599529027938843,
"rewards/rejected": -1.881920576095581,
"step": 85
},
{
"epoch": 0.17744916820702403,
"grad_norm": 15.954473082571113,
"learning_rate": 4.910299485003033e-07,
"logits/chosen": -2.1529054641723633,
"logits/rejected": -2.0844523906707764,
"logps/chosen": -424.82891845703125,
"logps/rejected": -543.7278442382812,
"loss": 0.5199,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.0633959770202637,
"rewards/margins": 0.886804461479187,
"rewards/rejected": -2.950200319290161,
"step": 90
},
{
"epoch": 0.18730745532963647,
"grad_norm": 17.544754679380226,
"learning_rate": 4.886012504698769e-07,
"logits/chosen": -1.882367730140686,
"logits/rejected": -1.9553489685058594,
"logps/chosen": -406.643310546875,
"logps/rejected": -471.86553955078125,
"loss": 0.5386,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.862217664718628,
"rewards/margins": 0.4628971219062805,
"rewards/rejected": -2.3251149654388428,
"step": 95
},
{
"epoch": 0.1971657424522489,
"grad_norm": 13.476927825101471,
"learning_rate": 4.858894524594652e-07,
"logits/chosen": -2.1455252170562744,
"logits/rejected": -2.0651824474334717,
"logps/chosen": -392.774169921875,
"logps/rejected": -530.4494018554688,
"loss": 0.4917,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8280452489852905,
"rewards/margins": 0.902090847492218,
"rewards/rejected": -2.7301361560821533,
"step": 100
},
{
"epoch": 0.20702402957486138,
"grad_norm": 23.287769508042025,
"learning_rate": 4.828977720128198e-07,
"logits/chosen": -1.9681150913238525,
"logits/rejected": -1.9559170007705688,
"logps/chosen": -431.6632385253906,
"logps/rejected": -570.6896362304688,
"loss": 0.4834,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.2001523971557617,
"rewards/margins": 1.0563952922821045,
"rewards/rejected": -3.2565484046936035,
"step": 105
},
{
"epoch": 0.21688231669747382,
"grad_norm": 23.293354005808915,
"learning_rate": 4.796297587537285e-07,
"logits/chosen": -2.096468448638916,
"logits/rejected": -1.9595563411712646,
"logps/chosen": -497.79400634765625,
"logps/rejected": -643.2481689453125,
"loss": 0.483,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.711547374725342,
"rewards/margins": 1.0744675397872925,
"rewards/rejected": -3.7860145568847656,
"step": 110
},
{
"epoch": 0.22674060382008626,
"grad_norm": 16.815054431474035,
"learning_rate": 4.760892901743944e-07,
"logits/chosen": -2.1025643348693848,
"logits/rejected": -2.091360092163086,
"logps/chosen": -450.98028564453125,
"logps/rejected": -575.75439453125,
"loss": 0.4619,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.345163106918335,
"rewards/margins": 1.00288987159729,
"rewards/rejected": -3.348052978515625,
"step": 115
},
{
"epoch": 0.2365988909426987,
"grad_norm": 18.736725526597898,
"learning_rate": 4.7228056703479626e-07,
"logits/chosen": -1.9844331741333008,
"logits/rejected": -2.1090264320373535,
"logps/chosen": -505.28509521484375,
"logps/rejected": -649.0353393554688,
"loss": 0.4298,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.8585612773895264,
"rewards/margins": 1.334934949874878,
"rewards/rejected": -4.193496227264404,
"step": 120
},
{
"epoch": 0.24645717806531114,
"grad_norm": 29.41530429769772,
"learning_rate": 4.6820810837849535e-07,
"logits/chosen": -1.9075158834457397,
"logits/rejected": -1.952182412147522,
"logps/chosen": -443.2312927246094,
"logps/rejected": -584.6851196289062,
"loss": 0.4647,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.3362534046173096,
"rewards/margins": 1.1353000402450562,
"rewards/rejected": -3.471553087234497,
"step": 125
},
{
"epoch": 0.2563154651879236,
"grad_norm": 28.047847807749136,
"learning_rate": 4.63876746170797e-07,
"logits/chosen": -1.9407484531402588,
"logits/rejected": -1.9303442239761353,
"logps/chosen": -533.4217529296875,
"logps/rejected": -713.83740234375,
"loss": 0.4145,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.180513858795166,
"rewards/margins": 1.4735915660858154,
"rewards/rejected": -4.6541056632995605,
"step": 130
},
{
"epoch": 0.266173752310536,
"grad_norm": 18.58702447039976,
"learning_rate": 4.592916195656321e-07,
"logits/chosen": -2.0613300800323486,
"logits/rejected": -1.971636414527893,
"logps/chosen": -469.5445251464844,
"logps/rejected": -650.7494506835938,
"loss": 0.4332,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.464566946029663,
"rewards/margins": 1.3873087167739868,
"rewards/rejected": -3.8518757820129395,
"step": 135
},
{
"epoch": 0.27603203943314847,
"grad_norm": 43.43885248557689,
"learning_rate": 4.544581688079602e-07,
"logits/chosen": -1.8543685674667358,
"logits/rejected": -1.960680365562439,
"logps/chosen": -499.29150390625,
"logps/rejected": -682.0525512695312,
"loss": 0.4126,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.7792274951934814,
"rewards/margins": 1.672224998474121,
"rewards/rejected": -4.45145320892334,
"step": 140
},
{
"epoch": 0.2858903265557609,
"grad_norm": 25.06136332684734,
"learning_rate": 4.493821287789272e-07,
"logits/chosen": -2.0097248554229736,
"logits/rejected": -2.05975604057312,
"logps/chosen": -622.1812744140625,
"logps/rejected": -857.2575073242188,
"loss": 0.4115,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -4.092832565307617,
"rewards/margins": 1.9847408533096313,
"rewards/rejected": -6.077573299407959,
"step": 145
},
{
"epoch": 0.2957486136783734,
"grad_norm": 23.62970192824471,
"learning_rate": 4.4406952219143934e-07,
"logits/chosen": -1.9738140106201172,
"logits/rejected": -1.8969192504882812,
"logps/chosen": -505.8863220214844,
"logps/rejected": -674.2682495117188,
"loss": 0.4551,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.9365577697753906,
"rewards/margins": 1.3609775304794312,
"rewards/rejected": -4.297535419464111,
"step": 150
},
{
"epoch": 0.30560690080098585,
"grad_norm": 36.928411112871835,
"learning_rate": 4.38526652444224e-07,
"logits/chosen": -1.9676620960235596,
"logits/rejected": -1.9335002899169922,
"logps/chosen": -526.3443603515625,
"logps/rejected": -675.4140625,
"loss": 0.4316,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.1301074028015137,
"rewards/margins": 1.176997423171997,
"rewards/rejected": -4.30710506439209,
"step": 155
},
{
"epoch": 0.3154651879235983,
"grad_norm": 35.737507476172006,
"learning_rate": 4.3276009614285824e-07,
"logits/chosen": -2.08416748046875,
"logits/rejected": -2.0275375843048096,
"logps/chosen": -547.2161254882812,
"logps/rejected": -734.8326416015625,
"loss": 0.4361,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.296079635620117,
"rewards/margins": 1.489527940750122,
"rewards/rejected": -4.78560733795166,
"step": 160
},
{
"epoch": 0.32532347504621073,
"grad_norm": 25.388193696092944,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -1.8684972524642944,
"logits/rejected": -1.98639714717865,
"logps/chosen": -492.37518310546875,
"logps/rejected": -663.5337524414062,
"loss": 0.3908,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.736380100250244,
"rewards/margins": 1.5142922401428223,
"rewards/rejected": -4.250672340393066,
"step": 165
},
{
"epoch": 0.33518176216882317,
"grad_norm": 26.163756341836816,
"learning_rate": 4.2058354920054043e-07,
"logits/chosen": -2.0008151531219482,
"logits/rejected": -2.1545004844665527,
"logps/chosen": -558.0103759765625,
"logps/rejected": -783.8531494140625,
"loss": 0.3635,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.2563652992248535,
"rewards/margins": 2.108079433441162,
"rewards/rejected": -5.364445209503174,
"step": 170
},
{
"epoch": 0.3450400492914356,
"grad_norm": 20.146161792615796,
"learning_rate": 4.141880060119336e-07,
"logits/chosen": -2.138545036315918,
"logits/rejected": -2.1449027061462402,
"logps/chosen": -580.8723754882812,
"logps/rejected": -799.7882690429688,
"loss": 0.4178,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.6856274604797363,
"rewards/margins": 1.9810088872909546,
"rewards/rejected": -5.6666364669799805,
"step": 175
},
{
"epoch": 0.35489833641404805,
"grad_norm": 20.25459576341684,
"learning_rate": 4.0759765403198877e-07,
"logits/chosen": -1.9771722555160522,
"logits/rejected": -1.9267823696136475,
"logps/chosen": -448.6309509277344,
"logps/rejected": -687.6984252929688,
"loss": 0.3941,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.361887216567993,
"rewards/margins": 1.849793791770935,
"rewards/rejected": -4.211681365966797,
"step": 180
},
{
"epoch": 0.3647566235366605,
"grad_norm": 23.732608340062967,
"learning_rate": 4.008203127021797e-07,
"logits/chosen": -2.0232439041137695,
"logits/rejected": -2.0282373428344727,
"logps/chosen": -536.0543212890625,
"logps/rejected": -753.0247802734375,
"loss": 0.3758,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.216007947921753,
"rewards/margins": 1.8679723739624023,
"rewards/rejected": -5.083980560302734,
"step": 185
},
{
"epoch": 0.37461491065927294,
"grad_norm": 33.821388543016646,
"learning_rate": 3.9386402332652754e-07,
"logits/chosen": -2.0202414989471436,
"logits/rejected": -1.956538200378418,
"logps/chosen": -628.9379272460938,
"logps/rejected": -831.4833984375,
"loss": 0.4385,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -4.222132205963135,
"rewards/margins": 1.7108278274536133,
"rewards/rejected": -5.93295955657959,
"step": 190
},
{
"epoch": 0.3844731977818854,
"grad_norm": 17.956228351745885,
"learning_rate": 3.867370395306068e-07,
"logits/chosen": -1.974908471107483,
"logits/rejected": -1.9330415725708008,
"logps/chosen": -509.0133361816406,
"logps/rejected": -720.5633544921875,
"loss": 0.3801,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.9117178916931152,
"rewards/margins": 1.6813218593597412,
"rewards/rejected": -4.593040466308594,
"step": 195
},
{
"epoch": 0.3943314849044978,
"grad_norm": 24.48103397679138,
"learning_rate": 3.794478174686328e-07,
"logits/chosen": -1.9475266933441162,
"logits/rejected": -1.9687010049819946,
"logps/chosen": -549.758544921875,
"logps/rejected": -740.8396606445312,
"loss": 0.4111,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.3982017040252686,
"rewards/margins": 1.5769809484481812,
"rewards/rejected": -4.97518253326416,
"step": 200
},
{
"epoch": 0.4041897720271103,
"grad_norm": 19.929793517914295,
"learning_rate": 3.720050057902495e-07,
"logits/chosen": -2.11773419380188,
"logits/rejected": -2.0510640144348145,
"logps/chosen": -678.2037353515625,
"logps/rejected": -897.92822265625,
"loss": 0.3989,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -4.697990894317627,
"rewards/margins": 1.6948550939559937,
"rewards/rejected": -6.39284610748291,
"step": 205
},
{
"epoch": 0.41404805914972276,
"grad_norm": 19.138382009358025,
"learning_rate": 3.644174353789204e-07,
"logits/chosen": -1.96860671043396,
"logits/rejected": -1.9445680379867554,
"logps/chosen": -541.2803955078125,
"logps/rejected": -714.5045776367188,
"loss": 0.3758,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.2117228507995605,
"rewards/margins": 1.4313344955444336,
"rewards/rejected": -4.643057346343994,
"step": 210
},
{
"epoch": 0.4239063462723352,
"grad_norm": 22.61062071667254,
"learning_rate": 3.566941088741009e-07,
"logits/chosen": -1.9290311336517334,
"logits/rejected": -1.9250952005386353,
"logps/chosen": -502.6095275878906,
"logps/rejected": -698.4926147460938,
"loss": 0.3967,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.9526402950286865,
"rewards/margins": 1.6423494815826416,
"rewards/rejected": -4.594989776611328,
"step": 215
},
{
"epoch": 0.43376463339494764,
"grad_norm": 28.506261562704676,
"learning_rate": 3.488441899896217e-07,
"logits/chosen": -2.1637561321258545,
"logits/rejected": -1.9638168811798096,
"logps/chosen": -579.2008056640625,
"logps/rejected": -836.2589111328125,
"loss": 0.3974,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.5565972328186035,
"rewards/margins": 2.140427589416504,
"rewards/rejected": -5.697024345397949,
"step": 220
},
{
"epoch": 0.4436229205175601,
"grad_norm": 22.104238159035294,
"learning_rate": 3.408769926409574e-07,
"logits/chosen": -1.9999799728393555,
"logits/rejected": -1.9067310094833374,
"logps/chosen": -533.4635009765625,
"logps/rejected": -767.3900146484375,
"loss": 0.3601,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.1343350410461426,
"rewards/margins": 1.9703528881072998,
"rewards/rejected": -5.104687690734863,
"step": 225
},
{
"epoch": 0.4534812076401725,
"grad_norm": 21.86054071865173,
"learning_rate": 3.3280196989428263e-07,
"logits/chosen": -2.0549824237823486,
"logits/rejected": -2.079737424850464,
"logps/chosen": -571.4501342773438,
"logps/rejected": -805.6971435546875,
"loss": 0.3644,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.492208480834961,
"rewards/margins": 2.0622007846832275,
"rewards/rejected": -5.554409027099609,
"step": 230
},
{
"epoch": 0.46333949476278496,
"grad_norm": 28.670025336805338,
"learning_rate": 3.2462870275042367e-07,
"logits/chosen": -2.086364269256592,
"logits/rejected": -2.082109212875366,
"logps/chosen": -627.2444458007812,
"logps/rejected": -857.6990356445312,
"loss": 0.3692,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.9953174591064453,
"rewards/margins": 2.085484266281128,
"rewards/rejected": -6.080801963806152,
"step": 235
},
{
"epoch": 0.4731977818853974,
"grad_norm": 23.098928119258375,
"learning_rate": 3.1636688877701806e-07,
"logits/chosen": -1.9278815984725952,
"logits/rejected": -2.008877992630005,
"logps/chosen": -536.9634399414062,
"logps/rejected": -782.7907104492188,
"loss": 0.3307,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.128661632537842,
"rewards/margins": 2.186957836151123,
"rewards/rejected": -5.315619468688965,
"step": 240
},
{
"epoch": 0.48305606900800985,
"grad_norm": 18.256316767301172,
"learning_rate": 3.080263306023669e-07,
"logits/chosen": -1.9272663593292236,
"logits/rejected": -1.9132862091064453,
"logps/chosen": -510.236328125,
"logps/rejected": -714.0992431640625,
"loss": 0.3866,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.079207420349121,
"rewards/margins": 1.7918453216552734,
"rewards/rejected": -4.8710527420043945,
"step": 245
},
{
"epoch": 0.4929143561306223,
"grad_norm": 21.751680260746046,
"learning_rate": 2.996169242846328e-07,
"logits/chosen": -1.8919010162353516,
"logits/rejected": -1.9492820501327515,
"logps/chosen": -575.5780029296875,
"logps/rejected": -819.9085693359375,
"loss": 0.3276,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.592189311981201,
"rewards/margins": 2.1720731258392334,
"rewards/rejected": -5.764262676239014,
"step": 250
},
{
"epoch": 0.5027726432532348,
"grad_norm": 22.663811321818965,
"learning_rate": 2.911486475701835e-07,
"logits/chosen": -1.8436260223388672,
"logits/rejected": -1.8624000549316406,
"logps/chosen": -532.0939331054688,
"logps/rejected": -772.1865234375,
"loss": 0.3646,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.234412670135498,
"rewards/margins": 2.037332773208618,
"rewards/rejected": -5.271745681762695,
"step": 255
},
{
"epoch": 0.5126309303758472,
"grad_norm": 45.86539600331869,
"learning_rate": 2.826315480550129e-07,
"logits/chosen": -1.8276054859161377,
"logits/rejected": -1.944835901260376,
"logps/chosen": -522.5578002929688,
"logps/rejected": -729.6175537109375,
"loss": 0.3653,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.1025471687316895,
"rewards/margins": 1.926390290260315,
"rewards/rejected": -5.028937339782715,
"step": 260
},
{
"epoch": 0.5224892174984597,
"grad_norm": 33.790931231853406,
"learning_rate": 2.740757312632854e-07,
"logits/chosen": -1.9260978698730469,
"logits/rejected": -1.8717044591903687,
"logps/chosen": -576.6935424804688,
"logps/rejected": -834.5701904296875,
"loss": 0.3316,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.7069427967071533,
"rewards/margins": 2.2525296211242676,
"rewards/rejected": -5.959472179412842,
"step": 265
},
{
"epoch": 0.532347504621072,
"grad_norm": 32.72135751726444,
"learning_rate": 2.654913486571487e-07,
"logits/chosen": -1.928877830505371,
"logits/rejected": -1.9832346439361572,
"logps/chosen": -580.7061767578125,
"logps/rejected": -821.77734375,
"loss": 0.3773,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.5770630836486816,
"rewards/margins": 2.1589841842651367,
"rewards/rejected": -5.73604679107666,
"step": 270
},
{
"epoch": 0.5422057917436846,
"grad_norm": 18.44880000765859,
"learning_rate": 2.5688858559204053e-07,
"logits/chosen": -1.8500230312347412,
"logits/rejected": -1.8931682109832764,
"logps/chosen": -484.74420166015625,
"logps/rejected": -701.5289916992188,
"loss": 0.3747,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.6471669673919678,
"rewards/margins": 1.9642257690429688,
"rewards/rejected": -4.611392974853516,
"step": 275
},
{
"epoch": 0.5520640788662969,
"grad_norm": 23.335141498824942,
"learning_rate": 2.4827764923178246e-07,
"logits/chosen": -1.8331562280654907,
"logits/rejected": -1.9513938426971436,
"logps/chosen": -470.59405517578125,
"logps/rejected": -660.6781005859375,
"loss": 0.3683,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.504983901977539,
"rewards/margins": 1.7337911128997803,
"rewards/rejected": -4.23877477645874,
"step": 280
},
{
"epoch": 0.5619223659889094,
"grad_norm": 26.351304197321983,
"learning_rate": 2.3966875643779667e-07,
"logits/chosen": -2.0291342735290527,
"logits/rejected": -1.9187507629394531,
"logps/chosen": -495.74639892578125,
"logps/rejected": -760.9273681640625,
"loss": 0.3506,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.894580125808716,
"rewards/margins": 2.167811632156372,
"rewards/rejected": -5.062391757965088,
"step": 285
},
{
"epoch": 0.5717806531115218,
"grad_norm": 21.633547530781627,
"learning_rate": 2.3107212164681774e-07,
"logits/chosen": -1.871260643005371,
"logits/rejected": -1.916135549545288,
"logps/chosen": -529.8262939453125,
"logps/rejected": -772.5482177734375,
"loss": 0.36,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.0958144664764404,
"rewards/margins": 2.1699347496032715,
"rewards/rejected": -5.265749454498291,
"step": 290
},
{
"epoch": 0.5816389402341343,
"grad_norm": 28.65131510288306,
"learning_rate": 2.2249794475148019e-07,
"logits/chosen": -2.063917636871338,
"logits/rejected": -2.049710750579834,
"logps/chosen": -510.1465759277344,
"logps/rejected": -759.2296752929688,
"loss": 0.3827,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.9606268405914307,
"rewards/margins": 2.110110282897949,
"rewards/rejected": -5.070736885070801,
"step": 295
},
{
"epoch": 0.5914972273567468,
"grad_norm": 23.359544656067033,
"learning_rate": 2.1395639899816332e-07,
"logits/chosen": -2.2645859718322754,
"logits/rejected": -1.9906375408172607,
"logps/chosen": -541.4847412109375,
"logps/rejected": -846.5947265625,
"loss": 0.3488,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.3356003761291504,
"rewards/margins": 2.4959442615509033,
"rewards/rejected": -5.831544399261475,
"step": 300
},
{
"epoch": 0.6013555144793592,
"grad_norm": 24.944829150573064,
"learning_rate": 2.0545761891645177e-07,
"logits/chosen": -2.0867130756378174,
"logits/rejected": -2.074833393096924,
"logps/chosen": -642.1096801757812,
"logps/rejected": -906.7780151367188,
"loss": 0.3502,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -4.175184726715088,
"rewards/margins": 2.322237253189087,
"rewards/rejected": -6.497422218322754,
"step": 305
},
{
"epoch": 0.6112138016019717,
"grad_norm": 24.979816541182146,
"learning_rate": 1.9701168829453305e-07,
"logits/chosen": -1.932847023010254,
"logits/rejected": -1.9259026050567627,
"logps/chosen": -570.7978515625,
"logps/rejected": -823.3259887695312,
"loss": 0.3411,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.579385757446289,
"rewards/margins": 2.1752305030822754,
"rewards/rejected": -5.7546162605285645,
"step": 310
},
{
"epoch": 0.6210720887245841,
"grad_norm": 24.585502500513254,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": -2.069624662399292,
"logits/rejected": -1.978257179260254,
"logps/chosen": -547.39794921875,
"logps/rejected": -787.85302734375,
"loss": 0.331,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.470412015914917,
"rewards/margins": 2.049595594406128,
"rewards/rejected": -5.520008087158203,
"step": 315
},
{
"epoch": 0.6309303758471966,
"grad_norm": 29.24520617120494,
"learning_rate": 1.8031838516385422e-07,
"logits/chosen": -2.089122772216797,
"logits/rejected": -2.0376973152160645,
"logps/chosen": -622.2824096679688,
"logps/rejected": -920.2009887695312,
"loss": 0.3733,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -4.0230793952941895,
"rewards/margins": 2.4440813064575195,
"rewards/rejected": -6.467160701751709,
"step": 320
},
{
"epoch": 0.640788662969809,
"grad_norm": 20.742877534346576,
"learning_rate": 1.7209081923101472e-07,
"logits/chosen": -2.0211918354034424,
"logits/rejected": -2.014601230621338,
"logps/chosen": -589.6067504882812,
"logps/rejected": -773.7950439453125,
"loss": 0.3436,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.677738904953003,
"rewards/margins": 1.7034008502960205,
"rewards/rejected": -5.381140232086182,
"step": 325
},
{
"epoch": 0.6506469500924215,
"grad_norm": 16.903001935618324,
"learning_rate": 1.639556924093404e-07,
"logits/chosen": -1.8897491693496704,
"logits/rejected": -1.88128662109375,
"logps/chosen": -517.2490844726562,
"logps/rejected": -746.6140747070312,
"loss": 0.3561,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.064331531524658,
"rewards/margins": 1.9852135181427002,
"rewards/rejected": -5.0495452880859375,
"step": 330
},
{
"epoch": 0.6605052372150338,
"grad_norm": 19.809662336676986,
"learning_rate": 1.5592265701304114e-07,
"logits/chosen": -2.0255661010742188,
"logits/rejected": -1.944502592086792,
"logps/chosen": -566.5452270507812,
"logps/rejected": -803.533203125,
"loss": 0.3705,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.5436272621154785,
"rewards/margins": 2.0192878246307373,
"rewards/rejected": -5.562914848327637,
"step": 335
},
{
"epoch": 0.6703635243376463,
"grad_norm": 26.202979422607854,
"learning_rate": 1.4800124422502334e-07,
"logits/chosen": -1.918569803237915,
"logits/rejected": -2.0119967460632324,
"logps/chosen": -601.0817260742188,
"logps/rejected": -845.7435302734375,
"loss": 0.3597,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.8363289833068848,
"rewards/margins": 2.0620241165161133,
"rewards/rejected": -5.89835262298584,
"step": 340
},
{
"epoch": 0.6802218114602587,
"grad_norm": 26.077309548266044,
"learning_rate": 1.4020085278815743e-07,
"logits/chosen": -2.0037617683410645,
"logits/rejected": -1.8837954998016357,
"logps/chosen": -645.288818359375,
"logps/rejected": -909.7770385742188,
"loss": 0.3434,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -4.325263500213623,
"rewards/margins": 2.183290719985962,
"rewards/rejected": -6.508553981781006,
"step": 345
},
{
"epoch": 0.6900800985828712,
"grad_norm": 20.206758195915803,
"learning_rate": 1.3253073785368545e-07,
"logits/chosen": -1.97844660282135,
"logits/rejected": -1.9779163599014282,
"logps/chosen": -656.6150512695312,
"logps/rejected": -917.7893676757812,
"loss": 0.3432,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -4.431666374206543,
"rewards/margins": 2.2607076168060303,
"rewards/rejected": -6.692374229431152,
"step": 350
},
{
"epoch": 0.6999383857054837,
"grad_norm": 24.24143829005782,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -2.066188335418701,
"logits/rejected": -2.054232120513916,
"logps/chosen": -643.3806762695312,
"logps/rejected": -863.3739013671875,
"loss": 0.3583,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.284465789794922,
"rewards/margins": 1.9454777240753174,
"rewards/rejected": -6.22994327545166,
"step": 355
},
{
"epoch": 0.7097966728280961,
"grad_norm": 17.76728293699117,
"learning_rate": 1.1761757443482285e-07,
"logits/chosen": -1.8952592611312866,
"logits/rejected": -1.8270065784454346,
"logps/chosen": -567.1143798828125,
"logps/rejected": -791.7889404296875,
"loss": 0.3722,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.542332172393799,
"rewards/margins": 1.8870967626571655,
"rewards/rejected": -5.429428577423096,
"step": 360
},
{
"epoch": 0.7196549599507086,
"grad_norm": 25.546590661527123,
"learning_rate": 1.1039222039359644e-07,
"logits/chosen": -1.9491792917251587,
"logits/rejected": -1.8340580463409424,
"logps/chosen": -522.3615112304688,
"logps/rejected": -782.1358032226562,
"loss": 0.3194,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.073195219039917,
"rewards/margins": 2.195949077606201,
"rewards/rejected": -5.269144535064697,
"step": 365
},
{
"epoch": 0.729513247073321,
"grad_norm": 25.390221264918292,
"learning_rate": 1.0333251074666608e-07,
"logits/chosen": -1.8948665857315063,
"logits/rejected": -1.8821592330932617,
"logps/chosen": -578.3306884765625,
"logps/rejected": -830.9544677734375,
"loss": 0.3285,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.6443309783935547,
"rewards/margins": 2.2878963947296143,
"rewards/rejected": -5.932227611541748,
"step": 370
},
{
"epoch": 0.7393715341959335,
"grad_norm": 23.32295380693496,
"learning_rate": 9.644682182758304e-08,
"logits/chosen": -1.8538382053375244,
"logits/rejected": -1.8016763925552368,
"logps/chosen": -604.1889038085938,
"logps/rejected": -873.8849487304688,
"loss": 0.3055,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.8995869159698486,
"rewards/margins": 2.3217251300811768,
"rewards/rejected": -6.221312046051025,
"step": 375
},
{
"epoch": 0.7492298213185459,
"grad_norm": 23.34487045577994,
"learning_rate": 8.974332349459992e-08,
"logits/chosen": -1.913751244544983,
"logits/rejected": -1.8759132623672485,
"logps/chosen": -620.8341064453125,
"logps/rejected": -879.5367431640625,
"loss": 0.3497,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.9944968223571777,
"rewards/margins": 2.277583360671997,
"rewards/rejected": -6.272080421447754,
"step": 380
},
{
"epoch": 0.7590881084411584,
"grad_norm": 32.80860271044305,
"learning_rate": 8.322996943714672e-08,
"logits/chosen": -1.9127395153045654,
"logits/rejected": -1.716653823852539,
"logps/chosen": -559.0478515625,
"logps/rejected": -869.8646240234375,
"loss": 0.3553,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.513237714767456,
"rewards/margins": 2.5553982257843018,
"rewards/rejected": -6.068636894226074,
"step": 385
},
{
"epoch": 0.7689463955637708,
"grad_norm": 22.70407770691601,
"learning_rate": 7.691448773879256e-08,
"logits/chosen": -1.8521419763565063,
"logits/rejected": -1.7435353994369507,
"logps/chosen": -503.28369140625,
"logps/rejected": -769.7913818359375,
"loss": 0.3582,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.978849172592163,
"rewards/margins": 2.2009291648864746,
"rewards/rejected": -5.179778575897217,
"step": 390
},
{
"epoch": 0.7788046826863833,
"grad_norm": 26.31642324943315,
"learning_rate": 7.080437170788722e-08,
"logits/chosen": -1.9601354598999023,
"logits/rejected": -1.8902816772460938,
"logps/chosen": -511.15478515625,
"logps/rejected": -765.9757690429688,
"loss": 0.3126,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.040039539337158,
"rewards/margins": 2.2116754055023193,
"rewards/rejected": -5.251715183258057,
"step": 395
},
{
"epoch": 0.7886629698089956,
"grad_norm": 73.40724109949657,
"learning_rate": 6.490687098676332e-08,
"logits/chosen": -1.776098608970642,
"logits/rejected": -1.7230415344238281,
"logps/chosen": -568.650634765625,
"logps/rejected": -813.4503173828125,
"loss": 0.347,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.520684003829956,
"rewards/margins": 2.105372190475464,
"rewards/rejected": -5.626055717468262,
"step": 400
},
{
"epoch": 0.7886629698089956,
"eval_logits/chosen": -2.5592944622039795,
"eval_logits/rejected": -2.4283623695373535,
"eval_logps/chosen": -432.9762878417969,
"eval_logps/rejected": -481.0541076660156,
"eval_loss": 0.5772423148155212,
"eval_rewards/accuracies": 0.6794354915618896,
"eval_rewards/chosen": -1.7008415460586548,
"eval_rewards/margins": 0.37072598934173584,
"eval_rewards/rejected": -2.0715677738189697,
"eval_runtime": 324.936,
"eval_samples_per_second": 6.081,
"eval_steps_per_second": 0.382,
"step": 400
},
{
"epoch": 0.7985212569316081,
"grad_norm": 33.809746730746596,
"learning_rate": 5.9228982950048414e-08,
"logits/chosen": -1.7156673669815063,
"logits/rejected": -1.7448875904083252,
"logps/chosen": -582.68603515625,
"logps/rejected": -882.1572265625,
"loss": 0.3578,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.6797871589660645,
"rewards/margins": 2.6076254844665527,
"rewards/rejected": -6.287413120269775,
"step": 405
},
{
"epoch": 0.8083795440542206,
"grad_norm": 23.982039805708112,
"learning_rate": 5.3777444402291345e-08,
"logits/chosen": -1.9656894207000732,
"logits/rejected": -1.7757899761199951,
"logps/chosen": -602.1336059570312,
"logps/rejected": -914.3304443359375,
"loss": 0.2749,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.811291456222534,
"rewards/margins": 2.5836830139160156,
"rewards/rejected": -6.394974708557129,
"step": 410
},
{
"epoch": 0.818237831176833,
"grad_norm": 21.999889032487328,
"learning_rate": 4.855872358475546e-08,
"logits/chosen": -1.883536696434021,
"logits/rejected": -1.8990424871444702,
"logps/chosen": -593.3975219726562,
"logps/rejected": -852.3743896484375,
"loss": 0.3421,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.749640941619873,
"rewards/margins": 2.2763512134552,
"rewards/rejected": -6.025992393493652,
"step": 415
},
{
"epoch": 0.8280961182994455,
"grad_norm": 22.43509931864549,
"learning_rate": 4.357901250086107e-08,
"logits/chosen": -1.9897289276123047,
"logits/rejected": -1.8019778728485107,
"logps/chosen": -604.5925903320312,
"logps/rejected": -907.0695190429688,
"loss": 0.34,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.7366394996643066,
"rewards/margins": 2.591937303543091,
"rewards/rejected": -6.328576564788818,
"step": 420
},
{
"epoch": 0.8379544054220579,
"grad_norm": 24.272876807226076,
"learning_rate": 3.884421956938377e-08,
"logits/chosen": -1.7035375833511353,
"logits/rejected": -1.8067095279693604,
"logps/chosen": -621.3763427734375,
"logps/rejected": -809.0113525390625,
"loss": 0.3274,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.943162441253662,
"rewards/margins": 1.8734540939331055,
"rewards/rejected": -5.816616535186768,
"step": 425
},
{
"epoch": 0.8478126925446704,
"grad_norm": 20.673588966056126,
"learning_rate": 3.435996261412591e-08,
"logits/chosen": -1.7106269598007202,
"logits/rejected": -1.7173693180084229,
"logps/chosen": -582.3190307617188,
"logps/rejected": -837.8707275390625,
"loss": 0.3204,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.7369446754455566,
"rewards/margins": 2.223895311355591,
"rewards/rejected": -5.960839748382568,
"step": 430
},
{
"epoch": 0.8576709796672828,
"grad_norm": 21.70614636700232,
"learning_rate": 3.013156219837776e-08,
"logits/chosen": -2.0358176231384277,
"logits/rejected": -1.7434278726577759,
"logps/chosen": -567.6253662109375,
"logps/rejected": -890.8966674804688,
"loss": 0.3264,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.516098737716675,
"rewards/margins": 2.7037405967712402,
"rewards/rejected": -6.219839096069336,
"step": 435
},
{
"epoch": 0.8675292667898953,
"grad_norm": 32.0982872650184,
"learning_rate": 2.6164035312078447e-08,
"logits/chosen": -1.87311589717865,
"logits/rejected": -1.8581056594848633,
"logps/chosen": -588.0389404296875,
"logps/rejected": -895.1696166992188,
"loss": 0.3188,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.7721753120422363,
"rewards/margins": 2.676305055618286,
"rewards/rejected": -6.448480129241943,
"step": 440
},
{
"epoch": 0.8773875539125077,
"grad_norm": 21.51066896519883,
"learning_rate": 2.2462089419165776e-08,
"logits/chosen": -1.8648655414581299,
"logits/rejected": -1.7761850357055664,
"logps/chosen": -582.1537475585938,
"logps/rejected": -876.07080078125,
"loss": 0.3584,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.821885347366333,
"rewards/margins": 2.4609155654907227,
"rewards/rejected": -6.282800197601318,
"step": 445
},
{
"epoch": 0.8872458410351202,
"grad_norm": 23.61458187816769,
"learning_rate": 1.9030116872178314e-08,
"logits/chosen": -1.8204158544540405,
"logits/rejected": -1.798825979232788,
"logps/chosen": -608.7778930664062,
"logps/rejected": -842.8968505859375,
"loss": 0.355,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.7794837951660156,
"rewards/margins": 2.1480660438537598,
"rewards/rejected": -5.927549839019775,
"step": 450
},
{
"epoch": 0.8971041281577325,
"grad_norm": 19.975596086165712,
"learning_rate": 1.5872189700736337e-08,
"logits/chosen": -1.7636759281158447,
"logits/rejected": -1.8992855548858643,
"logps/chosen": -585.3933715820312,
"logps/rejected": -801.01025390625,
"loss": 0.3525,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.633349657058716,
"rewards/margins": 1.9284839630126953,
"rewards/rejected": -5.561833381652832,
"step": 455
},
{
"epoch": 0.906962415280345,
"grad_norm": 24.116575473235745,
"learning_rate": 1.2992054780085692e-08,
"logits/chosen": -1.6149314641952515,
"logits/rejected": -1.6830947399139404,
"logps/chosen": -552.21728515625,
"logps/rejected": -793.0897216796875,
"loss": 0.3263,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.4062886238098145,
"rewards/margins": 2.0678482055664062,
"rewards/rejected": -5.474137306213379,
"step": 460
},
{
"epoch": 0.9168207024029574,
"grad_norm": 20.57194341940523,
"learning_rate": 1.0393129385436823e-08,
"logits/chosen": -1.9199676513671875,
"logits/rejected": -1.8623239994049072,
"logps/chosen": -570.3748779296875,
"logps/rejected": -817.0477294921875,
"loss": 0.3323,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.562505006790161,
"rewards/margins": 2.1840949058532715,
"rewards/rejected": -5.7465996742248535,
"step": 465
},
{
"epoch": 0.9266789895255699,
"grad_norm": 23.30110640610616,
"learning_rate": 8.078497137373242e-09,
"logits/chosen": -1.7810325622558594,
"logits/rejected": -1.7818634510040283,
"logps/chosen": -555.9640502929688,
"logps/rejected": -835.23876953125,
"loss": 0.3237,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.4883415699005127,
"rewards/margins": 2.179206609725952,
"rewards/rejected": -5.667548179626465,
"step": 470
},
{
"epoch": 0.9365372766481824,
"grad_norm": 21.422635902068766,
"learning_rate": 6.0509043431410945e-09,
"logits/chosen": -1.7087141275405884,
"logits/rejected": -1.772657036781311,
"logps/chosen": -568.8113403320312,
"logps/rejected": -804.4452514648438,
"loss": 0.3425,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.5404930114746094,
"rewards/margins": 2.15417742729187,
"rewards/rejected": -5.694670677185059,
"step": 475
},
{
"epoch": 0.9463955637707948,
"grad_norm": 22.24576845817703,
"learning_rate": 4.312756738160145e-09,
"logits/chosen": -1.8130733966827393,
"logits/rejected": -1.7939121723175049,
"logps/chosen": -561.7185668945312,
"logps/rejected": -826.4733276367188,
"loss": 0.3187,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.5638492107391357,
"rewards/margins": 2.2962565422058105,
"rewards/rejected": -5.860105991363525,
"step": 480
},
{
"epoch": 0.9562538508934073,
"grad_norm": 20.818504977861426,
"learning_rate": 2.8661166316229223e-09,
"logits/chosen": -1.7990143299102783,
"logits/rejected": -1.7799808979034424,
"logps/chosen": -545.7501220703125,
"logps/rejected": -777.5648193359375,
"loss": 0.338,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.4501044750213623,
"rewards/margins": 1.9422149658203125,
"rewards/rejected": -5.392319202423096,
"step": 485
},
{
"epoch": 0.9661121380160197,
"grad_norm": 21.367843020001658,
"learning_rate": 1.7127004595681727e-09,
"logits/chosen": -1.8907989263534546,
"logits/rejected": -1.803995132446289,
"logps/chosen": -572.9863891601562,
"logps/rejected": -869.6575317382812,
"loss": 0.3514,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.549314498901367,
"rewards/margins": 2.517167568206787,
"rewards/rejected": -6.066482067108154,
"step": 490
},
{
"epoch": 0.9759704251386322,
"grad_norm": 26.301841729679015,
"learning_rate": 8.538767483325383e-10,
"logits/chosen": -1.6898645162582397,
"logits/rejected": -1.872666597366333,
"logps/chosen": -564.5504760742188,
"logps/rejected": -813.0301513671875,
"loss": 0.3249,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.5329060554504395,
"rewards/margins": 2.330728054046631,
"rewards/rejected": -5.863633632659912,
"step": 495
},
{
"epoch": 0.9858287122612446,
"grad_norm": 28.33067138539654,
"learning_rate": 2.9066449079634404e-10,
"logits/chosen": -1.81964910030365,
"logits/rejected": -1.7677667140960693,
"logps/chosen": -553.2039184570312,
"logps/rejected": -806.8800048828125,
"loss": 0.3026,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.421668291091919,
"rewards/margins": 2.2354369163513184,
"rewards/rejected": -5.657104969024658,
"step": 500
},
{
"epoch": 0.9956869993838571,
"grad_norm": 23.713797105940532,
"learning_rate": 2.3731937350224273e-11,
"logits/chosen": -1.9265756607055664,
"logits/rejected": -1.8447071313858032,
"logps/chosen": -565.0730590820312,
"logps/rejected": -841.3292236328125,
"loss": 0.3122,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.6411995887756348,
"rewards/margins": 2.3857717514038086,
"rewards/rejected": -6.026970863342285,
"step": 505
},
{
"epoch": 0.999630314232902,
"step": 507,
"total_flos": 0.0,
"train_loss": 0.41502543125867375,
"train_runtime": 18234.8908,
"train_samples_per_second": 3.56,
"train_steps_per_second": 0.028
}
],
"logging_steps": 5,
"max_steps": 507,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}