diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8352 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 5811, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 8.591065292096219e-10, + "logits/chosen": -2.764016628265381, + "logits/rejected": -2.674347400665283, + "logps/chosen": -108.92428588867188, + "logps/rejected": -112.40267944335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 8.59106529209622e-09, + "logits/chosen": -2.863027811050415, + "logits/rejected": -2.838684320449829, + "logps/chosen": -327.73529052734375, + "logps/rejected": -250.00613403320312, + "loss": 0.6943, + "rewards/accuracies": 0.5277777910232544, + "rewards/chosen": -0.003003156976774335, + "rewards/margins": 0.0038251648657023907, + "rewards/rejected": -0.006828321143984795, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.7698731422424316, + "logits/rejected": -2.747614622116089, + "logps/chosen": -251.3875274658203, + "logps/rejected": -184.10693359375, + "loss": 0.6945, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0009023567545227706, + "rewards/margins": -0.0005918591632507741, + "rewards/rejected": -0.0003104977367911488, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 2.5773195876288656e-08, + "logits/chosen": -2.8563497066497803, + "logits/rejected": -2.8292155265808105, + "logps/chosen": -318.81866455078125, + "logps/rejected": -268.5263977050781, + "loss": 0.691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.013460609130561352, + "rewards/margins": 0.014691811986267567, + "rewards/rejected": -0.0012312005273997784, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.7859857082366943, + "logits/rejected": -2.753185749053955, + "logps/chosen": -324.4197692871094, + "logps/rejected": -235.1029510498047, + "loss": 0.6847, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.004841436631977558, + "rewards/margins": 0.015062311664223671, + "rewards/rejected": -0.010220875963568687, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.29553264604811e-08, + "logits/chosen": -2.918607473373413, + "logits/rejected": -2.89624285697937, + "logps/chosen": -264.66351318359375, + "logps/rejected": -208.9918212890625, + "loss": 0.6823, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01985430344939232, + "rewards/margins": 0.03327787667512894, + "rewards/rejected": -0.013423572294414043, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.7966980934143066, + "logits/rejected": -2.808544874191284, + "logps/chosen": -266.514404296875, + "logps/rejected": -252.36001586914062, + "loss": 0.6705, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018798351287841797, + "rewards/margins": 0.05238068103790283, + "rewards/rejected": -0.03358232229948044, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 6.013745704467354e-08, + "logits/chosen": -2.8935234546661377, + "logits/rejected": -2.854685068130493, + "logps/chosen": -312.78436279296875, + "logps/rejected": -256.33526611328125, + "loss": 0.6655, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03346724063158035, + "rewards/margins": 0.07332305610179901, + "rewards/rejected": -0.03985581547021866, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.8555781841278076, + "logits/rejected": -2.838266372680664, + "logps/chosen": -291.6562805175781, + "logps/rejected": -253.54049682617188, + "loss": 0.6428, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05274033546447754, + "rewards/margins": 0.10176833719015121, + "rewards/rejected": -0.04902799427509308, + "step": 80 + }, + { + "epoch": 0.05, + "learning_rate": 7.731958762886598e-08, + "logits/chosen": -2.8810267448425293, + "logits/rejected": -2.8738858699798584, + "logps/chosen": -312.50506591796875, + "logps/rejected": -256.6100158691406, + "loss": 0.6394, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.032647717744112015, + "rewards/margins": 0.1116027683019638, + "rewards/rejected": -0.07895506173372269, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.717987537384033, + "logits/rejected": -2.7490057945251465, + "logps/chosen": -275.51495361328125, + "logps/rejected": -199.8630828857422, + "loss": 0.6299, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.03528403118252754, + "rewards/margins": 0.14646485447883606, + "rewards/rejected": -0.11118084192276001, + "step": 100 + }, + { + "epoch": 0.06, + "learning_rate": 9.450171821305841e-08, + "logits/chosen": -2.7796645164489746, + "logits/rejected": -2.830543041229248, + "logps/chosen": -273.095703125, + "logps/rejected": -247.6790313720703, + "loss": 0.6182, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.0012518130242824554, + "rewards/margins": 0.21374264359474182, + "rewards/rejected": -0.21249084174633026, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.800854206085205, + "logits/rejected": -2.893012523651123, + "logps/chosen": -235.9143829345703, + "logps/rejected": -214.0904998779297, + "loss": 0.5884, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.017621120437979698, + "rewards/margins": 0.21942290663719177, + "rewards/rejected": -0.20180177688598633, + "step": 120 + }, + { + "epoch": 0.07, + "learning_rate": 1.1168384879725086e-07, + "logits/chosen": -2.9455106258392334, + "logits/rejected": -2.866570472717285, + "logps/chosen": -317.5361022949219, + "logps/rejected": -241.4765167236328, + "loss": 0.5885, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.15108627080917358, + "rewards/margins": 0.3808462917804718, + "rewards/rejected": -0.22976000607013702, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.828317403793335, + "logits/rejected": -2.8364932537078857, + "logps/chosen": -298.90313720703125, + "logps/rejected": -277.0294189453125, + "loss": 0.5635, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.043794650584459305, + "rewards/margins": 0.49751076102256775, + "rewards/rejected": -0.45371612906455994, + "step": 140 + }, + { + "epoch": 0.08, + "learning_rate": 1.2886597938144328e-07, + "logits/chosen": -2.898667335510254, + "logits/rejected": -2.882783889770508, + "logps/chosen": -291.00213623046875, + "logps/rejected": -247.44692993164062, + "loss": 0.5565, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1469222903251648, + "rewards/margins": 0.6005532145500183, + "rewards/rejected": -0.4536309242248535, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.813276767730713, + "logits/rejected": -2.8065598011016846, + "logps/chosen": -301.55999755859375, + "logps/rejected": -246.9912567138672, + "loss": 0.538, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14675188064575195, + "rewards/margins": 0.5681458115577698, + "rewards/rejected": -0.4213939309120178, + "step": 160 + }, + { + "epoch": 0.09, + "learning_rate": 1.4604810996563573e-07, + "logits/chosen": -2.8698763847351074, + "logits/rejected": -2.8171753883361816, + "logps/chosen": -285.3466796875, + "logps/rejected": -235.87338256835938, + "loss": 0.4963, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.16148421168327332, + "rewards/margins": 0.8422476053237915, + "rewards/rejected": -0.6807633638381958, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -2.8068044185638428, + "logits/rejected": -2.8266568183898926, + "logps/chosen": -237.0592803955078, + "logps/rejected": -211.4211883544922, + "loss": 0.5179, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04243529960513115, + "rewards/margins": 0.6374794840812683, + "rewards/rejected": -0.5950442552566528, + "step": 180 + }, + { + "epoch": 0.1, + "learning_rate": 1.6323024054982818e-07, + "logits/chosen": -2.780351161956787, + "logits/rejected": -2.834380626678467, + "logps/chosen": -267.52972412109375, + "logps/rejected": -268.3015441894531, + "loss": 0.5784, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0268848929554224, + "rewards/margins": 0.57568359375, + "rewards/rejected": -0.5487987399101257, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.724424123764038, + "logits/rejected": -2.759570598602295, + "logps/chosen": -283.513427734375, + "logps/rejected": -192.94766235351562, + "loss": 0.4916, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.18617114424705505, + "rewards/margins": 0.8166147470474243, + "rewards/rejected": -0.6304435133934021, + "step": 200 + }, + { + "epoch": 0.11, + "learning_rate": 1.804123711340206e-07, + "logits/chosen": -2.766322612762451, + "logits/rejected": -2.6512322425842285, + "logps/chosen": -264.46697998046875, + "logps/rejected": -257.443603515625, + "loss": 0.5674, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12685665488243103, + "rewards/margins": 0.5820873975753784, + "rewards/rejected": -0.7089440822601318, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.8477821350097656, + "logits/rejected": -2.830031394958496, + "logps/chosen": -287.61212158203125, + "logps/rejected": -240.40194702148438, + "loss": 0.5498, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.06794139742851257, + "rewards/margins": 0.7129371166229248, + "rewards/rejected": -0.780878484249115, + "step": 220 + }, + { + "epoch": 0.12, + "learning_rate": 1.9759450171821303e-07, + "logits/chosen": -2.856313943862915, + "logits/rejected": -2.8552908897399902, + "logps/chosen": -293.28033447265625, + "logps/rejected": -254.1317138671875, + "loss": 0.5135, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.050720613449811935, + "rewards/margins": 0.640368640422821, + "rewards/rejected": -0.6910892724990845, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.8190102577209473, + "logits/rejected": -2.8251214027404785, + "logps/chosen": -336.1227111816406, + "logps/rejected": -227.45346069335938, + "loss": 0.5406, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09252417087554932, + "rewards/margins": 0.6168856024742126, + "rewards/rejected": -0.709409773349762, + "step": 240 + }, + { + "epoch": 0.13, + "learning_rate": 2.1477663230240549e-07, + "logits/chosen": -2.9281716346740723, + "logits/rejected": -2.908295154571533, + "logps/chosen": -273.67584228515625, + "logps/rejected": -254.62588500976562, + "loss": 0.5117, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.07370147854089737, + "rewards/margins": 0.9147092700004578, + "rewards/rejected": -0.8410077095031738, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -2.911724805831909, + "logits/rejected": -2.909519672393799, + "logps/chosen": -300.3340759277344, + "logps/rejected": -246.3933868408203, + "loss": 0.5363, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15076453983783722, + "rewards/margins": 0.8897884488105774, + "rewards/rejected": -0.7390238642692566, + "step": 260 + }, + { + "epoch": 0.14, + "learning_rate": 2.3195876288659794e-07, + "logits/chosen": -2.8644649982452393, + "logits/rejected": -2.8738067150115967, + "logps/chosen": -278.7558898925781, + "logps/rejected": -239.0220489501953, + "loss": 0.4945, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01367796678096056, + "rewards/margins": 0.7261922955513, + "rewards/rejected": -0.7125142812728882, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.877744674682617, + "logits/rejected": -2.819153070449829, + "logps/chosen": -313.3977966308594, + "logps/rejected": -251.2105255126953, + "loss": 0.5324, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08043201267719269, + "rewards/margins": 0.8994860649108887, + "rewards/rejected": -0.8190540075302124, + "step": 280 + }, + { + "epoch": 0.15, + "learning_rate": 2.4914089347079036e-07, + "logits/chosen": -2.7928099632263184, + "logits/rejected": -2.8494484424591064, + "logps/chosen": -302.0984802246094, + "logps/rejected": -284.1147155761719, + "loss": 0.5093, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3942905068397522, + "rewards/margins": 1.0846911668777466, + "rewards/rejected": -0.6904006004333496, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.839244842529297, + "logits/rejected": -2.8479905128479004, + "logps/chosen": -262.75225830078125, + "logps/rejected": -247.9908447265625, + "loss": 0.5291, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.08927704393863678, + "rewards/margins": 0.8800823092460632, + "rewards/rejected": -0.7908053398132324, + "step": 300 + }, + { + "epoch": 0.16, + "learning_rate": 2.663230240549828e-07, + "logits/chosen": -2.8887572288513184, + "logits/rejected": -2.907589912414551, + "logps/chosen": -272.9616394042969, + "logps/rejected": -226.31192016601562, + "loss": 0.5171, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.0031876713037490845, + "rewards/margins": 0.7666963338851929, + "rewards/rejected": -0.7635086178779602, + "step": 310 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.8778905868530273, + "logits/rejected": -2.9233052730560303, + "logps/chosen": -297.2710876464844, + "logps/rejected": -233.50241088867188, + "loss": 0.4689, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.038285039365291595, + "rewards/margins": 1.1589080095291138, + "rewards/rejected": -1.120622992515564, + "step": 320 + }, + { + "epoch": 0.17, + "learning_rate": 2.835051546391752e-07, + "logits/chosen": -2.852926254272461, + "logits/rejected": -2.9000022411346436, + "logps/chosen": -303.80938720703125, + "logps/rejected": -248.91494750976562, + "loss": 0.4865, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.016001278534531593, + "rewards/margins": 1.0812021493911743, + "rewards/rejected": -1.0972034931182861, + "step": 330 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.8354809284210205, + "logits/rejected": -2.8505842685699463, + "logps/chosen": -289.55194091796875, + "logps/rejected": -248.23849487304688, + "loss": 0.5226, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.08476074039936066, + "rewards/margins": 0.8404847979545593, + "rewards/rejected": -0.7557238936424255, + "step": 340 + }, + { + "epoch": 0.18, + "learning_rate": 3.006872852233677e-07, + "logits/chosen": -2.8878281116485596, + "logits/rejected": -2.8713698387145996, + "logps/chosen": -234.1008758544922, + "logps/rejected": -228.55288696289062, + "loss": 0.4795, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06374388188123703, + "rewards/margins": 1.0862773656845093, + "rewards/rejected": -1.1500213146209717, + "step": 350 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -2.8875787258148193, + "logits/rejected": -2.853342056274414, + "logps/chosen": -266.7445373535156, + "logps/rejected": -220.08349609375, + "loss": 0.4635, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.18792326748371124, + "rewards/margins": 1.1800565719604492, + "rewards/rejected": -0.9921333193778992, + "step": 360 + }, + { + "epoch": 0.19, + "learning_rate": 3.178694158075601e-07, + "logits/chosen": -2.8684146404266357, + "logits/rejected": -2.859692096710205, + "logps/chosen": -254.9929656982422, + "logps/rejected": -205.95156860351562, + "loss": 0.4846, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.16485366225242615, + "rewards/margins": 1.232887864112854, + "rewards/rejected": -1.068034052848816, + "step": 370 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -2.8387057781219482, + "logits/rejected": -2.7803750038146973, + "logps/chosen": -242.21554565429688, + "logps/rejected": -221.7504119873047, + "loss": 0.566, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.01200074888765812, + "rewards/margins": 0.9040949940681458, + "rewards/rejected": -0.9160957336425781, + "step": 380 + }, + { + "epoch": 0.2, + "learning_rate": 3.3505154639175255e-07, + "logits/chosen": -2.893998146057129, + "logits/rejected": -2.8745360374450684, + "logps/chosen": -257.35675048828125, + "logps/rejected": -215.04263305664062, + "loss": 0.49, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.014016789384186268, + "rewards/margins": 1.0278851985931396, + "rewards/rejected": -1.0138683319091797, + "step": 390 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -2.944995880126953, + "logits/rejected": -2.921973705291748, + "logps/chosen": -255.16989135742188, + "logps/rejected": -192.85818481445312, + "loss": 0.5511, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16071584820747375, + "rewards/margins": 0.8611815571784973, + "rewards/rejected": -1.0218971967697144, + "step": 400 + }, + { + "epoch": 0.21, + "learning_rate": 3.5223367697594503e-07, + "logits/chosen": -2.824647903442383, + "logits/rejected": -2.78361439704895, + "logps/chosen": -311.44757080078125, + "logps/rejected": -225.4349822998047, + "loss": 0.4273, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0005831479793414474, + "rewards/margins": 1.1429133415222168, + "rewards/rejected": -1.1434962749481201, + "step": 410 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.7857377529144287, + "logits/rejected": -2.771965980529785, + "logps/chosen": -284.3433837890625, + "logps/rejected": -241.23867797851562, + "loss": 0.5274, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.19046534597873688, + "rewards/margins": 1.2829582691192627, + "rewards/rejected": -1.4734236001968384, + "step": 420 + }, + { + "epoch": 0.22, + "learning_rate": 3.6941580756013745e-07, + "logits/chosen": -2.8471944332122803, + "logits/rejected": -2.8916735649108887, + "logps/chosen": -243.7864532470703, + "logps/rejected": -219.95291137695312, + "loss": 0.5155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4037954807281494, + "rewards/margins": 1.2392082214355469, + "rewards/rejected": -1.6430038213729858, + "step": 430 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.827336072921753, + "logits/rejected": -2.7996811866760254, + "logps/chosen": -297.21405029296875, + "logps/rejected": -289.5452880859375, + "loss": 0.5969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2905265688896179, + "rewards/margins": 0.8604179620742798, + "rewards/rejected": -1.150944471359253, + "step": 440 + }, + { + "epoch": 0.23, + "learning_rate": 3.865979381443299e-07, + "logits/chosen": -2.8593180179595947, + "logits/rejected": -2.851736545562744, + "logps/chosen": -268.6583557128906, + "logps/rejected": -254.41995239257812, + "loss": 0.5034, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.054434485733509064, + "rewards/margins": 0.8534797430038452, + "rewards/rejected": -0.9079142808914185, + "step": 450 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -2.836496114730835, + "logits/rejected": -2.8268418312072754, + "logps/chosen": -273.56561279296875, + "logps/rejected": -258.1180419921875, + "loss": 0.594, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.26750993728637695, + "rewards/margins": 1.0689507722854614, + "rewards/rejected": -1.336460828781128, + "step": 460 + }, + { + "epoch": 0.24, + "learning_rate": 4.037800687285223e-07, + "logits/chosen": -2.9143097400665283, + "logits/rejected": -2.8867809772491455, + "logps/chosen": -320.6771240234375, + "logps/rejected": -207.6370086669922, + "loss": 0.4564, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2321922332048416, + "rewards/margins": 1.1562436819076538, + "rewards/rejected": -1.388435959815979, + "step": 470 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -2.939049243927002, + "logits/rejected": -2.9370107650756836, + "logps/chosen": -288.6817932128906, + "logps/rejected": -250.44735717773438, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09014900773763657, + "rewards/margins": 1.0462675094604492, + "rewards/rejected": -1.1364164352416992, + "step": 480 + }, + { + "epoch": 0.25, + "learning_rate": 4.209621993127148e-07, + "logits/chosen": -2.859022855758667, + "logits/rejected": -2.8309292793273926, + "logps/chosen": -271.74542236328125, + "logps/rejected": -241.3363494873047, + "loss": 0.4504, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.31187617778778076, + "rewards/margins": 1.1031101942062378, + "rewards/rejected": -1.4149863719940186, + "step": 490 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -2.998340129852295, + "logits/rejected": -3.001023292541504, + "logps/chosen": -274.80035400390625, + "logps/rejected": -258.14520263671875, + "loss": 0.533, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.30341288447380066, + "rewards/margins": 0.8886991739273071, + "rewards/rejected": -1.1921122074127197, + "step": 500 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.948715925216675, + "eval_logits/rejected": -2.9319217205047607, + "eval_logps/chosen": -277.62506103515625, + "eval_logps/rejected": -246.04127502441406, + "eval_loss": 0.508408784866333, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": -0.19016194343566895, + "eval_rewards/margins": 1.1778383255004883, + "eval_rewards/rejected": -1.3680005073547363, + "eval_runtime": 452.2133, + "eval_samples_per_second": 4.423, + "eval_steps_per_second": 0.276, + "step": 500 + }, + { + "epoch": 0.26, + "learning_rate": 4.381443298969072e-07, + "logits/chosen": -2.897766351699829, + "logits/rejected": -2.8928945064544678, + "logps/chosen": -295.4362487792969, + "logps/rejected": -250.251953125, + "loss": 0.5221, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.33991071581840515, + "rewards/margins": 0.7788680195808411, + "rewards/rejected": -1.1187787055969238, + "step": 510 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -2.905595064163208, + "logits/rejected": -2.915565252304077, + "logps/chosen": -252.61865234375, + "logps/rejected": -219.57925415039062, + "loss": 0.5462, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.25931137800216675, + "rewards/margins": 1.3003042936325073, + "rewards/rejected": -1.5596157312393188, + "step": 520 + }, + { + "epoch": 0.27, + "learning_rate": 4.5532646048109964e-07, + "logits/chosen": -2.8665707111358643, + "logits/rejected": -2.8539767265319824, + "logps/chosen": -280.4945373535156, + "logps/rejected": -233.099365234375, + "loss": 0.5134, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2802669405937195, + "rewards/margins": 0.9665705561637878, + "rewards/rejected": -1.2468374967575073, + "step": 530 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -2.88356614112854, + "logits/rejected": -2.887423276901245, + "logps/chosen": -286.5166931152344, + "logps/rejected": -255.9084930419922, + "loss": 0.601, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5912821888923645, + "rewards/margins": 0.7030351758003235, + "rewards/rejected": -1.2943174839019775, + "step": 540 + }, + { + "epoch": 0.28, + "learning_rate": 4.7250859106529206e-07, + "logits/chosen": -2.884530544281006, + "logits/rejected": -2.823495626449585, + "logps/chosen": -270.32037353515625, + "logps/rejected": -246.2440185546875, + "loss": 0.5912, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.40074628591537476, + "rewards/margins": 1.3706839084625244, + "rewards/rejected": -1.771430253982544, + "step": 550 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -2.8146512508392334, + "logits/rejected": -2.834014415740967, + "logps/chosen": -312.0709533691406, + "logps/rejected": -268.26416015625, + "loss": 0.5438, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.3308074474334717, + "rewards/margins": 1.189579963684082, + "rewards/rejected": -1.5203872919082642, + "step": 560 + }, + { + "epoch": 0.29, + "learning_rate": 4.896907216494845e-07, + "logits/chosen": -2.852008819580078, + "logits/rejected": -2.841087818145752, + "logps/chosen": -278.21600341796875, + "logps/rejected": -263.9369201660156, + "loss": 0.5615, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5327223539352417, + "rewards/margins": 0.9639410972595215, + "rewards/rejected": -1.4966634511947632, + "step": 570 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -2.827453136444092, + "logits/rejected": -2.918525457382202, + "logps/chosen": -274.576904296875, + "logps/rejected": -215.708251953125, + "loss": 0.5615, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.22379016876220703, + "rewards/margins": 1.179864525794983, + "rewards/rejected": -1.40365469455719, + "step": 580 + }, + { + "epoch": 0.3, + "learning_rate": 4.992350353796136e-07, + "logits/chosen": -2.834676504135132, + "logits/rejected": -2.772709369659424, + "logps/chosen": -248.90567016601562, + "logps/rejected": -249.8413543701172, + "loss": 0.4855, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.11263740062713623, + "rewards/margins": 1.379455327987671, + "rewards/rejected": -1.4920928478240967, + "step": 590 + }, + { + "epoch": 0.31, + "learning_rate": 4.982788296041308e-07, + "logits/chosen": -2.8848588466644287, + "logits/rejected": -2.909480094909668, + "logps/chosen": -250.5892791748047, + "logps/rejected": -227.27001953125, + "loss": 0.6065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.33602461218833923, + "rewards/margins": 1.2359898090362549, + "rewards/rejected": -1.572014570236206, + "step": 600 + }, + { + "epoch": 0.31, + "learning_rate": 4.973226238286479e-07, + "logits/chosen": -2.855553150177002, + "logits/rejected": -2.922438144683838, + "logps/chosen": -331.48419189453125, + "logps/rejected": -273.5794982910156, + "loss": 0.5389, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.20098695158958435, + "rewards/margins": 1.2247838973999023, + "rewards/rejected": -1.4257709980010986, + "step": 610 + }, + { + "epoch": 0.32, + "learning_rate": 4.96366418053165e-07, + "logits/chosen": -2.958442211151123, + "logits/rejected": -2.9399642944335938, + "logps/chosen": -284.391845703125, + "logps/rejected": -267.7436218261719, + "loss": 0.5924, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.27081987261772156, + "rewards/margins": 0.6422858238220215, + "rewards/rejected": -0.9131056666374207, + "step": 620 + }, + { + "epoch": 0.33, + "learning_rate": 4.954102122776821e-07, + "logits/chosen": -2.942708969116211, + "logits/rejected": -2.9726662635803223, + "logps/chosen": -255.775634765625, + "logps/rejected": -202.54544067382812, + "loss": 0.5454, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2214779406785965, + "rewards/margins": 1.1896545886993408, + "rewards/rejected": -1.411132574081421, + "step": 630 + }, + { + "epoch": 0.33, + "learning_rate": 4.944540065021993e-07, + "logits/chosen": -2.7782981395721436, + "logits/rejected": -2.740140199661255, + "logps/chosen": -243.0159149169922, + "logps/rejected": -211.75009155273438, + "loss": 0.5355, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5337392091751099, + "rewards/margins": 1.1862469911575317, + "rewards/rejected": -1.7199862003326416, + "step": 640 + }, + { + "epoch": 0.34, + "learning_rate": 4.934978007267163e-07, + "logits/chosen": -2.857133626937866, + "logits/rejected": -2.9078116416931152, + "logps/chosen": -285.61309814453125, + "logps/rejected": -253.51596069335938, + "loss": 0.5293, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.08902885764837265, + "rewards/margins": 1.221697449684143, + "rewards/rejected": -1.3107261657714844, + "step": 650 + }, + { + "epoch": 0.34, + "learning_rate": 4.925415949512335e-07, + "logits/chosen": -2.792689800262451, + "logits/rejected": -2.7753195762634277, + "logps/chosen": -336.81976318359375, + "logps/rejected": -265.2138366699219, + "loss": 0.5447, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.26736006140708923, + "rewards/margins": 1.6899057626724243, + "rewards/rejected": -1.957265853881836, + "step": 660 + }, + { + "epoch": 0.35, + "learning_rate": 4.915853891757506e-07, + "logits/chosen": -2.6831443309783936, + "logits/rejected": -2.6355414390563965, + "logps/chosen": -199.3243865966797, + "logps/rejected": -248.56185913085938, + "loss": 0.6114, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28927290439605713, + "rewards/margins": 0.5596956014633179, + "rewards/rejected": -0.848968505859375, + "step": 670 + }, + { + "epoch": 0.35, + "learning_rate": 4.906291834002677e-07, + "logits/chosen": -2.7662174701690674, + "logits/rejected": -2.7773642539978027, + "logps/chosen": -287.2375183105469, + "logps/rejected": -260.99151611328125, + "loss": 0.4939, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39218276739120483, + "rewards/margins": 1.1262353658676147, + "rewards/rejected": -1.5184181928634644, + "step": 680 + }, + { + "epoch": 0.36, + "learning_rate": 4.896729776247848e-07, + "logits/chosen": -2.8319334983825684, + "logits/rejected": -2.820892333984375, + "logps/chosen": -291.27972412109375, + "logps/rejected": -251.38760375976562, + "loss": 0.4687, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18871352076530457, + "rewards/margins": 1.4281032085418701, + "rewards/rejected": -1.616816759109497, + "step": 690 + }, + { + "epoch": 0.36, + "learning_rate": 4.88716771849302e-07, + "logits/chosen": -2.8433375358581543, + "logits/rejected": -2.8692548274993896, + "logps/chosen": -330.5716552734375, + "logps/rejected": -274.46893310546875, + "loss": 0.5239, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.07384981215000153, + "rewards/margins": 1.4511958360671997, + "rewards/rejected": -1.5250459909439087, + "step": 700 + }, + { + "epoch": 0.37, + "learning_rate": 4.87760566073819e-07, + "logits/chosen": -2.7395122051239014, + "logits/rejected": -2.781907558441162, + "logps/chosen": -310.0351257324219, + "logps/rejected": -240.80520629882812, + "loss": 0.5632, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20480065047740936, + "rewards/margins": 1.400748610496521, + "rewards/rejected": -1.6055494546890259, + "step": 710 + }, + { + "epoch": 0.37, + "learning_rate": 4.868043602983362e-07, + "logits/chosen": -2.8281850814819336, + "logits/rejected": -2.814466953277588, + "logps/chosen": -301.6328125, + "logps/rejected": -299.40936279296875, + "loss": 0.5002, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5126577615737915, + "rewards/margins": 1.7754802703857422, + "rewards/rejected": -2.288137912750244, + "step": 720 + }, + { + "epoch": 0.38, + "learning_rate": 4.858481545228533e-07, + "logits/chosen": -2.8933727741241455, + "logits/rejected": -2.8719959259033203, + "logps/chosen": -322.64056396484375, + "logps/rejected": -285.211181640625, + "loss": 0.5076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3348848521709442, + "rewards/margins": 1.523197889328003, + "rewards/rejected": -1.8580827713012695, + "step": 730 + }, + { + "epoch": 0.38, + "learning_rate": 4.848919487473704e-07, + "logits/chosen": -2.811448574066162, + "logits/rejected": -2.7747862339019775, + "logps/chosen": -302.3232727050781, + "logps/rejected": -296.32391357421875, + "loss": 0.6469, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5591434240341187, + "rewards/margins": 1.2536993026733398, + "rewards/rejected": -1.8128427267074585, + "step": 740 + }, + { + "epoch": 0.39, + "learning_rate": 4.839357429718875e-07, + "logits/chosen": -2.8409669399261475, + "logits/rejected": -2.7742369174957275, + "logps/chosen": -273.21160888671875, + "logps/rejected": -243.36630249023438, + "loss": 0.5355, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4651729464530945, + "rewards/margins": 1.2234070301055908, + "rewards/rejected": -1.6885799169540405, + "step": 750 + }, + { + "epoch": 0.39, + "learning_rate": 4.829795371964047e-07, + "logits/chosen": -2.8159842491149902, + "logits/rejected": -2.8242149353027344, + "logps/chosen": -305.82257080078125, + "logps/rejected": -267.11590576171875, + "loss": 0.5369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4173976480960846, + "rewards/margins": 1.344733476638794, + "rewards/rejected": -1.7621314525604248, + "step": 760 + }, + { + "epoch": 0.4, + "learning_rate": 4.820233314209217e-07, + "logits/chosen": -2.718552589416504, + "logits/rejected": -2.695807456970215, + "logps/chosen": -265.13531494140625, + "logps/rejected": -233.3996124267578, + "loss": 0.5837, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5660188794136047, + "rewards/margins": 1.1836113929748535, + "rewards/rejected": -1.7496302127838135, + "step": 770 + }, + { + "epoch": 0.4, + "learning_rate": 4.810671256454389e-07, + "logits/chosen": -2.7111926078796387, + "logits/rejected": -2.733715772628784, + "logps/chosen": -304.9040832519531, + "logps/rejected": -286.5145568847656, + "loss": 0.5114, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6240701675415039, + "rewards/margins": 1.450642466545105, + "rewards/rejected": -2.0747127532958984, + "step": 780 + }, + { + "epoch": 0.41, + "learning_rate": 4.80110919869956e-07, + "logits/chosen": -2.7404356002807617, + "logits/rejected": -2.7429983615875244, + "logps/chosen": -301.89862060546875, + "logps/rejected": -229.8195343017578, + "loss": 0.5147, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.38781577348709106, + "rewards/margins": 1.3693140745162964, + "rewards/rejected": -1.7571296691894531, + "step": 790 + }, + { + "epoch": 0.41, + "learning_rate": 4.791547140944731e-07, + "logits/chosen": -2.643702507019043, + "logits/rejected": -2.6573736667633057, + "logps/chosen": -227.3995819091797, + "logps/rejected": -227.03079223632812, + "loss": 0.5413, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.20422391593456268, + "rewards/margins": 1.3888452053070068, + "rewards/rejected": -1.593069314956665, + "step": 800 + }, + { + "epoch": 0.42, + "learning_rate": 4.781985083189902e-07, + "logits/chosen": -2.707505702972412, + "logits/rejected": -2.6472725868225098, + "logps/chosen": -254.8813018798828, + "logps/rejected": -265.68927001953125, + "loss": 0.6213, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.28768256306648254, + "rewards/margins": 1.237850546836853, + "rewards/rejected": -1.5255329608917236, + "step": 810 + }, + { + "epoch": 0.42, + "learning_rate": 4.772423025435074e-07, + "logits/chosen": -2.695797920227051, + "logits/rejected": -2.73037052154541, + "logps/chosen": -283.4263916015625, + "logps/rejected": -274.9232482910156, + "loss": 0.6503, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.31081074476242065, + "rewards/margins": 0.8685556650161743, + "rewards/rejected": -1.1793664693832397, + "step": 820 + }, + { + "epoch": 0.43, + "learning_rate": 4.762860967680244e-07, + "logits/chosen": -2.790220022201538, + "logits/rejected": -2.7574965953826904, + "logps/chosen": -240.9722137451172, + "logps/rejected": -196.33604431152344, + "loss": 0.5816, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1742028295993805, + "rewards/margins": 1.068560004234314, + "rewards/rejected": -1.2427630424499512, + "step": 830 + }, + { + "epoch": 0.43, + "learning_rate": 4.7532989099254154e-07, + "logits/chosen": -2.7602648735046387, + "logits/rejected": -2.7110064029693604, + "logps/chosen": -262.978271484375, + "logps/rejected": -234.6663055419922, + "loss": 0.5501, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.31769394874572754, + "rewards/margins": 1.0034860372543335, + "rewards/rejected": -1.321179986000061, + "step": 840 + }, + { + "epoch": 0.44, + "learning_rate": 4.7437368521705866e-07, + "logits/chosen": -2.8047854900360107, + "logits/rejected": -2.840674638748169, + "logps/chosen": -252.6496124267578, + "logps/rejected": -260.48602294921875, + "loss": 0.6849, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21668288111686707, + "rewards/margins": 1.5027235746383667, + "rewards/rejected": -1.7194064855575562, + "step": 850 + }, + { + "epoch": 0.44, + "learning_rate": 4.7341747944157577e-07, + "logits/chosen": -2.8253366947174072, + "logits/rejected": -2.776054859161377, + "logps/chosen": -274.93511962890625, + "logps/rejected": -252.29086303710938, + "loss": 0.5782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4284784197807312, + "rewards/margins": 0.9962869882583618, + "rewards/rejected": -1.4247655868530273, + "step": 860 + }, + { + "epoch": 0.45, + "learning_rate": 4.724612736660929e-07, + "logits/chosen": -2.7298855781555176, + "logits/rejected": -2.7826218605041504, + "logps/chosen": -292.0563049316406, + "logps/rejected": -245.8948211669922, + "loss": 0.7585, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1329361200332642, + "rewards/margins": 0.6591729521751404, + "rewards/rejected": -1.7921088933944702, + "step": 870 + }, + { + "epoch": 0.45, + "learning_rate": 4.7150506789061006e-07, + "logits/chosen": -2.800131320953369, + "logits/rejected": -2.76381254196167, + "logps/chosen": -292.432373046875, + "logps/rejected": -294.9343566894531, + "loss": 0.5039, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4364054203033447, + "rewards/margins": 1.5577386617660522, + "rewards/rejected": -1.9941442012786865, + "step": 880 + }, + { + "epoch": 0.46, + "learning_rate": 4.7054886211512717e-07, + "logits/chosen": -2.769394874572754, + "logits/rejected": -2.8211991786956787, + "logps/chosen": -284.194091796875, + "logps/rejected": -250.88818359375, + "loss": 0.5477, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2785911560058594, + "rewards/margins": 1.1287821531295776, + "rewards/rejected": -1.407373309135437, + "step": 890 + }, + { + "epoch": 0.46, + "learning_rate": 4.695926563396443e-07, + "logits/chosen": -2.868389844894409, + "logits/rejected": -2.934311628341675, + "logps/chosen": -275.0090026855469, + "logps/rejected": -234.48464965820312, + "loss": 0.5243, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4162035584449768, + "rewards/margins": 0.9356048703193665, + "rewards/rejected": -1.3518084287643433, + "step": 900 + }, + { + "epoch": 0.47, + "learning_rate": 4.686364505641614e-07, + "logits/chosen": -2.837700128555298, + "logits/rejected": -2.8741488456726074, + "logps/chosen": -275.07470703125, + "logps/rejected": -236.64950561523438, + "loss": 0.48, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2955784201622009, + "rewards/margins": 1.4538462162017822, + "rewards/rejected": -1.7494245767593384, + "step": 910 + }, + { + "epoch": 0.47, + "learning_rate": 4.676802447886785e-07, + "logits/chosen": -2.7928593158721924, + "logits/rejected": -2.8100712299346924, + "logps/chosen": -269.1537170410156, + "logps/rejected": -243.173828125, + "loss": 0.6521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4626309871673584, + "rewards/margins": 1.3189350366592407, + "rewards/rejected": -1.7815659046173096, + "step": 920 + }, + { + "epoch": 0.48, + "learning_rate": 4.6672403901319564e-07, + "logits/chosen": -2.819791793823242, + "logits/rejected": -2.783853054046631, + "logps/chosen": -258.08160400390625, + "logps/rejected": -241.07174682617188, + "loss": 0.4852, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03211576119065285, + "rewards/margins": 1.6092383861541748, + "rewards/rejected": -1.641353964805603, + "step": 930 + }, + { + "epoch": 0.49, + "learning_rate": 4.6576783323771275e-07, + "logits/chosen": -2.71333384513855, + "logits/rejected": -2.7661778926849365, + "logps/chosen": -232.42251586914062, + "logps/rejected": -220.47915649414062, + "loss": 0.5073, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.39604687690734863, + "rewards/margins": 1.278062105178833, + "rewards/rejected": -1.6741091012954712, + "step": 940 + }, + { + "epoch": 0.49, + "learning_rate": 4.6481162746222987e-07, + "logits/chosen": -2.7065939903259277, + "logits/rejected": -2.6961522102355957, + "logps/chosen": -305.9462890625, + "logps/rejected": -261.17340087890625, + "loss": 0.7714, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5392950773239136, + "rewards/margins": 0.5164933204650879, + "rewards/rejected": -2.055788278579712, + "step": 950 + }, + { + "epoch": 0.5, + "learning_rate": 4.63855421686747e-07, + "logits/chosen": -2.7435028553009033, + "logits/rejected": -2.7790913581848145, + "logps/chosen": -296.8639221191406, + "logps/rejected": -247.61825561523438, + "loss": 0.5541, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.42261379957199097, + "rewards/margins": 1.4009877443313599, + "rewards/rejected": -1.823601484298706, + "step": 960 + }, + { + "epoch": 0.5, + "learning_rate": 4.628992159112641e-07, + "logits/chosen": -2.782947063446045, + "logits/rejected": -2.694589138031006, + "logps/chosen": -264.4646301269531, + "logps/rejected": -268.305908203125, + "loss": 0.5559, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.18068067729473114, + "rewards/margins": 1.476174235343933, + "rewards/rejected": -1.6568549871444702, + "step": 970 + }, + { + "epoch": 0.51, + "learning_rate": 4.6194301013578116e-07, + "logits/chosen": -2.82590913772583, + "logits/rejected": -2.785083293914795, + "logps/chosen": -326.63250732421875, + "logps/rejected": -253.30032348632812, + "loss": 0.5292, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.36352962255477905, + "rewards/margins": 1.4459807872772217, + "rewards/rejected": -1.8095104694366455, + "step": 980 + }, + { + "epoch": 0.51, + "learning_rate": 4.609868043602983e-07, + "logits/chosen": -2.802238941192627, + "logits/rejected": -2.833404302597046, + "logps/chosen": -255.2863006591797, + "logps/rejected": -233.3296661376953, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32216745615005493, + "rewards/margins": 1.170350432395935, + "rewards/rejected": -1.4925178289413452, + "step": 990 + }, + { + "epoch": 0.52, + "learning_rate": 4.600305985848154e-07, + "logits/chosen": -2.7925350666046143, + "logits/rejected": -2.776784658432007, + "logps/chosen": -248.7510986328125, + "logps/rejected": -246.3361053466797, + "loss": 0.4907, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3822530508041382, + "rewards/margins": 0.8374401926994324, + "rewards/rejected": -1.2196933031082153, + "step": 1000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.844231605529785, + "eval_logits/rejected": -2.8401119709014893, + "eval_logps/chosen": -279.0693054199219, + "eval_logps/rejected": -250.51393127441406, + "eval_loss": 0.5233809947967529, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -0.3345881402492523, + "eval_rewards/margins": 1.4806790351867676, + "eval_rewards/rejected": -1.8152673244476318, + "eval_runtime": 452.4269, + "eval_samples_per_second": 4.421, + "eval_steps_per_second": 0.276, + "step": 1000 + }, + { + "epoch": 0.52, + "learning_rate": 4.590743928093325e-07, + "logits/chosen": -2.675524950027466, + "logits/rejected": -2.6706223487854004, + "logps/chosen": -316.2507629394531, + "logps/rejected": -263.38165283203125, + "loss": 0.5685, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6011537313461304, + "rewards/margins": 1.0471036434173584, + "rewards/rejected": -1.6482574939727783, + "step": 1010 + }, + { + "epoch": 0.53, + "learning_rate": 4.581181870338497e-07, + "logits/chosen": -2.683260440826416, + "logits/rejected": -2.792513370513916, + "logps/chosen": -333.0619201660156, + "logps/rejected": -292.0895080566406, + "loss": 0.5133, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4557833671569824, + "rewards/margins": 1.5725176334381104, + "rewards/rejected": -2.0283007621765137, + "step": 1020 + }, + { + "epoch": 0.53, + "learning_rate": 4.571619812583668e-07, + "logits/chosen": -2.7991647720336914, + "logits/rejected": -2.7419731616973877, + "logps/chosen": -270.207763671875, + "logps/rejected": -275.93701171875, + "loss": 0.4559, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.46183767914772034, + "rewards/margins": 1.4133819341659546, + "rewards/rejected": -1.8752195835113525, + "step": 1030 + }, + { + "epoch": 0.54, + "learning_rate": 4.562057754828839e-07, + "logits/chosen": -2.690382480621338, + "logits/rejected": -2.8083395957946777, + "logps/chosen": -287.6054992675781, + "logps/rejected": -252.50442504882812, + "loss": 0.5438, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5533614158630371, + "rewards/margins": 1.6856781244277954, + "rewards/rejected": -2.239039897918701, + "step": 1040 + }, + { + "epoch": 0.54, + "learning_rate": 4.55249569707401e-07, + "logits/chosen": -2.7608683109283447, + "logits/rejected": -2.758338212966919, + "logps/chosen": -236.76528930664062, + "logps/rejected": -256.64666748046875, + "loss": 0.4936, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7707513570785522, + "rewards/margins": 1.3510288000106812, + "rewards/rejected": -2.1217801570892334, + "step": 1050 + }, + { + "epoch": 0.55, + "learning_rate": 4.5429336393191814e-07, + "logits/chosen": -2.6740434169769287, + "logits/rejected": -2.7331643104553223, + "logps/chosen": -257.99456787109375, + "logps/rejected": -221.4962158203125, + "loss": 0.6079, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6206737756729126, + "rewards/margins": 0.7699206471443176, + "rewards/rejected": -1.3905946016311646, + "step": 1060 + }, + { + "epoch": 0.55, + "learning_rate": 4.5333715815643525e-07, + "logits/chosen": -2.7877304553985596, + "logits/rejected": -2.815929889678955, + "logps/chosen": -302.9522399902344, + "logps/rejected": -267.2388916015625, + "loss": 0.5461, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6460703611373901, + "rewards/margins": 1.2526905536651611, + "rewards/rejected": -1.8987607955932617, + "step": 1070 + }, + { + "epoch": 0.56, + "learning_rate": 4.5238095238095237e-07, + "logits/chosen": -2.796346426010132, + "logits/rejected": -2.7912135124206543, + "logps/chosen": -281.9361267089844, + "logps/rejected": -246.5717315673828, + "loss": 0.5535, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7052322030067444, + "rewards/margins": 1.1822354793548584, + "rewards/rejected": -1.8874677419662476, + "step": 1080 + }, + { + "epoch": 0.56, + "learning_rate": 4.514247466054695e-07, + "logits/chosen": -2.7953383922576904, + "logits/rejected": -2.739332914352417, + "logps/chosen": -229.3057403564453, + "logps/rejected": -223.58279418945312, + "loss": 0.5363, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6706255078315735, + "rewards/margins": 1.3192580938339233, + "rewards/rejected": -1.9898836612701416, + "step": 1090 + }, + { + "epoch": 0.57, + "learning_rate": 4.504685408299866e-07, + "logits/chosen": -2.7090344429016113, + "logits/rejected": -2.6705334186553955, + "logps/chosen": -302.71417236328125, + "logps/rejected": -292.3421325683594, + "loss": 0.5132, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.564530074596405, + "rewards/margins": 1.3910361528396606, + "rewards/rejected": -1.955566167831421, + "step": 1100 + }, + { + "epoch": 0.57, + "learning_rate": 4.495123350545037e-07, + "logits/chosen": -2.767871379852295, + "logits/rejected": -2.7415690422058105, + "logps/chosen": -302.222900390625, + "logps/rejected": -287.3324279785156, + "loss": 0.5608, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4624261260032654, + "rewards/margins": 1.2597681283950806, + "rewards/rejected": -1.7221943140029907, + "step": 1110 + }, + { + "epoch": 0.58, + "learning_rate": 4.4855612927902083e-07, + "logits/chosen": -2.6437506675720215, + "logits/rejected": -2.641035318374634, + "logps/chosen": -303.7630310058594, + "logps/rejected": -254.9109649658203, + "loss": 1.1531, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7806851863861084, + "rewards/margins": 1.3015507459640503, + "rewards/rejected": -2.0822360515594482, + "step": 1120 + }, + { + "epoch": 0.58, + "learning_rate": 4.4759992350353795e-07, + "logits/chosen": -2.648594379425049, + "logits/rejected": -2.6584677696228027, + "logps/chosen": -293.0738830566406, + "logps/rejected": -233.2223358154297, + "loss": 0.4252, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7711928486824036, + "rewards/margins": 1.5724222660064697, + "rewards/rejected": -2.3436150550842285, + "step": 1130 + }, + { + "epoch": 0.59, + "learning_rate": 4.46643717728055e-07, + "logits/chosen": -2.597867727279663, + "logits/rejected": -2.697968006134033, + "logps/chosen": -251.0467529296875, + "logps/rejected": -260.2559509277344, + "loss": 0.6018, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7678476572036743, + "rewards/margins": 1.4263780117034912, + "rewards/rejected": -2.194225788116455, + "step": 1140 + }, + { + "epoch": 0.59, + "learning_rate": 4.4568751195257213e-07, + "logits/chosen": -2.6516098976135254, + "logits/rejected": -2.5899970531463623, + "logps/chosen": -331.1034240722656, + "logps/rejected": -261.8896484375, + "loss": 0.5869, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.24301430583000183, + "rewards/margins": 1.839286208152771, + "rewards/rejected": -2.0823006629943848, + "step": 1150 + }, + { + "epoch": 0.6, + "learning_rate": 4.447313061770893e-07, + "logits/chosen": -2.6758570671081543, + "logits/rejected": -2.602381706237793, + "logps/chosen": -263.66851806640625, + "logps/rejected": -281.92852783203125, + "loss": 0.5807, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.30647602677345276, + "rewards/margins": 1.638983964920044, + "rewards/rejected": -1.9454599618911743, + "step": 1160 + }, + { + "epoch": 0.6, + "learning_rate": 4.437751004016064e-07, + "logits/chosen": -2.6164095401763916, + "logits/rejected": -2.6072592735290527, + "logps/chosen": -236.7494659423828, + "logps/rejected": -247.128173828125, + "loss": 0.4963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6536493301391602, + "rewards/margins": 1.0608946084976196, + "rewards/rejected": -1.7145439386367798, + "step": 1170 + }, + { + "epoch": 0.61, + "learning_rate": 4.4281889462612353e-07, + "logits/chosen": -2.7885098457336426, + "logits/rejected": -2.735015869140625, + "logps/chosen": -273.19390869140625, + "logps/rejected": -229.81619262695312, + "loss": 0.5465, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5660006403923035, + "rewards/margins": 1.4190986156463623, + "rewards/rejected": -1.985099196434021, + "step": 1180 + }, + { + "epoch": 0.61, + "learning_rate": 4.4186268885064064e-07, + "logits/chosen": -2.7694408893585205, + "logits/rejected": -2.7590034008026123, + "logps/chosen": -299.28875732421875, + "logps/rejected": -251.94491577148438, + "loss": 0.531, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6542151570320129, + "rewards/margins": 1.0486948490142822, + "rewards/rejected": -1.70291006565094, + "step": 1190 + }, + { + "epoch": 0.62, + "learning_rate": 4.4090648307515776e-07, + "logits/chosen": -2.7691597938537598, + "logits/rejected": -2.7554731369018555, + "logps/chosen": -229.202392578125, + "logps/rejected": -191.56576538085938, + "loss": 0.5307, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9604787826538086, + "rewards/margins": 0.9451109766960144, + "rewards/rejected": -1.9055898189544678, + "step": 1200 + }, + { + "epoch": 0.62, + "learning_rate": 4.399502772996749e-07, + "logits/chosen": -2.7866642475128174, + "logits/rejected": -2.7626142501831055, + "logps/chosen": -278.181396484375, + "logps/rejected": -256.399658203125, + "loss": 0.4674, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6451205611228943, + "rewards/margins": 1.4265587329864502, + "rewards/rejected": -2.0716793537139893, + "step": 1210 + }, + { + "epoch": 0.63, + "learning_rate": 4.38994071524192e-07, + "logits/chosen": -2.765838623046875, + "logits/rejected": -2.835886001586914, + "logps/chosen": -258.55694580078125, + "logps/rejected": -223.63119506835938, + "loss": 0.572, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7563117742538452, + "rewards/margins": 1.1331018209457397, + "rewards/rejected": -1.889413595199585, + "step": 1220 + }, + { + "epoch": 0.64, + "learning_rate": 4.380378657487091e-07, + "logits/chosen": -2.9007599353790283, + "logits/rejected": -2.8781092166900635, + "logps/chosen": -322.48297119140625, + "logps/rejected": -309.03057861328125, + "loss": 0.6139, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5290047526359558, + "rewards/margins": 1.6923503875732422, + "rewards/rejected": -2.2213549613952637, + "step": 1230 + }, + { + "epoch": 0.64, + "learning_rate": 4.370816599732262e-07, + "logits/chosen": -2.882289171218872, + "logits/rejected": -2.8788981437683105, + "logps/chosen": -311.59112548828125, + "logps/rejected": -237.2730712890625, + "loss": 0.5165, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0326735973358154, + "rewards/margins": 1.2838331460952759, + "rewards/rejected": -2.316506862640381, + "step": 1240 + }, + { + "epoch": 0.65, + "learning_rate": 4.3612545419774334e-07, + "logits/chosen": -2.735339641571045, + "logits/rejected": -2.765554189682007, + "logps/chosen": -240.76547241210938, + "logps/rejected": -268.36077880859375, + "loss": 0.5601, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.080916166305542, + "rewards/margins": 0.9319782257080078, + "rewards/rejected": -2.01289439201355, + "step": 1250 + }, + { + "epoch": 0.65, + "learning_rate": 4.3516924842226045e-07, + "logits/chosen": -2.712817907333374, + "logits/rejected": -2.779609203338623, + "logps/chosen": -303.5237731933594, + "logps/rejected": -254.60684204101562, + "loss": 0.6256, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.225891351699829, + "rewards/margins": 1.0712993144989014, + "rewards/rejected": -2.2971906661987305, + "step": 1260 + }, + { + "epoch": 0.66, + "learning_rate": 4.3421304264677757e-07, + "logits/chosen": -2.8182456493377686, + "logits/rejected": -2.7474656105041504, + "logps/chosen": -265.8131103515625, + "logps/rejected": -225.51089477539062, + "loss": 0.5053, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8482934236526489, + "rewards/margins": 1.4276056289672852, + "rewards/rejected": -2.2758994102478027, + "step": 1270 + }, + { + "epoch": 0.66, + "learning_rate": 4.332568368712947e-07, + "logits/chosen": -2.829192876815796, + "logits/rejected": -2.8889195919036865, + "logps/chosen": -276.78814697265625, + "logps/rejected": -252.64096069335938, + "loss": 0.5599, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9601444005966187, + "rewards/margins": 1.1756095886230469, + "rewards/rejected": -2.135754108428955, + "step": 1280 + }, + { + "epoch": 0.67, + "learning_rate": 4.323006310958118e-07, + "logits/chosen": -2.8162477016448975, + "logits/rejected": -2.890505313873291, + "logps/chosen": -323.9197692871094, + "logps/rejected": -273.29632568359375, + "loss": 0.5012, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6982527375221252, + "rewards/margins": 1.4712746143341064, + "rewards/rejected": -2.169527292251587, + "step": 1290 + }, + { + "epoch": 0.67, + "learning_rate": 4.313444253203289e-07, + "logits/chosen": -2.783569812774658, + "logits/rejected": -2.7339978218078613, + "logps/chosen": -263.76531982421875, + "logps/rejected": -239.9626922607422, + "loss": 0.5372, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8767280578613281, + "rewards/margins": 1.3299367427825928, + "rewards/rejected": -2.2066650390625, + "step": 1300 + }, + { + "epoch": 0.68, + "learning_rate": 4.3038821954484603e-07, + "logits/chosen": -2.7811901569366455, + "logits/rejected": -2.737088918685913, + "logps/chosen": -286.2353210449219, + "logps/rejected": -256.3047180175781, + "loss": 0.4805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0940454006195068, + "rewards/margins": 1.132187843322754, + "rewards/rejected": -2.22623348236084, + "step": 1310 + }, + { + "epoch": 0.68, + "learning_rate": 4.2943201376936315e-07, + "logits/chosen": -2.726167917251587, + "logits/rejected": -2.688605546951294, + "logps/chosen": -282.53546142578125, + "logps/rejected": -257.32373046875, + "loss": 0.547, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0240520238876343, + "rewards/margins": 1.4254390001296997, + "rewards/rejected": -2.449491024017334, + "step": 1320 + }, + { + "epoch": 0.69, + "learning_rate": 4.2847580799388026e-07, + "logits/chosen": -2.7360429763793945, + "logits/rejected": -2.7077505588531494, + "logps/chosen": -302.6694030761719, + "logps/rejected": -286.1494140625, + "loss": 0.5727, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7704175114631653, + "rewards/margins": 1.3586149215698242, + "rewards/rejected": -2.1290321350097656, + "step": 1330 + }, + { + "epoch": 0.69, + "learning_rate": 4.275196022183974e-07, + "logits/chosen": -2.687389373779297, + "logits/rejected": -2.7357983589172363, + "logps/chosen": -301.34136962890625, + "logps/rejected": -234.6636199951172, + "loss": 0.5274, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9257659912109375, + "rewards/margins": 1.1349513530731201, + "rewards/rejected": -2.0607173442840576, + "step": 1340 + }, + { + "epoch": 0.7, + "learning_rate": 4.265633964429145e-07, + "logits/chosen": -2.770357847213745, + "logits/rejected": -2.746288776397705, + "logps/chosen": -268.09576416015625, + "logps/rejected": -213.56039428710938, + "loss": 0.5658, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6878668665885925, + "rewards/margins": 1.0466588735580444, + "rewards/rejected": -1.7345256805419922, + "step": 1350 + }, + { + "epoch": 0.7, + "learning_rate": 4.256071906674316e-07, + "logits/chosen": -2.7798330783843994, + "logits/rejected": -2.805274486541748, + "logps/chosen": -317.8987731933594, + "logps/rejected": -276.82562255859375, + "loss": 0.627, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.550291895866394, + "rewards/margins": 1.2872064113616943, + "rewards/rejected": -1.8374983072280884, + "step": 1360 + }, + { + "epoch": 0.71, + "learning_rate": 4.246509848919487e-07, + "logits/chosen": -2.8107523918151855, + "logits/rejected": -2.842200756072998, + "logps/chosen": -262.02874755859375, + "logps/rejected": -265.257080078125, + "loss": 0.5532, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4975528120994568, + "rewards/margins": 1.5920393466949463, + "rewards/rejected": -2.089592218399048, + "step": 1370 + }, + { + "epoch": 0.71, + "learning_rate": 4.2369477911646584e-07, + "logits/chosen": -2.8077638149261475, + "logits/rejected": -2.801124334335327, + "logps/chosen": -273.29962158203125, + "logps/rejected": -232.51559448242188, + "loss": 0.5683, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7220560312271118, + "rewards/margins": 1.1169109344482422, + "rewards/rejected": -1.8389670848846436, + "step": 1380 + }, + { + "epoch": 0.72, + "learning_rate": 4.2273857334098296e-07, + "logits/chosen": -2.7139172554016113, + "logits/rejected": -2.7589526176452637, + "logps/chosen": -287.19781494140625, + "logps/rejected": -242.8853302001953, + "loss": 0.5754, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5711519122123718, + "rewards/margins": 1.1763086318969727, + "rewards/rejected": -1.7474607229232788, + "step": 1390 + }, + { + "epoch": 0.72, + "learning_rate": 4.2178236756550007e-07, + "logits/chosen": -2.696545124053955, + "logits/rejected": -2.714484214782715, + "logps/chosen": -297.3968505859375, + "logps/rejected": -245.6906280517578, + "loss": 0.6399, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8875021934509277, + "rewards/margins": 1.0341614484786987, + "rewards/rejected": -1.9216636419296265, + "step": 1400 + }, + { + "epoch": 0.73, + "learning_rate": 4.208261617900172e-07, + "logits/chosen": -2.7567496299743652, + "logits/rejected": -2.7744107246398926, + "logps/chosen": -283.526611328125, + "logps/rejected": -214.8311309814453, + "loss": 0.4682, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7966445684432983, + "rewards/margins": 1.516661286354065, + "rewards/rejected": -2.3133058547973633, + "step": 1410 + }, + { + "epoch": 0.73, + "learning_rate": 4.198699560145343e-07, + "logits/chosen": -2.694303512573242, + "logits/rejected": -2.5877633094787598, + "logps/chosen": -268.08709716796875, + "logps/rejected": -245.7215576171875, + "loss": 0.6334, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9871991872787476, + "rewards/margins": 1.256882905960083, + "rewards/rejected": -2.24408221244812, + "step": 1420 + }, + { + "epoch": 0.74, + "learning_rate": 4.189137502390514e-07, + "logits/chosen": -2.728818416595459, + "logits/rejected": -2.7532124519348145, + "logps/chosen": -279.16851806640625, + "logps/rejected": -277.90826416015625, + "loss": 0.6436, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7359957098960876, + "rewards/margins": 1.080309271812439, + "rewards/rejected": -1.8163049221038818, + "step": 1430 + }, + { + "epoch": 0.74, + "learning_rate": 4.179575444635686e-07, + "logits/chosen": -2.718947649002075, + "logits/rejected": -2.7442736625671387, + "logps/chosen": -337.0057373046875, + "logps/rejected": -276.1755676269531, + "loss": 0.5953, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6292416453361511, + "rewards/margins": 1.0549640655517578, + "rewards/rejected": -1.6842056512832642, + "step": 1440 + }, + { + "epoch": 0.75, + "learning_rate": 4.170013386880857e-07, + "logits/chosen": -2.7211525440216064, + "logits/rejected": -2.678626775741577, + "logps/chosen": -274.9634704589844, + "logps/rejected": -284.86328125, + "loss": 0.5041, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.6449888944625854, + "rewards/margins": 1.4034655094146729, + "rewards/rejected": -2.0484542846679688, + "step": 1450 + }, + { + "epoch": 0.75, + "learning_rate": 4.1604513291260277e-07, + "logits/chosen": -2.6686198711395264, + "logits/rejected": -2.7258832454681396, + "logps/chosen": -265.8595886230469, + "logps/rejected": -262.75701904296875, + "loss": 0.5408, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6771360635757446, + "rewards/margins": 1.6670234203338623, + "rewards/rejected": -2.3441596031188965, + "step": 1460 + }, + { + "epoch": 0.76, + "learning_rate": 4.150889271371199e-07, + "logits/chosen": -2.618217945098877, + "logits/rejected": -2.6638619899749756, + "logps/chosen": -291.3425598144531, + "logps/rejected": -239.32754516601562, + "loss": 0.5876, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7784534096717834, + "rewards/margins": 1.4896891117095947, + "rewards/rejected": -2.2681422233581543, + "step": 1470 + }, + { + "epoch": 0.76, + "learning_rate": 4.14132721361637e-07, + "logits/chosen": -2.626051187515259, + "logits/rejected": -2.742926836013794, + "logps/chosen": -277.51007080078125, + "logps/rejected": -207.0497283935547, + "loss": 0.4242, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9323328733444214, + "rewards/margins": 1.7172333002090454, + "rewards/rejected": -2.649566173553467, + "step": 1480 + }, + { + "epoch": 0.77, + "learning_rate": 4.131765155861541e-07, + "logits/chosen": -2.5984182357788086, + "logits/rejected": -2.634274482727051, + "logps/chosen": -243.6094970703125, + "logps/rejected": -216.2739715576172, + "loss": 0.5322, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8016673922538757, + "rewards/margins": 1.508610486984253, + "rewards/rejected": -2.3102779388427734, + "step": 1490 + }, + { + "epoch": 0.77, + "learning_rate": 4.1222030981067123e-07, + "logits/chosen": -2.6750636100769043, + "logits/rejected": -2.736687183380127, + "logps/chosen": -305.0487976074219, + "logps/rejected": -273.23553466796875, + "loss": 0.4388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9884425401687622, + "rewards/margins": 1.6452935934066772, + "rewards/rejected": -2.6337361335754395, + "step": 1500 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.744432210922241, + "eval_logits/rejected": -2.741956949234009, + "eval_logps/chosen": -283.57977294921875, + "eval_logps/rejected": -255.0812225341797, + "eval_loss": 0.5201703310012817, + "eval_rewards/accuracies": 0.7919999957084656, + "eval_rewards/chosen": -0.7856327891349792, + "eval_rewards/margins": 1.4863603115081787, + "eval_rewards/rejected": -2.271993398666382, + "eval_runtime": 454.3263, + "eval_samples_per_second": 4.402, + "eval_steps_per_second": 0.275, + "step": 1500 + }, + { + "epoch": 0.78, + "learning_rate": 4.1126410403518835e-07, + "logits/chosen": -2.6532881259918213, + "logits/rejected": -2.7087454795837402, + "logps/chosen": -256.3421936035156, + "logps/rejected": -271.71173095703125, + "loss": 0.5294, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9765220880508423, + "rewards/margins": 1.416123628616333, + "rewards/rejected": -2.392645835876465, + "step": 1510 + }, + { + "epoch": 0.78, + "learning_rate": 4.1030789825970546e-07, + "logits/chosen": -2.638579845428467, + "logits/rejected": -2.668213367462158, + "logps/chosen": -310.85296630859375, + "logps/rejected": -279.1716003417969, + "loss": 0.5471, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8188529014587402, + "rewards/margins": 1.2735222578048706, + "rewards/rejected": -2.092374801635742, + "step": 1520 + }, + { + "epoch": 0.79, + "learning_rate": 4.093516924842226e-07, + "logits/chosen": -2.7053446769714355, + "logits/rejected": -2.705913543701172, + "logps/chosen": -292.74359130859375, + "logps/rejected": -268.7305908203125, + "loss": 0.4502, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.3197025656700134, + "rewards/margins": 1.5472514629364014, + "rewards/rejected": -1.8669540882110596, + "step": 1530 + }, + { + "epoch": 0.8, + "learning_rate": 4.083954867087397e-07, + "logits/chosen": -2.820955276489258, + "logits/rejected": -2.781850814819336, + "logps/chosen": -263.65234375, + "logps/rejected": -251.3944854736328, + "loss": 0.5528, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6488135457038879, + "rewards/margins": 1.2752244472503662, + "rewards/rejected": -1.9240379333496094, + "step": 1540 + }, + { + "epoch": 0.8, + "learning_rate": 4.074392809332568e-07, + "logits/chosen": -2.742565155029297, + "logits/rejected": -2.7900197505950928, + "logps/chosen": -328.9388732910156, + "logps/rejected": -272.40667724609375, + "loss": 0.5103, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5513970255851746, + "rewards/margins": 2.098090410232544, + "rewards/rejected": -2.6494877338409424, + "step": 1550 + }, + { + "epoch": 0.81, + "learning_rate": 4.064830751577739e-07, + "logits/chosen": -2.8128983974456787, + "logits/rejected": -2.783550977706909, + "logps/chosen": -264.9004211425781, + "logps/rejected": -248.5589141845703, + "loss": 0.5191, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4322876036167145, + "rewards/margins": 1.4417130947113037, + "rewards/rejected": -1.8740005493164062, + "step": 1560 + }, + { + "epoch": 0.81, + "learning_rate": 4.0552686938229104e-07, + "logits/chosen": -2.7836945056915283, + "logits/rejected": -2.79536771774292, + "logps/chosen": -264.26385498046875, + "logps/rejected": -233.51791381835938, + "loss": 0.4382, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4552520215511322, + "rewards/margins": 1.2747576236724854, + "rewards/rejected": -1.73000967502594, + "step": 1570 + }, + { + "epoch": 0.82, + "learning_rate": 4.045706636068082e-07, + "logits/chosen": -2.7296574115753174, + "logits/rejected": -2.784824848175049, + "logps/chosen": -277.8669738769531, + "logps/rejected": -247.8280792236328, + "loss": 0.4745, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6441946625709534, + "rewards/margins": 1.571209192276001, + "rewards/rejected": -2.2154037952423096, + "step": 1580 + }, + { + "epoch": 0.82, + "learning_rate": 4.036144578313253e-07, + "logits/chosen": -2.669766664505005, + "logits/rejected": -2.749067544937134, + "logps/chosen": -280.0926208496094, + "logps/rejected": -257.1477966308594, + "loss": 0.5338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6168959736824036, + "rewards/margins": 1.3998132944107056, + "rewards/rejected": -2.016709327697754, + "step": 1590 + }, + { + "epoch": 0.83, + "learning_rate": 4.0265825205584244e-07, + "logits/chosen": -2.7413103580474854, + "logits/rejected": -2.712337017059326, + "logps/chosen": -296.93768310546875, + "logps/rejected": -267.0952453613281, + "loss": 0.5875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6115021705627441, + "rewards/margins": 1.2376142740249634, + "rewards/rejected": -1.849116325378418, + "step": 1600 + }, + { + "epoch": 0.83, + "learning_rate": 4.0170204628035956e-07, + "logits/chosen": -2.6845192909240723, + "logits/rejected": -2.729642152786255, + "logps/chosen": -226.81564331054688, + "logps/rejected": -218.29470825195312, + "loss": 0.4993, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.4952973425388336, + "rewards/margins": 1.4554481506347656, + "rewards/rejected": -1.9507455825805664, + "step": 1610 + }, + { + "epoch": 0.84, + "learning_rate": 4.007458405048766e-07, + "logits/chosen": -2.680030345916748, + "logits/rejected": -2.713993549346924, + "logps/chosen": -313.85894775390625, + "logps/rejected": -285.9293212890625, + "loss": 0.4574, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6897695660591125, + "rewards/margins": 1.6455440521240234, + "rewards/rejected": -2.3353137969970703, + "step": 1620 + }, + { + "epoch": 0.84, + "learning_rate": 3.9978963472939373e-07, + "logits/chosen": -2.647779941558838, + "logits/rejected": -2.676213026046753, + "logps/chosen": -284.97760009765625, + "logps/rejected": -246.68838500976562, + "loss": 0.4985, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6747108697891235, + "rewards/margins": 1.6222903728485107, + "rewards/rejected": -2.297001361846924, + "step": 1630 + }, + { + "epoch": 0.85, + "learning_rate": 3.9883342895391085e-07, + "logits/chosen": -2.7175776958465576, + "logits/rejected": -2.674654722213745, + "logps/chosen": -325.5532531738281, + "logps/rejected": -255.57589721679688, + "loss": 0.4878, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6802691221237183, + "rewards/margins": 1.6955766677856445, + "rewards/rejected": -2.3758456707000732, + "step": 1640 + }, + { + "epoch": 0.85, + "learning_rate": 3.9787722317842796e-07, + "logits/chosen": -2.7575249671936035, + "logits/rejected": -2.7429704666137695, + "logps/chosen": -295.3287353515625, + "logps/rejected": -210.6437530517578, + "loss": 0.5416, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.2862042784690857, + "rewards/margins": 1.6433782577514648, + "rewards/rejected": -1.9295822381973267, + "step": 1650 + }, + { + "epoch": 0.86, + "learning_rate": 3.969210174029451e-07, + "logits/chosen": -2.683577060699463, + "logits/rejected": -2.5588877201080322, + "logps/chosen": -284.94488525390625, + "logps/rejected": -277.26715087890625, + "loss": 0.5569, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8422481417655945, + "rewards/margins": 1.1587899923324585, + "rewards/rejected": -2.001038074493408, + "step": 1660 + }, + { + "epoch": 0.86, + "learning_rate": 3.959648116274622e-07, + "logits/chosen": -2.6856627464294434, + "logits/rejected": -2.720485210418701, + "logps/chosen": -278.14727783203125, + "logps/rejected": -235.4019012451172, + "loss": 0.5908, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8898976445198059, + "rewards/margins": 0.9991616010665894, + "rewards/rejected": -1.8890594244003296, + "step": 1670 + }, + { + "epoch": 0.87, + "learning_rate": 3.950086058519793e-07, + "logits/chosen": -2.7345478534698486, + "logits/rejected": -2.7119522094726562, + "logps/chosen": -251.06784057617188, + "logps/rejected": -237.36767578125, + "loss": 0.5711, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6733026504516602, + "rewards/margins": 1.1799070835113525, + "rewards/rejected": -1.8532098531723022, + "step": 1680 + }, + { + "epoch": 0.87, + "learning_rate": 3.9405240007649643e-07, + "logits/chosen": -2.7996246814727783, + "logits/rejected": -2.8810853958129883, + "logps/chosen": -263.982421875, + "logps/rejected": -253.3212127685547, + "loss": 0.4917, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6147769093513489, + "rewards/margins": 1.3312063217163086, + "rewards/rejected": -1.9459832906723022, + "step": 1690 + }, + { + "epoch": 0.88, + "learning_rate": 3.9309619430101354e-07, + "logits/chosen": -2.711601734161377, + "logits/rejected": -2.8019349575042725, + "logps/chosen": -267.4760437011719, + "logps/rejected": -244.1147918701172, + "loss": 0.5695, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6157582402229309, + "rewards/margins": 1.292510747909546, + "rewards/rejected": -1.9082691669464111, + "step": 1700 + }, + { + "epoch": 0.88, + "learning_rate": 3.9213998852553066e-07, + "logits/chosen": -2.7815704345703125, + "logits/rejected": -2.810908555984497, + "logps/chosen": -328.38531494140625, + "logps/rejected": -266.3671569824219, + "loss": 0.5704, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7039732933044434, + "rewards/margins": 1.1600509881973267, + "rewards/rejected": -1.8640244007110596, + "step": 1710 + }, + { + "epoch": 0.89, + "learning_rate": 3.9118378275004783e-07, + "logits/chosen": -2.833494186401367, + "logits/rejected": -2.773686170578003, + "logps/chosen": -274.27349853515625, + "logps/rejected": -314.40447998046875, + "loss": 0.5537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7931226491928101, + "rewards/margins": 0.9070862531661987, + "rewards/rejected": -1.7002089023590088, + "step": 1720 + }, + { + "epoch": 0.89, + "learning_rate": 3.9022757697456494e-07, + "logits/chosen": -2.6811347007751465, + "logits/rejected": -2.7765626907348633, + "logps/chosen": -344.49310302734375, + "logps/rejected": -281.51678466796875, + "loss": 0.5263, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6116336584091187, + "rewards/margins": 1.0795291662216187, + "rewards/rejected": -1.6911628246307373, + "step": 1730 + }, + { + "epoch": 0.9, + "learning_rate": 3.8927137119908206e-07, + "logits/chosen": -2.7077999114990234, + "logits/rejected": -2.7165045738220215, + "logps/chosen": -303.66107177734375, + "logps/rejected": -229.548095703125, + "loss": 0.5082, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.65656977891922, + "rewards/margins": 1.3464813232421875, + "rewards/rejected": -2.003051280975342, + "step": 1740 + }, + { + "epoch": 0.9, + "learning_rate": 3.883151654235992e-07, + "logits/chosen": -2.7294795513153076, + "logits/rejected": -2.7539401054382324, + "logps/chosen": -296.2801818847656, + "logps/rejected": -263.555908203125, + "loss": 0.5555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5686991214752197, + "rewards/margins": 1.5480784177780151, + "rewards/rejected": -2.1167776584625244, + "step": 1750 + }, + { + "epoch": 0.91, + "learning_rate": 3.873589596481163e-07, + "logits/chosen": -2.785902261734009, + "logits/rejected": -2.778597593307495, + "logps/chosen": -294.0135192871094, + "logps/rejected": -264.5918884277344, + "loss": 0.5686, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.79108726978302, + "rewards/margins": 1.2999193668365479, + "rewards/rejected": -2.0910067558288574, + "step": 1760 + }, + { + "epoch": 0.91, + "learning_rate": 3.864027538726334e-07, + "logits/chosen": -2.640261173248291, + "logits/rejected": -2.7362558841705322, + "logps/chosen": -277.26153564453125, + "logps/rejected": -251.80453491210938, + "loss": 0.5828, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.48934370279312134, + "rewards/margins": 1.6264740228652954, + "rewards/rejected": -2.1158177852630615, + "step": 1770 + }, + { + "epoch": 0.92, + "learning_rate": 3.8544654809715047e-07, + "logits/chosen": -2.7447402477264404, + "logits/rejected": -2.735809803009033, + "logps/chosen": -291.24432373046875, + "logps/rejected": -259.63238525390625, + "loss": 0.5448, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8588013648986816, + "rewards/margins": 1.3649778366088867, + "rewards/rejected": -2.2237792015075684, + "step": 1780 + }, + { + "epoch": 0.92, + "learning_rate": 3.844903423216676e-07, + "logits/chosen": -2.689377784729004, + "logits/rejected": -2.642862319946289, + "logps/chosen": -270.98809814453125, + "logps/rejected": -237.0521240234375, + "loss": 0.5087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5103562474250793, + "rewards/margins": 1.4526978731155396, + "rewards/rejected": -1.9630540609359741, + "step": 1790 + }, + { + "epoch": 0.93, + "learning_rate": 3.835341365461847e-07, + "logits/chosen": -2.6891281604766846, + "logits/rejected": -2.6219875812530518, + "logps/chosen": -276.0992126464844, + "logps/rejected": -229.54647827148438, + "loss": 0.5346, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5371229648590088, + "rewards/margins": 1.4801558256149292, + "rewards/rejected": -2.0172784328460693, + "step": 1800 + }, + { + "epoch": 0.93, + "learning_rate": 3.825779307707018e-07, + "logits/chosen": -2.6955854892730713, + "logits/rejected": -2.6945228576660156, + "logps/chosen": -209.4112548828125, + "logps/rejected": -230.22689819335938, + "loss": 0.5242, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.609640896320343, + "rewards/margins": 1.1101809740066528, + "rewards/rejected": -1.7198219299316406, + "step": 1810 + }, + { + "epoch": 0.94, + "learning_rate": 3.8162172499521893e-07, + "logits/chosen": -2.7102739810943604, + "logits/rejected": -2.7950327396392822, + "logps/chosen": -267.4234619140625, + "logps/rejected": -218.8787078857422, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6695321798324585, + "rewards/margins": 1.3147714138031006, + "rewards/rejected": -1.9843038320541382, + "step": 1820 + }, + { + "epoch": 0.94, + "learning_rate": 3.8066551921973605e-07, + "logits/chosen": -2.7370214462280273, + "logits/rejected": -2.7310547828674316, + "logps/chosen": -261.2284240722656, + "logps/rejected": -237.5937957763672, + "loss": 0.4986, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.37716975808143616, + "rewards/margins": 1.3623695373535156, + "rewards/rejected": -1.739539384841919, + "step": 1830 + }, + { + "epoch": 0.95, + "learning_rate": 3.7970931344425316e-07, + "logits/chosen": -2.667738437652588, + "logits/rejected": -2.7235381603240967, + "logps/chosen": -299.5805358886719, + "logps/rejected": -240.5843505859375, + "loss": 0.4653, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.5832570195198059, + "rewards/margins": 1.1436893939971924, + "rewards/rejected": -1.726946473121643, + "step": 1840 + }, + { + "epoch": 0.96, + "learning_rate": 3.787531076687703e-07, + "logits/chosen": -2.5823702812194824, + "logits/rejected": -2.6130504608154297, + "logps/chosen": -258.65521240234375, + "logps/rejected": -211.156005859375, + "loss": 0.5161, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.46967166662216187, + "rewards/margins": 1.632408857345581, + "rewards/rejected": -2.1020803451538086, + "step": 1850 + }, + { + "epoch": 0.96, + "learning_rate": 3.7779690189328745e-07, + "logits/chosen": -2.7206063270568848, + "logits/rejected": -2.7099740505218506, + "logps/chosen": -260.58782958984375, + "logps/rejected": -236.6365203857422, + "loss": 0.4783, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6446757316589355, + "rewards/margins": 1.2911062240600586, + "rewards/rejected": -1.9357818365097046, + "step": 1860 + }, + { + "epoch": 0.97, + "learning_rate": 3.7684069611780456e-07, + "logits/chosen": -2.6849565505981445, + "logits/rejected": -2.760693073272705, + "logps/chosen": -289.33135986328125, + "logps/rejected": -252.05615234375, + "loss": 0.5354, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3844161331653595, + "rewards/margins": 1.464693546295166, + "rewards/rejected": -1.8491096496582031, + "step": 1870 + }, + { + "epoch": 0.97, + "learning_rate": 3.758844903423217e-07, + "logits/chosen": -2.770711898803711, + "logits/rejected": -2.7400715351104736, + "logps/chosen": -257.98223876953125, + "logps/rejected": -249.1288604736328, + "loss": 0.5816, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5955790281295776, + "rewards/margins": 1.141638994216919, + "rewards/rejected": -1.737217903137207, + "step": 1880 + }, + { + "epoch": 0.98, + "learning_rate": 3.749282845668388e-07, + "logits/chosen": -2.781506061553955, + "logits/rejected": -2.7431671619415283, + "logps/chosen": -307.255615234375, + "logps/rejected": -271.7546081542969, + "loss": 0.5382, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7323135137557983, + "rewards/margins": 1.2955814599990845, + "rewards/rejected": -2.027894973754883, + "step": 1890 + }, + { + "epoch": 0.98, + "learning_rate": 3.739720787913559e-07, + "logits/chosen": -2.6961607933044434, + "logits/rejected": -2.7294247150421143, + "logps/chosen": -267.5713195800781, + "logps/rejected": -234.83511352539062, + "loss": 0.5006, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7748560905456543, + "rewards/margins": 1.3794810771942139, + "rewards/rejected": -2.1543374061584473, + "step": 1900 + }, + { + "epoch": 0.99, + "learning_rate": 3.73015873015873e-07, + "logits/chosen": -2.6831729412078857, + "logits/rejected": -2.697099208831787, + "logps/chosen": -286.3196105957031, + "logps/rejected": -262.9601135253906, + "loss": 0.5058, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3332211971282959, + "rewards/margins": 1.1857125759124756, + "rewards/rejected": -1.518933653831482, + "step": 1910 + }, + { + "epoch": 0.99, + "learning_rate": 3.7205966724039014e-07, + "logits/chosen": -2.5785605907440186, + "logits/rejected": -2.6058080196380615, + "logps/chosen": -294.5285339355469, + "logps/rejected": -252.9921112060547, + "loss": 0.5, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6818346381187439, + "rewards/margins": 1.2134017944335938, + "rewards/rejected": -1.895236611366272, + "step": 1920 + }, + { + "epoch": 1.0, + "learning_rate": 3.711034614649072e-07, + "logits/chosen": -2.751162528991699, + "logits/rejected": -2.7023558616638184, + "logps/chosen": -310.20550537109375, + "logps/rejected": -245.3313751220703, + "loss": 0.4926, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6286559700965881, + "rewards/margins": 1.6120086908340454, + "rewards/rejected": -2.24066424369812, + "step": 1930 + }, + { + "epoch": 1.0, + "learning_rate": 3.701472556894243e-07, + "logits/chosen": -2.6212620735168457, + "logits/rejected": -2.65401029586792, + "logps/chosen": -253.7644500732422, + "logps/rejected": -285.94927978515625, + "loss": 0.3858, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.09027113765478134, + "rewards/margins": 2.7949814796447754, + "rewards/rejected": -2.7047104835510254, + "step": 1940 + }, + { + "epoch": 1.01, + "learning_rate": 3.6919104991394144e-07, + "logits/chosen": -2.749549388885498, + "logits/rejected": -2.7355539798736572, + "logps/chosen": -262.36444091796875, + "logps/rejected": -266.4080505371094, + "loss": 0.0969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.437258005142212, + "rewards/margins": 5.972604751586914, + "rewards/rejected": -4.535346508026123, + "step": 1950 + }, + { + "epoch": 1.01, + "learning_rate": 3.6823484413845855e-07, + "logits/chosen": -2.644953966140747, + "logits/rejected": -2.643900156021118, + "logps/chosen": -271.19232177734375, + "logps/rejected": -290.2998352050781, + "loss": 0.1308, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.7727033495903015, + "rewards/margins": 5.208821773529053, + "rewards/rejected": -4.436118125915527, + "step": 1960 + }, + { + "epoch": 1.02, + "learning_rate": 3.6727863836297567e-07, + "logits/chosen": -2.7043509483337402, + "logits/rejected": -2.6822307109832764, + "logps/chosen": -246.55972290039062, + "logps/rejected": -267.2857971191406, + "loss": 0.0869, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1716320514678955, + "rewards/margins": 6.148016929626465, + "rewards/rejected": -4.976385116577148, + "step": 1970 + }, + { + "epoch": 1.02, + "learning_rate": 3.663224325874928e-07, + "logits/chosen": -2.6752538681030273, + "logits/rejected": -2.640815019607544, + "logps/chosen": -243.1986083984375, + "logps/rejected": -281.6815185546875, + "loss": 0.108, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6313158273696899, + "rewards/margins": 5.219941139221191, + "rewards/rejected": -4.588625907897949, + "step": 1980 + }, + { + "epoch": 1.03, + "learning_rate": 3.653662268120099e-07, + "logits/chosen": -2.6890158653259277, + "logits/rejected": -2.636115312576294, + "logps/chosen": -250.166748046875, + "logps/rejected": -315.3585510253906, + "loss": 0.0855, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1859726905822754, + "rewards/margins": 5.5249199867248535, + "rewards/rejected": -4.338947772979736, + "step": 1990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6441002103652707e-07, + "logits/chosen": -2.6496829986572266, + "logits/rejected": -2.697693347930908, + "logps/chosen": -268.20355224609375, + "logps/rejected": -300.24993896484375, + "loss": 0.0651, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7303746938705444, + "rewards/margins": 5.9827704429626465, + "rewards/rejected": -5.2523956298828125, + "step": 2000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.7412195205688477, + "eval_logits/rejected": -2.733457565307617, + "eval_logps/chosen": -285.7675476074219, + "eval_logps/rejected": -261.06353759765625, + "eval_loss": 0.5048810243606567, + "eval_rewards/accuracies": 0.7860000133514404, + "eval_rewards/chosen": -1.0044103860855103, + "eval_rewards/margins": 1.8658151626586914, + "eval_rewards/rejected": -2.870225667953491, + "eval_runtime": 453.132, + "eval_samples_per_second": 4.414, + "eval_steps_per_second": 0.276, + "step": 2000 + }, + { + "epoch": 1.04, + "learning_rate": 3.634538152610442e-07, + "logits/chosen": -2.6881024837493896, + "logits/rejected": -2.7203176021575928, + "logps/chosen": -273.32574462890625, + "logps/rejected": -304.735107421875, + "loss": 0.0758, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1392464637756348, + "rewards/margins": 6.492955207824707, + "rewards/rejected": -5.353708744049072, + "step": 2010 + }, + { + "epoch": 1.04, + "learning_rate": 3.624976094855613e-07, + "logits/chosen": -2.643216609954834, + "logits/rejected": -2.6377549171447754, + "logps/chosen": -258.87518310546875, + "logps/rejected": -269.3851318359375, + "loss": 0.0707, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7527416944503784, + "rewards/margins": 5.787278175354004, + "rewards/rejected": -5.034537315368652, + "step": 2020 + }, + { + "epoch": 1.05, + "learning_rate": 3.615414037100784e-07, + "logits/chosen": -2.624013900756836, + "logits/rejected": -2.6467807292938232, + "logps/chosen": -287.6932678222656, + "logps/rejected": -270.98529052734375, + "loss": 0.0997, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3131764531135559, + "rewards/margins": 5.307021141052246, + "rewards/rejected": -4.993845462799072, + "step": 2030 + }, + { + "epoch": 1.05, + "learning_rate": 3.6058519793459553e-07, + "logits/chosen": -2.583369493484497, + "logits/rejected": -2.6954777240753174, + "logps/chosen": -241.7144317626953, + "logps/rejected": -249.5287322998047, + "loss": 0.0879, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.655167818069458, + "rewards/margins": 5.203488826751709, + "rewards/rejected": -4.548320770263672, + "step": 2040 + }, + { + "epoch": 1.06, + "learning_rate": 3.5962899215911265e-07, + "logits/chosen": -2.672328472137451, + "logits/rejected": -2.6205334663391113, + "logps/chosen": -239.5384979248047, + "logps/rejected": -288.34893798828125, + "loss": 0.0544, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5333976745605469, + "rewards/margins": 5.3222761154174805, + "rewards/rejected": -4.788878440856934, + "step": 2050 + }, + { + "epoch": 1.06, + "learning_rate": 3.5867278638362976e-07, + "logits/chosen": -2.6396572589874268, + "logits/rejected": -2.728149652481079, + "logps/chosen": -308.57110595703125, + "logps/rejected": -312.836181640625, + "loss": 0.083, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8408229947090149, + "rewards/margins": 5.805523872375488, + "rewards/rejected": -4.964701175689697, + "step": 2060 + }, + { + "epoch": 1.07, + "learning_rate": 3.577165806081469e-07, + "logits/chosen": -2.668506145477295, + "logits/rejected": -2.717451810836792, + "logps/chosen": -257.14599609375, + "logps/rejected": -260.1763916015625, + "loss": 0.0988, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.4912979006767273, + "rewards/margins": 5.213704586029053, + "rewards/rejected": -4.72240686416626, + "step": 2070 + }, + { + "epoch": 1.07, + "learning_rate": 3.56760374832664e-07, + "logits/chosen": -2.7033512592315674, + "logits/rejected": -2.676906108856201, + "logps/chosen": -290.5976867675781, + "logps/rejected": -317.9124450683594, + "loss": 0.0423, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4838396310806274, + "rewards/margins": 7.55858850479126, + "rewards/rejected": -6.0747480392456055, + "step": 2080 + }, + { + "epoch": 1.08, + "learning_rate": 3.5580416905718106e-07, + "logits/chosen": -2.647249221801758, + "logits/rejected": -2.7124292850494385, + "logps/chosen": -305.52313232421875, + "logps/rejected": -265.80853271484375, + "loss": 0.1056, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6061897277832031, + "rewards/margins": 5.471752166748047, + "rewards/rejected": -4.86556339263916, + "step": 2090 + }, + { + "epoch": 1.08, + "learning_rate": 3.5484796328169817e-07, + "logits/chosen": -2.633441686630249, + "logits/rejected": -2.7382283210754395, + "logps/chosen": -268.3299560546875, + "logps/rejected": -253.24844360351562, + "loss": 0.07, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5186059474945068, + "rewards/margins": 5.430913925170898, + "rewards/rejected": -4.9123077392578125, + "step": 2100 + }, + { + "epoch": 1.09, + "learning_rate": 3.538917575062153e-07, + "logits/chosen": -2.594312906265259, + "logits/rejected": -2.6324267387390137, + "logps/chosen": -267.57550048828125, + "logps/rejected": -314.03460693359375, + "loss": 0.0835, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.33759838342666626, + "rewards/margins": 6.359553813934326, + "rewards/rejected": -6.021955490112305, + "step": 2110 + }, + { + "epoch": 1.09, + "learning_rate": 3.529355517307324e-07, + "logits/chosen": -2.6919217109680176, + "logits/rejected": -2.6473464965820312, + "logps/chosen": -247.5459747314453, + "logps/rejected": -294.58917236328125, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8844423294067383, + "rewards/margins": 6.708581447601318, + "rewards/rejected": -5.824139595031738, + "step": 2120 + }, + { + "epoch": 1.1, + "learning_rate": 3.519793459552495e-07, + "logits/chosen": -2.7207422256469727, + "logits/rejected": -2.644496440887451, + "logps/chosen": -267.00518798828125, + "logps/rejected": -298.27508544921875, + "loss": 0.0747, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.5102488398551941, + "rewards/margins": 5.6486406326293945, + "rewards/rejected": -5.138392448425293, + "step": 2130 + }, + { + "epoch": 1.1, + "learning_rate": 3.510231401797667e-07, + "logits/chosen": -2.675915479660034, + "logits/rejected": -2.623983860015869, + "logps/chosen": -301.0896301269531, + "logps/rejected": -301.5240478515625, + "loss": 0.092, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.039230428636074066, + "rewards/margins": 6.0254340171813965, + "rewards/rejected": -6.064664363861084, + "step": 2140 + }, + { + "epoch": 1.11, + "learning_rate": 3.500669344042838e-07, + "logits/chosen": -2.7287375926971436, + "logits/rejected": -2.673964738845825, + "logps/chosen": -256.0411682128906, + "logps/rejected": -286.02166748046875, + "loss": 0.082, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.15862272679805756, + "rewards/margins": 5.4255757331848145, + "rewards/rejected": -5.584197998046875, + "step": 2150 + }, + { + "epoch": 1.12, + "learning_rate": 3.491107286288009e-07, + "logits/chosen": -2.7414021492004395, + "logits/rejected": -2.6649553775787354, + "logps/chosen": -271.9679870605469, + "logps/rejected": -304.8786926269531, + "loss": 0.1284, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.0953565314412117, + "rewards/margins": 5.869844436645508, + "rewards/rejected": -5.965200901031494, + "step": 2160 + }, + { + "epoch": 1.12, + "learning_rate": 3.4815452285331803e-07, + "logits/chosen": -2.659775495529175, + "logits/rejected": -2.6203103065490723, + "logps/chosen": -289.34613037109375, + "logps/rejected": -282.0144958496094, + "loss": 0.0941, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19078879058361053, + "rewards/margins": 5.521025657653809, + "rewards/rejected": -5.7118144035339355, + "step": 2170 + }, + { + "epoch": 1.13, + "learning_rate": 3.4719831707783515e-07, + "logits/chosen": -2.6245665550231934, + "logits/rejected": -2.659949779510498, + "logps/chosen": -300.9286193847656, + "logps/rejected": -322.0133056640625, + "loss": 0.0888, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.270366370677948, + "rewards/margins": 6.352386474609375, + "rewards/rejected": -6.082020282745361, + "step": 2180 + }, + { + "epoch": 1.13, + "learning_rate": 3.4624211130235227e-07, + "logits/chosen": -2.706652879714966, + "logits/rejected": -2.7077231407165527, + "logps/chosen": -248.646484375, + "logps/rejected": -265.32366943359375, + "loss": 0.085, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.16835294663906097, + "rewards/margins": 5.832309246063232, + "rewards/rejected": -6.000662326812744, + "step": 2190 + }, + { + "epoch": 1.14, + "learning_rate": 3.452859055268694e-07, + "logits/chosen": -2.6764402389526367, + "logits/rejected": -2.740021228790283, + "logps/chosen": -256.58892822265625, + "logps/rejected": -270.4902648925781, + "loss": 0.0928, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03022139146924019, + "rewards/margins": 6.150148391723633, + "rewards/rejected": -6.119926929473877, + "step": 2200 + }, + { + "epoch": 1.14, + "learning_rate": 3.443296997513865e-07, + "logits/chosen": -2.775790214538574, + "logits/rejected": -2.727541446685791, + "logps/chosen": -300.582275390625, + "logps/rejected": -329.0697326660156, + "loss": 0.1029, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.46084627509117126, + "rewards/margins": 6.592850685119629, + "rewards/rejected": -6.132004261016846, + "step": 2210 + }, + { + "epoch": 1.15, + "learning_rate": 3.433734939759036e-07, + "logits/chosen": -2.733781337738037, + "logits/rejected": -2.6996545791625977, + "logps/chosen": -289.87286376953125, + "logps/rejected": -335.86883544921875, + "loss": 0.1133, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7392681241035461, + "rewards/margins": 6.866480350494385, + "rewards/rejected": -6.127212047576904, + "step": 2220 + }, + { + "epoch": 1.15, + "learning_rate": 3.4241728820042073e-07, + "logits/chosen": -2.6640281677246094, + "logits/rejected": -2.626359224319458, + "logps/chosen": -248.7790985107422, + "logps/rejected": -306.9777526855469, + "loss": 0.1217, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.4639340043067932, + "rewards/margins": 5.437063694000244, + "rewards/rejected": -5.900998115539551, + "step": 2230 + }, + { + "epoch": 1.16, + "learning_rate": 3.4146108242493784e-07, + "logits/chosen": -2.7318482398986816, + "logits/rejected": -2.7393126487731934, + "logps/chosen": -236.71939086914062, + "logps/rejected": -281.84332275390625, + "loss": 0.1461, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.26268935203552246, + "rewards/margins": 5.945524215698242, + "rewards/rejected": -5.682834625244141, + "step": 2240 + }, + { + "epoch": 1.16, + "learning_rate": 3.405048766494549e-07, + "logits/chosen": -2.655292272567749, + "logits/rejected": -2.6065621376037598, + "logps/chosen": -292.45843505859375, + "logps/rejected": -272.82489013671875, + "loss": 0.1152, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3397136926651001, + "rewards/margins": 5.560425281524658, + "rewards/rejected": -5.220711708068848, + "step": 2250 + }, + { + "epoch": 1.17, + "learning_rate": 3.39548670873972e-07, + "logits/chosen": -2.5906424522399902, + "logits/rejected": -2.5299315452575684, + "logps/chosen": -310.4429626464844, + "logps/rejected": -342.58355712890625, + "loss": 0.1157, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.9430241584777832, + "rewards/margins": 7.4350996017456055, + "rewards/rejected": -6.4920759201049805, + "step": 2260 + }, + { + "epoch": 1.17, + "learning_rate": 3.3859246509848914e-07, + "logits/chosen": -2.5732579231262207, + "logits/rejected": -2.608187198638916, + "logps/chosen": -274.4580993652344, + "logps/rejected": -302.9696960449219, + "loss": 0.0827, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.47884711623191833, + "rewards/margins": 5.948543071746826, + "rewards/rejected": -5.469696044921875, + "step": 2270 + }, + { + "epoch": 1.18, + "learning_rate": 3.376362593230063e-07, + "logits/chosen": -2.5940709114074707, + "logits/rejected": -2.579684257507324, + "logps/chosen": -254.5391082763672, + "logps/rejected": -310.8402404785156, + "loss": 0.0961, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.21051082015037537, + "rewards/margins": 6.346388816833496, + "rewards/rejected": -6.13587760925293, + "step": 2280 + }, + { + "epoch": 1.18, + "learning_rate": 3.366800535475234e-07, + "logits/chosen": -2.609266757965088, + "logits/rejected": -2.4904086589813232, + "logps/chosen": -291.47637939453125, + "logps/rejected": -287.4329833984375, + "loss": 0.0767, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9694992899894714, + "rewards/margins": 7.04742431640625, + "rewards/rejected": -6.077925205230713, + "step": 2290 + }, + { + "epoch": 1.19, + "learning_rate": 3.3572384777204054e-07, + "logits/chosen": -2.6758570671081543, + "logits/rejected": -2.6098129749298096, + "logps/chosen": -292.984130859375, + "logps/rejected": -281.83056640625, + "loss": 0.0861, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6002046465873718, + "rewards/margins": 6.433353424072266, + "rewards/rejected": -5.833148002624512, + "step": 2300 + }, + { + "epoch": 1.19, + "learning_rate": 3.3476764199655765e-07, + "logits/chosen": -2.49656343460083, + "logits/rejected": -2.542083501815796, + "logps/chosen": -214.75625610351562, + "logps/rejected": -274.36273193359375, + "loss": 0.0838, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3019743859767914, + "rewards/margins": 6.335268974304199, + "rewards/rejected": -6.033293724060059, + "step": 2310 + }, + { + "epoch": 1.2, + "learning_rate": 3.3381143622107477e-07, + "logits/chosen": -2.5624756813049316, + "logits/rejected": -2.5002315044403076, + "logps/chosen": -310.534912109375, + "logps/rejected": -313.9328308105469, + "loss": 0.0543, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4683504104614258, + "rewards/margins": 6.916632175445557, + "rewards/rejected": -6.448281288146973, + "step": 2320 + }, + { + "epoch": 1.2, + "learning_rate": 3.328552304455919e-07, + "logits/chosen": -2.5515811443328857, + "logits/rejected": -2.5180673599243164, + "logps/chosen": -260.62518310546875, + "logps/rejected": -287.25152587890625, + "loss": 0.073, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.19289037585258484, + "rewards/margins": 6.111286163330078, + "rewards/rejected": -5.918396472930908, + "step": 2330 + }, + { + "epoch": 1.21, + "learning_rate": 3.31899024670109e-07, + "logits/chosen": -2.5265719890594482, + "logits/rejected": -2.423987865447998, + "logps/chosen": -279.4831237792969, + "logps/rejected": -292.02935791015625, + "loss": 0.0825, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.28345030546188354, + "rewards/margins": 6.494320869445801, + "rewards/rejected": -6.210869789123535, + "step": 2340 + }, + { + "epoch": 1.21, + "learning_rate": 3.309428188946261e-07, + "logits/chosen": -2.422034978866577, + "logits/rejected": -2.483975410461426, + "logps/chosen": -257.01348876953125, + "logps/rejected": -276.59686279296875, + "loss": 0.0929, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.13130250573158264, + "rewards/margins": 5.311938285827637, + "rewards/rejected": -5.443240165710449, + "step": 2350 + }, + { + "epoch": 1.22, + "learning_rate": 3.2998661311914323e-07, + "logits/chosen": -2.571733236312866, + "logits/rejected": -2.4926095008850098, + "logps/chosen": -268.9256591796875, + "logps/rejected": -298.5318603515625, + "loss": 0.0788, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.22903628647327423, + "rewards/margins": 6.610577583312988, + "rewards/rejected": -6.381541728973389, + "step": 2360 + }, + { + "epoch": 1.22, + "learning_rate": 3.2903040734366035e-07, + "logits/chosen": -2.5603127479553223, + "logits/rejected": -2.584486722946167, + "logps/chosen": -249.293212890625, + "logps/rejected": -303.9468688964844, + "loss": 0.0726, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.38736894726753235, + "rewards/margins": 6.439343452453613, + "rewards/rejected": -6.051974296569824, + "step": 2370 + }, + { + "epoch": 1.23, + "learning_rate": 3.2807420156817746e-07, + "logits/chosen": -2.4685256481170654, + "logits/rejected": -2.4727115631103516, + "logps/chosen": -247.81320190429688, + "logps/rejected": -277.7972717285156, + "loss": 0.2207, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3868095874786377, + "rewards/margins": 5.795322418212891, + "rewards/rejected": -5.408513069152832, + "step": 2380 + }, + { + "epoch": 1.23, + "learning_rate": 3.271179957926946e-07, + "logits/chosen": -2.5883705615997314, + "logits/rejected": -2.542631149291992, + "logps/chosen": -282.636962890625, + "logps/rejected": -294.6969299316406, + "loss": 0.098, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.005316281225532293, + "rewards/margins": 6.369308948516846, + "rewards/rejected": -6.363993167877197, + "step": 2390 + }, + { + "epoch": 1.24, + "learning_rate": 3.261617900172117e-07, + "logits/chosen": -2.628528118133545, + "logits/rejected": -2.605329990386963, + "logps/chosen": -239.5411834716797, + "logps/rejected": -289.7711181640625, + "loss": 0.1672, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6569315195083618, + "rewards/margins": 5.836136817932129, + "rewards/rejected": -5.179205894470215, + "step": 2400 + }, + { + "epoch": 1.24, + "learning_rate": 3.2520558424172876e-07, + "logits/chosen": -2.669081211090088, + "logits/rejected": -2.6567091941833496, + "logps/chosen": -270.3697814941406, + "logps/rejected": -276.68780517578125, + "loss": 0.0625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.47786712646484375, + "rewards/margins": 6.1428542137146, + "rewards/rejected": -5.664987564086914, + "step": 2410 + }, + { + "epoch": 1.25, + "learning_rate": 3.242493784662459e-07, + "logits/chosen": -2.595695972442627, + "logits/rejected": -2.6479713916778564, + "logps/chosen": -257.28521728515625, + "logps/rejected": -289.1241760253906, + "loss": 0.1023, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.023120785132050514, + "rewards/margins": 6.379412651062012, + "rewards/rejected": -6.356291770935059, + "step": 2420 + }, + { + "epoch": 1.25, + "learning_rate": 3.2329317269076304e-07, + "logits/chosen": -2.633089542388916, + "logits/rejected": -2.5774524211883545, + "logps/chosen": -285.16400146484375, + "logps/rejected": -334.11822509765625, + "loss": 0.1087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5232380628585815, + "rewards/margins": 6.513851165771484, + "rewards/rejected": -5.990612983703613, + "step": 2430 + }, + { + "epoch": 1.26, + "learning_rate": 3.2233696691528016e-07, + "logits/chosen": -2.5757319927215576, + "logits/rejected": -2.551025152206421, + "logps/chosen": -255.3240509033203, + "logps/rejected": -311.89337158203125, + "loss": 0.1209, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6545261144638062, + "rewards/margins": 6.234076023101807, + "rewards/rejected": -5.579549789428711, + "step": 2440 + }, + { + "epoch": 1.26, + "learning_rate": 3.2138076113979727e-07, + "logits/chosen": -2.6312050819396973, + "logits/rejected": -2.539020538330078, + "logps/chosen": -278.7816162109375, + "logps/rejected": -318.1822509765625, + "loss": 0.0936, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.04061394929885864, + "rewards/margins": 6.488400936126709, + "rewards/rejected": -6.447786808013916, + "step": 2450 + }, + { + "epoch": 1.27, + "learning_rate": 3.204245553643144e-07, + "logits/chosen": -2.5656325817108154, + "logits/rejected": -2.5055129528045654, + "logps/chosen": -309.80218505859375, + "logps/rejected": -306.3583679199219, + "loss": 0.0857, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.1567656546831131, + "rewards/margins": 5.968165397644043, + "rewards/rejected": -5.811399936676025, + "step": 2460 + }, + { + "epoch": 1.28, + "learning_rate": 3.194683495888315e-07, + "logits/chosen": -2.6194345951080322, + "logits/rejected": -2.524545431137085, + "logps/chosen": -304.95233154296875, + "logps/rejected": -278.0549621582031, + "loss": 0.1175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.33243006467819214, + "rewards/margins": 5.192269802093506, + "rewards/rejected": -4.85983943939209, + "step": 2470 + }, + { + "epoch": 1.28, + "learning_rate": 3.185121438133486e-07, + "logits/chosen": -2.5585827827453613, + "logits/rejected": -2.6347527503967285, + "logps/chosen": -282.5390625, + "logps/rejected": -360.55792236328125, + "loss": 0.086, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7870206832885742, + "rewards/margins": 7.211942195892334, + "rewards/rejected": -6.424921989440918, + "step": 2480 + }, + { + "epoch": 1.29, + "learning_rate": 3.1755593803786574e-07, + "logits/chosen": -2.5586822032928467, + "logits/rejected": -2.5810511112213135, + "logps/chosen": -228.8177490234375, + "logps/rejected": -280.386962890625, + "loss": 0.1051, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.21440930664539337, + "rewards/margins": 5.810070991516113, + "rewards/rejected": -6.02448034286499, + "step": 2490 + }, + { + "epoch": 1.29, + "learning_rate": 3.1659973226238285e-07, + "logits/chosen": -2.516174793243408, + "logits/rejected": -2.523841381072998, + "logps/chosen": -278.87884521484375, + "logps/rejected": -282.09747314453125, + "loss": 0.0887, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04227827861905098, + "rewards/margins": 6.174509048461914, + "rewards/rejected": -6.132230281829834, + "step": 2500 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -2.6173453330993652, + "eval_logits/rejected": -2.5939838886260986, + "eval_logps/chosen": -295.6112976074219, + "eval_logps/rejected": -271.6174621582031, + "eval_loss": 0.5945637226104736, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -1.9887875318527222, + "eval_rewards/margins": 1.9368335008621216, + "eval_rewards/rejected": -3.925621271133423, + "eval_runtime": 453.7931, + "eval_samples_per_second": 4.407, + "eval_steps_per_second": 0.275, + "step": 2500 + }, + { + "epoch": 1.3, + "learning_rate": 3.1564352648689997e-07, + "logits/chosen": -2.6018643379211426, + "logits/rejected": -2.6282429695129395, + "logps/chosen": -302.58746337890625, + "logps/rejected": -285.2703857421875, + "loss": 0.0985, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.127798393368721, + "rewards/margins": 5.529982089996338, + "rewards/rejected": -5.40218448638916, + "step": 2510 + }, + { + "epoch": 1.3, + "learning_rate": 3.146873207114171e-07, + "logits/chosen": -2.5892491340637207, + "logits/rejected": -2.5546836853027344, + "logps/chosen": -241.8780975341797, + "logps/rejected": -288.4834899902344, + "loss": 0.1053, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.20376138389110565, + "rewards/margins": 6.397191047668457, + "rewards/rejected": -6.193429946899414, + "step": 2520 + }, + { + "epoch": 1.31, + "learning_rate": 3.137311149359342e-07, + "logits/chosen": -2.653477191925049, + "logits/rejected": -2.6052021980285645, + "logps/chosen": -312.23828125, + "logps/rejected": -331.75006103515625, + "loss": 0.0965, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2199176847934723, + "rewards/margins": 5.913000106811523, + "rewards/rejected": -5.693081855773926, + "step": 2530 + }, + { + "epoch": 1.31, + "learning_rate": 3.127749091604513e-07, + "logits/chosen": -2.583885669708252, + "logits/rejected": -2.624443531036377, + "logps/chosen": -303.43658447265625, + "logps/rejected": -341.8377380371094, + "loss": 0.0916, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8230945467948914, + "rewards/margins": 8.061508178710938, + "rewards/rejected": -7.2384138107299805, + "step": 2540 + }, + { + "epoch": 1.32, + "learning_rate": 3.1181870338496843e-07, + "logits/chosen": -2.5278661251068115, + "logits/rejected": -2.5652339458465576, + "logps/chosen": -211.1104278564453, + "logps/rejected": -290.1295471191406, + "loss": 0.1534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07069828361272812, + "rewards/margins": 5.71384859085083, + "rewards/rejected": -5.643150806427002, + "step": 2550 + }, + { + "epoch": 1.32, + "learning_rate": 3.108624976094856e-07, + "logits/chosen": -2.585292339324951, + "logits/rejected": -2.625404119491577, + "logps/chosen": -264.1541442871094, + "logps/rejected": -261.3079833984375, + "loss": 0.1008, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9801958799362183, + "rewards/margins": 5.846863746643066, + "rewards/rejected": -4.866667747497559, + "step": 2560 + }, + { + "epoch": 1.33, + "learning_rate": 3.0990629183400266e-07, + "logits/chosen": -2.601198196411133, + "logits/rejected": -2.5282673835754395, + "logps/chosen": -289.15863037109375, + "logps/rejected": -304.9074401855469, + "loss": 0.1013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5822550654411316, + "rewards/margins": 6.540297508239746, + "rewards/rejected": -5.958043098449707, + "step": 2570 + }, + { + "epoch": 1.33, + "learning_rate": 3.089500860585198e-07, + "logits/chosen": -2.5145509243011475, + "logits/rejected": -2.5709242820739746, + "logps/chosen": -233.1308135986328, + "logps/rejected": -259.27081298828125, + "loss": 0.109, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19086940586566925, + "rewards/margins": 5.136707305908203, + "rewards/rejected": -5.327577114105225, + "step": 2580 + }, + { + "epoch": 1.34, + "learning_rate": 3.079938802830369e-07, + "logits/chosen": -2.5653598308563232, + "logits/rejected": -2.6376378536224365, + "logps/chosen": -255.4644775390625, + "logps/rejected": -230.2383575439453, + "loss": 0.073, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.03477507829666138, + "rewards/margins": 5.211745262145996, + "rewards/rejected": -5.246520519256592, + "step": 2590 + }, + { + "epoch": 1.34, + "learning_rate": 3.07037674507554e-07, + "logits/chosen": -2.5367038249969482, + "logits/rejected": -2.5960888862609863, + "logps/chosen": -300.2066345214844, + "logps/rejected": -349.10064697265625, + "loss": 0.1305, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.460475355386734, + "rewards/margins": 6.672016143798828, + "rewards/rejected": -6.211541175842285, + "step": 2600 + }, + { + "epoch": 1.35, + "learning_rate": 3.060814687320711e-07, + "logits/chosen": -2.5328152179718018, + "logits/rejected": -2.514622688293457, + "logps/chosen": -292.62896728515625, + "logps/rejected": -287.25946044921875, + "loss": 0.0986, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.45202842354774475, + "rewards/margins": 6.34035587310791, + "rewards/rejected": -5.888327598571777, + "step": 2610 + }, + { + "epoch": 1.35, + "learning_rate": 3.0512526295658824e-07, + "logits/chosen": -2.5849661827087402, + "logits/rejected": -2.564257860183716, + "logps/chosen": -290.31890869140625, + "logps/rejected": -296.34967041015625, + "loss": 0.1226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20002314448356628, + "rewards/margins": 5.983278751373291, + "rewards/rejected": -5.783255100250244, + "step": 2620 + }, + { + "epoch": 1.36, + "learning_rate": 3.0416905718110536e-07, + "logits/chosen": -2.567880630493164, + "logits/rejected": -2.539358615875244, + "logps/chosen": -247.0207977294922, + "logps/rejected": -269.39459228515625, + "loss": 0.131, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.034124452620744705, + "rewards/margins": 6.358598709106445, + "rewards/rejected": -6.324474334716797, + "step": 2630 + }, + { + "epoch": 1.36, + "learning_rate": 3.0321285140562247e-07, + "logits/chosen": -2.608093738555908, + "logits/rejected": -2.5030605792999268, + "logps/chosen": -249.52786254882812, + "logps/rejected": -272.12176513671875, + "loss": 0.1505, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.8648720979690552, + "rewards/margins": 5.290918350219727, + "rewards/rejected": -6.155789375305176, + "step": 2640 + }, + { + "epoch": 1.37, + "learning_rate": 3.022566456301396e-07, + "logits/chosen": -2.6363885402679443, + "logits/rejected": -2.7040047645568848, + "logps/chosen": -264.7932434082031, + "logps/rejected": -294.1228942871094, + "loss": 0.1816, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.0637885183095932, + "rewards/margins": 6.300909042358398, + "rewards/rejected": -6.364696979522705, + "step": 2650 + }, + { + "epoch": 1.37, + "learning_rate": 3.013004398546567e-07, + "logits/chosen": -2.542020797729492, + "logits/rejected": -2.5166268348693848, + "logps/chosen": -271.88916015625, + "logps/rejected": -299.7735595703125, + "loss": 0.1132, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3094290792942047, + "rewards/margins": 5.991552352905273, + "rewards/rejected": -6.300982475280762, + "step": 2660 + }, + { + "epoch": 1.38, + "learning_rate": 3.003442340791738e-07, + "logits/chosen": -2.6782054901123047, + "logits/rejected": -2.670994758605957, + "logps/chosen": -265.01751708984375, + "logps/rejected": -290.5347900390625, + "loss": 0.0778, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.39196690917015076, + "rewards/margins": 6.586050510406494, + "rewards/rejected": -6.9780168533325195, + "step": 2670 + }, + { + "epoch": 1.38, + "learning_rate": 2.9938802830369093e-07, + "logits/chosen": -2.668478488922119, + "logits/rejected": -2.671940803527832, + "logps/chosen": -266.77191162109375, + "logps/rejected": -299.1286926269531, + "loss": 0.104, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.42280688881874084, + "rewards/margins": 5.9002275466918945, + "rewards/rejected": -6.323034763336182, + "step": 2680 + }, + { + "epoch": 1.39, + "learning_rate": 2.9843182252820805e-07, + "logits/chosen": -2.507829189300537, + "logits/rejected": -2.517129421234131, + "logps/chosen": -252.4044189453125, + "logps/rejected": -290.12060546875, + "loss": 0.074, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.012675967998802662, + "rewards/margins": 6.789121150970459, + "rewards/rejected": -6.801795959472656, + "step": 2690 + }, + { + "epoch": 1.39, + "learning_rate": 2.974756167527252e-07, + "logits/chosen": -2.6370203495025635, + "logits/rejected": -2.5525128841400146, + "logps/chosen": -240.90438842773438, + "logps/rejected": -289.1092224121094, + "loss": 0.0887, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.09208206832408905, + "rewards/margins": 5.901047706604004, + "rewards/rejected": -5.993129253387451, + "step": 2700 + }, + { + "epoch": 1.4, + "learning_rate": 2.9651941097724233e-07, + "logits/chosen": -2.576045513153076, + "logits/rejected": -2.587123394012451, + "logps/chosen": -280.9703369140625, + "logps/rejected": -267.0768127441406, + "loss": 0.115, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.17211000621318817, + "rewards/margins": 5.533499717712402, + "rewards/rejected": -5.361390113830566, + "step": 2710 + }, + { + "epoch": 1.4, + "learning_rate": 2.9556320520175945e-07, + "logits/chosen": -2.4903769493103027, + "logits/rejected": -2.542959213256836, + "logps/chosen": -272.4198913574219, + "logps/rejected": -274.2657775878906, + "loss": 0.1448, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.25773993134498596, + "rewards/margins": 5.519182205200195, + "rewards/rejected": -5.776921272277832, + "step": 2720 + }, + { + "epoch": 1.41, + "learning_rate": 2.946069994262765e-07, + "logits/chosen": -2.641730546951294, + "logits/rejected": -2.5455493927001953, + "logps/chosen": -292.0690612792969, + "logps/rejected": -322.41717529296875, + "loss": 0.1071, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.45149484276771545, + "rewards/margins": 6.424929141998291, + "rewards/rejected": -5.973433971405029, + "step": 2730 + }, + { + "epoch": 1.41, + "learning_rate": 2.9365079365079363e-07, + "logits/chosen": -2.602846622467041, + "logits/rejected": -2.6123135089874268, + "logps/chosen": -280.2417907714844, + "logps/rejected": -258.2378234863281, + "loss": 0.1074, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2893966734409332, + "rewards/margins": 6.021862983703613, + "rewards/rejected": -5.732466697692871, + "step": 2740 + }, + { + "epoch": 1.42, + "learning_rate": 2.9269458787531074e-07, + "logits/chosen": -2.4980530738830566, + "logits/rejected": -2.4847359657287598, + "logps/chosen": -241.2249755859375, + "logps/rejected": -305.1526794433594, + "loss": 0.0899, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.004615753889083862, + "rewards/margins": 5.619439601898193, + "rewards/rejected": -5.6240553855896, + "step": 2750 + }, + { + "epoch": 1.42, + "learning_rate": 2.9173838209982786e-07, + "logits/chosen": -2.618914842605591, + "logits/rejected": -2.564487934112549, + "logps/chosen": -264.39581298828125, + "logps/rejected": -246.0105438232422, + "loss": 0.0961, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.016721582040190697, + "rewards/margins": 5.4498291015625, + "rewards/rejected": -5.466550827026367, + "step": 2760 + }, + { + "epoch": 1.43, + "learning_rate": 2.90782176324345e-07, + "logits/chosen": -2.517812490463257, + "logits/rejected": -2.5693249702453613, + "logps/chosen": -340.0567932128906, + "logps/rejected": -288.3482360839844, + "loss": 0.0991, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.5332422256469727, + "rewards/margins": 6.4156060218811035, + "rewards/rejected": -5.882363319396973, + "step": 2770 + }, + { + "epoch": 1.44, + "learning_rate": 2.898259705488621e-07, + "logits/chosen": -2.7078089714050293, + "logits/rejected": -2.634216070175171, + "logps/chosen": -264.4534606933594, + "logps/rejected": -270.3084411621094, + "loss": 0.0881, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09578968584537506, + "rewards/margins": 5.951535224914551, + "rewards/rejected": -5.855745315551758, + "step": 2780 + }, + { + "epoch": 1.44, + "learning_rate": 2.888697647733792e-07, + "logits/chosen": -2.6018433570861816, + "logits/rejected": -2.5431325435638428, + "logps/chosen": -325.04339599609375, + "logps/rejected": -321.00384521484375, + "loss": 0.0834, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6666939854621887, + "rewards/margins": 7.204258918762207, + "rewards/rejected": -6.537564277648926, + "step": 2790 + }, + { + "epoch": 1.45, + "learning_rate": 2.879135589978963e-07, + "logits/chosen": -2.544001340866089, + "logits/rejected": -2.519418954849243, + "logps/chosen": -244.0146484375, + "logps/rejected": -299.86334228515625, + "loss": 0.0933, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.21346421539783478, + "rewards/margins": 5.652622222900391, + "rewards/rejected": -5.439157962799072, + "step": 2800 + }, + { + "epoch": 1.45, + "learning_rate": 2.8695735322241344e-07, + "logits/chosen": -2.5885682106018066, + "logits/rejected": -2.5310218334198, + "logps/chosen": -282.88092041015625, + "logps/rejected": -294.2001647949219, + "loss": 0.0818, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4320918917655945, + "rewards/margins": 6.334897518157959, + "rewards/rejected": -5.902805328369141, + "step": 2810 + }, + { + "epoch": 1.46, + "learning_rate": 2.8600114744693055e-07, + "logits/chosen": -2.4299087524414062, + "logits/rejected": -2.4698925018310547, + "logps/chosen": -251.56283569335938, + "logps/rejected": -303.685546875, + "loss": 0.0708, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.48298412561416626, + "rewards/margins": 6.807847023010254, + "rewards/rejected": -6.324862480163574, + "step": 2820 + }, + { + "epoch": 1.46, + "learning_rate": 2.8504494167144767e-07, + "logits/chosen": -2.4746925830841064, + "logits/rejected": -2.4473392963409424, + "logps/chosen": -267.5787048339844, + "logps/rejected": -305.0351867675781, + "loss": 0.1244, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.123781718313694, + "rewards/margins": 6.507838249206543, + "rewards/rejected": -6.384057521820068, + "step": 2830 + }, + { + "epoch": 1.47, + "learning_rate": 2.8408873589596484e-07, + "logits/chosen": -2.513401985168457, + "logits/rejected": -2.4804389476776123, + "logps/chosen": -263.47601318359375, + "logps/rejected": -278.4566345214844, + "loss": 0.0877, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06293614208698273, + "rewards/margins": 6.116005897521973, + "rewards/rejected": -6.053069591522217, + "step": 2840 + }, + { + "epoch": 1.47, + "learning_rate": 2.8313253012048195e-07, + "logits/chosen": -2.317784547805786, + "logits/rejected": -2.4009079933166504, + "logps/chosen": -243.90869140625, + "logps/rejected": -322.0810546875, + "loss": 0.1898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.003992212004959583, + "rewards/margins": 6.088549613952637, + "rewards/rejected": -6.084556579589844, + "step": 2850 + }, + { + "epoch": 1.48, + "learning_rate": 2.8217632434499907e-07, + "logits/chosen": -2.4719886779785156, + "logits/rejected": -2.3840715885162354, + "logps/chosen": -293.1739501953125, + "logps/rejected": -318.64990234375, + "loss": 0.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09377191215753555, + "rewards/margins": 6.726716041564941, + "rewards/rejected": -6.820487976074219, + "step": 2860 + }, + { + "epoch": 1.48, + "learning_rate": 2.812201185695162e-07, + "logits/chosen": -2.4654247760772705, + "logits/rejected": -2.4648938179016113, + "logps/chosen": -238.2162322998047, + "logps/rejected": -299.6322937011719, + "loss": 0.0761, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.07485105842351913, + "rewards/margins": 5.973660469055176, + "rewards/rejected": -5.898809909820557, + "step": 2870 + }, + { + "epoch": 1.49, + "learning_rate": 2.802639127940333e-07, + "logits/chosen": -2.512289047241211, + "logits/rejected": -2.473268985748291, + "logps/chosen": -267.739990234375, + "logps/rejected": -249.61135864257812, + "loss": 0.0935, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15142297744750977, + "rewards/margins": 5.871346950531006, + "rewards/rejected": -6.022769927978516, + "step": 2880 + }, + { + "epoch": 1.49, + "learning_rate": 2.7930770701855036e-07, + "logits/chosen": -2.570551872253418, + "logits/rejected": -2.468529224395752, + "logps/chosen": -271.27410888671875, + "logps/rejected": -282.323974609375, + "loss": 0.1277, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5963325500488281, + "rewards/margins": 7.158364295959473, + "rewards/rejected": -6.562032222747803, + "step": 2890 + }, + { + "epoch": 1.5, + "learning_rate": 2.783515012430675e-07, + "logits/chosen": -2.406557559967041, + "logits/rejected": -2.4173083305358887, + "logps/chosen": -268.2488708496094, + "logps/rejected": -279.4272155761719, + "loss": 0.1144, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4366177022457123, + "rewards/margins": 6.323246955871582, + "rewards/rejected": -5.886629581451416, + "step": 2900 + }, + { + "epoch": 1.5, + "learning_rate": 2.773952954675846e-07, + "logits/chosen": -2.4727766513824463, + "logits/rejected": -2.4939932823181152, + "logps/chosen": -266.2540283203125, + "logps/rejected": -250.78445434570312, + "loss": 0.0947, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.21414819359779358, + "rewards/margins": 5.781444549560547, + "rewards/rejected": -5.567296504974365, + "step": 2910 + }, + { + "epoch": 1.51, + "learning_rate": 2.764390896921017e-07, + "logits/chosen": -2.4799656867980957, + "logits/rejected": -2.466733932495117, + "logps/chosen": -298.5470275878906, + "logps/rejected": -284.695556640625, + "loss": 0.0868, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.38546982407569885, + "rewards/margins": 6.353014945983887, + "rewards/rejected": -5.967545509338379, + "step": 2920 + }, + { + "epoch": 1.51, + "learning_rate": 2.754828839166188e-07, + "logits/chosen": -2.458850383758545, + "logits/rejected": -2.3630478382110596, + "logps/chosen": -259.45361328125, + "logps/rejected": -270.87664794921875, + "loss": 0.0972, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5138736367225647, + "rewards/margins": 7.339796543121338, + "rewards/rejected": -6.82592248916626, + "step": 2930 + }, + { + "epoch": 1.52, + "learning_rate": 2.7452667814113594e-07, + "logits/chosen": -2.532984495162964, + "logits/rejected": -2.6302952766418457, + "logps/chosen": -287.7354431152344, + "logps/rejected": -295.53240966796875, + "loss": 0.0902, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5111293196678162, + "rewards/margins": 6.166231155395508, + "rewards/rejected": -5.655101776123047, + "step": 2940 + }, + { + "epoch": 1.52, + "learning_rate": 2.7357047236565306e-07, + "logits/chosen": -2.53074312210083, + "logits/rejected": -2.509186267852783, + "logps/chosen": -256.3830261230469, + "logps/rejected": -287.47113037109375, + "loss": 0.0582, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4252433180809021, + "rewards/margins": 6.003683090209961, + "rewards/rejected": -5.578439712524414, + "step": 2950 + }, + { + "epoch": 1.53, + "learning_rate": 2.7261426659017017e-07, + "logits/chosen": -2.465263843536377, + "logits/rejected": -2.441957950592041, + "logps/chosen": -259.81170654296875, + "logps/rejected": -287.27825927734375, + "loss": 0.1006, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.16452935338020325, + "rewards/margins": 6.049590110778809, + "rewards/rejected": -5.885060787200928, + "step": 2960 + }, + { + "epoch": 1.53, + "learning_rate": 2.716580608146873e-07, + "logits/chosen": -2.467651605606079, + "logits/rejected": -2.4969096183776855, + "logps/chosen": -325.9266662597656, + "logps/rejected": -307.8146057128906, + "loss": 0.0931, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5208323001861572, + "rewards/margins": 7.010321140289307, + "rewards/rejected": -6.489488124847412, + "step": 2970 + }, + { + "epoch": 1.54, + "learning_rate": 2.7070185503920446e-07, + "logits/chosen": -2.297741174697876, + "logits/rejected": -2.342559576034546, + "logps/chosen": -260.2572937011719, + "logps/rejected": -300.3307800292969, + "loss": 0.0574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3879585862159729, + "rewards/margins": 6.9699883460998535, + "rewards/rejected": -6.582029819488525, + "step": 2980 + }, + { + "epoch": 1.54, + "learning_rate": 2.6974564926372157e-07, + "logits/chosen": -2.491931438446045, + "logits/rejected": -2.3935139179229736, + "logps/chosen": -290.97320556640625, + "logps/rejected": -317.1175231933594, + "loss": 0.063, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28951650857925415, + "rewards/margins": 7.234450340270996, + "rewards/rejected": -6.944933891296387, + "step": 2990 + }, + { + "epoch": 1.55, + "learning_rate": 2.687894434882387e-07, + "logits/chosen": -2.462905168533325, + "logits/rejected": -2.491457223892212, + "logps/chosen": -242.7196502685547, + "logps/rejected": -251.0511932373047, + "loss": 0.0747, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3230946958065033, + "rewards/margins": 5.371783256530762, + "rewards/rejected": -5.694878578186035, + "step": 3000 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.520482063293457, + "eval_logits/rejected": -2.49690580368042, + "eval_logps/chosen": -295.31353759765625, + "eval_logps/rejected": -272.6327209472656, + "eval_loss": 0.5747828483581543, + "eval_rewards/accuracies": 0.7559999823570251, + "eval_rewards/chosen": -1.9590092897415161, + "eval_rewards/margins": 2.068134307861328, + "eval_rewards/rejected": -4.027143955230713, + "eval_runtime": 454.0222, + "eval_samples_per_second": 4.405, + "eval_steps_per_second": 0.275, + "step": 3000 + }, + { + "epoch": 1.55, + "learning_rate": 2.678332377127558e-07, + "logits/chosen": -2.430216073989868, + "logits/rejected": -2.52648663520813, + "logps/chosen": -302.43536376953125, + "logps/rejected": -307.7148742675781, + "loss": 0.0768, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5085604786872864, + "rewards/margins": 7.694282531738281, + "rewards/rejected": -7.185723304748535, + "step": 3010 + }, + { + "epoch": 1.56, + "learning_rate": 2.668770319372729e-07, + "logits/chosen": -2.472560167312622, + "logits/rejected": -2.4429688453674316, + "logps/chosen": -269.9737243652344, + "logps/rejected": -255.141845703125, + "loss": 0.079, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.05735977366566658, + "rewards/margins": 5.4194865226745605, + "rewards/rejected": -5.362127304077148, + "step": 3020 + }, + { + "epoch": 1.56, + "learning_rate": 2.6592082616179004e-07, + "logits/chosen": -2.4499270915985107, + "logits/rejected": -2.321331739425659, + "logps/chosen": -199.5952911376953, + "logps/rejected": -271.1117248535156, + "loss": 0.0957, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.10843601077795029, + "rewards/margins": 6.051784992218018, + "rewards/rejected": -5.9433488845825195, + "step": 3030 + }, + { + "epoch": 1.57, + "learning_rate": 2.649646203863071e-07, + "logits/chosen": -2.5263118743896484, + "logits/rejected": -2.501732110977173, + "logps/chosen": -257.1748046875, + "logps/rejected": -306.52545166015625, + "loss": 0.0858, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3991379737854004, + "rewards/margins": 6.244803428649902, + "rewards/rejected": -5.845664978027344, + "step": 3040 + }, + { + "epoch": 1.57, + "learning_rate": 2.640084146108242e-07, + "logits/chosen": -2.5237767696380615, + "logits/rejected": -2.5036301612854004, + "logps/chosen": -305.75408935546875, + "logps/rejected": -300.43505859375, + "loss": 0.0738, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.2263985425233841, + "rewards/margins": 7.201443672180176, + "rewards/rejected": -6.975044250488281, + "step": 3050 + }, + { + "epoch": 1.58, + "learning_rate": 2.6305220883534133e-07, + "logits/chosen": -2.4949545860290527, + "logits/rejected": -2.5045151710510254, + "logps/chosen": -239.982666015625, + "logps/rejected": -278.0937805175781, + "loss": 0.0746, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.044704683125019073, + "rewards/margins": 5.782896995544434, + "rewards/rejected": -5.738192558288574, + "step": 3060 + }, + { + "epoch": 1.58, + "learning_rate": 2.6209600305985845e-07, + "logits/chosen": -2.5282607078552246, + "logits/rejected": -2.5183358192443848, + "logps/chosen": -278.13458251953125, + "logps/rejected": -305.0899963378906, + "loss": 0.1216, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6105505228042603, + "rewards/margins": 7.445713996887207, + "rewards/rejected": -6.8351640701293945, + "step": 3070 + }, + { + "epoch": 1.59, + "learning_rate": 2.6113979728437556e-07, + "logits/chosen": -2.641510486602783, + "logits/rejected": -2.635889768600464, + "logps/chosen": -302.23468017578125, + "logps/rejected": -289.52545166015625, + "loss": 0.0945, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.28618529438972473, + "rewards/margins": 6.0523362159729, + "rewards/rejected": -5.766150951385498, + "step": 3080 + }, + { + "epoch": 1.6, + "learning_rate": 2.601835915088927e-07, + "logits/chosen": -2.5400562286376953, + "logits/rejected": -2.446110486984253, + "logps/chosen": -238.501220703125, + "logps/rejected": -286.86163330078125, + "loss": 0.0957, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.2534426152706146, + "rewards/margins": 6.206382751464844, + "rewards/rejected": -5.952939510345459, + "step": 3090 + }, + { + "epoch": 1.6, + "learning_rate": 2.592273857334098e-07, + "logits/chosen": -2.6041581630706787, + "logits/rejected": -2.4704174995422363, + "logps/chosen": -260.03790283203125, + "logps/rejected": -295.0962829589844, + "loss": 0.0959, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3247249722480774, + "rewards/margins": 6.418676853179932, + "rewards/rejected": -6.09395170211792, + "step": 3100 + }, + { + "epoch": 1.61, + "learning_rate": 2.582711799579269e-07, + "logits/chosen": -2.6691956520080566, + "logits/rejected": -2.6473820209503174, + "logps/chosen": -294.5990905761719, + "logps/rejected": -310.1783142089844, + "loss": 0.0778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.013369923457503319, + "rewards/margins": 6.645904541015625, + "rewards/rejected": -6.632534027099609, + "step": 3110 + }, + { + "epoch": 1.61, + "learning_rate": 2.573149741824441e-07, + "logits/chosen": -2.5838215351104736, + "logits/rejected": -2.569681167602539, + "logps/chosen": -309.8135681152344, + "logps/rejected": -293.36224365234375, + "loss": 0.0819, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.18596038222312927, + "rewards/margins": 6.355560302734375, + "rewards/rejected": -6.169599533081055, + "step": 3120 + }, + { + "epoch": 1.62, + "learning_rate": 2.563587684069612e-07, + "logits/chosen": -2.5741264820098877, + "logits/rejected": -2.5943167209625244, + "logps/chosen": -303.80975341796875, + "logps/rejected": -300.5847473144531, + "loss": 0.0687, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.188811257481575, + "rewards/margins": 6.232892036437988, + "rewards/rejected": -6.421704292297363, + "step": 3130 + }, + { + "epoch": 1.62, + "learning_rate": 2.554025626314783e-07, + "logits/chosen": -2.476935863494873, + "logits/rejected": -2.5481014251708984, + "logps/chosen": -276.67401123046875, + "logps/rejected": -311.7854919433594, + "loss": 0.0848, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5916802883148193, + "rewards/margins": 6.4489240646362305, + "rewards/rejected": -7.040605068206787, + "step": 3140 + }, + { + "epoch": 1.63, + "learning_rate": 2.544463568559954e-07, + "logits/chosen": -2.555737018585205, + "logits/rejected": -2.5703513622283936, + "logps/chosen": -313.1715393066406, + "logps/rejected": -320.77484130859375, + "loss": 0.0695, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.053133077919483185, + "rewards/margins": 6.832341194152832, + "rewards/rejected": -6.779207706451416, + "step": 3150 + }, + { + "epoch": 1.63, + "learning_rate": 2.5349015108051254e-07, + "logits/chosen": -2.426164150238037, + "logits/rejected": -2.5023982524871826, + "logps/chosen": -259.065673828125, + "logps/rejected": -274.95361328125, + "loss": 0.0805, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.08334522694349289, + "rewards/margins": 6.496880531311035, + "rewards/rejected": -6.580225467681885, + "step": 3160 + }, + { + "epoch": 1.64, + "learning_rate": 2.5253394530502966e-07, + "logits/chosen": -2.479907512664795, + "logits/rejected": -2.4037084579467773, + "logps/chosen": -331.0310363769531, + "logps/rejected": -324.9789733886719, + "loss": 0.0667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15661577880382538, + "rewards/margins": 6.679041862487793, + "rewards/rejected": -6.522425651550293, + "step": 3170 + }, + { + "epoch": 1.64, + "learning_rate": 2.5157773952954677e-07, + "logits/chosen": -2.610734462738037, + "logits/rejected": -2.492011308670044, + "logps/chosen": -306.02264404296875, + "logps/rejected": -303.4534912109375, + "loss": 0.0959, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.17716434597969055, + "rewards/margins": 6.456340789794922, + "rewards/rejected": -6.279176235198975, + "step": 3180 + }, + { + "epoch": 1.65, + "learning_rate": 2.506215337540639e-07, + "logits/chosen": -2.574583053588867, + "logits/rejected": -2.423633575439453, + "logps/chosen": -261.4005126953125, + "logps/rejected": -308.74761962890625, + "loss": 0.1468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08365867286920547, + "rewards/margins": 6.674959659576416, + "rewards/rejected": -6.591300010681152, + "step": 3190 + }, + { + "epoch": 1.65, + "learning_rate": 2.4966532797858095e-07, + "logits/chosen": -2.4675204753875732, + "logits/rejected": -2.391970157623291, + "logps/chosen": -287.25286865234375, + "logps/rejected": -287.6166687011719, + "loss": 0.1103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16912779211997986, + "rewards/margins": 6.987713813781738, + "rewards/rejected": -6.818585395812988, + "step": 3200 + }, + { + "epoch": 1.66, + "learning_rate": 2.4870912220309807e-07, + "logits/chosen": -2.4692444801330566, + "logits/rejected": -2.492424488067627, + "logps/chosen": -284.3822326660156, + "logps/rejected": -278.57940673828125, + "loss": 0.0819, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.0931725949048996, + "rewards/margins": 5.680792331695557, + "rewards/rejected": -5.587619304656982, + "step": 3210 + }, + { + "epoch": 1.66, + "learning_rate": 2.477529164276152e-07, + "logits/chosen": -2.5302493572235107, + "logits/rejected": -2.5462918281555176, + "logps/chosen": -274.50482177734375, + "logps/rejected": -305.86639404296875, + "loss": 0.1224, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.22570249438285828, + "rewards/margins": 6.788195610046387, + "rewards/rejected": -6.562493324279785, + "step": 3220 + }, + { + "epoch": 1.67, + "learning_rate": 2.4679671065213235e-07, + "logits/chosen": -2.4377293586730957, + "logits/rejected": -2.495702028274536, + "logps/chosen": -284.8072204589844, + "logps/rejected": -318.3940734863281, + "loss": 0.1224, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.27616390585899353, + "rewards/margins": 6.491647243499756, + "rewards/rejected": -6.215483665466309, + "step": 3230 + }, + { + "epoch": 1.67, + "learning_rate": 2.4584050487664947e-07, + "logits/chosen": -2.43880295753479, + "logits/rejected": -2.5362446308135986, + "logps/chosen": -283.766845703125, + "logps/rejected": -288.78924560546875, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6184531450271606, + "rewards/margins": 7.084000587463379, + "rewards/rejected": -6.46554708480835, + "step": 3240 + }, + { + "epoch": 1.68, + "learning_rate": 2.448842991011666e-07, + "logits/chosen": -2.446498394012451, + "logits/rejected": -2.405273199081421, + "logps/chosen": -293.029052734375, + "logps/rejected": -291.66790771484375, + "loss": 0.1185, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1758905053138733, + "rewards/margins": 5.736870288848877, + "rewards/rejected": -5.9127607345581055, + "step": 3250 + }, + { + "epoch": 1.68, + "learning_rate": 2.439280933256837e-07, + "logits/chosen": -2.4379031658172607, + "logits/rejected": -2.4364233016967773, + "logps/chosen": -242.67715454101562, + "logps/rejected": -309.957275390625, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2026246041059494, + "rewards/margins": 6.98794412612915, + "rewards/rejected": -6.7853193283081055, + "step": 3260 + }, + { + "epoch": 1.69, + "learning_rate": 2.429718875502008e-07, + "logits/chosen": -2.330289602279663, + "logits/rejected": -2.3436808586120605, + "logps/chosen": -273.34246826171875, + "logps/rejected": -333.4544372558594, + "loss": 0.0806, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.07439390569925308, + "rewards/margins": 6.902178764343262, + "rewards/rejected": -6.9765729904174805, + "step": 3270 + }, + { + "epoch": 1.69, + "learning_rate": 2.420156817747179e-07, + "logits/chosen": -2.3813562393188477, + "logits/rejected": -2.4932188987731934, + "logps/chosen": -278.9999084472656, + "logps/rejected": -269.7981872558594, + "loss": 0.059, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.24793286621570587, + "rewards/margins": 6.85955286026001, + "rewards/rejected": -6.611619472503662, + "step": 3280 + }, + { + "epoch": 1.7, + "learning_rate": 2.41059475999235e-07, + "logits/chosen": -2.4017083644866943, + "logits/rejected": -2.4412665367126465, + "logps/chosen": -251.3889923095703, + "logps/rejected": -283.4482727050781, + "loss": 0.0791, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6491401791572571, + "rewards/margins": 5.978515148162842, + "rewards/rejected": -6.627655982971191, + "step": 3290 + }, + { + "epoch": 1.7, + "learning_rate": 2.4010327022375216e-07, + "logits/chosen": -2.5032293796539307, + "logits/rejected": -2.48028826713562, + "logps/chosen": -304.7440185546875, + "logps/rejected": -320.63519287109375, + "loss": 0.1398, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.17621102929115295, + "rewards/margins": 7.24979305267334, + "rewards/rejected": -7.073582649230957, + "step": 3300 + }, + { + "epoch": 1.71, + "learning_rate": 2.391470644482693e-07, + "logits/chosen": -2.4918460845947266, + "logits/rejected": -2.436323642730713, + "logps/chosen": -289.7032165527344, + "logps/rejected": -310.91546630859375, + "loss": 0.0748, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.15517649054527283, + "rewards/margins": 6.368616104125977, + "rewards/rejected": -6.213438987731934, + "step": 3310 + }, + { + "epoch": 1.71, + "learning_rate": 2.3819085867278636e-07, + "logits/chosen": -2.4458746910095215, + "logits/rejected": -2.3586153984069824, + "logps/chosen": -211.6595916748047, + "logps/rejected": -250.60513305664062, + "loss": 0.0827, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6259239315986633, + "rewards/margins": 5.855485439300537, + "rewards/rejected": -6.481408596038818, + "step": 3320 + }, + { + "epoch": 1.72, + "learning_rate": 2.3723465289730348e-07, + "logits/chosen": -2.5415797233581543, + "logits/rejected": -2.482503652572632, + "logps/chosen": -305.93267822265625, + "logps/rejected": -257.7454833984375, + "loss": 0.1036, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0975867435336113, + "rewards/margins": 5.548628807067871, + "rewards/rejected": -5.4510416984558105, + "step": 3330 + }, + { + "epoch": 1.72, + "learning_rate": 2.362784471218206e-07, + "logits/chosen": -2.4980437755584717, + "logits/rejected": -2.5161662101745605, + "logps/chosen": -262.29510498046875, + "logps/rejected": -310.7427978515625, + "loss": 0.0731, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.31484660506248474, + "rewards/margins": 6.899011135101318, + "rewards/rejected": -6.584165096282959, + "step": 3340 + }, + { + "epoch": 1.73, + "learning_rate": 2.353222413463377e-07, + "logits/chosen": -2.462398052215576, + "logits/rejected": -2.387648582458496, + "logps/chosen": -273.9061279296875, + "logps/rejected": -313.47222900390625, + "loss": 0.0813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.27307185530662537, + "rewards/margins": 6.449624061584473, + "rewards/rejected": -6.722695827484131, + "step": 3350 + }, + { + "epoch": 1.73, + "learning_rate": 2.3436603557085483e-07, + "logits/chosen": -2.559948205947876, + "logits/rejected": -2.435615062713623, + "logps/chosen": -279.50567626953125, + "logps/rejected": -314.01318359375, + "loss": 0.0912, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1880934089422226, + "rewards/margins": 5.78375768661499, + "rewards/rejected": -5.971850395202637, + "step": 3360 + }, + { + "epoch": 1.74, + "learning_rate": 2.3340982979537197e-07, + "logits/chosen": -2.477184772491455, + "logits/rejected": -2.506683588027954, + "logps/chosen": -345.5298156738281, + "logps/rejected": -338.9172668457031, + "loss": 0.0733, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6365272998809814, + "rewards/margins": 6.926106929779053, + "rewards/rejected": -6.28957986831665, + "step": 3370 + }, + { + "epoch": 1.74, + "learning_rate": 2.3245362401988909e-07, + "logits/chosen": -2.4809818267822266, + "logits/rejected": -2.4546568393707275, + "logps/chosen": -291.1073303222656, + "logps/rejected": -299.65234375, + "loss": 0.0924, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.744446873664856, + "rewards/margins": 7.913092613220215, + "rewards/rejected": -7.168646335601807, + "step": 3380 + }, + { + "epoch": 1.75, + "learning_rate": 2.314974182444062e-07, + "logits/chosen": -2.5422558784484863, + "logits/rejected": -2.5349318981170654, + "logps/chosen": -265.03131103515625, + "logps/rejected": -303.9304504394531, + "loss": 0.0583, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8401824831962585, + "rewards/margins": 6.261730194091797, + "rewards/rejected": -7.101912498474121, + "step": 3390 + }, + { + "epoch": 1.76, + "learning_rate": 2.305412124689233e-07, + "logits/chosen": -2.3767032623291016, + "logits/rejected": -2.358243465423584, + "logps/chosen": -271.84942626953125, + "logps/rejected": -295.79327392578125, + "loss": 0.0739, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.003557852003723383, + "rewards/margins": 7.05438232421875, + "rewards/rejected": -7.057940483093262, + "step": 3400 + }, + { + "epoch": 1.76, + "learning_rate": 2.295850066934404e-07, + "logits/chosen": -2.4502758979797363, + "logits/rejected": -2.3828396797180176, + "logps/chosen": -278.19732666015625, + "logps/rejected": -309.7081604003906, + "loss": 0.0784, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.501421332359314, + "rewards/margins": 6.933775424957275, + "rewards/rejected": -7.435196876525879, + "step": 3410 + }, + { + "epoch": 1.77, + "learning_rate": 2.2862880091795752e-07, + "logits/chosen": -2.4277431964874268, + "logits/rejected": -2.446159839630127, + "logps/chosen": -270.41741943359375, + "logps/rejected": -327.88714599609375, + "loss": 0.0823, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.05128968879580498, + "rewards/margins": 7.131208896636963, + "rewards/rejected": -7.182497501373291, + "step": 3420 + }, + { + "epoch": 1.77, + "learning_rate": 2.2767259514247464e-07, + "logits/chosen": -2.369462490081787, + "logits/rejected": -2.2862589359283447, + "logps/chosen": -275.76141357421875, + "logps/rejected": -328.746337890625, + "loss": 0.0831, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08160386979579926, + "rewards/margins": 6.414852142333984, + "rewards/rejected": -6.496455192565918, + "step": 3430 + }, + { + "epoch": 1.78, + "learning_rate": 2.2671638936699178e-07, + "logits/chosen": -2.2966933250427246, + "logits/rejected": -2.256803512573242, + "logps/chosen": -271.08062744140625, + "logps/rejected": -306.69134521484375, + "loss": 0.0773, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03467688709497452, + "rewards/margins": 6.333457946777344, + "rewards/rejected": -6.298781394958496, + "step": 3440 + }, + { + "epoch": 1.78, + "learning_rate": 2.257601835915089e-07, + "logits/chosen": -2.444070339202881, + "logits/rejected": -2.4232065677642822, + "logps/chosen": -327.8500671386719, + "logps/rejected": -316.18670654296875, + "loss": 0.0727, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3836885094642639, + "rewards/margins": 7.016473293304443, + "rewards/rejected": -6.632785797119141, + "step": 3450 + }, + { + "epoch": 1.79, + "learning_rate": 2.24803977816026e-07, + "logits/chosen": -2.389791488647461, + "logits/rejected": -2.4729177951812744, + "logps/chosen": -283.513916015625, + "logps/rejected": -287.8778991699219, + "loss": 0.1155, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20163340866565704, + "rewards/margins": 7.153794288635254, + "rewards/rejected": -6.952160835266113, + "step": 3460 + }, + { + "epoch": 1.79, + "learning_rate": 2.2384777204054313e-07, + "logits/chosen": -2.516119956970215, + "logits/rejected": -2.493978977203369, + "logps/chosen": -290.07672119140625, + "logps/rejected": -308.48785400390625, + "loss": 0.0915, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6690025329589844, + "rewards/margins": 7.590751647949219, + "rewards/rejected": -6.921748161315918, + "step": 3470 + }, + { + "epoch": 1.8, + "learning_rate": 2.2289156626506022e-07, + "logits/chosen": -2.3329920768737793, + "logits/rejected": -2.2687020301818848, + "logps/chosen": -297.0395812988281, + "logps/rejected": -300.65667724609375, + "loss": 0.0788, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2801963686943054, + "rewards/margins": 6.951547145843506, + "rewards/rejected": -7.231744289398193, + "step": 3480 + }, + { + "epoch": 1.8, + "learning_rate": 2.2193536048957733e-07, + "logits/chosen": -2.523899555206299, + "logits/rejected": -2.528120279312134, + "logps/chosen": -317.03179931640625, + "logps/rejected": -318.65460205078125, + "loss": 0.0846, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.14804738759994507, + "rewards/margins": 6.38545560836792, + "rewards/rejected": -6.237408638000488, + "step": 3490 + }, + { + "epoch": 1.81, + "learning_rate": 2.2097915471409445e-07, + "logits/chosen": -2.4564592838287354, + "logits/rejected": -2.4133288860321045, + "logps/chosen": -283.5343933105469, + "logps/rejected": -309.0581359863281, + "loss": 0.101, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.22065086662769318, + "rewards/margins": 6.83633279800415, + "rewards/rejected": -7.05698299407959, + "step": 3500 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.527757167816162, + "eval_logits/rejected": -2.5068888664245605, + "eval_logps/chosen": -295.24420166015625, + "eval_logps/rejected": -274.2143859863281, + "eval_loss": 0.5783348679542542, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -1.9520740509033203, + "eval_rewards/margins": 2.2332372665405273, + "eval_rewards/rejected": -4.185311317443848, + "eval_runtime": 452.5009, + "eval_samples_per_second": 4.42, + "eval_steps_per_second": 0.276, + "step": 3500 + }, + { + "epoch": 1.81, + "learning_rate": 2.200229489386116e-07, + "logits/chosen": -2.519521474838257, + "logits/rejected": -2.472960948944092, + "logps/chosen": -282.6535949707031, + "logps/rejected": -330.7150573730469, + "loss": 0.0975, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.10534670203924179, + "rewards/margins": 7.252067565917969, + "rewards/rejected": -7.357414245605469, + "step": 3510 + }, + { + "epoch": 1.82, + "learning_rate": 2.190667431631287e-07, + "logits/chosen": -2.394043445587158, + "logits/rejected": -2.4325778484344482, + "logps/chosen": -275.0331726074219, + "logps/rejected": -367.2027893066406, + "loss": 0.1485, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1844065934419632, + "rewards/margins": 7.9727044105529785, + "rewards/rejected": -7.788296699523926, + "step": 3520 + }, + { + "epoch": 1.82, + "learning_rate": 2.1811053738764582e-07, + "logits/chosen": -2.448549509048462, + "logits/rejected": -2.4569427967071533, + "logps/chosen": -246.14657592773438, + "logps/rejected": -308.890869140625, + "loss": 0.0624, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.19459474086761475, + "rewards/margins": 6.0366058349609375, + "rewards/rejected": -6.231200695037842, + "step": 3530 + }, + { + "epoch": 1.83, + "learning_rate": 2.1715433161216294e-07, + "logits/chosen": -2.4541616439819336, + "logits/rejected": -2.3277955055236816, + "logps/chosen": -274.67626953125, + "logps/rejected": -317.1044921875, + "loss": 0.0751, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.32509785890579224, + "rewards/margins": 6.280895233154297, + "rewards/rejected": -6.605993747711182, + "step": 3540 + }, + { + "epoch": 1.83, + "learning_rate": 2.1619812583668005e-07, + "logits/chosen": -2.5092625617980957, + "logits/rejected": -2.4619314670562744, + "logps/chosen": -258.93194580078125, + "logps/rejected": -278.76763916015625, + "loss": 0.1008, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.588664710521698, + "rewards/margins": 6.409360408782959, + "rewards/rejected": -6.998025417327881, + "step": 3550 + }, + { + "epoch": 1.84, + "learning_rate": 2.1524192006119714e-07, + "logits/chosen": -2.406841516494751, + "logits/rejected": -2.4362905025482178, + "logps/chosen": -252.5372772216797, + "logps/rejected": -290.995361328125, + "loss": 0.0803, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6738952994346619, + "rewards/margins": 6.143965721130371, + "rewards/rejected": -6.817861080169678, + "step": 3560 + }, + { + "epoch": 1.84, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -2.506378412246704, + "logits/rejected": -2.50364351272583, + "logps/chosen": -253.6861114501953, + "logps/rejected": -268.87969970703125, + "loss": 0.0661, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7515280246734619, + "rewards/margins": 5.812872886657715, + "rewards/rejected": -6.564399719238281, + "step": 3570 + }, + { + "epoch": 1.85, + "learning_rate": 2.133295085102314e-07, + "logits/chosen": -2.3749794960021973, + "logits/rejected": -2.441770076751709, + "logps/chosen": -265.58209228515625, + "logps/rejected": -287.2449645996094, + "loss": 0.0564, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5784146189689636, + "rewards/margins": 6.812424659729004, + "rewards/rejected": -7.390838623046875, + "step": 3580 + }, + { + "epoch": 1.85, + "learning_rate": 2.1237330273474851e-07, + "logits/chosen": -2.5062410831451416, + "logits/rejected": -2.5346713066101074, + "logps/chosen": -313.18817138671875, + "logps/rejected": -302.63458251953125, + "loss": 0.1101, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.45042672753334045, + "rewards/margins": 6.4521989822387695, + "rewards/rejected": -6.902626037597656, + "step": 3590 + }, + { + "epoch": 1.86, + "learning_rate": 2.1141709695926563e-07, + "logits/chosen": -2.4420742988586426, + "logits/rejected": -2.443021059036255, + "logps/chosen": -278.341064453125, + "logps/rejected": -274.72418212890625, + "loss": 0.0752, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.20474250614643097, + "rewards/margins": 6.852424621582031, + "rewards/rejected": -7.057167053222656, + "step": 3600 + }, + { + "epoch": 1.86, + "learning_rate": 2.1046089118378275e-07, + "logits/chosen": -2.5255866050720215, + "logits/rejected": -2.548825740814209, + "logps/chosen": -258.55224609375, + "logps/rejected": -315.3794250488281, + "loss": 0.0795, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.010959947481751442, + "rewards/margins": 7.67264461517334, + "rewards/rejected": -7.683606147766113, + "step": 3610 + }, + { + "epoch": 1.87, + "learning_rate": 2.0950468540829986e-07, + "logits/chosen": -2.4413561820983887, + "logits/rejected": -2.4037675857543945, + "logps/chosen": -281.16693115234375, + "logps/rejected": -287.8060302734375, + "loss": 0.1006, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.09261156618595123, + "rewards/margins": 6.9859113693237305, + "rewards/rejected": -6.8933000564575195, + "step": 3620 + }, + { + "epoch": 1.87, + "learning_rate": 2.0854847963281698e-07, + "logits/chosen": -2.480898380279541, + "logits/rejected": -2.3625235557556152, + "logps/chosen": -312.2430419921875, + "logps/rejected": -297.37713623046875, + "loss": 0.0805, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.038004614412784576, + "rewards/margins": 6.296678066253662, + "rewards/rejected": -6.258673667907715, + "step": 3630 + }, + { + "epoch": 1.88, + "learning_rate": 2.0759227385733407e-07, + "logits/chosen": -2.3831233978271484, + "logits/rejected": -2.4543986320495605, + "logps/chosen": -319.70159912109375, + "logps/rejected": -310.35638427734375, + "loss": 0.0865, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2939375936985016, + "rewards/margins": 6.249878883361816, + "rewards/rejected": -6.543816566467285, + "step": 3640 + }, + { + "epoch": 1.88, + "learning_rate": 2.066360680818512e-07, + "logits/chosen": -2.4780735969543457, + "logits/rejected": -2.4746382236480713, + "logps/chosen": -336.36346435546875, + "logps/rejected": -311.2398986816406, + "loss": 0.0663, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3942198157310486, + "rewards/margins": 6.873496055603027, + "rewards/rejected": -6.47927713394165, + "step": 3650 + }, + { + "epoch": 1.89, + "learning_rate": 2.0567986230636832e-07, + "logits/chosen": -2.5300562381744385, + "logits/rejected": -2.520646572113037, + "logps/chosen": -259.832763671875, + "logps/rejected": -299.1865539550781, + "loss": 0.1066, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.015356218442320824, + "rewards/margins": 6.783368110656738, + "rewards/rejected": -6.7987236976623535, + "step": 3660 + }, + { + "epoch": 1.89, + "learning_rate": 2.0472365653088544e-07, + "logits/chosen": -2.542448043823242, + "logits/rejected": -2.494253635406494, + "logps/chosen": -304.13446044921875, + "logps/rejected": -295.2920837402344, + "loss": 0.1075, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3823709487915039, + "rewards/margins": 5.561723232269287, + "rewards/rejected": -5.944094657897949, + "step": 3670 + }, + { + "epoch": 1.9, + "learning_rate": 2.0376745075540256e-07, + "logits/chosen": -2.5494580268859863, + "logits/rejected": -2.544642925262451, + "logps/chosen": -327.9197998046875, + "logps/rejected": -315.2252197265625, + "loss": 0.0836, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.03714234381914139, + "rewards/margins": 6.751425266265869, + "rewards/rejected": -6.788567543029785, + "step": 3680 + }, + { + "epoch": 1.91, + "learning_rate": 2.0281124497991967e-07, + "logits/chosen": -2.564119577407837, + "logits/rejected": -2.4600670337677, + "logps/chosen": -285.39801025390625, + "logps/rejected": -321.54742431640625, + "loss": 0.0885, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.11016042530536652, + "rewards/margins": 6.66648006439209, + "rewards/rejected": -6.556318759918213, + "step": 3690 + }, + { + "epoch": 1.91, + "learning_rate": 2.018550392044368e-07, + "logits/chosen": -2.59401798248291, + "logits/rejected": -2.5557501316070557, + "logps/chosen": -276.79486083984375, + "logps/rejected": -344.27130126953125, + "loss": 0.0822, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.26534900069236755, + "rewards/margins": 7.319410800933838, + "rewards/rejected": -7.054062843322754, + "step": 3700 + }, + { + "epoch": 1.92, + "learning_rate": 2.0089883342895388e-07, + "logits/chosen": -2.6014533042907715, + "logits/rejected": -2.5620899200439453, + "logps/chosen": -318.00531005859375, + "logps/rejected": -271.3183288574219, + "loss": 0.0996, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.26080864667892456, + "rewards/margins": 6.475592136383057, + "rewards/rejected": -6.214783668518066, + "step": 3710 + }, + { + "epoch": 1.92, + "learning_rate": 1.9994262765347102e-07, + "logits/chosen": -2.523375988006592, + "logits/rejected": -2.512540340423584, + "logps/chosen": -293.73431396484375, + "logps/rejected": -266.2129821777344, + "loss": 0.0836, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.26058363914489746, + "rewards/margins": 6.056021690368652, + "rewards/rejected": -6.316605091094971, + "step": 3720 + }, + { + "epoch": 1.93, + "learning_rate": 1.9898642187798813e-07, + "logits/chosen": -2.490314245223999, + "logits/rejected": -2.4401144981384277, + "logps/chosen": -289.86444091796875, + "logps/rejected": -321.94183349609375, + "loss": 0.0654, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.539985716342926, + "rewards/margins": 7.538631439208984, + "rewards/rejected": -6.9986467361450195, + "step": 3730 + }, + { + "epoch": 1.93, + "learning_rate": 1.9803021610250525e-07, + "logits/chosen": -2.5355782508850098, + "logits/rejected": -2.546966791152954, + "logps/chosen": -235.0393829345703, + "logps/rejected": -285.04840087890625, + "loss": 0.0731, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.02458820305764675, + "rewards/margins": 6.333740234375, + "rewards/rejected": -6.358328342437744, + "step": 3740 + }, + { + "epoch": 1.94, + "learning_rate": 1.9707401032702237e-07, + "logits/chosen": -2.481501579284668, + "logits/rejected": -2.4209208488464355, + "logps/chosen": -266.3040771484375, + "logps/rejected": -290.20208740234375, + "loss": 0.0538, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.03531923145055771, + "rewards/margins": 6.58829402923584, + "rewards/rejected": -6.552975654602051, + "step": 3750 + }, + { + "epoch": 1.94, + "learning_rate": 1.9611780455153948e-07, + "logits/chosen": -2.583940267562866, + "logits/rejected": -2.5408358573913574, + "logps/chosen": -239.2235870361328, + "logps/rejected": -294.5646057128906, + "loss": 0.1252, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.46535977721214294, + "rewards/margins": 6.524919033050537, + "rewards/rejected": -6.059558868408203, + "step": 3760 + }, + { + "epoch": 1.95, + "learning_rate": 1.951615987760566e-07, + "logits/chosen": -2.467961072921753, + "logits/rejected": -2.4821152687072754, + "logps/chosen": -277.696533203125, + "logps/rejected": -301.1275939941406, + "loss": 0.0803, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.28950750827789307, + "rewards/margins": 6.481263637542725, + "rewards/rejected": -6.7707719802856445, + "step": 3770 + }, + { + "epoch": 1.95, + "learning_rate": 1.942053930005737e-07, + "logits/chosen": -2.5769028663635254, + "logits/rejected": -2.512784004211426, + "logps/chosen": -287.0464782714844, + "logps/rejected": -302.48370361328125, + "loss": 0.0578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1127987951040268, + "rewards/margins": 7.0196099281311035, + "rewards/rejected": -6.906810760498047, + "step": 3780 + }, + { + "epoch": 1.96, + "learning_rate": 1.9324918722509086e-07, + "logits/chosen": -2.4373650550842285, + "logits/rejected": -2.4223923683166504, + "logps/chosen": -271.5222473144531, + "logps/rejected": -329.9495544433594, + "loss": 0.0852, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.07137300074100494, + "rewards/margins": 6.933795928955078, + "rewards/rejected": -7.005169868469238, + "step": 3790 + }, + { + "epoch": 1.96, + "learning_rate": 1.9229298144960794e-07, + "logits/chosen": -2.4774272441864014, + "logits/rejected": -2.485574722290039, + "logps/chosen": -289.32550048828125, + "logps/rejected": -297.17132568359375, + "loss": 0.1129, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4329603612422943, + "rewards/margins": 6.4090399742126465, + "rewards/rejected": -6.8420000076293945, + "step": 3800 + }, + { + "epoch": 1.97, + "learning_rate": 1.9133677567412506e-07, + "logits/chosen": -2.480685234069824, + "logits/rejected": -2.485748767852783, + "logps/chosen": -285.3147277832031, + "logps/rejected": -300.1988830566406, + "loss": 0.0892, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2884216904640198, + "rewards/margins": 6.330131530761719, + "rewards/rejected": -6.618553161621094, + "step": 3810 + }, + { + "epoch": 1.97, + "learning_rate": 1.9038056989864218e-07, + "logits/chosen": -2.5223355293273926, + "logits/rejected": -2.4684529304504395, + "logps/chosen": -249.61636352539062, + "logps/rejected": -274.7422790527344, + "loss": 0.1282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3523308336734772, + "rewards/margins": 5.8032026290893555, + "rewards/rejected": -5.450871467590332, + "step": 3820 + }, + { + "epoch": 1.98, + "learning_rate": 1.894243641231593e-07, + "logits/chosen": -2.3684513568878174, + "logits/rejected": -2.2880539894104004, + "logps/chosen": -269.7086486816406, + "logps/rejected": -254.0093231201172, + "loss": 0.0998, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3140656352043152, + "rewards/margins": 5.751679420471191, + "rewards/rejected": -6.065744400024414, + "step": 3830 + }, + { + "epoch": 1.98, + "learning_rate": 1.884681583476764e-07, + "logits/chosen": -2.474822521209717, + "logits/rejected": -2.5617165565490723, + "logps/chosen": -290.51214599609375, + "logps/rejected": -301.39251708984375, + "loss": 0.0547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7417269945144653, + "rewards/margins": 5.65903902053833, + "rewards/rejected": -6.400765895843506, + "step": 3840 + }, + { + "epoch": 1.99, + "learning_rate": 1.8751195257219352e-07, + "logits/chosen": -2.536726474761963, + "logits/rejected": -2.5045382976531982, + "logps/chosen": -254.51803588867188, + "logps/rejected": -287.4189758300781, + "loss": 0.1006, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.414132297039032, + "rewards/margins": 5.95471715927124, + "rewards/rejected": -6.368849754333496, + "step": 3850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8655574679671067e-07, + "logits/chosen": -2.4787344932556152, + "logits/rejected": -2.4844470024108887, + "logps/chosen": -293.6579895019531, + "logps/rejected": -310.86761474609375, + "loss": 0.0978, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.47846388816833496, + "rewards/margins": 6.510424613952637, + "rewards/rejected": -6.988888740539551, + "step": 3860 + }, + { + "epoch": 2.0, + "learning_rate": 1.8559954102122778e-07, + "logits/chosen": -2.479588508605957, + "logits/rejected": -2.514061450958252, + "logps/chosen": -280.0299072265625, + "logps/rejected": -297.7696533203125, + "loss": 0.1121, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11727344989776611, + "rewards/margins": 6.20565128326416, + "rewards/rejected": -6.3229241371154785, + "step": 3870 + }, + { + "epoch": 2.0, + "learning_rate": 1.8464333524574487e-07, + "logits/chosen": -2.4771392345428467, + "logits/rejected": -2.4389753341674805, + "logps/chosen": -256.0650634765625, + "logps/rejected": -277.08477783203125, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06204669550061226, + "rewards/margins": 6.886547088623047, + "rewards/rejected": -6.82450008392334, + "step": 3880 + }, + { + "epoch": 2.01, + "learning_rate": 1.8368712947026199e-07, + "logits/chosen": -2.4898078441619873, + "logits/rejected": -2.522775173187256, + "logps/chosen": -287.1253967285156, + "logps/rejected": -301.94256591796875, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.26958638429641724, + "rewards/margins": 7.4097700119018555, + "rewards/rejected": -7.140183448791504, + "step": 3890 + }, + { + "epoch": 2.01, + "learning_rate": 1.827309236947791e-07, + "logits/chosen": -2.4328882694244385, + "logits/rejected": -2.46454119682312, + "logps/chosen": -252.57009887695312, + "logps/rejected": -318.90521240234375, + "loss": 0.0219, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.20105068385601044, + "rewards/margins": 8.330595016479492, + "rewards/rejected": -8.129544258117676, + "step": 3900 + }, + { + "epoch": 2.02, + "learning_rate": 1.8177471791929622e-07, + "logits/chosen": -2.362769603729248, + "logits/rejected": -2.421079397201538, + "logps/chosen": -266.7387390136719, + "logps/rejected": -366.26568603515625, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3538321852684021, + "rewards/margins": 8.694085121154785, + "rewards/rejected": -8.340251922607422, + "step": 3910 + }, + { + "epoch": 2.02, + "learning_rate": 1.8081851214381333e-07, + "logits/chosen": -2.3552629947662354, + "logits/rejected": -2.3446311950683594, + "logps/chosen": -284.10064697265625, + "logps/rejected": -355.9640197753906, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04302588850259781, + "rewards/margins": 8.178693771362305, + "rewards/rejected": -8.13566780090332, + "step": 3920 + }, + { + "epoch": 2.03, + "learning_rate": 1.7986230636833047e-07, + "logits/chosen": -2.4111831188201904, + "logits/rejected": -2.5026137828826904, + "logps/chosen": -228.980712890625, + "logps/rejected": -296.83905029296875, + "loss": 0.0186, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2731500267982483, + "rewards/margins": 7.313241481781006, + "rewards/rejected": -7.586390495300293, + "step": 3930 + }, + { + "epoch": 2.03, + "learning_rate": 1.789061005928476e-07, + "logits/chosen": -2.5294995307922363, + "logits/rejected": -2.470599889755249, + "logps/chosen": -309.84344482421875, + "logps/rejected": -292.3616943359375, + "loss": 0.0254, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.040870584547519684, + "rewards/margins": 7.898612022399902, + "rewards/rejected": -7.8577423095703125, + "step": 3940 + }, + { + "epoch": 2.04, + "learning_rate": 1.7794989481736468e-07, + "logits/chosen": -2.4372847080230713, + "logits/rejected": -2.4185192584991455, + "logps/chosen": -296.27508544921875, + "logps/rejected": -320.5166931152344, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05343180149793625, + "rewards/margins": 8.25248908996582, + "rewards/rejected": -8.305920600891113, + "step": 3950 + }, + { + "epoch": 2.04, + "learning_rate": 1.769936890418818e-07, + "logits/chosen": -2.4072155952453613, + "logits/rejected": -2.3540451526641846, + "logps/chosen": -273.29339599609375, + "logps/rejected": -315.81890869140625, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36594390869140625, + "rewards/margins": 8.549717903137207, + "rewards/rejected": -8.183772087097168, + "step": 3960 + }, + { + "epoch": 2.05, + "learning_rate": 1.760374832663989e-07, + "logits/chosen": -2.407019853591919, + "logits/rejected": -2.346592664718628, + "logps/chosen": -274.61138916015625, + "logps/rejected": -335.91253662109375, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06718329340219498, + "rewards/margins": 8.160886764526367, + "rewards/rejected": -8.228068351745605, + "step": 3970 + }, + { + "epoch": 2.05, + "learning_rate": 1.7508127749091603e-07, + "logits/chosen": -2.4587526321411133, + "logits/rejected": -2.431252956390381, + "logps/chosen": -281.40447998046875, + "logps/rejected": -324.2808837890625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30483338236808777, + "rewards/margins": 8.109731674194336, + "rewards/rejected": -8.414565086364746, + "step": 3980 + }, + { + "epoch": 2.06, + "learning_rate": 1.7412507171543314e-07, + "logits/chosen": -2.4788222312927246, + "logits/rejected": -2.4190168380737305, + "logps/chosen": -273.2280578613281, + "logps/rejected": -290.38812255859375, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5941675305366516, + "rewards/margins": 8.313642501831055, + "rewards/rejected": -8.907809257507324, + "step": 3990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7316886593995028e-07, + "logits/chosen": -2.524629831314087, + "logits/rejected": -2.4027457237243652, + "logps/chosen": -255.3746795654297, + "logps/rejected": -309.2898254394531, + "loss": 0.0195, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.27210792899131775, + "rewards/margins": 8.531095504760742, + "rewards/rejected": -8.803202629089355, + "step": 4000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.5158400535583496, + "eval_logits/rejected": -2.493459463119507, + "eval_logps/chosen": -305.0455017089844, + "eval_logps/rejected": -289.9938049316406, + "eval_loss": 0.625259518623352, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -2.932209014892578, + "eval_rewards/margins": 2.831045150756836, + "eval_rewards/rejected": -5.763253688812256, + "eval_runtime": 453.5801, + "eval_samples_per_second": 4.409, + "eval_steps_per_second": 0.276, + "step": 4000 + }, + { + "epoch": 2.07, + "learning_rate": 1.722126601644674e-07, + "logits/chosen": -2.39493989944458, + "logits/rejected": -2.40425705909729, + "logps/chosen": -286.09185791015625, + "logps/rejected": -304.4429016113281, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2783908247947693, + "rewards/margins": 7.916007041931152, + "rewards/rejected": -8.194398880004883, + "step": 4010 + }, + { + "epoch": 2.08, + "learning_rate": 1.7125645438898452e-07, + "logits/chosen": -2.4972126483917236, + "logits/rejected": -2.3945891857147217, + "logps/chosen": -275.5196228027344, + "logps/rejected": -324.5783996582031, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20885951817035675, + "rewards/margins": 9.706292152404785, + "rewards/rejected": -9.915151596069336, + "step": 4020 + }, + { + "epoch": 2.08, + "learning_rate": 1.703002486135016e-07, + "logits/chosen": -2.485563278198242, + "logits/rejected": -2.3387653827667236, + "logps/chosen": -294.34124755859375, + "logps/rejected": -329.8710021972656, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1807212084531784, + "rewards/margins": 9.752906799316406, + "rewards/rejected": -9.572185516357422, + "step": 4030 + }, + { + "epoch": 2.09, + "learning_rate": 1.6934404283801872e-07, + "logits/chosen": -2.4767284393310547, + "logits/rejected": -2.2920525074005127, + "logps/chosen": -288.2818908691406, + "logps/rejected": -346.46185302734375, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2743307054042816, + "rewards/margins": 9.252851486206055, + "rewards/rejected": -9.527181625366211, + "step": 4040 + }, + { + "epoch": 2.09, + "learning_rate": 1.6838783706253584e-07, + "logits/chosen": -2.425131320953369, + "logits/rejected": -2.5031018257141113, + "logps/chosen": -253.77273559570312, + "logps/rejected": -301.98980712890625, + "loss": 0.0374, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.680608332157135, + "rewards/margins": 8.351430892944336, + "rewards/rejected": -9.032038688659668, + "step": 4050 + }, + { + "epoch": 2.1, + "learning_rate": 1.6743163128705295e-07, + "logits/chosen": -2.4143614768981934, + "logits/rejected": -2.4409687519073486, + "logps/chosen": -264.8473815917969, + "logps/rejected": -324.42376708984375, + "loss": 0.0124, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.908231258392334, + "rewards/margins": 8.655680656433105, + "rewards/rejected": -9.563910484313965, + "step": 4060 + }, + { + "epoch": 2.1, + "learning_rate": 1.664754255115701e-07, + "logits/chosen": -2.4121220111846924, + "logits/rejected": -2.3599774837493896, + "logps/chosen": -262.0404052734375, + "logps/rejected": -341.3482666015625, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46254047751426697, + "rewards/margins": 9.965926170349121, + "rewards/rejected": -10.428467750549316, + "step": 4070 + }, + { + "epoch": 2.11, + "learning_rate": 1.655192197360872e-07, + "logits/chosen": -2.550471305847168, + "logits/rejected": -2.515150547027588, + "logps/chosen": -318.02349853515625, + "logps/rejected": -362.1571960449219, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4734498858451843, + "rewards/margins": 9.279683113098145, + "rewards/rejected": -9.753132820129395, + "step": 4080 + }, + { + "epoch": 2.11, + "learning_rate": 1.6456301396060433e-07, + "logits/chosen": -2.474956750869751, + "logits/rejected": -2.4845316410064697, + "logps/chosen": -312.4612121582031, + "logps/rejected": -371.6642150878906, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6691607236862183, + "rewards/margins": 8.871939659118652, + "rewards/rejected": -9.541101455688477, + "step": 4090 + }, + { + "epoch": 2.12, + "learning_rate": 1.6360680818512144e-07, + "logits/chosen": -2.4650988578796387, + "logits/rejected": -2.3654093742370605, + "logps/chosen": -273.10906982421875, + "logps/rejected": -328.1131591796875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6473243832588196, + "rewards/margins": 8.533228874206543, + "rewards/rejected": -9.18055248260498, + "step": 4100 + }, + { + "epoch": 2.12, + "learning_rate": 1.6265060240963853e-07, + "logits/chosen": -2.4874815940856934, + "logits/rejected": -2.346606969833374, + "logps/chosen": -313.10040283203125, + "logps/rejected": -328.90081787109375, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4576417803764343, + "rewards/margins": 8.776191711425781, + "rewards/rejected": -9.233833312988281, + "step": 4110 + }, + { + "epoch": 2.13, + "learning_rate": 1.6169439663415565e-07, + "logits/chosen": -2.5000967979431152, + "logits/rejected": -2.4653024673461914, + "logps/chosen": -291.36895751953125, + "logps/rejected": -340.0940246582031, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.645675539970398, + "rewards/margins": 10.322629928588867, + "rewards/rejected": -10.968305587768555, + "step": 4120 + }, + { + "epoch": 2.13, + "learning_rate": 1.6073819085867276e-07, + "logits/chosen": -2.5758209228515625, + "logits/rejected": -2.4631595611572266, + "logps/chosen": -273.5103759765625, + "logps/rejected": -392.3283386230469, + "loss": 0.0123, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.28731220960617065, + "rewards/margins": 9.878499984741211, + "rewards/rejected": -10.165812492370605, + "step": 4130 + }, + { + "epoch": 2.14, + "learning_rate": 1.597819850831899e-07, + "logits/chosen": -2.4753005504608154, + "logits/rejected": -2.4263906478881836, + "logps/chosen": -257.69207763671875, + "logps/rejected": -337.54058837890625, + "loss": 0.0285, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8755598068237305, + "rewards/margins": 9.041923522949219, + "rewards/rejected": -9.917482376098633, + "step": 4140 + }, + { + "epoch": 2.14, + "learning_rate": 1.5882577930770702e-07, + "logits/chosen": -2.386593818664551, + "logits/rejected": -2.465447187423706, + "logps/chosen": -290.82623291015625, + "logps/rejected": -332.8897399902344, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8657897710800171, + "rewards/margins": 9.042566299438477, + "rewards/rejected": -9.908356666564941, + "step": 4150 + }, + { + "epoch": 2.15, + "learning_rate": 1.5786957353222414e-07, + "logits/chosen": -2.412400245666504, + "logits/rejected": -2.4159159660339355, + "logps/chosen": -333.2178649902344, + "logps/rejected": -335.25787353515625, + "loss": 0.0111, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.09050460904836655, + "rewards/margins": 9.707788467407227, + "rewards/rejected": -9.798294067382812, + "step": 4160 + }, + { + "epoch": 2.15, + "learning_rate": 1.5691336775674125e-07, + "logits/chosen": -2.3777973651885986, + "logits/rejected": -2.3793344497680664, + "logps/chosen": -267.68084716796875, + "logps/rejected": -323.4200744628906, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5352864861488342, + "rewards/margins": 10.006217956542969, + "rewards/rejected": -10.541502952575684, + "step": 4170 + }, + { + "epoch": 2.16, + "learning_rate": 1.5595716198125837e-07, + "logits/chosen": -2.379467725753784, + "logits/rejected": -2.499807834625244, + "logps/chosen": -306.67608642578125, + "logps/rejected": -353.30084228515625, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9916990995407104, + "rewards/margins": 9.460695266723633, + "rewards/rejected": -10.452393531799316, + "step": 4180 + }, + { + "epoch": 2.16, + "learning_rate": 1.5500095620577546e-07, + "logits/chosen": -2.4940500259399414, + "logits/rejected": -2.462193489074707, + "logps/chosen": -270.82696533203125, + "logps/rejected": -294.65570068359375, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4269378185272217, + "rewards/margins": 8.49191665649414, + "rewards/rejected": -9.918853759765625, + "step": 4190 + }, + { + "epoch": 2.17, + "learning_rate": 1.5404475043029257e-07, + "logits/chosen": -2.4286954402923584, + "logits/rejected": -2.367332935333252, + "logps/chosen": -269.7999267578125, + "logps/rejected": -323.0738525390625, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.022783875465393, + "rewards/margins": 9.329833984375, + "rewards/rejected": -10.352617263793945, + "step": 4200 + }, + { + "epoch": 2.17, + "learning_rate": 1.5308854465480971e-07, + "logits/chosen": -2.498307704925537, + "logits/rejected": -2.450392961502075, + "logps/chosen": -280.20977783203125, + "logps/rejected": -296.2657775878906, + "loss": 0.0137, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5457987785339355, + "rewards/margins": 8.372919082641602, + "rewards/rejected": -8.918716430664062, + "step": 4210 + }, + { + "epoch": 2.18, + "learning_rate": 1.5213233887932683e-07, + "logits/chosen": -2.4233896732330322, + "logits/rejected": -2.4217326641082764, + "logps/chosen": -313.09271240234375, + "logps/rejected": -329.91314697265625, + "loss": 0.0078, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6310591697692871, + "rewards/margins": 8.810084342956543, + "rewards/rejected": -9.441144943237305, + "step": 4220 + }, + { + "epoch": 2.18, + "learning_rate": 1.5117613310384395e-07, + "logits/chosen": -2.441246747970581, + "logits/rejected": -2.4308252334594727, + "logps/chosen": -294.64312744140625, + "logps/rejected": -339.2587585449219, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9671104550361633, + "rewards/margins": 10.101009368896484, + "rewards/rejected": -11.068120956420898, + "step": 4230 + }, + { + "epoch": 2.19, + "learning_rate": 1.5021992732836106e-07, + "logits/chosen": -2.458059787750244, + "logits/rejected": -2.476111650466919, + "logps/chosen": -291.772705078125, + "logps/rejected": -376.759033203125, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0088956356048584, + "rewards/margins": 9.92273998260498, + "rewards/rejected": -10.93163776397705, + "step": 4240 + }, + { + "epoch": 2.19, + "learning_rate": 1.4926372155287818e-07, + "logits/chosen": -2.49906587600708, + "logits/rejected": -2.332627773284912, + "logps/chosen": -237.60049438476562, + "logps/rejected": -333.3525390625, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.078620433807373, + "rewards/margins": 9.37675666809082, + "rewards/rejected": -10.455377578735352, + "step": 4250 + }, + { + "epoch": 2.2, + "learning_rate": 1.483075157773953e-07, + "logits/chosen": -2.4600281715393066, + "logits/rejected": -2.4805192947387695, + "logps/chosen": -290.1564636230469, + "logps/rejected": -313.1746826171875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.712319552898407, + "rewards/margins": 8.697793006896973, + "rewards/rejected": -9.410112380981445, + "step": 4260 + }, + { + "epoch": 2.2, + "learning_rate": 1.4735131000191238e-07, + "logits/chosen": -2.4032301902770996, + "logits/rejected": -2.3650975227355957, + "logps/chosen": -251.8123016357422, + "logps/rejected": -277.9178771972656, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7331027984619141, + "rewards/margins": 8.663496971130371, + "rewards/rejected": -9.396598815917969, + "step": 4270 + }, + { + "epoch": 2.21, + "learning_rate": 1.4639510422642952e-07, + "logits/chosen": -2.410510540008545, + "logits/rejected": -2.3593618869781494, + "logps/chosen": -254.2480010986328, + "logps/rejected": -306.0626525878906, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2594799995422363, + "rewards/margins": 7.9070024490356445, + "rewards/rejected": -9.166482925415039, + "step": 4280 + }, + { + "epoch": 2.21, + "learning_rate": 1.4543889845094664e-07, + "logits/chosen": -2.4375698566436768, + "logits/rejected": -2.336656093597412, + "logps/chosen": -318.3421936035156, + "logps/rejected": -384.72772216796875, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4227966368198395, + "rewards/margins": 10.83735466003418, + "rewards/rejected": -11.260150909423828, + "step": 4290 + }, + { + "epoch": 2.22, + "learning_rate": 1.4448269267546376e-07, + "logits/chosen": -2.393700122833252, + "logits/rejected": -2.4179205894470215, + "logps/chosen": -343.4000244140625, + "logps/rejected": -361.1111755371094, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06643708050251007, + "rewards/margins": 10.324440002441406, + "rewards/rejected": -10.390877723693848, + "step": 4300 + }, + { + "epoch": 2.23, + "learning_rate": 1.4352648689998087e-07, + "logits/chosen": -2.451838731765747, + "logits/rejected": -2.507495880126953, + "logps/chosen": -259.57757568359375, + "logps/rejected": -311.0859375, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5939081311225891, + "rewards/margins": 9.141595840454102, + "rewards/rejected": -9.735504150390625, + "step": 4310 + }, + { + "epoch": 2.23, + "learning_rate": 1.42570281124498e-07, + "logits/chosen": -2.3880109786987305, + "logits/rejected": -2.4511330127716064, + "logps/chosen": -312.5009765625, + "logps/rejected": -357.44384765625, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6199862360954285, + "rewards/margins": 9.396383285522461, + "rewards/rejected": -10.01636791229248, + "step": 4320 + }, + { + "epoch": 2.24, + "learning_rate": 1.416140753490151e-07, + "logits/chosen": -2.399909734725952, + "logits/rejected": -2.409482955932617, + "logps/chosen": -312.19403076171875, + "logps/rejected": -365.5652160644531, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7507609128952026, + "rewards/margins": 10.391949653625488, + "rewards/rejected": -11.142709732055664, + "step": 4330 + }, + { + "epoch": 2.24, + "learning_rate": 1.4065786957353222e-07, + "logits/chosen": -2.466676712036133, + "logits/rejected": -2.3715767860412598, + "logps/chosen": -290.2066955566406, + "logps/rejected": -340.7693176269531, + "loss": 0.019, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8540838956832886, + "rewards/margins": 9.24998950958252, + "rewards/rejected": -10.104074478149414, + "step": 4340 + }, + { + "epoch": 2.25, + "learning_rate": 1.3970166379804933e-07, + "logits/chosen": -2.4747912883758545, + "logits/rejected": -2.4412856101989746, + "logps/chosen": -332.2046203613281, + "logps/rejected": -336.59320068359375, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2478363513946533, + "rewards/margins": 9.040945053100586, + "rewards/rejected": -10.28878116607666, + "step": 4350 + }, + { + "epoch": 2.25, + "learning_rate": 1.3874545802256645e-07, + "logits/chosen": -2.4615142345428467, + "logits/rejected": -2.4157400131225586, + "logps/chosen": -275.7891845703125, + "logps/rejected": -314.43426513671875, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8941132426261902, + "rewards/margins": 9.226968765258789, + "rewards/rejected": -10.12108039855957, + "step": 4360 + }, + { + "epoch": 2.26, + "learning_rate": 1.3778925224708357e-07, + "logits/chosen": -2.3651604652404785, + "logits/rejected": -2.3336400985717773, + "logps/chosen": -268.6673889160156, + "logps/rejected": -325.3690490722656, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1127688884735107, + "rewards/margins": 8.900229454040527, + "rewards/rejected": -10.012998580932617, + "step": 4370 + }, + { + "epoch": 2.26, + "learning_rate": 1.3683304647160068e-07, + "logits/chosen": -2.4090378284454346, + "logits/rejected": -2.420701265335083, + "logps/chosen": -261.0252380371094, + "logps/rejected": -343.9711608886719, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9711012840270996, + "rewards/margins": 9.500223159790039, + "rewards/rejected": -10.471324920654297, + "step": 4380 + }, + { + "epoch": 2.27, + "learning_rate": 1.358768406961178e-07, + "logits/chosen": -2.3843369483947754, + "logits/rejected": -2.299290895462036, + "logps/chosen": -255.8084259033203, + "logps/rejected": -338.96917724609375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0958081483840942, + "rewards/margins": 10.103997230529785, + "rewards/rejected": -11.199804306030273, + "step": 4390 + }, + { + "epoch": 2.27, + "learning_rate": 1.349206349206349e-07, + "logits/chosen": -2.430680990219116, + "logits/rejected": -2.4080395698547363, + "logps/chosen": -250.6897735595703, + "logps/rejected": -306.9358215332031, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9635181427001953, + "rewards/margins": 8.765409469604492, + "rewards/rejected": -10.728927612304688, + "step": 4400 + }, + { + "epoch": 2.28, + "learning_rate": 1.3396442914515203e-07, + "logits/chosen": -2.3354380130767822, + "logits/rejected": -2.336620807647705, + "logps/chosen": -297.7576904296875, + "logps/rejected": -355.9744873046875, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7647383213043213, + "rewards/margins": 9.611452102661133, + "rewards/rejected": -11.376190185546875, + "step": 4410 + }, + { + "epoch": 2.28, + "learning_rate": 1.3300822336966917e-07, + "logits/chosen": -2.4746663570404053, + "logits/rejected": -2.2819876670837402, + "logps/chosen": -323.2002258300781, + "logps/rejected": -348.868896484375, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9656437039375305, + "rewards/margins": 10.08340835571289, + "rewards/rejected": -11.049051284790039, + "step": 4420 + }, + { + "epoch": 2.29, + "learning_rate": 1.3205201759418626e-07, + "logits/chosen": -2.3873531818389893, + "logits/rejected": -2.2998034954071045, + "logps/chosen": -319.4635925292969, + "logps/rejected": -344.7845153808594, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3506031632423401, + "rewards/margins": 9.482653617858887, + "rewards/rejected": -9.833256721496582, + "step": 4430 + }, + { + "epoch": 2.29, + "learning_rate": 1.3109581181870338e-07, + "logits/chosen": -2.4441051483154297, + "logits/rejected": -2.4536311626434326, + "logps/chosen": -329.62103271484375, + "logps/rejected": -332.23748779296875, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5083192586898804, + "rewards/margins": 9.463578224182129, + "rewards/rejected": -9.971896171569824, + "step": 4440 + }, + { + "epoch": 2.3, + "learning_rate": 1.301396060432205e-07, + "logits/chosen": -2.4361507892608643, + "logits/rejected": -2.4902892112731934, + "logps/chosen": -320.44659423828125, + "logps/rejected": -342.33905029296875, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1917698383331299, + "rewards/margins": 9.730718612670898, + "rewards/rejected": -10.922487258911133, + "step": 4450 + }, + { + "epoch": 2.3, + "learning_rate": 1.291834002677376e-07, + "logits/chosen": -2.414278745651245, + "logits/rejected": -2.304199695587158, + "logps/chosen": -286.6895751953125, + "logps/rejected": -354.45361328125, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6277958154678345, + "rewards/margins": 9.477842330932617, + "rewards/rejected": -11.10563850402832, + "step": 4460 + }, + { + "epoch": 2.31, + "learning_rate": 1.2822719449225472e-07, + "logits/chosen": -2.363389253616333, + "logits/rejected": -2.4042270183563232, + "logps/chosen": -227.5398406982422, + "logps/rejected": -313.44512939453125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8537158966064453, + "rewards/margins": 8.813211441040039, + "rewards/rejected": -10.666927337646484, + "step": 4470 + }, + { + "epoch": 2.31, + "learning_rate": 1.2727098871677184e-07, + "logits/chosen": -2.544398784637451, + "logits/rejected": -2.509765625, + "logps/chosen": -353.1852111816406, + "logps/rejected": -389.2328186035156, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6934762597084045, + "rewards/margins": 9.873237609863281, + "rewards/rejected": -10.5667142868042, + "step": 4480 + }, + { + "epoch": 2.32, + "learning_rate": 1.2631478294128898e-07, + "logits/chosen": -2.536165237426758, + "logits/rejected": -2.4797139167785645, + "logps/chosen": -280.58880615234375, + "logps/rejected": -349.1954345703125, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3528211116790771, + "rewards/margins": 9.536347389221191, + "rewards/rejected": -10.889168739318848, + "step": 4490 + }, + { + "epoch": 2.32, + "learning_rate": 1.253585771658061e-07, + "logits/chosen": -2.4254403114318848, + "logits/rejected": -2.4034600257873535, + "logps/chosen": -253.6884002685547, + "logps/rejected": -346.30029296875, + "loss": 0.0191, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1951841115951538, + "rewards/margins": 9.793797492980957, + "rewards/rejected": -10.988981246948242, + "step": 4500 + }, + { + "epoch": 2.32, + "eval_logits/chosen": -2.503566265106201, + "eval_logits/rejected": -2.475616931915283, + "eval_logps/chosen": -317.906005859375, + "eval_logps/rejected": -308.5774230957031, + "eval_loss": 0.7214946150779724, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -4.218255043029785, + "eval_rewards/margins": 3.40336537361145, + "eval_rewards/rejected": -7.621620178222656, + "eval_runtime": 462.7177, + "eval_samples_per_second": 4.322, + "eval_steps_per_second": 0.27, + "step": 4500 + }, + { + "epoch": 2.33, + "learning_rate": 1.2440237139032319e-07, + "logits/chosen": -2.4602277278900146, + "logits/rejected": -2.4032809734344482, + "logps/chosen": -303.80816650390625, + "logps/rejected": -327.02874755859375, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4082835912704468, + "rewards/margins": 9.613999366760254, + "rewards/rejected": -11.022283554077148, + "step": 4510 + }, + { + "epoch": 2.33, + "learning_rate": 1.234461656148403e-07, + "logits/chosen": -2.486802816390991, + "logits/rejected": -2.4940314292907715, + "logps/chosen": -329.1534118652344, + "logps/rejected": -371.4541015625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.163053274154663, + "rewards/margins": 10.781423568725586, + "rewards/rejected": -11.944475173950195, + "step": 4520 + }, + { + "epoch": 2.34, + "learning_rate": 1.2248995983935742e-07, + "logits/chosen": -2.4711391925811768, + "logits/rejected": -2.485156536102295, + "logps/chosen": -306.0505065917969, + "logps/rejected": -345.5367431640625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9234344363212585, + "rewards/margins": 10.126235961914062, + "rewards/rejected": -11.049670219421387, + "step": 4530 + }, + { + "epoch": 2.34, + "learning_rate": 1.2153375406387456e-07, + "logits/chosen": -2.4170022010803223, + "logits/rejected": -2.4166598320007324, + "logps/chosen": -303.3820495605469, + "logps/rejected": -380.8604736328125, + "loss": 0.008, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3916549682617188, + "rewards/margins": 10.463330268859863, + "rewards/rejected": -11.854985237121582, + "step": 4540 + }, + { + "epoch": 2.35, + "learning_rate": 1.2057754828839165e-07, + "logits/chosen": -2.471839189529419, + "logits/rejected": -2.512411594390869, + "logps/chosen": -277.19158935546875, + "logps/rejected": -340.2514343261719, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9067068099975586, + "rewards/margins": 9.49809455871582, + "rewards/rejected": -10.404802322387695, + "step": 4550 + }, + { + "epoch": 2.35, + "learning_rate": 1.1962134251290876e-07, + "logits/chosen": -2.356781482696533, + "logits/rejected": -2.4188120365142822, + "logps/chosen": -306.0680847167969, + "logps/rejected": -314.0914001464844, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8832162618637085, + "rewards/margins": 8.965120315551758, + "rewards/rejected": -10.848337173461914, + "step": 4560 + }, + { + "epoch": 2.36, + "learning_rate": 1.1866513673742588e-07, + "logits/chosen": -2.484847068786621, + "logits/rejected": -2.363482713699341, + "logps/chosen": -310.15118408203125, + "logps/rejected": -303.5231628417969, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3100998401641846, + "rewards/margins": 8.249720573425293, + "rewards/rejected": -9.559820175170898, + "step": 4570 + }, + { + "epoch": 2.36, + "learning_rate": 1.1770893096194301e-07, + "logits/chosen": -2.399364948272705, + "logits/rejected": -2.4284517765045166, + "logps/chosen": -300.3472595214844, + "logps/rejected": -332.69671630859375, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6987594366073608, + "rewards/margins": 9.020197868347168, + "rewards/rejected": -10.71895694732666, + "step": 4580 + }, + { + "epoch": 2.37, + "learning_rate": 1.1675272518646012e-07, + "logits/chosen": -2.4957313537597656, + "logits/rejected": -2.409392833709717, + "logps/chosen": -249.9640655517578, + "logps/rejected": -335.85638427734375, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2298882007598877, + "rewards/margins": 9.339942932128906, + "rewards/rejected": -10.569831848144531, + "step": 4590 + }, + { + "epoch": 2.37, + "learning_rate": 1.1579651941097724e-07, + "logits/chosen": -2.4879660606384277, + "logits/rejected": -2.507378101348877, + "logps/chosen": -326.9290466308594, + "logps/rejected": -343.4966735839844, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4674627780914307, + "rewards/margins": 9.710068702697754, + "rewards/rejected": -11.177530288696289, + "step": 4600 + }, + { + "epoch": 2.38, + "learning_rate": 1.1484031363549436e-07, + "logits/chosen": -2.391278028488159, + "logits/rejected": -2.443850040435791, + "logps/chosen": -320.7591247558594, + "logps/rejected": -353.69281005859375, + "loss": 0.0219, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.702275037765503, + "rewards/margins": 10.41825008392334, + "rewards/rejected": -12.120525360107422, + "step": 4610 + }, + { + "epoch": 2.39, + "learning_rate": 1.1388410786001147e-07, + "logits/chosen": -2.4807400703430176, + "logits/rejected": -2.4674527645111084, + "logps/chosen": -266.72100830078125, + "logps/rejected": -361.2059631347656, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4482429027557373, + "rewards/margins": 9.929925918579102, + "rewards/rejected": -11.378170013427734, + "step": 4620 + }, + { + "epoch": 2.39, + "learning_rate": 1.1292790208452859e-07, + "logits/chosen": -2.442647933959961, + "logits/rejected": -2.4394030570983887, + "logps/chosen": -243.0096893310547, + "logps/rejected": -335.05023193359375, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2735203504562378, + "rewards/margins": 9.817770957946777, + "rewards/rejected": -11.091290473937988, + "step": 4630 + }, + { + "epoch": 2.4, + "learning_rate": 1.119716963090457e-07, + "logits/chosen": -2.385737657546997, + "logits/rejected": -2.3391549587249756, + "logps/chosen": -265.28271484375, + "logps/rejected": -349.02593994140625, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0982563495635986, + "rewards/margins": 9.236578941345215, + "rewards/rejected": -11.33483600616455, + "step": 4640 + }, + { + "epoch": 2.4, + "learning_rate": 1.1101549053356282e-07, + "logits/chosen": -2.52477765083313, + "logits/rejected": -2.4235565662384033, + "logps/chosen": -298.9849548339844, + "logps/rejected": -348.5369567871094, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4369876384735107, + "rewards/margins": 9.919187545776367, + "rewards/rejected": -11.356176376342773, + "step": 4650 + }, + { + "epoch": 2.41, + "learning_rate": 1.1005928475807993e-07, + "logits/chosen": -2.469526767730713, + "logits/rejected": -2.4453277587890625, + "logps/chosen": -252.61422729492188, + "logps/rejected": -331.5292663574219, + "loss": 0.01, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1783180236816406, + "rewards/margins": 9.28120231628418, + "rewards/rejected": -11.45952033996582, + "step": 4660 + }, + { + "epoch": 2.41, + "learning_rate": 1.0910307898259705e-07, + "logits/chosen": -2.3904144763946533, + "logits/rejected": -2.4100112915039062, + "logps/chosen": -267.6534729003906, + "logps/rejected": -374.3344421386719, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9095629453659058, + "rewards/margins": 10.048948287963867, + "rewards/rejected": -10.95850944519043, + "step": 4670 + }, + { + "epoch": 2.42, + "learning_rate": 1.0814687320711418e-07, + "logits/chosen": -2.421391010284424, + "logits/rejected": -2.308657646179199, + "logps/chosen": -240.39181518554688, + "logps/rejected": -371.52642822265625, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2718487977981567, + "rewards/margins": 10.701406478881836, + "rewards/rejected": -11.97325611114502, + "step": 4680 + }, + { + "epoch": 2.42, + "learning_rate": 1.0719066743163128e-07, + "logits/chosen": -2.453456401824951, + "logits/rejected": -2.4812474250793457, + "logps/chosen": -338.2146911621094, + "logps/rejected": -358.6303405761719, + "loss": 0.0148, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3517457246780396, + "rewards/margins": 10.342467308044434, + "rewards/rejected": -11.694211959838867, + "step": 4690 + }, + { + "epoch": 2.43, + "learning_rate": 1.062344616561484e-07, + "logits/chosen": -2.506812810897827, + "logits/rejected": -2.4892067909240723, + "logps/chosen": -294.2913818359375, + "logps/rejected": -364.78564453125, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3385236263275146, + "rewards/margins": 10.63171672821045, + "rewards/rejected": -11.970239639282227, + "step": 4700 + }, + { + "epoch": 2.43, + "learning_rate": 1.0527825588066551e-07, + "logits/chosen": -2.4706711769104004, + "logits/rejected": -2.462312698364258, + "logps/chosen": -301.4435119628906, + "logps/rejected": -339.53076171875, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9904528856277466, + "rewards/margins": 10.595855712890625, + "rewards/rejected": -11.586308479309082, + "step": 4710 + }, + { + "epoch": 2.44, + "learning_rate": 1.0432205010518264e-07, + "logits/chosen": -2.417384624481201, + "logits/rejected": -2.5014162063598633, + "logps/chosen": -252.42526245117188, + "logps/rejected": -357.0494079589844, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7958747148513794, + "rewards/margins": 10.516870498657227, + "rewards/rejected": -11.312746047973633, + "step": 4720 + }, + { + "epoch": 2.44, + "learning_rate": 1.0336584432969974e-07, + "logits/chosen": -2.3254947662353516, + "logits/rejected": -2.3634276390075684, + "logps/chosen": -249.72042846679688, + "logps/rejected": -326.01580810546875, + "loss": 0.0111, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8156722784042358, + "rewards/margins": 9.60567855834961, + "rewards/rejected": -10.421350479125977, + "step": 4730 + }, + { + "epoch": 2.45, + "learning_rate": 1.0240963855421686e-07, + "logits/chosen": -2.393904447555542, + "logits/rejected": -2.289703607559204, + "logps/chosen": -333.31494140625, + "logps/rejected": -349.7557067871094, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.729089617729187, + "rewards/margins": 9.651041030883789, + "rewards/rejected": -10.38012981414795, + "step": 4740 + }, + { + "epoch": 2.45, + "learning_rate": 1.0145343277873399e-07, + "logits/chosen": -2.4490630626678467, + "logits/rejected": -2.384364366531372, + "logps/chosen": -324.8036804199219, + "logps/rejected": -356.80157470703125, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9558230638504028, + "rewards/margins": 9.961849212646484, + "rewards/rejected": -10.917671203613281, + "step": 4750 + }, + { + "epoch": 2.46, + "learning_rate": 1.004972270032511e-07, + "logits/chosen": -2.2176153659820557, + "logits/rejected": -2.187372922897339, + "logps/chosen": -258.55267333984375, + "logps/rejected": -290.7706298828125, + "loss": 0.0121, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3212132453918457, + "rewards/margins": 9.305283546447754, + "rewards/rejected": -10.626497268676758, + "step": 4760 + }, + { + "epoch": 2.46, + "learning_rate": 9.95410212277682e-08, + "logits/chosen": -2.405513286590576, + "logits/rejected": -2.4268226623535156, + "logps/chosen": -293.0621032714844, + "logps/rejected": -351.65557861328125, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2851110696792603, + "rewards/margins": 9.79377555847168, + "rewards/rejected": -11.078888893127441, + "step": 4770 + }, + { + "epoch": 2.47, + "learning_rate": 9.858481545228532e-08, + "logits/chosen": -2.388866901397705, + "logits/rejected": -2.4224321842193604, + "logps/chosen": -300.51416015625, + "logps/rejected": -318.98834228515625, + "loss": 0.0171, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0021681785583496, + "rewards/margins": 9.520451545715332, + "rewards/rejected": -10.522619247436523, + "step": 4780 + }, + { + "epoch": 2.47, + "learning_rate": 9.762860967680245e-08, + "logits/chosen": -2.411489486694336, + "logits/rejected": -2.3945744037628174, + "logps/chosen": -295.2189025878906, + "logps/rejected": -373.933837890625, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4970338344573975, + "rewards/margins": 10.310312271118164, + "rewards/rejected": -11.80734634399414, + "step": 4790 + }, + { + "epoch": 2.48, + "learning_rate": 9.667240390131957e-08, + "logits/chosen": -2.500349760055542, + "logits/rejected": -2.551851749420166, + "logps/chosen": -313.2876892089844, + "logps/rejected": -354.4496154785156, + "loss": 0.0073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8869138956069946, + "rewards/margins": 10.612105369567871, + "rewards/rejected": -11.49902057647705, + "step": 4800 + }, + { + "epoch": 2.48, + "learning_rate": 9.571619812583667e-08, + "logits/chosen": -2.3907971382141113, + "logits/rejected": -2.3780624866485596, + "logps/chosen": -294.43145751953125, + "logps/rejected": -354.15380859375, + "loss": 0.0184, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6661240458488464, + "rewards/margins": 10.202630996704102, + "rewards/rejected": -10.868753433227539, + "step": 4810 + }, + { + "epoch": 2.49, + "learning_rate": 9.47599923503538e-08, + "logits/chosen": -2.522645950317383, + "logits/rejected": -2.5387370586395264, + "logps/chosen": -282.9609069824219, + "logps/rejected": -352.8865051269531, + "loss": 0.0182, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1960705518722534, + "rewards/margins": 9.857826232910156, + "rewards/rejected": -11.053895950317383, + "step": 4820 + }, + { + "epoch": 2.49, + "learning_rate": 9.380378657487091e-08, + "logits/chosen": -2.5203070640563965, + "logits/rejected": -2.5354747772216797, + "logps/chosen": -282.48968505859375, + "logps/rejected": -340.5061340332031, + "loss": 0.0188, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3149895668029785, + "rewards/margins": 10.316043853759766, + "rewards/rejected": -11.631032943725586, + "step": 4830 + }, + { + "epoch": 2.5, + "learning_rate": 9.284758079938803e-08, + "logits/chosen": -2.5143141746520996, + "logits/rejected": -2.5719997882843018, + "logps/chosen": -290.4022216796875, + "logps/rejected": -334.06939697265625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.034603476524353, + "rewards/margins": 9.760403633117676, + "rewards/rejected": -10.795007705688477, + "step": 4840 + }, + { + "epoch": 2.5, + "learning_rate": 9.189137502390513e-08, + "logits/chosen": -2.513765335083008, + "logits/rejected": -2.5539920330047607, + "logps/chosen": -294.8843078613281, + "logps/rejected": -402.92596435546875, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0495996475219727, + "rewards/margins": 9.208512306213379, + "rewards/rejected": -11.258111953735352, + "step": 4850 + }, + { + "epoch": 2.51, + "learning_rate": 9.093516924842226e-08, + "logits/chosen": -2.563143253326416, + "logits/rejected": -2.4798381328582764, + "logps/chosen": -272.5045166015625, + "logps/rejected": -361.7091979980469, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7693915367126465, + "rewards/margins": 10.272433280944824, + "rewards/rejected": -11.041825294494629, + "step": 4860 + }, + { + "epoch": 2.51, + "learning_rate": 8.997896347293938e-08, + "logits/chosen": -2.549731492996216, + "logits/rejected": -2.5603187084198, + "logps/chosen": -293.55511474609375, + "logps/rejected": -380.87969970703125, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2459195852279663, + "rewards/margins": 9.440132141113281, + "rewards/rejected": -10.686052322387695, + "step": 4870 + }, + { + "epoch": 2.52, + "learning_rate": 8.902275769745648e-08, + "logits/chosen": -2.4357972145080566, + "logits/rejected": -2.5012993812561035, + "logps/chosen": -261.88238525390625, + "logps/rejected": -351.6615295410156, + "loss": 0.0125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.2481439113616943, + "rewards/margins": 10.309308052062988, + "rewards/rejected": -11.557451248168945, + "step": 4880 + }, + { + "epoch": 2.52, + "learning_rate": 8.806655192197361e-08, + "logits/chosen": -2.468061923980713, + "logits/rejected": -2.406736373901367, + "logps/chosen": -249.983154296875, + "logps/rejected": -365.0889587402344, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5393785238265991, + "rewards/margins": 11.374832153320312, + "rewards/rejected": -11.914213180541992, + "step": 4890 + }, + { + "epoch": 2.53, + "learning_rate": 8.711034614649072e-08, + "logits/chosen": -2.5306286811828613, + "logits/rejected": -2.5501136779785156, + "logps/chosen": -271.1980285644531, + "logps/rejected": -366.566650390625, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6836016178131104, + "rewards/margins": 10.52708625793457, + "rewards/rejected": -12.210687637329102, + "step": 4900 + }, + { + "epoch": 2.53, + "learning_rate": 8.615414037100784e-08, + "logits/chosen": -2.408055543899536, + "logits/rejected": -2.4738125801086426, + "logps/chosen": -352.69366455078125, + "logps/rejected": -341.93359375, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7806146144866943, + "rewards/margins": 9.49925422668457, + "rewards/rejected": -10.279869079589844, + "step": 4910 + }, + { + "epoch": 2.54, + "learning_rate": 8.519793459552494e-08, + "logits/chosen": -2.4429402351379395, + "logits/rejected": -2.4843060970306396, + "logps/chosen": -270.4953308105469, + "logps/rejected": -339.0693054199219, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.785670816898346, + "rewards/margins": 10.222851753234863, + "rewards/rejected": -11.008522987365723, + "step": 4920 + }, + { + "epoch": 2.55, + "learning_rate": 8.424172882004207e-08, + "logits/chosen": -2.4533307552337646, + "logits/rejected": -2.4679694175720215, + "logps/chosen": -287.844482421875, + "logps/rejected": -376.2936706542969, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5463220477104187, + "rewards/margins": 9.996697425842285, + "rewards/rejected": -10.543018341064453, + "step": 4930 + }, + { + "epoch": 2.55, + "learning_rate": 8.328552304455919e-08, + "logits/chosen": -2.5091729164123535, + "logits/rejected": -2.5005977153778076, + "logps/chosen": -256.46240234375, + "logps/rejected": -319.3448181152344, + "loss": 0.0136, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2730116844177246, + "rewards/margins": 9.923606872558594, + "rewards/rejected": -10.19661808013916, + "step": 4940 + }, + { + "epoch": 2.56, + "learning_rate": 8.23293172690763e-08, + "logits/chosen": -2.511229991912842, + "logits/rejected": -2.446113109588623, + "logps/chosen": -287.6640930175781, + "logps/rejected": -343.26190185546875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.262495756149292, + "rewards/margins": 10.62315559387207, + "rewards/rejected": -10.885650634765625, + "step": 4950 + }, + { + "epoch": 2.56, + "learning_rate": 8.137311149359343e-08, + "logits/chosen": -2.594197988510132, + "logits/rejected": -2.6038312911987305, + "logps/chosen": -337.53973388671875, + "logps/rejected": -340.3900451660156, + "loss": 0.0061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.43203097581863403, + "rewards/margins": 10.693909645080566, + "rewards/rejected": -11.125940322875977, + "step": 4960 + }, + { + "epoch": 2.57, + "learning_rate": 8.041690571811053e-08, + "logits/chosen": -2.512244939804077, + "logits/rejected": -2.429426670074463, + "logps/chosen": -290.28509521484375, + "logps/rejected": -350.6308288574219, + "loss": 0.0131, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1895757913589478, + "rewards/margins": 10.664027214050293, + "rewards/rejected": -11.853602409362793, + "step": 4970 + }, + { + "epoch": 2.57, + "learning_rate": 7.946069994262765e-08, + "logits/chosen": -2.457437038421631, + "logits/rejected": -2.4107303619384766, + "logps/chosen": -266.69757080078125, + "logps/rejected": -317.18695068359375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8658391833305359, + "rewards/margins": 9.216228485107422, + "rewards/rejected": -10.082067489624023, + "step": 4980 + }, + { + "epoch": 2.58, + "learning_rate": 7.850449416714476e-08, + "logits/chosen": -2.501028537750244, + "logits/rejected": -2.5449934005737305, + "logps/chosen": -306.799560546875, + "logps/rejected": -347.26971435546875, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9260441660881042, + "rewards/margins": 9.568941116333008, + "rewards/rejected": -10.494985580444336, + "step": 4990 + }, + { + "epoch": 2.58, + "learning_rate": 7.754828839166188e-08, + "logits/chosen": -2.5003304481506348, + "logits/rejected": -2.4379677772521973, + "logps/chosen": -280.774658203125, + "logps/rejected": -358.0619201660156, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2388416528701782, + "rewards/margins": 9.900052070617676, + "rewards/rejected": -11.138894081115723, + "step": 5000 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.543748617172241, + "eval_logits/rejected": -2.5156030654907227, + "eval_logps/chosen": -318.33062744140625, + "eval_logps/rejected": -309.8016052246094, + "eval_loss": 0.7340511679649353, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -4.260715961456299, + "eval_rewards/margins": 3.4833171367645264, + "eval_rewards/rejected": -7.744033336639404, + "eval_runtime": 462.765, + "eval_samples_per_second": 4.322, + "eval_steps_per_second": 0.27, + "step": 5000 + }, + { + "epoch": 2.59, + "learning_rate": 7.6592082616179e-08, + "logits/chosen": -2.523669481277466, + "logits/rejected": -2.453369617462158, + "logps/chosen": -326.9425354003906, + "logps/rejected": -382.0200500488281, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6532198190689087, + "rewards/margins": 10.079858779907227, + "rewards/rejected": -11.733078002929688, + "step": 5010 + }, + { + "epoch": 2.59, + "learning_rate": 7.563587684069611e-08, + "logits/chosen": -2.41379451751709, + "logits/rejected": -2.458996534347534, + "logps/chosen": -296.97430419921875, + "logps/rejected": -358.48809814453125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40497732162475586, + "rewards/margins": 10.697502136230469, + "rewards/rejected": -11.10247802734375, + "step": 5020 + }, + { + "epoch": 2.6, + "learning_rate": 7.467967106521324e-08, + "logits/chosen": -2.474517345428467, + "logits/rejected": -2.508488178253174, + "logps/chosen": -267.0328063964844, + "logps/rejected": -270.98907470703125, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9263556599617004, + "rewards/margins": 9.430020332336426, + "rewards/rejected": -10.356375694274902, + "step": 5030 + }, + { + "epoch": 2.6, + "learning_rate": 7.372346528973034e-08, + "logits/chosen": -2.5095248222351074, + "logits/rejected": -2.512310743331909, + "logps/chosen": -295.5697937011719, + "logps/rejected": -332.3825988769531, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4971369504928589, + "rewards/margins": 9.871773719787598, + "rewards/rejected": -11.36890983581543, + "step": 5040 + }, + { + "epoch": 2.61, + "learning_rate": 7.276725951424746e-08, + "logits/chosen": -2.4795002937316895, + "logits/rejected": -2.3714241981506348, + "logps/chosen": -272.24066162109375, + "logps/rejected": -346.04986572265625, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.496202826499939, + "rewards/margins": 9.852384567260742, + "rewards/rejected": -11.348587989807129, + "step": 5050 + }, + { + "epoch": 2.61, + "learning_rate": 7.181105373876457e-08, + "logits/chosen": -2.4065327644348145, + "logits/rejected": -2.5047264099121094, + "logps/chosen": -276.74053955078125, + "logps/rejected": -318.2535705566406, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8203741908073425, + "rewards/margins": 10.269460678100586, + "rewards/rejected": -11.089835166931152, + "step": 5060 + }, + { + "epoch": 2.62, + "learning_rate": 7.08548479632817e-08, + "logits/chosen": -2.472975254058838, + "logits/rejected": -2.4483237266540527, + "logps/chosen": -320.4283752441406, + "logps/rejected": -343.42498779296875, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.471790075302124, + "rewards/margins": 10.54113483428955, + "rewards/rejected": -11.012925148010254, + "step": 5070 + }, + { + "epoch": 2.62, + "learning_rate": 6.98986421877988e-08, + "logits/chosen": -2.5092146396636963, + "logits/rejected": -2.537966012954712, + "logps/chosen": -303.90447998046875, + "logps/rejected": -370.0856018066406, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.359339714050293, + "rewards/margins": 9.674978256225586, + "rewards/rejected": -11.034318923950195, + "step": 5080 + }, + { + "epoch": 2.63, + "learning_rate": 6.894243641231592e-08, + "logits/chosen": -2.3674094676971436, + "logits/rejected": -2.351562023162842, + "logps/chosen": -286.71063232421875, + "logps/rejected": -349.8485107421875, + "loss": 0.0174, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.551408052444458, + "rewards/margins": 10.066386222839355, + "rewards/rejected": -11.617793083190918, + "step": 5090 + }, + { + "epoch": 2.63, + "learning_rate": 6.798623063683305e-08, + "logits/chosen": -2.399749279022217, + "logits/rejected": -2.496814012527466, + "logps/chosen": -293.72027587890625, + "logps/rejected": -356.86395263671875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.522544264793396, + "rewards/margins": 11.700794219970703, + "rewards/rejected": -13.223337173461914, + "step": 5100 + }, + { + "epoch": 2.64, + "learning_rate": 6.703002486135017e-08, + "logits/chosen": -2.3967783451080322, + "logits/rejected": -2.476592779159546, + "logps/chosen": -260.53924560546875, + "logps/rejected": -364.6197204589844, + "loss": 0.0223, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.540137529373169, + "rewards/margins": 11.050385475158691, + "rewards/rejected": -12.590522766113281, + "step": 5110 + }, + { + "epoch": 2.64, + "learning_rate": 6.607381908586727e-08, + "logits/chosen": -2.4503352642059326, + "logits/rejected": -2.439924955368042, + "logps/chosen": -297.86279296875, + "logps/rejected": -390.11920166015625, + "loss": 0.0201, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1085379123687744, + "rewards/margins": 10.787110328674316, + "rewards/rejected": -11.895648002624512, + "step": 5120 + }, + { + "epoch": 2.65, + "learning_rate": 6.511761331038438e-08, + "logits/chosen": -2.5204012393951416, + "logits/rejected": -2.4319424629211426, + "logps/chosen": -248.1070556640625, + "logps/rejected": -328.4260559082031, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1627001762390137, + "rewards/margins": 9.001168251037598, + "rewards/rejected": -11.163866996765137, + "step": 5130 + }, + { + "epoch": 2.65, + "learning_rate": 6.416140753490151e-08, + "logits/chosen": -2.4711902141571045, + "logits/rejected": -2.5176639556884766, + "logps/chosen": -330.0944519042969, + "logps/rejected": -382.4728088378906, + "loss": 0.0131, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.061452865600586, + "rewards/margins": 10.265039443969727, + "rewards/rejected": -11.326491355895996, + "step": 5140 + }, + { + "epoch": 2.66, + "learning_rate": 6.320520175941863e-08, + "logits/chosen": -2.353011131286621, + "logits/rejected": -2.3878352642059326, + "logps/chosen": -275.98431396484375, + "logps/rejected": -333.1819763183594, + "loss": 0.0133, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6692678928375244, + "rewards/margins": 9.493376731872559, + "rewards/rejected": -11.162644386291504, + "step": 5150 + }, + { + "epoch": 2.66, + "learning_rate": 6.224899598393573e-08, + "logits/chosen": -2.5229997634887695, + "logits/rejected": -2.5036749839782715, + "logps/chosen": -331.75408935546875, + "logps/rejected": -387.4117126464844, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1152173280715942, + "rewards/margins": 10.071974754333496, + "rewards/rejected": -11.1871919631958, + "step": 5160 + }, + { + "epoch": 2.67, + "learning_rate": 6.129279020845286e-08, + "logits/chosen": -2.4617230892181396, + "logits/rejected": -2.560640811920166, + "logps/chosen": -277.8742370605469, + "logps/rejected": -311.23907470703125, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0628654956817627, + "rewards/margins": 10.087770462036133, + "rewards/rejected": -11.150636672973633, + "step": 5170 + }, + { + "epoch": 2.67, + "learning_rate": 6.033658443296998e-08, + "logits/chosen": -2.5219173431396484, + "logits/rejected": -2.521574020385742, + "logps/chosen": -306.21929931640625, + "logps/rejected": -358.87322998046875, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.613425612449646, + "rewards/margins": 10.280952453613281, + "rewards/rejected": -11.894378662109375, + "step": 5180 + }, + { + "epoch": 2.68, + "learning_rate": 5.9380378657487085e-08, + "logits/chosen": -2.5566489696502686, + "logits/rejected": -2.5286972522735596, + "logps/chosen": -304.65264892578125, + "logps/rejected": -353.1256103515625, + "loss": 0.005, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8878873586654663, + "rewards/margins": 10.550737380981445, + "rewards/rejected": -11.438623428344727, + "step": 5190 + }, + { + "epoch": 2.68, + "learning_rate": 5.842417288200421e-08, + "logits/chosen": -2.5313048362731934, + "logits/rejected": -2.540001392364502, + "logps/chosen": -319.0318298339844, + "logps/rejected": -362.86224365234375, + "loss": 0.0089, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8091020584106445, + "rewards/margins": 10.437647819519043, + "rewards/rejected": -11.246749877929688, + "step": 5200 + }, + { + "epoch": 2.69, + "learning_rate": 5.7467967106521317e-08, + "logits/chosen": -2.4078681468963623, + "logits/rejected": -2.4756946563720703, + "logps/chosen": -228.6869354248047, + "logps/rejected": -344.0455627441406, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5125707387924194, + "rewards/margins": 10.272712707519531, + "rewards/rejected": -11.78528118133545, + "step": 5210 + }, + { + "epoch": 2.69, + "learning_rate": 5.651176133103844e-08, + "logits/chosen": -2.577641010284424, + "logits/rejected": -2.535780668258667, + "logps/chosen": -287.9211120605469, + "logps/rejected": -376.558349609375, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5841994285583496, + "rewards/margins": 11.391988754272461, + "rewards/rejected": -11.976190567016602, + "step": 5220 + }, + { + "epoch": 2.7, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": -2.3901116847991943, + "logits/rejected": -2.4189741611480713, + "logps/chosen": -281.49957275390625, + "logps/rejected": -300.177001953125, + "loss": 0.0139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.406322717666626, + "rewards/margins": 9.543334007263184, + "rewards/rejected": -10.94965648651123, + "step": 5230 + }, + { + "epoch": 2.71, + "learning_rate": 5.459934978007267e-08, + "logits/chosen": -2.4920616149902344, + "logits/rejected": -2.421905994415283, + "logps/chosen": -297.64984130859375, + "logps/rejected": -357.3204040527344, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1165698766708374, + "rewards/margins": 11.096078872680664, + "rewards/rejected": -12.21264934539795, + "step": 5240 + }, + { + "epoch": 2.71, + "learning_rate": 5.3643144004589786e-08, + "logits/chosen": -2.469278573989868, + "logits/rejected": -2.357398271560669, + "logps/chosen": -338.11065673828125, + "logps/rejected": -361.2065124511719, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4490266740322113, + "rewards/margins": 11.35706615447998, + "rewards/rejected": -11.806093215942383, + "step": 5250 + }, + { + "epoch": 2.72, + "learning_rate": 5.26869382291069e-08, + "logits/chosen": -2.4685468673706055, + "logits/rejected": -2.4665069580078125, + "logps/chosen": -256.52935791015625, + "logps/rejected": -312.5513916015625, + "loss": 0.0425, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2890058755874634, + "rewards/margins": 9.67116928100586, + "rewards/rejected": -10.960175514221191, + "step": 5260 + }, + { + "epoch": 2.72, + "learning_rate": 5.173073245362402e-08, + "logits/chosen": -2.429325819015503, + "logits/rejected": -2.451594829559326, + "logps/chosen": -303.85986328125, + "logps/rejected": -367.44830322265625, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4256505966186523, + "rewards/margins": 10.259121894836426, + "rewards/rejected": -11.684773445129395, + "step": 5270 + }, + { + "epoch": 2.73, + "learning_rate": 5.077452667814113e-08, + "logits/chosen": -2.3756916522979736, + "logits/rejected": -2.465467691421509, + "logps/chosen": -260.43890380859375, + "logps/rejected": -343.8309020996094, + "loss": 0.0119, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6615362167358398, + "rewards/margins": 10.869677543640137, + "rewards/rejected": -12.531213760375977, + "step": 5280 + }, + { + "epoch": 2.73, + "learning_rate": 4.981832090265825e-08, + "logits/chosen": -2.505603790283203, + "logits/rejected": -2.3853371143341064, + "logps/chosen": -265.8559265136719, + "logps/rejected": -338.4347229003906, + "loss": 0.0214, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0404921770095825, + "rewards/margins": 10.665314674377441, + "rewards/rejected": -11.705805778503418, + "step": 5290 + }, + { + "epoch": 2.74, + "learning_rate": 4.8862115127175364e-08, + "logits/chosen": -2.474071979522705, + "logits/rejected": -2.5259792804718018, + "logps/chosen": -314.3756103515625, + "logps/rejected": -350.26910400390625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8292062282562256, + "rewards/margins": 9.556681632995605, + "rewards/rejected": -11.38588809967041, + "step": 5300 + }, + { + "epoch": 2.74, + "learning_rate": 4.790590935169248e-08, + "logits/chosen": -2.52644419670105, + "logits/rejected": -2.389683961868286, + "logps/chosen": -261.18524169921875, + "logps/rejected": -429.006591796875, + "loss": 0.0202, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.313180923461914, + "rewards/margins": 10.647405624389648, + "rewards/rejected": -11.960586547851562, + "step": 5310 + }, + { + "epoch": 2.75, + "learning_rate": 4.69497035762096e-08, + "logits/chosen": -2.4015848636627197, + "logits/rejected": -2.414374351501465, + "logps/chosen": -263.62530517578125, + "logps/rejected": -333.758544921875, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.166088819503784, + "rewards/margins": 10.134454727172852, + "rewards/rejected": -12.300543785095215, + "step": 5320 + }, + { + "epoch": 2.75, + "learning_rate": 4.599349780072671e-08, + "logits/chosen": -2.4161643981933594, + "logits/rejected": -2.3900132179260254, + "logps/chosen": -307.77691650390625, + "logps/rejected": -320.50286865234375, + "loss": 0.0129, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1430935859680176, + "rewards/margins": 10.124316215515137, + "rewards/rejected": -11.267410278320312, + "step": 5330 + }, + { + "epoch": 2.76, + "learning_rate": 4.5037292025243834e-08, + "logits/chosen": -2.4651598930358887, + "logits/rejected": -2.4811933040618896, + "logps/chosen": -316.4070739746094, + "logps/rejected": -383.3251953125, + "loss": 0.01, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1585972309112549, + "rewards/margins": 9.961041450500488, + "rewards/rejected": -11.119638442993164, + "step": 5340 + }, + { + "epoch": 2.76, + "learning_rate": 4.408108624976094e-08, + "logits/chosen": -2.570382595062256, + "logits/rejected": -2.4643959999084473, + "logps/chosen": -284.5560607910156, + "logps/rejected": -375.60638427734375, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8181749582290649, + "rewards/margins": 11.522770881652832, + "rewards/rejected": -12.340944290161133, + "step": 5350 + }, + { + "epoch": 2.77, + "learning_rate": 4.3124880474278065e-08, + "logits/chosen": -2.5256872177124023, + "logits/rejected": -2.4669735431671143, + "logps/chosen": -259.1166687011719, + "logps/rejected": -323.52471923828125, + "loss": 0.0206, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9384600520133972, + "rewards/margins": 9.923190116882324, + "rewards/rejected": -10.861650466918945, + "step": 5360 + }, + { + "epoch": 2.77, + "learning_rate": 4.2168674698795174e-08, + "logits/chosen": -2.4286413192749023, + "logits/rejected": -2.4100894927978516, + "logps/chosen": -257.6455993652344, + "logps/rejected": -365.386474609375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3937734365463257, + "rewards/margins": 10.573844909667969, + "rewards/rejected": -11.967617988586426, + "step": 5370 + }, + { + "epoch": 2.78, + "learning_rate": 4.1212468923312296e-08, + "logits/chosen": -2.3846893310546875, + "logits/rejected": -2.356142520904541, + "logps/chosen": -294.99725341796875, + "logps/rejected": -372.8056945800781, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4807031154632568, + "rewards/margins": 10.132972717285156, + "rewards/rejected": -11.613676071166992, + "step": 5380 + }, + { + "epoch": 2.78, + "learning_rate": 4.025626314782941e-08, + "logits/chosen": -2.3499274253845215, + "logits/rejected": -2.3001906871795654, + "logps/chosen": -292.0154113769531, + "logps/rejected": -337.46771240234375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8701134920120239, + "rewards/margins": 10.553464889526367, + "rewards/rejected": -11.423578262329102, + "step": 5390 + }, + { + "epoch": 2.79, + "learning_rate": 3.930005737234653e-08, + "logits/chosen": -2.4598124027252197, + "logits/rejected": -2.4664788246154785, + "logps/chosen": -263.979248046875, + "logps/rejected": -301.59130859375, + "loss": 0.0147, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.568913221359253, + "rewards/margins": 9.618475914001465, + "rewards/rejected": -11.187389373779297, + "step": 5400 + }, + { + "epoch": 2.79, + "learning_rate": 3.8343851596863644e-08, + "logits/chosen": -2.4709832668304443, + "logits/rejected": -2.414757490158081, + "logps/chosen": -258.98486328125, + "logps/rejected": -309.8785705566406, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1285693645477295, + "rewards/margins": 10.527348518371582, + "rewards/rejected": -11.655917167663574, + "step": 5410 + }, + { + "epoch": 2.8, + "learning_rate": 3.738764582138076e-08, + "logits/chosen": -2.455583333969116, + "logits/rejected": -2.375662088394165, + "logps/chosen": -331.9709777832031, + "logps/rejected": -379.05230712890625, + "loss": 0.0132, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4132988452911377, + "rewards/margins": 10.533462524414062, + "rewards/rejected": -11.946762084960938, + "step": 5420 + }, + { + "epoch": 2.8, + "learning_rate": 3.6431440045897875e-08, + "logits/chosen": -2.401982069015503, + "logits/rejected": -2.4286303520202637, + "logps/chosen": -284.4631042480469, + "logps/rejected": -377.6039123535156, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1193106174468994, + "rewards/margins": 10.629063606262207, + "rewards/rejected": -11.748373031616211, + "step": 5430 + }, + { + "epoch": 2.81, + "learning_rate": 3.547523427041499e-08, + "logits/chosen": -2.5329856872558594, + "logits/rejected": -2.52848744392395, + "logps/chosen": -295.4744567871094, + "logps/rejected": -393.1733703613281, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9103116989135742, + "rewards/margins": 11.393068313598633, + "rewards/rejected": -12.303380012512207, + "step": 5440 + }, + { + "epoch": 2.81, + "learning_rate": 3.4519028494932106e-08, + "logits/chosen": -2.411667823791504, + "logits/rejected": -2.3320624828338623, + "logps/chosen": -306.2268981933594, + "logps/rejected": -367.2950439453125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8251739740371704, + "rewards/margins": 10.851397514343262, + "rewards/rejected": -11.6765718460083, + "step": 5450 + }, + { + "epoch": 2.82, + "learning_rate": 3.356282271944923e-08, + "logits/chosen": -2.448438882827759, + "logits/rejected": -2.454094409942627, + "logps/chosen": -266.05267333984375, + "logps/rejected": -370.8230895996094, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.932195782661438, + "rewards/margins": 10.790741920471191, + "rewards/rejected": -11.722939491271973, + "step": 5460 + }, + { + "epoch": 2.82, + "learning_rate": 3.260661694396634e-08, + "logits/chosen": -2.5592355728149414, + "logits/rejected": -2.4593288898468018, + "logps/chosen": -336.9600524902344, + "logps/rejected": -338.29656982421875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5741435289382935, + "rewards/margins": 10.8987455368042, + "rewards/rejected": -11.472888946533203, + "step": 5470 + }, + { + "epoch": 2.83, + "learning_rate": 3.165041116848346e-08, + "logits/chosen": -2.4696829319000244, + "logits/rejected": -2.4493181705474854, + "logps/chosen": -275.49609375, + "logps/rejected": -422.721923828125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2121615409851074, + "rewards/margins": 11.010812759399414, + "rewards/rejected": -12.22297477722168, + "step": 5480 + }, + { + "epoch": 2.83, + "learning_rate": 3.0694205393000576e-08, + "logits/chosen": -2.4667916297912598, + "logits/rejected": -2.3578381538391113, + "logps/chosen": -262.20989990234375, + "logps/rejected": -345.19000244140625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7185470461845398, + "rewards/margins": 10.832283020019531, + "rewards/rejected": -11.550830841064453, + "step": 5490 + }, + { + "epoch": 2.84, + "learning_rate": 2.9737999617517688e-08, + "logits/chosen": -2.38720965385437, + "logits/rejected": -2.320250988006592, + "logps/chosen": -308.02581787109375, + "logps/rejected": -337.2959899902344, + "loss": 0.0092, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0859707593917847, + "rewards/margins": 10.13597583770752, + "rewards/rejected": -11.221944808959961, + "step": 5500 + }, + { + "epoch": 2.84, + "eval_logits/chosen": -2.5149059295654297, + "eval_logits/rejected": -2.4855728149414062, + "eval_logps/chosen": -319.4794006347656, + "eval_logps/rejected": -311.7966003417969, + "eval_loss": 0.732962429523468, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -4.375596046447754, + "eval_rewards/margins": 3.5679359436035156, + "eval_rewards/rejected": -7.943531513214111, + "eval_runtime": 463.3044, + "eval_samples_per_second": 4.317, + "eval_steps_per_second": 0.27, + "step": 5500 + }, + { + "epoch": 2.84, + "learning_rate": 2.8781793842034804e-08, + "logits/chosen": -2.3183465003967285, + "logits/rejected": -2.3097920417785645, + "logps/chosen": -262.3741455078125, + "logps/rejected": -333.424072265625, + "loss": 0.015, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.9249290227890015, + "rewards/margins": 9.927213668823242, + "rewards/rejected": -11.852142333984375, + "step": 5510 + }, + { + "epoch": 2.85, + "learning_rate": 2.782558806655192e-08, + "logits/chosen": -2.3864715099334717, + "logits/rejected": -2.427034616470337, + "logps/chosen": -284.58843994140625, + "logps/rejected": -362.7284851074219, + "loss": 0.0135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.4909298419952393, + "rewards/margins": 9.80453872680664, + "rewards/rejected": -11.295469284057617, + "step": 5520 + }, + { + "epoch": 2.85, + "learning_rate": 2.6869382291069035e-08, + "logits/chosen": -2.4152259826660156, + "logits/rejected": -2.5505833625793457, + "logps/chosen": -306.852783203125, + "logps/rejected": -364.6080017089844, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.235000491142273, + "rewards/margins": 10.352682113647461, + "rewards/rejected": -11.587682723999023, + "step": 5530 + }, + { + "epoch": 2.86, + "learning_rate": 2.591317651558615e-08, + "logits/chosen": -2.4661903381347656, + "logits/rejected": -2.3790674209594727, + "logps/chosen": -267.7341003417969, + "logps/rejected": -350.3778991699219, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.195554256439209, + "rewards/margins": 10.710996627807617, + "rewards/rejected": -11.906549453735352, + "step": 5540 + }, + { + "epoch": 2.87, + "learning_rate": 2.4956970740103267e-08, + "logits/chosen": -2.466414213180542, + "logits/rejected": -2.3303287029266357, + "logps/chosen": -290.38360595703125, + "logps/rejected": -394.87103271484375, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4862712621688843, + "rewards/margins": 10.932287216186523, + "rewards/rejected": -12.418558120727539, + "step": 5550 + }, + { + "epoch": 2.87, + "learning_rate": 2.4000764964620386e-08, + "logits/chosen": -2.4945197105407715, + "logits/rejected": -2.4668118953704834, + "logps/chosen": -356.53009033203125, + "logps/rejected": -385.39581298828125, + "loss": 0.0193, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.3914432525634766, + "rewards/margins": 9.552518844604492, + "rewards/rejected": -10.943963050842285, + "step": 5560 + }, + { + "epoch": 2.88, + "learning_rate": 2.30445591891375e-08, + "logits/chosen": -2.5244903564453125, + "logits/rejected": -2.4784955978393555, + "logps/chosen": -332.05157470703125, + "logps/rejected": -369.33013916015625, + "loss": 0.0089, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6902099251747131, + "rewards/margins": 11.023591995239258, + "rewards/rejected": -11.713800430297852, + "step": 5570 + }, + { + "epoch": 2.88, + "learning_rate": 2.2088353413654617e-08, + "logits/chosen": -2.3487985134124756, + "logits/rejected": -2.35017728805542, + "logps/chosen": -245.8054656982422, + "logps/rejected": -308.2724914550781, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4316474199295044, + "rewards/margins": 10.602240562438965, + "rewards/rejected": -12.03388786315918, + "step": 5580 + }, + { + "epoch": 2.89, + "learning_rate": 2.1132147638171733e-08, + "logits/chosen": -2.430948257446289, + "logits/rejected": -2.4136359691619873, + "logps/chosen": -328.83123779296875, + "logps/rejected": -336.42816162109375, + "loss": 0.0124, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2018016129732132, + "rewards/margins": 10.71995735168457, + "rewards/rejected": -10.921757698059082, + "step": 5590 + }, + { + "epoch": 2.89, + "learning_rate": 2.0175941862688848e-08, + "logits/chosen": -2.4677605628967285, + "logits/rejected": -2.4360225200653076, + "logps/chosen": -256.86199951171875, + "logps/rejected": -328.8375549316406, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.112344741821289, + "rewards/margins": 10.564764022827148, + "rewards/rejected": -11.677107810974121, + "step": 5600 + }, + { + "epoch": 2.9, + "learning_rate": 1.9219736087205964e-08, + "logits/chosen": -2.398508071899414, + "logits/rejected": -2.424389123916626, + "logps/chosen": -263.18658447265625, + "logps/rejected": -337.9579772949219, + "loss": 0.0184, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.4862511157989502, + "rewards/margins": 9.879942893981934, + "rewards/rejected": -11.366193771362305, + "step": 5610 + }, + { + "epoch": 2.9, + "learning_rate": 1.826353031172308e-08, + "logits/chosen": -2.363523483276367, + "logits/rejected": -2.4009668827056885, + "logps/chosen": -320.4283142089844, + "logps/rejected": -435.572509765625, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7103533148765564, + "rewards/margins": 11.305900573730469, + "rewards/rejected": -12.016253471374512, + "step": 5620 + }, + { + "epoch": 2.91, + "learning_rate": 1.73073245362402e-08, + "logits/chosen": -2.418933391571045, + "logits/rejected": -2.3981566429138184, + "logps/chosen": -300.5345764160156, + "logps/rejected": -324.4273376464844, + "loss": 0.018, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3486359119415283, + "rewards/margins": 10.127408981323242, + "rewards/rejected": -11.476043701171875, + "step": 5630 + }, + { + "epoch": 2.91, + "learning_rate": 1.6351118760757314e-08, + "logits/chosen": -2.4389071464538574, + "logits/rejected": -2.330331802368164, + "logps/chosen": -280.1732482910156, + "logps/rejected": -337.8651123046875, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2679827213287354, + "rewards/margins": 9.86819076538086, + "rewards/rejected": -11.136173248291016, + "step": 5640 + }, + { + "epoch": 2.92, + "learning_rate": 1.539491298527443e-08, + "logits/chosen": -2.44217586517334, + "logits/rejected": -2.413456678390503, + "logps/chosen": -247.275146484375, + "logps/rejected": -348.5274963378906, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.907915711402893, + "rewards/margins": 9.879022598266602, + "rewards/rejected": -11.786938667297363, + "step": 5650 + }, + { + "epoch": 2.92, + "learning_rate": 1.4438707209791546e-08, + "logits/chosen": -2.368377685546875, + "logits/rejected": -2.5231080055236816, + "logps/chosen": -306.4520263671875, + "logps/rejected": -321.46710205078125, + "loss": 0.0285, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.2531068325042725, + "rewards/margins": 9.472089767456055, + "rewards/rejected": -10.72519588470459, + "step": 5660 + }, + { + "epoch": 2.93, + "learning_rate": 1.3482501434308661e-08, + "logits/chosen": -2.4074292182922363, + "logits/rejected": -2.306896448135376, + "logps/chosen": -296.39984130859375, + "logps/rejected": -327.9219055175781, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2679624557495117, + "rewards/margins": 9.967606544494629, + "rewards/rejected": -11.235568046569824, + "step": 5670 + }, + { + "epoch": 2.93, + "learning_rate": 1.2526295658825777e-08, + "logits/chosen": -2.5389842987060547, + "logits/rejected": -2.484412908554077, + "logps/chosen": -302.4338684082031, + "logps/rejected": -372.11822509765625, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3193234205245972, + "rewards/margins": 10.425549507141113, + "rewards/rejected": -11.744871139526367, + "step": 5680 + }, + { + "epoch": 2.94, + "learning_rate": 1.1570089883342895e-08, + "logits/chosen": -2.3705217838287354, + "logits/rejected": -2.409170389175415, + "logps/chosen": -312.36614990234375, + "logps/rejected": -406.3182678222656, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.161109209060669, + "rewards/margins": 11.536172866821289, + "rewards/rejected": -12.697282791137695, + "step": 5690 + }, + { + "epoch": 2.94, + "learning_rate": 1.061388410786001e-08, + "logits/chosen": -2.424706220626831, + "logits/rejected": -2.459883213043213, + "logps/chosen": -288.7680969238281, + "logps/rejected": -316.0674743652344, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.269592523574829, + "rewards/margins": 9.58020305633545, + "rewards/rejected": -10.849796295166016, + "step": 5700 + }, + { + "epoch": 2.95, + "learning_rate": 9.657678332377126e-09, + "logits/chosen": -2.3998100757598877, + "logits/rejected": -2.329770565032959, + "logps/chosen": -285.0074157714844, + "logps/rejected": -316.50445556640625, + "loss": 0.0137, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.7311685085296631, + "rewards/margins": 9.935151100158691, + "rewards/rejected": -10.6663179397583, + "step": 5710 + }, + { + "epoch": 2.95, + "learning_rate": 8.701472556894243e-09, + "logits/chosen": -2.3474459648132324, + "logits/rejected": -2.4268414974212646, + "logps/chosen": -265.39727783203125, + "logps/rejected": -366.6424865722656, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.919445276260376, + "rewards/margins": 10.569762229919434, + "rewards/rejected": -12.489209175109863, + "step": 5720 + }, + { + "epoch": 2.96, + "learning_rate": 7.745266781411359e-09, + "logits/chosen": -2.407890796661377, + "logits/rejected": -2.4742534160614014, + "logps/chosen": -316.9874572753906, + "logps/rejected": -378.4074401855469, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8994172811508179, + "rewards/margins": 10.128255844116211, + "rewards/rejected": -11.027674674987793, + "step": 5730 + }, + { + "epoch": 2.96, + "learning_rate": 6.7890610059284754e-09, + "logits/chosen": -2.4006435871124268, + "logits/rejected": -2.4294021129608154, + "logps/chosen": -243.5701446533203, + "logps/rejected": -294.20233154296875, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2013685703277588, + "rewards/margins": 9.605109214782715, + "rewards/rejected": -10.806478500366211, + "step": 5740 + }, + { + "epoch": 2.97, + "learning_rate": 5.832855230445592e-09, + "logits/chosen": -2.395141124725342, + "logits/rejected": -2.4526638984680176, + "logps/chosen": -296.9883117675781, + "logps/rejected": -336.3964538574219, + "loss": 0.0085, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.263058066368103, + "rewards/margins": 10.24799633026123, + "rewards/rejected": -11.511053085327148, + "step": 5750 + }, + { + "epoch": 2.97, + "learning_rate": 4.8766494549627085e-09, + "logits/chosen": -2.397413969039917, + "logits/rejected": -2.379274368286133, + "logps/chosen": -286.8966064453125, + "logps/rejected": -352.62103271484375, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2679819166660309, + "rewards/margins": 11.343156814575195, + "rewards/rejected": -11.611139297485352, + "step": 5760 + }, + { + "epoch": 2.98, + "learning_rate": 3.920443679479824e-09, + "logits/chosen": -2.501077175140381, + "logits/rejected": -2.474958896636963, + "logps/chosen": -324.0655822753906, + "logps/rejected": -336.5184020996094, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2807409763336182, + "rewards/margins": 9.82066535949707, + "rewards/rejected": -11.10140609741211, + "step": 5770 + }, + { + "epoch": 2.98, + "learning_rate": 2.96423790399694e-09, + "logits/chosen": -2.4477200508117676, + "logits/rejected": -2.471569538116455, + "logps/chosen": -294.63897705078125, + "logps/rejected": -357.42059326171875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2787177562713623, + "rewards/margins": 9.29156494140625, + "rewards/rejected": -10.570282936096191, + "step": 5780 + }, + { + "epoch": 2.99, + "learning_rate": 2.008032128514056e-09, + "logits/chosen": -2.3351516723632812, + "logits/rejected": -2.407452344894409, + "logps/chosen": -300.2392883300781, + "logps/rejected": -361.8913269042969, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1922436952590942, + "rewards/margins": 9.98654556274414, + "rewards/rejected": -11.178790092468262, + "step": 5790 + }, + { + "epoch": 2.99, + "learning_rate": 1.0518263530311723e-09, + "logits/chosen": -2.4000933170318604, + "logits/rejected": -2.4726662635803223, + "logps/chosen": -244.21127319335938, + "logps/rejected": -344.6064453125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5048989057540894, + "rewards/margins": 9.535536766052246, + "rewards/rejected": -11.040433883666992, + "step": 5800 + }, + { + "epoch": 3.0, + "learning_rate": 9.562057754828839e-11, + "logits/chosen": -2.3397300243377686, + "logits/rejected": -2.444467067718506, + "logps/chosen": -252.40072631835938, + "logps/rejected": -342.96197509765625, + "loss": 0.0231, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8680833578109741, + "rewards/margins": 9.311616897583008, + "rewards/rejected": -10.17970085144043, + "step": 5810 + }, + { + "epoch": 3.0, + "step": 5811, + "total_flos": 0.0, + "train_loss": 0.21807545795603994, + "train_runtime": 74953.8161, + "train_samples_per_second": 2.48, + "train_steps_per_second": 0.078 + } + ], + "logging_steps": 10, + "max_steps": 5811, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}