{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 84.02989783534706, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0397655963897705, "logits/rejected": -1.0092562437057495, "logps/chosen": -0.7628876566886902, "logps/rejected": -0.7414335012435913, "loss": 4.8657, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -7.628875732421875, "rewards/margins": -0.21454186737537384, "rewards/rejected": -7.414334774017334, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 65.98135057247741, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0671762228012085, "logits/rejected": -0.9972389340400696, "logps/chosen": -0.7587485909461975, "logps/rejected": -0.7000675201416016, "loss": 4.7692, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -7.587485313415527, "rewards/margins": -0.5868101119995117, "rewards/rejected": -7.000675201416016, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 81.81057773129896, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9940463900566101, "logits/rejected": -1.014291524887085, "logps/chosen": -0.7531558871269226, "logps/rejected": -0.7980529069900513, "loss": 5.0125, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.531558990478516, "rewards/margins": 0.44897016882896423, "rewards/rejected": -7.980528831481934, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 131.3611224894929, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9941411018371582, "logits/rejected": -0.9676485061645508, "logps/chosen": -0.7748786807060242, "logps/rejected": -0.7950050234794617, "loss": 4.3887, "rewards/accuracies": 0.5, "rewards/chosen": -7.748786926269531, "rewards/margins": 0.20126314461231232, "rewards/rejected": -7.950050354003906, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 41.98227986443809, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.06044602394104, "logits/rejected": -1.0300180912017822, "logps/chosen": -0.5242100358009338, "logps/rejected": -0.5184083580970764, "loss": 4.4531, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -5.242100715637207, "rewards/margins": -0.05801659822463989, "rewards/rejected": -5.184083461761475, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 68.84250778408698, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.0604535341262817, "logits/rejected": -1.0147985219955444, "logps/chosen": -0.5427820086479187, "logps/rejected": -0.538798451423645, "loss": 4.0481, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -5.427820205688477, "rewards/margins": -0.03983556479215622, "rewards/rejected": -5.3879852294921875, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 55.83166407300918, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.1716325283050537, "logits/rejected": -1.0896761417388916, "logps/chosen": -0.45515695214271545, "logps/rejected": -0.46327242255210876, "loss": 3.8398, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -4.55156946182251, "rewards/margins": 0.08115490525960922, "rewards/rejected": -4.632723808288574, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 47.314667796032154, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.1367859840393066, "logits/rejected": -1.0874762535095215, "logps/chosen": -0.3599122166633606, "logps/rejected": -0.4211342930793762, "loss": 3.8501, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.5991222858428955, "rewards/margins": 0.6122205853462219, "rewards/rejected": -4.211342811584473, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 43.42431109758399, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.17836332321167, "logits/rejected": -1.1281588077545166, "logps/chosen": -0.377611368894577, "logps/rejected": -0.42245084047317505, "loss": 3.7936, "rewards/accuracies": 0.53125, "rewards/chosen": -3.7761130332946777, "rewards/margins": 0.448394775390625, "rewards/rejected": -4.224508762359619, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 50.23949059041095, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.1873140335083008, "logits/rejected": -1.1317476034164429, "logps/chosen": -0.34768182039260864, "logps/rejected": -0.38563376665115356, "loss": 3.7644, "rewards/accuracies": 0.5, "rewards/chosen": -3.476818561553955, "rewards/margins": 0.3795194625854492, "rewards/rejected": -3.856337785720825, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 77.86452279542212, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.2336595058441162, "logits/rejected": -1.1919711828231812, "logps/chosen": -0.33353298902511597, "logps/rejected": -0.3969067931175232, "loss": 3.6758, "rewards/accuracies": 0.59375, "rewards/chosen": -3.3353302478790283, "rewards/margins": 0.6337377429008484, "rewards/rejected": -3.9690680503845215, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 48.54307450778913, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.2853938341140747, "logits/rejected": -1.2468154430389404, "logps/chosen": -0.3357910215854645, "logps/rejected": -0.36223435401916504, "loss": 3.6487, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -3.35791015625, "rewards/margins": 0.26443368196487427, "rewards/rejected": -3.6223437786102295, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 67.81573981196492, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.203680396080017, "logits/rejected": -1.1713194847106934, "logps/chosen": -0.3974061906337738, "logps/rejected": -0.4787389636039734, "loss": 3.5226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.9740614891052246, "rewards/margins": 0.813327431678772, "rewards/rejected": -4.787389278411865, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 43.70954674356485, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.2420989274978638, "logits/rejected": -1.211631178855896, "logps/chosen": -0.43263110518455505, "logps/rejected": -0.5376982688903809, "loss": 3.6077, "rewards/accuracies": 0.59375, "rewards/chosen": -4.3263115882873535, "rewards/margins": 1.0506718158721924, "rewards/rejected": -5.376982688903809, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 76.0561732944734, "learning_rate": 9.890738003669027e-07, "logits/chosen": -1.196466088294983, "logits/rejected": -1.1188781261444092, "logps/chosen": -0.4406164586544037, "logps/rejected": -0.5367096662521362, "loss": 3.5757, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.40616512298584, "rewards/margins": 0.9609323740005493, "rewards/rejected": -5.3670973777771, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 51.31544624480531, "learning_rate": 9.848447601883433e-07, "logits/chosen": -1.189353346824646, "logits/rejected": -1.1714979410171509, "logps/chosen": -0.38350600004196167, "logps/rejected": -0.5140829682350159, "loss": 3.4182, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.835059642791748, "rewards/margins": 1.3057689666748047, "rewards/rejected": -5.140829563140869, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 72.27870189202869, "learning_rate": 9.799376207714444e-07, "logits/chosen": -1.2069041728973389, "logits/rejected": -1.1819300651550293, "logps/chosen": -0.4050617218017578, "logps/rejected": -0.46999722719192505, "loss": 3.4683, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.05061674118042, "rewards/margins": 0.6493551731109619, "rewards/rejected": -4.699972152709961, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 65.15839777923942, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.2592518329620361, "logits/rejected": -1.2168278694152832, "logps/chosen": -0.49755024909973145, "logps/rejected": -0.6345678567886353, "loss": 3.5477, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.975502014160156, "rewards/margins": 1.3701757192611694, "rewards/rejected": -6.345677852630615, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 86.65912439673568, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.324328064918518, "logits/rejected": -1.2326462268829346, "logps/chosen": -0.5199041962623596, "logps/rejected": -0.5949305295944214, "loss": 3.3966, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.199041843414307, "rewards/margins": 0.7502638697624207, "rewards/rejected": -5.949306011199951, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 84.91017386381003, "learning_rate": 9.612209208833646e-07, "logits/chosen": -1.223847508430481, "logits/rejected": -1.193149209022522, "logps/chosen": -0.5218724012374878, "logps/rejected": -0.6197377443313599, "loss": 3.4071, "rewards/accuracies": 0.65625, "rewards/chosen": -5.218724250793457, "rewards/margins": 0.978653609752655, "rewards/rejected": -6.197378158569336, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 122.37976199382105, "learning_rate": 9.536793472839324e-07, "logits/chosen": -1.2392634153366089, "logits/rejected": -1.178882360458374, "logps/chosen": -0.47783392667770386, "logps/rejected": -0.6210560202598572, "loss": 3.3437, "rewards/accuracies": 0.71875, "rewards/chosen": -4.7783403396606445, "rewards/margins": 1.432220220565796, "rewards/rejected": -6.210559844970703, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 117.75268692874619, "learning_rate": 9.455032620941839e-07, "logits/chosen": -1.2091577053070068, "logits/rejected": -1.1453241109848022, "logps/chosen": -0.5471813678741455, "logps/rejected": -0.6967746615409851, "loss": 3.3036, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.471813678741455, "rewards/margins": 1.495932936668396, "rewards/rejected": -6.967746734619141, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 103.23266991280943, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.2898657321929932, "logits/rejected": -1.224526047706604, "logps/chosen": -0.5657516121864319, "logps/rejected": -0.684479296207428, "loss": 3.058, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.657515525817871, "rewards/margins": 1.18727707862854, "rewards/rejected": -6.84479284286499, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 87.5488547010946, "learning_rate": 9.272941683504808e-07, "logits/chosen": -1.2546017169952393, "logits/rejected": -1.157871961593628, "logps/chosen": -0.6318168640136719, "logps/rejected": -0.8914499282836914, "loss": 2.963, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.318168640136719, "rewards/margins": 2.596330165863037, "rewards/rejected": -8.914499282836914, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 136.85902662680772, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.336275339126587, "logits/rejected": -1.2872272729873657, "logps/chosen": -0.6883447766304016, "logps/rejected": -0.8685702085494995, "loss": 2.782, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.883447170257568, "rewards/margins": 1.8022544384002686, "rewards/rejected": -8.685701370239258, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 219.78249066687138, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.365232229232788, "logits/rejected": -1.354945182800293, "logps/chosen": -0.795397162437439, "logps/rejected": -1.2445639371871948, "loss": 2.7364, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.953972816467285, "rewards/margins": 4.491666793823242, "rewards/rejected": -12.445638656616211, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 108.29484064425847, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.3360868692398071, "logits/rejected": -1.2848550081253052, "logps/chosen": -0.8393806219100952, "logps/rejected": -1.0930553674697876, "loss": 2.7236, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.393805503845215, "rewards/margins": 2.536748170852661, "rewards/rejected": -10.930554389953613, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 111.40981489221316, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.4011452198028564, "logits/rejected": -1.3746473789215088, "logps/chosen": -0.8652567863464355, "logps/rejected": -1.1113642454147339, "loss": 2.6512, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.652568817138672, "rewards/margins": 2.4610743522644043, "rewards/rejected": -11.113641738891602, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 138.17602287907837, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.3082659244537354, "logits/rejected": -1.2775059938430786, "logps/chosen": -0.8726702928543091, "logps/rejected": -1.1962165832519531, "loss": 2.5784, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.726702690124512, "rewards/margins": 3.2354626655578613, "rewards/rejected": -11.962165832519531, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 115.35023284841587, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.3314917087554932, "logits/rejected": -1.306792974472046, "logps/chosen": -1.0279521942138672, "logps/rejected": -1.3382768630981445, "loss": 2.47, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -10.279521942138672, "rewards/margins": 3.1032464504241943, "rewards/rejected": -13.382769584655762, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 144.67956167588085, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.3292877674102783, "logits/rejected": -1.3038713932037354, "logps/chosen": -1.035979986190796, "logps/rejected": -1.4305903911590576, "loss": 2.4435, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -10.359800338745117, "rewards/margins": 3.946104049682617, "rewards/rejected": -14.305903434753418, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 106.03488572932915, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.3299791812896729, "logits/rejected": -1.3083471059799194, "logps/chosen": -1.1912587881088257, "logps/rejected": -1.648496389389038, "loss": 2.3299, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.912586212158203, "rewards/margins": 4.5723772048950195, "rewards/rejected": -16.484966278076172, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 124.30842086260031, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.3360395431518555, "logits/rejected": -1.3087151050567627, "logps/chosen": -1.2806798219680786, "logps/rejected": -1.7463128566741943, "loss": 2.1935, "rewards/accuracies": 0.75, "rewards/chosen": -12.806798934936523, "rewards/margins": 4.656330108642578, "rewards/rejected": -17.463130950927734, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 159.71509664144037, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.3390699625015259, "logits/rejected": -1.3161685466766357, "logps/chosen": -1.27903151512146, "logps/rejected": -1.8541208505630493, "loss": 2.1363, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -12.790315628051758, "rewards/margins": 5.750893592834473, "rewards/rejected": -18.541208267211914, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 163.13967408983922, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.359133243560791, "logits/rejected": -1.3104041814804077, "logps/chosen": -1.4678348302841187, "logps/rejected": -1.8786561489105225, "loss": 2.0099, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.678349494934082, "rewards/margins": 4.108211517333984, "rewards/rejected": -18.786561965942383, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 213.96398552844718, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.3180896043777466, "logits/rejected": -1.2993403673171997, "logps/chosen": -1.395212173461914, "logps/rejected": -1.9256620407104492, "loss": 1.8566, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.952122688293457, "rewards/margins": 5.304497718811035, "rewards/rejected": -19.25661849975586, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 151.287610930137, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.354866623878479, "logits/rejected": -1.2994146347045898, "logps/chosen": -1.5145620107650757, "logps/rejected": -2.0945541858673096, "loss": 1.9317, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.14561939239502, "rewards/margins": 5.799921989440918, "rewards/rejected": -20.945541381835938, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 147.29173146280465, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.348362684249878, "logits/rejected": -1.3590227365493774, "logps/chosen": -1.7428133487701416, "logps/rejected": -2.511070966720581, "loss": 1.7298, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.428131103515625, "rewards/margins": 7.682579040527344, "rewards/rejected": -25.1107120513916, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 107.1032490301315, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.3077692985534668, "logits/rejected": -1.2952277660369873, "logps/chosen": -1.8426529169082642, "logps/rejected": -2.392831325531006, "loss": 1.9538, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -18.426528930664062, "rewards/margins": 5.5017852783203125, "rewards/rejected": -23.928314208984375, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 190.9422038213508, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.3160083293914795, "logits/rejected": -1.3034440279006958, "logps/chosen": -1.7327241897583008, "logps/rejected": -2.3605778217315674, "loss": 1.6663, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -17.327241897583008, "rewards/margins": 6.278534889221191, "rewards/rejected": -23.605777740478516, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 161.95141010920645, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.3677047491073608, "logits/rejected": -1.3167794942855835, "logps/chosen": -1.8963912725448608, "logps/rejected": -2.526191234588623, "loss": 1.8748, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -18.96391487121582, "rewards/margins": 6.297998905181885, "rewards/rejected": -25.261911392211914, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 150.6462623725548, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.400187373161316, "logits/rejected": -1.3892953395843506, "logps/chosen": -2.041691541671753, "logps/rejected": -2.6361751556396484, "loss": 1.827, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -20.416913986206055, "rewards/margins": 5.9448370933532715, "rewards/rejected": -26.361751556396484, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 126.80315819769801, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.3723266124725342, "logits/rejected": -1.356567144393921, "logps/chosen": -2.0259296894073486, "logps/rejected": -2.754607677459717, "loss": 1.6874, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -20.259296417236328, "rewards/margins": 7.286777496337891, "rewards/rejected": -27.546072006225586, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 161.97036694522683, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.4270765781402588, "logits/rejected": -1.4134238958358765, "logps/chosen": -2.0706095695495605, "logps/rejected": -2.8078479766845703, "loss": 1.5613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.706096649169922, "rewards/margins": 7.372382164001465, "rewards/rejected": -28.078479766845703, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 119.50283654492473, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.4146029949188232, "logits/rejected": -1.3837422132492065, "logps/chosen": -2.119856595993042, "logps/rejected": -2.7663817405700684, "loss": 1.7069, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -21.198566436767578, "rewards/margins": 6.465248107910156, "rewards/rejected": -27.663818359375, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 109.77939760201329, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.4831273555755615, "logits/rejected": -1.435591459274292, "logps/chosen": -2.0505659580230713, "logps/rejected": -2.685992956161499, "loss": 1.5078, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -20.505659103393555, "rewards/margins": 6.354269027709961, "rewards/rejected": -26.859928131103516, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 147.37811459958783, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.4949449300765991, "logits/rejected": -1.469238042831421, "logps/chosen": -2.045539617538452, "logps/rejected": -2.816204071044922, "loss": 1.4911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.455394744873047, "rewards/margins": 7.706644535064697, "rewards/rejected": -28.162038803100586, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 132.440512722152, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.461572289466858, "logits/rejected": -1.4638675451278687, "logps/chosen": -1.9458844661712646, "logps/rejected": -2.7108004093170166, "loss": 1.5773, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -19.458845138549805, "rewards/margins": 7.6491570472717285, "rewards/rejected": -27.10800552368164, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 236.09806301372015, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.5521364212036133, "logits/rejected": -1.4946634769439697, "logps/chosen": -2.0292551517486572, "logps/rejected": -2.9242515563964844, "loss": 1.395, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -20.292551040649414, "rewards/margins": 8.949962615966797, "rewards/rejected": -29.242517471313477, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 142.79859396103615, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.4808663129806519, "logits/rejected": -1.4642443656921387, "logps/chosen": -2.046252489089966, "logps/rejected": -2.9118173122406006, "loss": 1.3738, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -20.462526321411133, "rewards/margins": 8.655647277832031, "rewards/rejected": -29.1181697845459, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 155.0378761351227, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.5233964920043945, "logits/rejected": -1.5014541149139404, "logps/chosen": -2.2131590843200684, "logps/rejected": -2.9324965476989746, "loss": 1.5353, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -22.131589889526367, "rewards/margins": 7.193373680114746, "rewards/rejected": -29.324962615966797, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 130.04315061980418, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.5009315013885498, "logits/rejected": -1.474130630493164, "logps/chosen": -2.1018893718719482, "logps/rejected": -2.8382973670959473, "loss": 1.5292, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -21.018896102905273, "rewards/margins": 7.364079475402832, "rewards/rejected": -28.382976531982422, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 119.27303382130957, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.4742642641067505, "logits/rejected": -1.4596500396728516, "logps/chosen": -2.2155094146728516, "logps/rejected": -3.0179553031921387, "loss": 1.1693, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -22.15509605407715, "rewards/margins": 8.024459838867188, "rewards/rejected": -30.179553985595703, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 131.97372035922703, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.4763275384902954, "logits/rejected": -1.4486229419708252, "logps/chosen": -1.990666151046753, "logps/rejected": -2.838904857635498, "loss": 1.3013, "rewards/accuracies": 0.875, "rewards/chosen": -19.906661987304688, "rewards/margins": 8.482388496398926, "rewards/rejected": -28.389049530029297, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 104.56387348722761, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.4566619396209717, "logits/rejected": -1.4216809272766113, "logps/chosen": -1.9770195484161377, "logps/rejected": -2.719089984893799, "loss": 1.4258, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -19.77019500732422, "rewards/margins": 7.4207048416137695, "rewards/rejected": -27.190898895263672, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 133.9300472658157, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.542373776435852, "logits/rejected": -1.5213029384613037, "logps/chosen": -2.0355706214904785, "logps/rejected": -2.883401393890381, "loss": 1.3022, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -20.3557071685791, "rewards/margins": 8.478304862976074, "rewards/rejected": -28.834014892578125, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 118.2550806898935, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.501997947692871, "logits/rejected": -1.4718389511108398, "logps/chosen": -2.043879270553589, "logps/rejected": -2.9162051677703857, "loss": 1.0025, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.438793182373047, "rewards/margins": 8.723258972167969, "rewards/rejected": -29.16205406188965, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 112.08802157558269, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.5190720558166504, "logits/rejected": -1.4788782596588135, "logps/chosen": -2.014256000518799, "logps/rejected": -2.7785563468933105, "loss": 1.3157, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -20.142559051513672, "rewards/margins": 7.64300537109375, "rewards/rejected": -27.785564422607422, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 122.16012718306597, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.4519203901290894, "logits/rejected": -1.4377477169036865, "logps/chosen": -2.052429437637329, "logps/rejected": -2.7946228981018066, "loss": 1.2035, "rewards/accuracies": 0.8125, "rewards/chosen": -20.524295806884766, "rewards/margins": 7.421932220458984, "rewards/rejected": -27.94622802734375, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 127.59960775053135, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.3882105350494385, "logits/rejected": -1.365216612815857, "logps/chosen": -2.0868842601776123, "logps/rejected": -2.7921385765075684, "loss": 1.5608, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -20.86884307861328, "rewards/margins": 7.0525383949279785, "rewards/rejected": -27.9213809967041, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 104.5591334100815, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.5032074451446533, "logits/rejected": -1.458888053894043, "logps/chosen": -2.047602415084839, "logps/rejected": -2.731414794921875, "loss": 1.3552, "rewards/accuracies": 0.8125, "rewards/chosen": -20.476024627685547, "rewards/margins": 6.8381242752075195, "rewards/rejected": -27.31414794921875, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 128.9435075925882, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.5344185829162598, "logits/rejected": -1.520206093788147, "logps/chosen": -2.182490587234497, "logps/rejected": -2.994981288909912, "loss": 1.3424, "rewards/accuracies": 0.90625, "rewards/chosen": -21.824905395507812, "rewards/margins": 8.124906539916992, "rewards/rejected": -29.949810028076172, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 117.862174484777, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.5316812992095947, "logits/rejected": -1.4777030944824219, "logps/chosen": -2.1463074684143066, "logps/rejected": -2.8947274684906006, "loss": 1.2773, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -21.46307373046875, "rewards/margins": 7.484200954437256, "rewards/rejected": -28.947277069091797, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 154.98050630680447, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.5196678638458252, "logits/rejected": -1.5156781673431396, "logps/chosen": -2.249702215194702, "logps/rejected": -3.2510273456573486, "loss": 1.0665, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -22.497024536132812, "rewards/margins": 10.013248443603516, "rewards/rejected": -32.51027297973633, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 180.45567365921923, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.549286127090454, "logits/rejected": -1.5002086162567139, "logps/chosen": -2.399672031402588, "logps/rejected": -3.504227876663208, "loss": 1.2384, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -23.996719360351562, "rewards/margins": 11.045561790466309, "rewards/rejected": -35.04228210449219, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 105.90784807533655, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.4591710567474365, "logits/rejected": -1.4205106496810913, "logps/chosen": -2.2293872833251953, "logps/rejected": -3.00614857673645, "loss": 1.2359, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -22.293874740600586, "rewards/margins": 7.767613410949707, "rewards/rejected": -30.061487197875977, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 124.3214763138239, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.495527744293213, "logits/rejected": -1.4803019762039185, "logps/chosen": -2.202894449234009, "logps/rejected": -3.0441994667053223, "loss": 1.1359, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -22.02894401550293, "rewards/margins": 8.413049697875977, "rewards/rejected": -30.441997528076172, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 156.03049853633283, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.5111547708511353, "logits/rejected": -1.4789291620254517, "logps/chosen": -2.1601741313934326, "logps/rejected": -3.021275281906128, "loss": 1.2419, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -21.601741790771484, "rewards/margins": 8.611013412475586, "rewards/rejected": -30.212757110595703, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 130.76746865900543, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.5371043682098389, "logits/rejected": -1.5177440643310547, "logps/chosen": -2.2243714332580566, "logps/rejected": -3.2498092651367188, "loss": 1.2299, "rewards/accuracies": 0.90625, "rewards/chosen": -22.243711471557617, "rewards/margins": 10.25438117980957, "rewards/rejected": -32.49809265136719, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 219.49500818808932, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.5225059986114502, "logits/rejected": -1.5286823511123657, "logps/chosen": -2.12577486038208, "logps/rejected": -3.068633794784546, "loss": 1.164, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -21.257749557495117, "rewards/margins": 9.428587913513184, "rewards/rejected": -30.686336517333984, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 148.30498812771748, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.4495574235916138, "logits/rejected": -1.4652235507965088, "logps/chosen": -2.066810131072998, "logps/rejected": -2.938152313232422, "loss": 1.1351, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -20.668102264404297, "rewards/margins": 8.713422775268555, "rewards/rejected": -29.38152503967285, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 116.32905868645479, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.5009536743164062, "logits/rejected": -1.5022971630096436, "logps/chosen": -2.0494394302368164, "logps/rejected": -3.0328898429870605, "loss": 1.1972, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -20.494396209716797, "rewards/margins": 9.834505081176758, "rewards/rejected": -30.328899383544922, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 108.7134031593527, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.5464454889297485, "logits/rejected": -1.4812748432159424, "logps/chosen": -2.1187500953674316, "logps/rejected": -3.0532031059265137, "loss": 1.253, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -21.187496185302734, "rewards/margins": 9.344534873962402, "rewards/rejected": -30.532033920288086, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 145.2134549922322, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.474498987197876, "logits/rejected": -1.4640263319015503, "logps/chosen": -2.100693941116333, "logps/rejected": -2.9064152240753174, "loss": 1.1015, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -21.00693702697754, "rewards/margins": 8.05721378326416, "rewards/rejected": -29.064151763916016, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 130.4190340462438, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.4943348169326782, "logits/rejected": -1.4995317459106445, "logps/chosen": -2.2358219623565674, "logps/rejected": -3.1511778831481934, "loss": 1.1776, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -22.35822105407715, "rewards/margins": 9.153559684753418, "rewards/rejected": -31.51177978515625, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 171.4769450829484, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.5206212997436523, "logits/rejected": -1.5025417804718018, "logps/chosen": -2.1182684898376465, "logps/rejected": -2.9665935039520264, "loss": 1.2678, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -21.18268394470215, "rewards/margins": 8.48325252532959, "rewards/rejected": -29.665935516357422, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 116.74061042296164, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.5152442455291748, "logits/rejected": -1.4944274425506592, "logps/chosen": -2.0695395469665527, "logps/rejected": -3.1040778160095215, "loss": 1.0078, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -20.69539451599121, "rewards/margins": 10.345380783081055, "rewards/rejected": -31.0407772064209, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 112.33137236518094, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.5178992748260498, "logits/rejected": -1.5032987594604492, "logps/chosen": -2.170666456222534, "logps/rejected": -3.162553310394287, "loss": 1.2224, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -21.7066650390625, "rewards/margins": 9.918868064880371, "rewards/rejected": -31.62552833557129, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 186.4853149645633, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.5229237079620361, "logits/rejected": -1.505030632019043, "logps/chosen": -2.0735864639282227, "logps/rejected": -2.9833900928497314, "loss": 1.1526, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -20.73586654663086, "rewards/margins": 9.098031997680664, "rewards/rejected": -29.833898544311523, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 96.6762011696182, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.5230014324188232, "logits/rejected": -1.5108994245529175, "logps/chosen": -2.109778642654419, "logps/rejected": -2.941293716430664, "loss": 1.1544, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -21.09778594970703, "rewards/margins": 8.315154075622559, "rewards/rejected": -29.412939071655273, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.710108757019043, "eval_logits/rejected": -1.7197374105453491, "eval_logps/chosen": -2.08453369140625, "eval_logps/rejected": -2.94063138961792, "eval_loss": 1.1388919353485107, "eval_rewards/accuracies": 0.8678861856460571, "eval_rewards/chosen": -20.845340728759766, "eval_rewards/margins": 8.560976028442383, "eval_rewards/rejected": -29.406312942504883, "eval_runtime": 123.1178, "eval_samples_per_second": 15.928, "eval_steps_per_second": 0.999, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 143.6008976778444, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.4884498119354248, "logits/rejected": -1.508371114730835, "logps/chosen": -2.2530012130737305, "logps/rejected": -3.071047782897949, "loss": 1.1786, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -22.530010223388672, "rewards/margins": 8.18046760559082, "rewards/rejected": -30.710479736328125, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 145.0618422989251, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.4991635084152222, "logits/rejected": -1.4899944067001343, "logps/chosen": -2.1319069862365723, "logps/rejected": -2.977902889251709, "loss": 1.073, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -21.31907081604004, "rewards/margins": 8.4599609375, "rewards/rejected": -29.77903175354004, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 121.04879854574509, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.5042963027954102, "logits/rejected": -1.4524286985397339, "logps/chosen": -2.0573108196258545, "logps/rejected": -3.0475449562072754, "loss": 1.2613, "rewards/accuracies": 0.875, "rewards/chosen": -20.573108673095703, "rewards/margins": 9.902341842651367, "rewards/rejected": -30.475452423095703, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 139.53176945804844, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.5310368537902832, "logits/rejected": -1.4835598468780518, "logps/chosen": -2.005772352218628, "logps/rejected": -3.0218088626861572, "loss": 1.1364, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -20.057723999023438, "rewards/margins": 10.160362243652344, "rewards/rejected": -30.218088150024414, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 136.53214771188803, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.525810718536377, "logits/rejected": -1.51809823513031, "logps/chosen": -2.186115026473999, "logps/rejected": -3.016331911087036, "loss": 1.4659, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -21.86115074157715, "rewards/margins": 8.302164077758789, "rewards/rejected": -30.163314819335938, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 159.75999561219473, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.5137279033660889, "logits/rejected": -1.5020883083343506, "logps/chosen": -2.2320022583007812, "logps/rejected": -3.141892671585083, "loss": 1.1123, "rewards/accuracies": 0.90625, "rewards/chosen": -22.320022583007812, "rewards/margins": 9.098905563354492, "rewards/rejected": -31.418926239013672, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 103.73846522782931, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.469313383102417, "logits/rejected": -1.4278925657272339, "logps/chosen": -1.988031029701233, "logps/rejected": -2.880598545074463, "loss": 1.1778, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -19.880308151245117, "rewards/margins": 8.925679206848145, "rewards/rejected": -28.805988311767578, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 103.8822744213963, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.4933596849441528, "logits/rejected": -1.4389324188232422, "logps/chosen": -2.1481704711914062, "logps/rejected": -3.0063669681549072, "loss": 1.0981, "rewards/accuracies": 0.90625, "rewards/chosen": -21.481706619262695, "rewards/margins": 8.581964492797852, "rewards/rejected": -30.063669204711914, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 105.17668873683024, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.4460530281066895, "logits/rejected": -1.431138277053833, "logps/chosen": -2.159626007080078, "logps/rejected": -3.2153289318084717, "loss": 1.1378, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -21.59626007080078, "rewards/margins": 10.557029724121094, "rewards/rejected": -32.153289794921875, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 149.56429045938268, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.5528205633163452, "logits/rejected": -1.5331635475158691, "logps/chosen": -2.0413358211517334, "logps/rejected": -2.8829147815704346, "loss": 1.1398, "rewards/accuracies": 0.875, "rewards/chosen": -20.413360595703125, "rewards/margins": 8.415786743164062, "rewards/rejected": -28.829147338867188, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 144.02520089734548, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.5203301906585693, "logits/rejected": -1.4858264923095703, "logps/chosen": -2.0674800872802734, "logps/rejected": -3.0988433361053467, "loss": 1.0231, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -20.674800872802734, "rewards/margins": 10.313634872436523, "rewards/rejected": -30.988433837890625, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 123.56718367330731, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.5208638906478882, "logits/rejected": -1.5143053531646729, "logps/chosen": -2.192969799041748, "logps/rejected": -3.1692872047424316, "loss": 1.1094, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -21.929697036743164, "rewards/margins": 9.763177871704102, "rewards/rejected": -31.692874908447266, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 128.9716796420016, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.4912371635437012, "logits/rejected": -1.496371865272522, "logps/chosen": -2.177973508834839, "logps/rejected": -3.1520798206329346, "loss": 1.2437, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -21.779735565185547, "rewards/margins": 9.741061210632324, "rewards/rejected": -31.520797729492188, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 2.0977548006004643, "train_runtime": 12789.999, "train_samples_per_second": 4.681, "train_steps_per_second": 0.037 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }