{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993060374739764, "eval_steps": 1000, "global_step": 720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013879250520471894, "grad_norm": 2.184646703877197, "learning_rate": 6.9444444444444435e-09, "logits/chosen": -1.157708764076233, "logits/rejected": -1.0856982469558716, "logps/chosen": -392.3704528808594, "logps/rejected": -422.7169189453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.013879250520471894, "grad_norm": 2.045186292766699, "learning_rate": 6.944444444444444e-08, "logits/chosen": -0.9201799035072327, "logits/rejected": -0.9960015416145325, "logps/chosen": -403.1753234863281, "logps/rejected": -406.3473815917969, "loss": 0.6931, "rewards/accuracies": 0.3680555522441864, "rewards/chosen": -0.0007031817804090679, "rewards/margins": -0.0005899361567571759, "rewards/rejected": -0.00011324579827487469, "step": 10 }, { "epoch": 0.027758501040943788, "grad_norm": 2.0501238605790117, "learning_rate": 1.3888888888888888e-07, "logits/chosen": -0.9155173301696777, "logits/rejected": -0.9719367027282715, "logps/chosen": -393.12945556640625, "logps/rejected": -407.33709716796875, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0007024414371699095, "rewards/margins": 0.00011867408466059715, "rewards/rejected": 0.0005837674252688885, "step": 20 }, { "epoch": 0.041637751561415685, "grad_norm": 2.1931925008563304, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.9494641423225403, "logits/rejected": -1.0066778659820557, "logps/chosen": -429.01910400390625, "logps/rejected": -425.8172912597656, "loss": 0.6929, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.0010106399422511458, "rewards/margins": 0.0010079689091071486, "rewards/rejected": 2.671018592081964e-06, "step": 30 }, { "epoch": 0.055517002081887576, "grad_norm": 2.196961132429192, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -0.9400655627250671, "logits/rejected": -1.0188666582107544, "logps/chosen": -420.2276916503906, "logps/rejected": -405.5096130371094, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.002629968337714672, "rewards/margins": 0.001661272719502449, "rewards/rejected": 0.0009686955017969012, "step": 40 }, { "epoch": 0.06939625260235947, "grad_norm": 2.2620008213708194, "learning_rate": 3.472222222222222e-07, "logits/chosen": -0.9268127679824829, "logits/rejected": -0.9413528442382812, "logps/chosen": -407.0869140625, "logps/rejected": -433.43035888671875, "loss": 0.691, "rewards/accuracies": 0.703125, "rewards/chosen": 0.006026268471032381, "rewards/margins": 0.005162273999303579, "rewards/rejected": 0.0008639938896521926, "step": 50 }, { "epoch": 0.08327550312283137, "grad_norm": 2.9642516438971254, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.9514287710189819, "logits/rejected": -1.0300030708312988, "logps/chosen": -408.86181640625, "logps/rejected": -417.53106689453125, "loss": 0.6884, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.01152682676911354, "rewards/margins": 0.00950426422059536, "rewards/rejected": 0.002022563014179468, "step": 60 }, { "epoch": 0.09715475364330327, "grad_norm": 2.0059535680251264, "learning_rate": 4.861111111111111e-07, "logits/chosen": -0.9322255849838257, "logits/rejected": -0.9961916208267212, "logps/chosen": -416.08197021484375, "logps/rejected": -440.123291015625, "loss": 0.6845, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.02161681093275547, "rewards/margins": 0.016515256837010384, "rewards/rejected": 0.005101555027067661, "step": 70 }, { "epoch": 0.11103400416377515, "grad_norm": 2.959027274351803, "learning_rate": 4.998119881260575e-07, "logits/chosen": -0.8885990381240845, "logits/rejected": -0.9473252296447754, "logps/chosen": -412.66253662109375, "logps/rejected": -426.8203125, "loss": 0.6784, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.03722007945179939, "rewards/margins": 0.03375672921538353, "rewards/rejected": 0.0034633490722626448, "step": 80 }, { "epoch": 0.12491325468424705, "grad_norm": 2.896496746709743, "learning_rate": 4.990486745229364e-07, "logits/chosen": -0.9239265322685242, "logits/rejected": -1.015549659729004, "logps/chosen": -406.7080078125, "logps/rejected": -408.1942138671875, "loss": 0.6692, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.04666774719953537, "rewards/margins": 0.0515812449157238, "rewards/rejected": -0.004913502838462591, "step": 90 }, { "epoch": 0.13879250520471895, "grad_norm": 2.17437482262988, "learning_rate": 4.977001008412112e-07, "logits/chosen": -1.0140012502670288, "logits/rejected": -1.0407038927078247, "logps/chosen": -416.13800048828125, "logps/rejected": -424.24468994140625, "loss": 0.6619, "rewards/accuracies": 0.746874988079071, "rewards/chosen": 0.06093335896730423, "rewards/margins": 0.05933469533920288, "rewards/rejected": 0.0015986515209078789, "step": 100 }, { "epoch": 0.15267175572519084, "grad_norm": 2.0661816686924537, "learning_rate": 4.957694362057149e-07, "logits/chosen": -0.9822956919670105, "logits/rejected": -1.011765956878662, "logps/chosen": -417.8843688964844, "logps/rejected": -422.9627380371094, "loss": 0.6518, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.05249622464179993, "rewards/margins": 0.07984234392642975, "rewards/rejected": -0.027346113696694374, "step": 110 }, { "epoch": 0.16655100624566274, "grad_norm": 2.1244191981703135, "learning_rate": 4.932612176449559e-07, "logits/chosen": -0.9970356225967407, "logits/rejected": -1.0578349828720093, "logps/chosen": -407.3737487792969, "logps/rejected": -426.0777893066406, "loss": 0.6368, "rewards/accuracies": 0.75, "rewards/chosen": 0.04393979534506798, "rewards/margins": 0.10807327926158905, "rewards/rejected": -0.06413348764181137, "step": 120 }, { "epoch": 0.18043025676613464, "grad_norm": 2.0595896023881703, "learning_rate": 4.901813394291801e-07, "logits/chosen": -0.9531866908073425, "logits/rejected": -0.9673601984977722, "logps/chosen": -416.93572998046875, "logps/rejected": -443.47894287109375, "loss": 0.6201, "rewards/accuracies": 0.84375, "rewards/chosen": 0.03279733285307884, "rewards/margins": 0.1790144443511963, "rewards/rejected": -0.14621710777282715, "step": 130 }, { "epoch": 0.19430950728660654, "grad_norm": 2.169975326300987, "learning_rate": 4.865370392189376e-07, "logits/chosen": -0.9693125486373901, "logits/rejected": -0.9949439167976379, "logps/chosen": -396.1827697753906, "logps/rejected": -439.85198974609375, "loss": 0.6004, "rewards/accuracies": 0.84375, "rewards/chosen": 0.005037306807935238, "rewards/margins": 0.23115964233875275, "rewards/rejected": -0.22612233459949493, "step": 140 }, { "epoch": 0.2081887578070784, "grad_norm": 2.439547013921171, "learning_rate": 4.823368810567056e-07, "logits/chosen": -1.0007877349853516, "logits/rejected": -1.0211200714111328, "logps/chosen": -408.7399597167969, "logps/rejected": -449.01953125, "loss": 0.5878, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.07737134397029877, "rewards/margins": 0.25105008482933044, "rewards/rejected": -0.3284214437007904, "step": 150 }, { "epoch": 0.2220680083275503, "grad_norm": 2.191957070219856, "learning_rate": 4.775907352415367e-07, "logits/chosen": -0.9597848057746887, "logits/rejected": -0.9754905700683594, "logps/chosen": -404.9871826171875, "logps/rejected": -453.3285217285156, "loss": 0.572, "rewards/accuracies": 0.78125, "rewards/chosen": -0.14126327633857727, "rewards/margins": 0.30146196484565735, "rewards/rejected": -0.44272518157958984, "step": 160 }, { "epoch": 0.2359472588480222, "grad_norm": 2.8446228567237726, "learning_rate": 4.723097551340265e-07, "logits/chosen": -0.8638654947280884, "logits/rejected": -0.8823292851448059, "logps/chosen": -451.3196716308594, "logps/rejected": -487.5765075683594, "loss": 0.5457, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.2892600893974304, "rewards/margins": 0.37542229890823364, "rewards/rejected": -0.6646823287010193, "step": 170 }, { "epoch": 0.2498265093684941, "grad_norm": 2.488211046081056, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.9351462125778198, "logits/rejected": -0.9417260885238647, "logps/chosen": -442.58917236328125, "logps/rejected": -491.6702575683594, "loss": 0.5337, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.40233302116394043, "rewards/margins": 0.40838655829429626, "rewards/rejected": -0.8107194900512695, "step": 180 }, { "epoch": 0.263705759888966, "grad_norm": 3.2325426784137417, "learning_rate": 4.6019416057727577e-07, "logits/chosen": -0.9113900065422058, "logits/rejected": -0.9249471426010132, "logps/chosen": -454.93658447265625, "logps/rejected": -505.32763671875, "loss": 0.5093, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5173531174659729, "rewards/margins": 0.5457069277763367, "rewards/rejected": -1.0630600452423096, "step": 190 }, { "epoch": 0.2775850104094379, "grad_norm": 2.6935717929946374, "learning_rate": 4.5338801756574185e-07, "logits/chosen": -0.9233020544052124, "logits/rejected": -0.936613917350769, "logps/chosen": -477.2933654785156, "logps/rejected": -543.9907836914062, "loss": 0.5088, "rewards/accuracies": 0.765625, "rewards/chosen": -0.698850154876709, "rewards/margins": 0.6066737174987793, "rewards/rejected": -1.3055237531661987, "step": 200 }, { "epoch": 0.2914642609299098, "grad_norm": 2.6234937800693237, "learning_rate": 4.461039162298939e-07, "logits/chosen": -0.9328230619430542, "logits/rejected": -0.8883553743362427, "logps/chosen": -474.24176025390625, "logps/rejected": -560.80810546875, "loss": 0.4809, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7412362694740295, "rewards/margins": 0.7680382132530212, "rewards/rejected": -1.5092744827270508, "step": 210 }, { "epoch": 0.3053435114503817, "grad_norm": 3.024837883769618, "learning_rate": 4.3835897408191513e-07, "logits/chosen": -0.9258328676223755, "logits/rejected": -0.902255654335022, "logps/chosen": -545.10205078125, "logps/rejected": -611.480224609375, "loss": 0.4716, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -0.9928766489028931, "rewards/margins": 0.8125397562980652, "rewards/rejected": -1.8054163455963135, "step": 220 }, { "epoch": 0.3192227619708536, "grad_norm": 4.999154415916733, "learning_rate": 4.301713916019286e-07, "logits/chosen": -0.864261269569397, "logits/rejected": -0.8518573045730591, "logps/chosen": -521.3760986328125, "logps/rejected": -621.6141967773438, "loss": 0.4618, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": -1.0177581310272217, "rewards/margins": 1.0421375036239624, "rewards/rejected": -2.0598955154418945, "step": 230 }, { "epoch": 0.3331020124913255, "grad_norm": 4.221789042426902, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -0.8536814451217651, "logits/rejected": -0.855249285697937, "logps/chosen": -554.1260986328125, "logps/rejected": -660.2291259765625, "loss": 0.4504, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2824825048446655, "rewards/margins": 0.9512511491775513, "rewards/rejected": -2.233733654022217, "step": 240 }, { "epoch": 0.3469812630117974, "grad_norm": 2.9920792128032283, "learning_rate": 4.125462633367959e-07, "logits/chosen": -0.9102590680122375, "logits/rejected": -0.8657618761062622, "logps/chosen": -548.0431518554688, "logps/rejected": -677.654541015625, "loss": 0.4387, "rewards/accuracies": 0.809374988079071, "rewards/chosen": -1.2187608480453491, "rewards/margins": 1.2359611988067627, "rewards/rejected": -2.4547219276428223, "step": 250 }, { "epoch": 0.3608605135322693, "grad_norm": 3.276352885973741, "learning_rate": 4.031501362983007e-07, "logits/chosen": -0.9509126543998718, "logits/rejected": -0.8994159698486328, "logps/chosen": -542.1051635742188, "logps/rejected": -668.0875244140625, "loss": 0.44, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -1.3456058502197266, "rewards/margins": 1.152848243713379, "rewards/rejected": -2.4984538555145264, "step": 260 }, { "epoch": 0.3747397640527412, "grad_norm": 3.433959749322587, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.9043784141540527, "logits/rejected": -0.8728653192520142, "logps/chosen": -572.6734619140625, "logps/rejected": -690.0535278320312, "loss": 0.4269, "rewards/accuracies": 0.796875, "rewards/chosen": -1.49461030960083, "rewards/margins": 1.185040831565857, "rewards/rejected": -2.6796510219573975, "step": 270 }, { "epoch": 0.3886190145732131, "grad_norm": 3.135829679670571, "learning_rate": 3.833011082004228e-07, "logits/chosen": -0.9181197285652161, "logits/rejected": -0.8538551330566406, "logps/chosen": -580.3753662109375, "logps/rejected": -723.9002685546875, "loss": 0.4243, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.563520908355713, "rewards/margins": 1.419384479522705, "rewards/rejected": -2.982905387878418, "step": 280 }, { "epoch": 0.4024982650936849, "grad_norm": 3.4471828982674606, "learning_rate": 3.728948520138426e-07, "logits/chosen": -0.8911579847335815, "logits/rejected": -0.8040667772293091, "logps/chosen": -581.972412109375, "logps/rejected": -704.8370361328125, "loss": 0.4129, "rewards/accuracies": 0.8125, "rewards/chosen": -1.598099946975708, "rewards/margins": 1.2950611114501953, "rewards/rejected": -2.893160820007324, "step": 290 }, { "epoch": 0.4163775156141568, "grad_norm": 3.4538574024709185, "learning_rate": 3.6219979505011555e-07, "logits/chosen": -0.9019178152084351, "logits/rejected": -0.8638097047805786, "logps/chosen": -552.6265869140625, "logps/rejected": -678.7213745117188, "loss": 0.4264, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -1.5648345947265625, "rewards/margins": 1.3646188974380493, "rewards/rejected": -2.9294533729553223, "step": 300 }, { "epoch": 0.4302567661346287, "grad_norm": 4.0164936619199905, "learning_rate": 3.512410705081684e-07, "logits/chosen": -0.8717131614685059, "logits/rejected": -0.8151350021362305, "logps/chosen": -573.2371215820312, "logps/rejected": -731.2991333007812, "loss": 0.4135, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5790045261383057, "rewards/margins": 1.489929437637329, "rewards/rejected": -3.0689339637756348, "step": 310 }, { "epoch": 0.4441360166551006, "grad_norm": 3.5055099662080766, "learning_rate": 3.400444312011776e-07, "logits/chosen": -0.9052634239196777, "logits/rejected": -0.8478328585624695, "logps/chosen": -601.0723266601562, "logps/rejected": -743.2720947265625, "loss": 0.3968, "rewards/accuracies": 0.809374988079071, "rewards/chosen": -1.5770965814590454, "rewards/margins": 1.5724782943725586, "rewards/rejected": -3.1495749950408936, "step": 320 }, { "epoch": 0.4580152671755725, "grad_norm": 3.965189906947426, "learning_rate": 3.286361890379034e-07, "logits/chosen": -0.8289991617202759, "logits/rejected": -0.7707743644714355, "logps/chosen": -592.8557739257812, "logps/rejected": -759.8111572265625, "loss": 0.397, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -1.7602096796035767, "rewards/margins": 1.649737000465393, "rewards/rejected": -3.409946918487549, "step": 330 }, { "epoch": 0.4718945176960444, "grad_norm": 6.571433183841374, "learning_rate": 3.1704315319015936e-07, "logits/chosen": -0.8428624868392944, "logits/rejected": -0.807705283164978, "logps/chosen": -569.0560302734375, "logps/rejected": -749.4913330078125, "loss": 0.4054, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.711888074874878, "rewards/margins": 1.6754896640777588, "rewards/rejected": -3.387377977371216, "step": 340 }, { "epoch": 0.4857737682165163, "grad_norm": 3.3895657051926116, "learning_rate": 3.052925670917219e-07, "logits/chosen": -0.8292319178581238, "logits/rejected": -0.7592617273330688, "logps/chosen": -615.757080078125, "logps/rejected": -775.6668090820312, "loss": 0.3867, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.9108030796051025, "rewards/margins": 1.574349045753479, "rewards/rejected": -3.48515248298645, "step": 350 }, { "epoch": 0.4996530187369882, "grad_norm": 3.9806648944209155, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.822270393371582, "logits/rejected": -0.7529654502868652, "logps/chosen": -610.9036865234375, "logps/rejected": -785.0739135742188, "loss": 0.3914, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -2.06144380569458, "rewards/margins": 1.6444549560546875, "rewards/rejected": -3.7058987617492676, "step": 360 }, { "epoch": 0.5135322692574601, "grad_norm": 4.270423330616301, "learning_rate": 2.814295041880407e-07, "logits/chosen": -0.8429604768753052, "logits/rejected": -0.7863871455192566, "logps/chosen": -599.1904296875, "logps/rejected": -767.7243041992188, "loss": 0.391, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.957845687866211, "rewards/margins": 1.5971133708953857, "rewards/rejected": -3.5549590587615967, "step": 370 }, { "epoch": 0.527411519777932, "grad_norm": 3.711197534301825, "learning_rate": 2.6937310516798275e-07, "logits/chosen": -0.8030338287353516, "logits/rejected": -0.7218085527420044, "logps/chosen": -635.15380859375, "logps/rejected": -831.8131103515625, "loss": 0.3798, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.005901336669922, "rewards/margins": 1.8555805683135986, "rewards/rejected": -3.8614819049835205, "step": 380 }, { "epoch": 0.5412907702984039, "grad_norm": 4.065093944544315, "learning_rate": 2.5727117968577785e-07, "logits/chosen": -0.7184926867485046, "logits/rejected": -0.6400619745254517, "logps/chosen": -647.2830810546875, "logps/rejected": -867.3894653320312, "loss": 0.3737, "rewards/accuracies": 0.8531249761581421, "rewards/chosen": -2.1855947971343994, "rewards/margins": 2.0819170475006104, "rewards/rejected": -4.26751184463501, "step": 390 }, { "epoch": 0.5551700208188758, "grad_norm": 4.664675650179661, "learning_rate": 2.4515216705704393e-07, "logits/chosen": -0.7386522889137268, "logits/rejected": -0.6554276347160339, "logps/chosen": -680.4814453125, "logps/rejected": -875.1360473632812, "loss": 0.3768, "rewards/accuracies": 0.809374988079071, "rewards/chosen": -2.458164691925049, "rewards/margins": 1.9175488948822021, "rewards/rejected": -4.37571382522583, "step": 400 }, { "epoch": 0.5690492713393477, "grad_norm": 4.082366486193136, "learning_rate": 2.330445467518977e-07, "logits/chosen": -0.7022604942321777, "logits/rejected": -0.5973988175392151, "logps/chosen": -627.4280395507812, "logps/rejected": -786.0429077148438, "loss": 0.377, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2749786376953125, "rewards/margins": 1.6706234216690063, "rewards/rejected": -3.94560170173645, "step": 410 }, { "epoch": 0.5829285218598196, "grad_norm": 3.6701003128420386, "learning_rate": 2.209767714686924e-07, "logits/chosen": -0.672639787197113, "logits/rejected": -0.5875508189201355, "logps/chosen": -665.8011474609375, "logps/rejected": -878.91015625, "loss": 0.3537, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3907861709594727, "rewards/margins": 2.144279956817627, "rewards/rejected": -4.5350661277771, "step": 420 }, { "epoch": 0.5968077723802915, "grad_norm": 3.8553830986355764, "learning_rate": 2.0897720027066897e-07, "logits/chosen": -0.67729651927948, "logits/rejected": -0.5586274862289429, "logps/chosen": -661.219482421875, "logps/rejected": -875.0428466796875, "loss": 0.3715, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -2.52473783493042, "rewards/margins": 2.1635537147521973, "rewards/rejected": -4.688291072845459, "step": 430 }, { "epoch": 0.6106870229007634, "grad_norm": 4.080685306612081, "learning_rate": 1.970740319426474e-07, "logits/chosen": -0.65470290184021, "logits/rejected": -0.5402953028678894, "logps/chosen": -674.3743896484375, "logps/rejected": -868.76611328125, "loss": 0.3756, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.5917928218841553, "rewards/margins": 1.9496173858642578, "rewards/rejected": -4.54141092300415, "step": 440 }, { "epoch": 0.6245662734212353, "grad_norm": 3.7177874253136913, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -0.6337302327156067, "logits/rejected": -0.5308721661567688, "logps/chosen": -659.3401489257812, "logps/rejected": -889.49462890625, "loss": 0.3548, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4972152709960938, "rewards/margins": 2.2179877758026123, "rewards/rejected": -4.715203285217285, "step": 450 }, { "epoch": 0.6384455239417072, "grad_norm": 3.781558261259466, "learning_rate": 1.7366850057622172e-07, "logits/chosen": -0.627717137336731, "logits/rejected": -0.5284063816070557, "logps/chosen": -665.6905517578125, "logps/rejected": -902.7052612304688, "loss": 0.3565, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -2.4058690071105957, "rewards/margins": 2.2597124576568604, "rewards/rejected": -4.665581703186035, "step": 460 }, { "epoch": 0.6523247744621791, "grad_norm": 3.624902658770024, "learning_rate": 1.622211401318028e-07, "logits/chosen": -0.5916509628295898, "logits/rejected": -0.48228612542152405, "logps/chosen": -683.482421875, "logps/rejected": -909.9552612304688, "loss": 0.3597, "rewards/accuracies": 0.840624988079071, "rewards/chosen": -2.451106071472168, "rewards/margins": 2.257086992263794, "rewards/rejected": -4.708193302154541, "step": 470 }, { "epoch": 0.666204024982651, "grad_norm": 4.657777670569194, "learning_rate": 1.5098005849021078e-07, "logits/chosen": -0.5622087717056274, "logits/rejected": -0.46162405610084534, "logps/chosen": -661.2738037109375, "logps/rejected": -859.2135009765625, "loss": 0.3586, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3591208457946777, "rewards/margins": 2.061002016067505, "rewards/rejected": -4.4201226234436035, "step": 480 }, { "epoch": 0.6800832755031229, "grad_norm": 6.251774756777437, "learning_rate": 1.3997167199892385e-07, "logits/chosen": -0.5363645553588867, "logits/rejected": -0.39701658487319946, "logps/chosen": -659.409912109375, "logps/rejected": -862.7174072265625, "loss": 0.3703, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.3470611572265625, "rewards/margins": 2.0817208290100098, "rewards/rejected": -4.428781986236572, "step": 490 }, { "epoch": 0.6939625260235948, "grad_norm": 4.437192094325177, "learning_rate": 1.2922185017584036e-07, "logits/chosen": -0.47550851106643677, "logits/rejected": -0.33102065324783325, "logps/chosen": -642.2039794921875, "logps/rejected": -921.5857543945312, "loss": 0.341, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.3800008296966553, "rewards/margins": 2.652977466583252, "rewards/rejected": -5.032978534698486, "step": 500 }, { "epoch": 0.7078417765440667, "grad_norm": 4.426537023215993, "learning_rate": 1.1875585491635998e-07, "logits/chosen": -0.5427089333534241, "logits/rejected": -0.3993372321128845, "logps/chosen": -642.7872924804688, "logps/rejected": -917.6959838867188, "loss": 0.339, "rewards/accuracies": 0.859375, "rewards/chosen": -2.4717633724212646, "rewards/margins": 2.438523054122925, "rewards/rejected": -4.910286903381348, "step": 510 }, { "epoch": 0.7217210270645386, "grad_norm": 3.8981108408032368, "learning_rate": 1.0859828112836539e-07, "logits/chosen": -0.4982399344444275, "logits/rejected": -0.3629917502403259, "logps/chosen": -662.3302001953125, "logps/rejected": -924.0505981445312, "loss": 0.3575, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5424540042877197, "rewards/margins": 2.429400682449341, "rewards/rejected": -4.9718546867370605, "step": 520 }, { "epoch": 0.7356002775850105, "grad_norm": 4.062590769922296, "learning_rate": 9.877299893461455e-08, "logits/chosen": -0.4675068259239197, "logits/rejected": -0.32594671845436096, "logps/chosen": -694.9034423828125, "logps/rejected": -893.17724609375, "loss": 0.3619, "rewards/accuracies": 0.828125, "rewards/chosen": -2.6744236946105957, "rewards/margins": 2.1196742057800293, "rewards/rejected": -4.794097900390625, "step": 530 }, { "epoch": 0.7494795281054824, "grad_norm": 4.00277278125475, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.4969969689846039, "logits/rejected": -0.33853963017463684, "logps/chosen": -672.2896118164062, "logps/rejected": -913.6337890625, "loss": 0.3511, "rewards/accuracies": 0.809374988079071, "rewards/chosen": -2.6400508880615234, "rewards/margins": 2.4135284423828125, "rewards/rejected": -5.053578853607178, "step": 540 }, { "epoch": 0.7633587786259542, "grad_norm": 4.354943956619566, "learning_rate": 8.021083116405173e-08, "logits/chosen": -0.45121559500694275, "logits/rejected": -0.34631314873695374, "logps/chosen": -652.8707885742188, "logps/rejected": -885.6276245117188, "loss": 0.358, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.4929685592651367, "rewards/margins": 2.304232120513916, "rewards/rejected": -4.797201156616211, "step": 550 }, { "epoch": 0.7772380291464261, "grad_norm": 4.835099186465556, "learning_rate": 7.151756636052527e-08, "logits/chosen": -0.4754953384399414, "logits/rejected": -0.3613481819629669, "logps/chosen": -677.6926879882812, "logps/rejected": -924.0533447265625, "loss": 0.3606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6459193229675293, "rewards/margins": 2.297471523284912, "rewards/rejected": -4.943390846252441, "step": 560 }, { "epoch": 0.7911172796668979, "grad_norm": 4.2131174135927765, "learning_rate": 6.324373218975104e-08, "logits/chosen": -0.44653385877609253, "logits/rejected": -0.3059902787208557, "logps/chosen": -659.1906127929688, "logps/rejected": -913.2021484375, "loss": 0.3647, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6062073707580566, "rewards/margins": 2.3299379348754883, "rewards/rejected": -4.936145782470703, "step": 570 }, { "epoch": 0.8049965301873698, "grad_norm": 4.3409883619650795, "learning_rate": 5.5408772018959996e-08, "logits/chosen": -0.44951170682907104, "logits/rejected": -0.3005351424217224, "logps/chosen": -656.5030517578125, "logps/rejected": -886.4778442382812, "loss": 0.3535, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -2.515747547149658, "rewards/margins": 2.289696455001831, "rewards/rejected": -4.805444240570068, "step": 580 }, { "epoch": 0.8188757807078417, "grad_norm": 13.005915598801213, "learning_rate": 4.8031097869072225e-08, "logits/chosen": -0.4361554980278015, "logits/rejected": -0.3123430609703064, "logps/chosen": -669.0994873046875, "logps/rejected": -921.1182861328125, "loss": 0.3566, "rewards/accuracies": 0.840624988079071, "rewards/chosen": -2.5885722637176514, "rewards/margins": 2.3980278968811035, "rewards/rejected": -4.986600399017334, "step": 590 }, { "epoch": 0.8327550312283136, "grad_norm": 4.507720819165163, "learning_rate": 4.112804714676593e-08, "logits/chosen": -0.4602123200893402, "logits/rejected": -0.3107297122478485, "logps/chosen": -671.499267578125, "logps/rejected": -886.6002807617188, "loss": 0.3403, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.505514144897461, "rewards/margins": 2.2939445972442627, "rewards/rejected": -4.7994585037231445, "step": 600 }, { "epoch": 0.8466342817487855, "grad_norm": 4.108687652880907, "learning_rate": 3.4715841901871545e-08, "logits/chosen": -0.4734552800655365, "logits/rejected": -0.31184083223342896, "logps/chosen": -664.9038696289062, "logps/rejected": -929.7863159179688, "loss": 0.3487, "rewards/accuracies": 0.846875011920929, "rewards/chosen": -2.5965065956115723, "rewards/margins": 2.5841736793518066, "rewards/rejected": -5.180680751800537, "step": 610 }, { "epoch": 0.8605135322692574, "grad_norm": 4.599039860177261, "learning_rate": 2.8809550705835546e-08, "logits/chosen": -0.43759673833847046, "logits/rejected": -0.25794321298599243, "logps/chosen": -715.013671875, "logps/rejected": -970.8460083007812, "loss": 0.3524, "rewards/accuracies": 0.84375, "rewards/chosen": -2.732679843902588, "rewards/margins": 2.611907482147217, "rewards/rejected": -5.344587802886963, "step": 620 }, { "epoch": 0.8743927827897293, "grad_norm": 4.4811322085497105, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -0.4395861029624939, "logits/rejected": -0.27333635091781616, "logps/chosen": -696.0693359375, "logps/rejected": -928.7999877929688, "loss": 0.3694, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7578296661376953, "rewards/margins": 2.2534213066101074, "rewards/rejected": -5.011250972747803, "step": 630 }, { "epoch": 0.8882720333102012, "grad_norm": 4.763510092667976, "learning_rate": 1.8569007682777415e-08, "logits/chosen": -0.4241456985473633, "logits/rejected": -0.29944029450416565, "logps/chosen": -663.63330078125, "logps/rejected": -864.45703125, "loss": 0.358, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.602560043334961, "rewards/margins": 2.116223096847534, "rewards/rejected": -4.718783378601074, "step": 640 }, { "epoch": 0.9021512838306731, "grad_norm": 4.368284739209083, "learning_rate": 1.4258820954781037e-08, "logits/chosen": -0.4327312409877777, "logits/rejected": -0.2682781219482422, "logps/chosen": -700.5999145507812, "logps/rejected": -931.2169799804688, "loss": 0.3583, "rewards/accuracies": 0.828125, "rewards/chosen": -2.735988140106201, "rewards/margins": 2.3255417346954346, "rewards/rejected": -5.061530113220215, "step": 650 }, { "epoch": 0.916030534351145, "grad_norm": 4.046149463634508, "learning_rate": 1.0502621921127774e-08, "logits/chosen": -0.43741026520729065, "logits/rejected": -0.2442634403705597, "logps/chosen": -671.8573608398438, "logps/rejected": -919.7029418945312, "loss": 0.3519, "rewards/accuracies": 0.846875011920929, "rewards/chosen": -2.6393163204193115, "rewards/margins": 2.4307634830474854, "rewards/rejected": -5.070079326629639, "step": 660 }, { "epoch": 0.9299097848716169, "grad_norm": 3.883568325725545, "learning_rate": 7.309237584595007e-09, "logits/chosen": -0.4095235764980316, "logits/rejected": -0.27290958166122437, "logps/chosen": -672.8810424804688, "logps/rejected": -929.7780151367188, "loss": 0.3433, "rewards/accuracies": 0.859375, "rewards/chosen": -2.6669869422912598, "rewards/margins": 2.416769504547119, "rewards/rejected": -5.083756446838379, "step": 670 }, { "epoch": 0.9437890353920888, "grad_norm": 5.273020682836859, "learning_rate": 4.6861723431538265e-09, "logits/chosen": -0.42638665437698364, "logits/rejected": -0.25011223554611206, "logps/chosen": -685.7161865234375, "logps/rejected": -900.3049926757812, "loss": 0.3666, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.629455089569092, "rewards/margins": 2.211458206176758, "rewards/rejected": -4.84091329574585, "step": 680 }, { "epoch": 0.9576682859125607, "grad_norm": 5.214078177947886, "learning_rate": 2.639590354763882e-09, "logits/chosen": -0.41801247000694275, "logits/rejected": -0.29952767491340637, "logps/chosen": -651.67724609375, "logps/rejected": -901.2472534179688, "loss": 0.3506, "rewards/accuracies": 0.84375, "rewards/chosen": -2.55751371383667, "rewards/margins": 2.3576016426086426, "rewards/rejected": -4.9151153564453125, "step": 690 }, { "epoch": 0.9715475364330326, "grad_norm": 4.655188648736662, "learning_rate": 1.1743010517085427e-09, "logits/chosen": -0.4516308307647705, "logits/rejected": -0.2846836745738983, "logps/chosen": -696.3230590820312, "logps/rejected": -928.4945068359375, "loss": 0.3491, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.7318358421325684, "rewards/margins": 2.3993608951568604, "rewards/rejected": -5.13119649887085, "step": 700 }, { "epoch": 0.9854267869535045, "grad_norm": 3.572705217532298, "learning_rate": 2.9374783851240923e-10, "logits/chosen": -0.39585572481155396, "logits/rejected": -0.26901108026504517, "logps/chosen": -655.4457397460938, "logps/rejected": -909.2194213867188, "loss": 0.3344, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4239559173583984, "rewards/margins": 2.4643149375915527, "rewards/rejected": -4.888271331787109, "step": 710 }, { "epoch": 0.9993060374739764, "grad_norm": 4.876709666740222, "learning_rate": 0.0, "logits/chosen": -0.4101489186286926, "logits/rejected": -0.2496207058429718, "logps/chosen": -666.4955444335938, "logps/rejected": -903.6095581054688, "loss": 0.3544, "rewards/accuracies": 0.8218749761581421, "rewards/chosen": -2.61322283744812, "rewards/margins": 2.3339571952819824, "rewards/rejected": -4.947180271148682, "step": 720 }, { "epoch": 0.9993060374739764, "step": 720, "total_flos": 0.0, "train_loss": 0.4493948830498589, "train_runtime": 20175.981, "train_samples_per_second": 9.142, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 720, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }