{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020931449502878076, "grad_norm": 56.60030299273906, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.8892595767974854, "logits/rejected": -2.840986728668213, "logps/chosen": -312.6982116699219, "logps/rejected": -359.79229736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.020931449502878074, "grad_norm": 48.0738005946216, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.664259672164917, "logits/rejected": -2.628167152404785, "logps/chosen": -319.789306640625, "logps/rejected": -288.3993835449219, "loss": 0.6931, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0027708299458026886, "rewards/margins": 0.0005994565435685217, "rewards/rejected": 0.002171373227611184, "step": 10 }, { "epoch": 0.04186289900575615, "grad_norm": 67.35204861005953, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.743551254272461, "logits/rejected": -2.6782517433166504, "logps/chosen": -310.35101318359375, "logps/rejected": -283.4342956542969, "loss": 0.6913, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.006935287266969681, "rewards/margins": 0.0024963519535958767, "rewards/rejected": 0.004438935313373804, "step": 20 }, { "epoch": 0.06279434850863422, "grad_norm": 37.91488625769742, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.729531764984131, "logits/rejected": -2.6502060890197754, "logps/chosen": -293.98309326171875, "logps/rejected": -251.10726928710938, "loss": 0.6814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.054005831480026245, "rewards/margins": 0.034085579216480255, "rewards/rejected": 0.019920259714126587, "step": 30 }, { "epoch": 0.0837257980115123, "grad_norm": 36.97741406624794, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.625835657119751, "logits/rejected": -2.5816197395324707, "logps/chosen": -273.37054443359375, "logps/rejected": -261.5669250488281, "loss": 0.6582, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.12765000760555267, "rewards/margins": 0.08208973705768585, "rewards/rejected": 0.04556027799844742, "step": 40 }, { "epoch": 0.10465724751439037, "grad_norm": 37.43059499290312, "learning_rate": 4.999731868769026e-07, "logits/chosen": -2.6400301456451416, "logits/rejected": -2.5608487129211426, "logps/chosen": -275.17523193359375, "logps/rejected": -275.0105285644531, "loss": 0.6272, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.20204421877861023, "rewards/margins": 0.16956469416618347, "rewards/rejected": 0.03247954696416855, "step": 50 }, { "epoch": 0.12558869701726844, "grad_norm": 57.10845065545151, "learning_rate": 4.990353313429303e-07, "logits/chosen": -2.642455577850342, "logits/rejected": -2.573876142501831, "logps/chosen": -251.33602905273438, "logps/rejected": -249.27578735351562, "loss": 0.5981, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23599937558174133, "rewards/margins": 0.3313826322555542, "rewards/rejected": -0.09538328647613525, "step": 60 }, { "epoch": 0.14652014652014653, "grad_norm": 35.22629948353397, "learning_rate": 4.967625656594781e-07, "logits/chosen": -2.5455336570739746, "logits/rejected": -2.5049350261688232, "logps/chosen": -302.450927734375, "logps/rejected": -292.55072021484375, "loss": 0.5917, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12116215378046036, "rewards/margins": 0.40936508774757385, "rewards/rejected": -0.2882029414176941, "step": 70 }, { "epoch": 0.1674515960230246, "grad_norm": 47.22877045853146, "learning_rate": 4.93167072587771e-07, "logits/chosen": -2.6536507606506348, "logits/rejected": -2.5224609375, "logps/chosen": -340.2115783691406, "logps/rejected": -274.326904296875, "loss": 0.5907, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.13078466057777405, "rewards/margins": 0.5240527391433716, "rewards/rejected": -0.39326804876327515, "step": 80 }, { "epoch": 0.18838304552590268, "grad_norm": 37.16410104338103, "learning_rate": 4.882681251368548e-07, "logits/chosen": -2.5428051948547363, "logits/rejected": -2.4835638999938965, "logps/chosen": -254.80374145507812, "logps/rejected": -269.6417236328125, "loss": 0.5481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.027083083987236023, "rewards/margins": 0.6100192070007324, "rewards/rejected": -0.6371022462844849, "step": 90 }, { "epoch": 0.20931449502878074, "grad_norm": 33.7175693505583, "learning_rate": 4.820919832540181e-07, "logits/chosen": -2.525334119796753, "logits/rejected": -2.4599850177764893, "logps/chosen": -305.0451965332031, "logps/rejected": -294.32122802734375, "loss": 0.5775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.04258927330374718, "rewards/margins": 0.7190525531768799, "rewards/rejected": -0.7616418600082397, "step": 100 }, { "epoch": 0.2302459445316588, "grad_norm": 41.20672223986865, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -2.5379371643066406, "logits/rejected": -2.4624836444854736, "logps/chosen": -295.3846130371094, "logps/rejected": -290.00823974609375, "loss": 0.5683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09885083138942719, "rewards/margins": 0.5707240700721741, "rewards/rejected": -0.6695749163627625, "step": 110 }, { "epoch": 0.25117739403453687, "grad_norm": 38.75944481368775, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -2.4620394706726074, "logits/rejected": -2.453782558441162, "logps/chosen": -280.3204040527344, "logps/rejected": -284.44219970703125, "loss": 0.5627, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15607118606567383, "rewards/margins": 0.6097468137741089, "rewards/rejected": -0.7658179998397827, "step": 120 }, { "epoch": 0.272108843537415, "grad_norm": 58.76765744852648, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -2.52392315864563, "logits/rejected": -2.481851100921631, "logps/chosen": -299.79156494140625, "logps/rejected": -292.2509765625, "loss": 0.5275, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.26476210355758667, "rewards/margins": 0.6276587843894958, "rewards/rejected": -0.8924208879470825, "step": 130 }, { "epoch": 0.29304029304029305, "grad_norm": 64.42453786695027, "learning_rate": 4.453763107901675e-07, "logits/chosen": -2.546255588531494, "logits/rejected": -2.483006715774536, "logps/chosen": -318.06182861328125, "logps/rejected": -286.57257080078125, "loss": 0.5517, "rewards/accuracies": 0.78125, "rewards/chosen": 0.06344182789325714, "rewards/margins": 0.8681875467300415, "rewards/rejected": -0.8047456741333008, "step": 140 }, { "epoch": 0.3139717425431711, "grad_norm": 49.33320813589767, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -2.5776188373565674, "logits/rejected": -2.5224177837371826, "logps/chosen": -294.88134765625, "logps/rejected": -271.59539794921875, "loss": 0.5302, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.04772792011499405, "rewards/margins": 0.6878777742385864, "rewards/rejected": -0.6401498317718506, "step": 150 }, { "epoch": 0.3349031920460492, "grad_norm": 41.399344035646344, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -2.6206777095794678, "logits/rejected": -2.5290703773498535, "logps/chosen": -279.5448303222656, "logps/rejected": -271.38916015625, "loss": 0.5468, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.03517674654722214, "rewards/margins": 0.8822032809257507, "rewards/rejected": -0.9173799753189087, "step": 160 }, { "epoch": 0.35583464154892724, "grad_norm": 39.72842833528202, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -2.4758784770965576, "logits/rejected": -2.437650442123413, "logps/chosen": -268.2413330078125, "logps/rejected": -257.02105712890625, "loss": 0.5403, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12392053753137589, "rewards/margins": 0.5926230549812317, "rewards/rejected": -0.7165434956550598, "step": 170 }, { "epoch": 0.37676609105180536, "grad_norm": 36.98022789966047, "learning_rate": 3.920161866827889e-07, "logits/chosen": -2.5508599281311035, "logits/rejected": -2.50437593460083, "logps/chosen": -284.5989074707031, "logps/rejected": -273.63800048828125, "loss": 0.5216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.47948211431503296, "rewards/margins": 0.6584511995315552, "rewards/rejected": -1.137933373451233, "step": 180 }, { "epoch": 0.3976975405546834, "grad_norm": 83.96111613389628, "learning_rate": 3.765821230985757e-07, "logits/chosen": -2.479034185409546, "logits/rejected": -2.459357261657715, "logps/chosen": -265.3419189453125, "logps/rejected": -273.99237060546875, "loss": 0.5357, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2723972201347351, "rewards/margins": 0.5891710519790649, "rewards/rejected": -0.8615682721138, "step": 190 }, { "epoch": 0.4186289900575615, "grad_norm": 57.98472670273764, "learning_rate": 3.604695382782159e-07, "logits/chosen": -2.4622385501861572, "logits/rejected": -2.453322172164917, "logps/chosen": -261.9639587402344, "logps/rejected": -290.62188720703125, "loss": 0.5961, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21924467384815216, "rewards/margins": 0.6115727424621582, "rewards/rejected": -0.8308174014091492, "step": 200 }, { "epoch": 0.43956043956043955, "grad_norm": 58.79891040438903, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -2.5712344646453857, "logits/rejected": -2.463402509689331, "logps/chosen": -318.231201171875, "logps/rejected": -269.74798583984375, "loss": 0.5426, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1611379086971283, "rewards/margins": 0.7542437314987183, "rewards/rejected": -0.9153816103935242, "step": 210 }, { "epoch": 0.4604918890633176, "grad_norm": 79.8856418955716, "learning_rate": 3.265574537815398e-07, "logits/chosen": -2.5093963146209717, "logits/rejected": -2.497286319732666, "logps/chosen": -270.76568603515625, "logps/rejected": -272.580078125, "loss": 0.5446, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10513055324554443, "rewards/margins": 0.9609693288803101, "rewards/rejected": -1.0660998821258545, "step": 220 }, { "epoch": 0.48142333856619574, "grad_norm": 38.74064389181431, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -2.6367149353027344, "logits/rejected": -2.564960479736328, "logps/chosen": -273.73529052734375, "logps/rejected": -270.94415283203125, "loss": 0.5278, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.16755743324756622, "rewards/margins": 1.0889314413070679, "rewards/rejected": -1.2564888000488281, "step": 230 }, { "epoch": 0.5023547880690737, "grad_norm": 40.4843251117903, "learning_rate": 2.910060778827554e-07, "logits/chosen": -2.633864641189575, "logits/rejected": -2.5612056255340576, "logps/chosen": -296.2731628417969, "logps/rejected": -290.04742431640625, "loss": 0.4978, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12019411474466324, "rewards/margins": 0.7597166895866394, "rewards/rejected": -0.8799108266830444, "step": 240 }, { "epoch": 0.5232862375719518, "grad_norm": 40.628977686293446, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -2.611175060272217, "logits/rejected": -2.517091989517212, "logps/chosen": -294.7513122558594, "logps/rejected": -285.98199462890625, "loss": 0.5307, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2056296169757843, "rewards/margins": 1.1986558437347412, "rewards/rejected": -1.4042854309082031, "step": 250 }, { "epoch": 0.54421768707483, "grad_norm": 41.00335841137649, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -2.6146559715270996, "logits/rejected": -2.608934164047241, "logps/chosen": -274.4793395996094, "logps/rejected": -270.13726806640625, "loss": 0.5159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.43834853172302246, "rewards/margins": 0.8028098344802856, "rewards/rejected": -1.241158366203308, "step": 260 }, { "epoch": 0.565149136577708, "grad_norm": 38.419381160509054, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -2.6012203693389893, "logits/rejected": -2.5700631141662598, "logps/chosen": -302.5823059082031, "logps/rejected": -297.1186218261719, "loss": 0.5141, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.3534887433052063, "rewards/margins": 1.0269334316253662, "rewards/rejected": -1.3804222345352173, "step": 270 }, { "epoch": 0.5860805860805861, "grad_norm": 61.56800465863709, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -2.487819194793701, "logits/rejected": -2.443556308746338, "logps/chosen": -289.46136474609375, "logps/rejected": -258.09454345703125, "loss": 0.5389, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3526845872402191, "rewards/margins": 0.8081240653991699, "rewards/rejected": -1.1608086824417114, "step": 280 }, { "epoch": 0.6070120355834642, "grad_norm": 42.020314742546624, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -2.587932586669922, "logits/rejected": -2.517599582672119, "logps/chosen": -262.06488037109375, "logps/rejected": -273.3334045410156, "loss": 0.536, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5786847472190857, "rewards/margins": 0.6844012141227722, "rewards/rejected": -1.2630858421325684, "step": 290 }, { "epoch": 0.6279434850863422, "grad_norm": 45.77982161189827, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -2.551849603652954, "logits/rejected": -2.4827933311462402, "logps/chosen": -338.84259033203125, "logps/rejected": -313.6923828125, "loss": 0.5116, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.437159925699234, "rewards/margins": 0.7780275940895081, "rewards/rejected": -1.215187430381775, "step": 300 }, { "epoch": 0.6488749345892203, "grad_norm": 67.26009798996141, "learning_rate": 1.647817538357072e-07, "logits/chosen": -2.586299419403076, "logits/rejected": -2.5306968688964844, "logps/chosen": -315.3221130371094, "logps/rejected": -284.2604064941406, "loss": 0.5072, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.25200536847114563, "rewards/margins": 0.9943742752075195, "rewards/rejected": -1.2463796138763428, "step": 310 }, { "epoch": 0.6698063840920984, "grad_norm": 41.456508803565676, "learning_rate": 1.478143389201113e-07, "logits/chosen": -2.5629897117614746, "logits/rejected": -2.4874186515808105, "logps/chosen": -265.6336669921875, "logps/rejected": -254.78158569335938, "loss": 0.5157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22662892937660217, "rewards/margins": 1.0545392036437988, "rewards/rejected": -1.2811682224273682, "step": 320 }, { "epoch": 0.6907378335949764, "grad_norm": 36.05897194603855, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -2.5867385864257812, "logits/rejected": -2.570345878601074, "logps/chosen": -279.125, "logps/rejected": -296.114501953125, "loss": 0.5157, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.22260840237140656, "rewards/margins": 0.8764309883117676, "rewards/rejected": -1.0990393161773682, "step": 330 }, { "epoch": 0.7116692830978545, "grad_norm": 48.915795401449174, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -2.541907548904419, "logits/rejected": -2.506343126296997, "logps/chosen": -307.07427978515625, "logps/rejected": -305.878662109375, "loss": 0.4979, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.31391283869743347, "rewards/margins": 0.9292833209037781, "rewards/rejected": -1.2431962490081787, "step": 340 }, { "epoch": 0.7326007326007326, "grad_norm": 42.37289903395956, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -2.6136066913604736, "logits/rejected": -2.54303240776062, "logps/chosen": -274.64434814453125, "logps/rejected": -279.75482177734375, "loss": 0.4802, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.19893045723438263, "rewards/margins": 0.978558361530304, "rewards/rejected": -1.1774885654449463, "step": 350 }, { "epoch": 0.7535321821036107, "grad_norm": 38.677534171799124, "learning_rate": 8.628481651367875e-08, "logits/chosen": -2.58419132232666, "logits/rejected": -2.518683910369873, "logps/chosen": -319.4009094238281, "logps/rejected": -292.9573974609375, "loss": 0.5281, "rewards/accuracies": 0.75, "rewards/chosen": -0.31927159428596497, "rewards/margins": 1.0226125717163086, "rewards/rejected": -1.3418842554092407, "step": 360 }, { "epoch": 0.7744636316064888, "grad_norm": 46.36575225037569, "learning_rate": 7.289996455765748e-08, "logits/chosen": -2.5846099853515625, "logits/rejected": -2.5254452228546143, "logps/chosen": -284.2174072265625, "logps/rejected": -264.4429626464844, "loss": 0.5244, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.36106812953948975, "rewards/margins": 0.966556191444397, "rewards/rejected": -1.3276244401931763, "step": 370 }, { "epoch": 0.7953950811093669, "grad_norm": 39.86634863792299, "learning_rate": 6.046442623320145e-08, "logits/chosen": -2.47021484375, "logits/rejected": -2.45784854888916, "logps/chosen": -276.29632568359375, "logps/rejected": -316.617431640625, "loss": 0.5091, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7333633303642273, "rewards/margins": 1.0854023694992065, "rewards/rejected": -1.8187658786773682, "step": 380 }, { "epoch": 0.8163265306122449, "grad_norm": 40.36648943435987, "learning_rate": 4.904486005914027e-08, "logits/chosen": -2.5730109214782715, "logits/rejected": -2.515702962875366, "logps/chosen": -364.7529296875, "logps/rejected": -344.87060546875, "loss": 0.4929, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5375163555145264, "rewards/margins": 1.0361560583114624, "rewards/rejected": -1.5736725330352783, "step": 390 }, { "epoch": 0.837257980115123, "grad_norm": 56.32657464205283, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -2.525852918624878, "logits/rejected": -2.469883680343628, "logps/chosen": -276.40948486328125, "logps/rejected": -274.75274658203125, "loss": 0.5189, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5351244211196899, "rewards/margins": 0.9622423052787781, "rewards/rejected": -1.4973666667938232, "step": 400 }, { "epoch": 0.858189429618001, "grad_norm": 42.87274307233854, "learning_rate": 2.9492720416985e-08, "logits/chosen": -2.6081812381744385, "logits/rejected": -2.5553929805755615, "logps/chosen": -317.64312744140625, "logps/rejected": -301.38323974609375, "loss": 0.5085, "rewards/accuracies": 0.71875, "rewards/chosen": -0.42447155714035034, "rewards/margins": 1.0819838047027588, "rewards/rejected": -1.506455421447754, "step": 410 }, { "epoch": 0.8791208791208791, "grad_norm": 41.8015801763105, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -2.502769708633423, "logits/rejected": -2.475386381149292, "logps/chosen": -273.35150146484375, "logps/rejected": -296.51361083984375, "loss": 0.4965, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.42012089490890503, "rewards/margins": 0.9804800152778625, "rewards/rejected": -1.4006009101867676, "step": 420 }, { "epoch": 0.9000523286237572, "grad_norm": 42.916394522460244, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -2.516355037689209, "logits/rejected": -2.4368109703063965, "logps/chosen": -307.8439025878906, "logps/rejected": -306.0263671875, "loss": 0.4975, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3919011950492859, "rewards/margins": 0.8122466802597046, "rewards/rejected": -1.2041478157043457, "step": 430 }, { "epoch": 0.9209837781266352, "grad_norm": 39.402077703083755, "learning_rate": 9.12094829893642e-09, "logits/chosen": -2.6104936599731445, "logits/rejected": -2.5414485931396484, "logps/chosen": -272.779052734375, "logps/rejected": -253.19808959960938, "loss": 0.5093, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.33945396542549133, "rewards/margins": 0.873080849647522, "rewards/rejected": -1.212534785270691, "step": 440 }, { "epoch": 0.9419152276295133, "grad_norm": 42.735602427543185, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -2.41510272026062, "logits/rejected": -2.391575574874878, "logps/chosen": -294.7413635253906, "logps/rejected": -298.8751220703125, "loss": 0.4984, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.45333218574523926, "rewards/margins": 0.9596071243286133, "rewards/rejected": -1.4129393100738525, "step": 450 }, { "epoch": 0.9628466771323915, "grad_norm": 54.40801480075315, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -2.530097484588623, "logits/rejected": -2.441829204559326, "logps/chosen": -284.36077880859375, "logps/rejected": -262.52923583984375, "loss": 0.5071, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3923359513282776, "rewards/margins": 0.9908507466316223, "rewards/rejected": -1.3831866979599, "step": 460 }, { "epoch": 0.9837781266352695, "grad_norm": 39.7809456404364, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -2.536561965942383, "logits/rejected": -2.4826693534851074, "logps/chosen": -328.4256591796875, "logps/rejected": -307.42376708984375, "loss": 0.4907, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.22636675834655762, "rewards/margins": 0.9801048040390015, "rewards/rejected": -1.2064714431762695, "step": 470 }, { "epoch": 0.9984301412872841, "eval_logits/chosen": -2.519351005554199, "eval_logits/rejected": -2.4782261848449707, "eval_logps/chosen": -271.4703674316406, "eval_logps/rejected": -289.6701354980469, "eval_loss": 0.5136305093765259, "eval_rewards/accuracies": 0.75390625, "eval_rewards/chosen": -0.3157782554626465, "eval_rewards/margins": 0.9721657633781433, "eval_rewards/rejected": -1.2879440784454346, "eval_runtime": 168.3617, "eval_samples_per_second": 11.879, "eval_steps_per_second": 0.19, "step": 477 }, { "epoch": 0.9984301412872841, "step": 477, "total_flos": 0.0, "train_loss": 0.5435615640516301, "train_runtime": 13423.4265, "train_samples_per_second": 4.554, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }