{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984168865435357, "eval_steps": 400, "global_step": 473, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021108179419525065, "grad_norm": 3.792602400172418, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -0.723710298538208, "logits/rejected": -1.1678439378738403, "logps/chosen": -266.5860900878906, "logps/rejected": -246.2262420654297, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010554089709762533, "grad_norm": 5.35027261694182, "learning_rate": 5.208333333333333e-08, "logits/chosen": -0.6524915099143982, "logits/rejected": -0.9277956485748291, "logps/chosen": -282.5875549316406, "logps/rejected": -269.2027893066406, "loss": 0.6933, "rewards/accuracies": 0.3828125, "rewards/chosen": 0.000355295545887202, "rewards/margins": -0.00032308147638104856, "rewards/rejected": 0.000678377109579742, "step": 5 }, { "epoch": 0.021108179419525065, "grad_norm": 5.266933872220353, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.6941147446632385, "logits/rejected": -1.03800368309021, "logps/chosen": -290.0839538574219, "logps/rejected": -274.08502197265625, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0008805571123957634, "rewards/margins": -0.0002368297427892685, "rewards/rejected": 0.00111738673876971, "step": 10 }, { "epoch": 0.0316622691292876, "grad_norm": 4.4222736963146785, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.6915597319602966, "logits/rejected": -1.0270450115203857, "logps/chosen": -286.4000549316406, "logps/rejected": -268.19305419921875, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0015847303438931704, "rewards/margins": -0.00021869130432605743, "rewards/rejected": 0.0018034216482192278, "step": 15 }, { "epoch": 0.04221635883905013, "grad_norm": 4.370999160332841, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.6628856658935547, "logits/rejected": -1.0627143383026123, "logps/chosen": -281.633056640625, "logps/rejected": -258.80975341796875, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004043369088321924, "rewards/margins": 0.0007513560703955591, "rewards/rejected": 0.0032920129597187042, "step": 20 }, { "epoch": 0.052770448548812667, "grad_norm": 4.295540874340828, "learning_rate": 2.604166666666667e-07, "logits/chosen": -0.6402955651283264, "logits/rejected": -0.9882392883300781, "logps/chosen": -303.6094055175781, "logps/rejected": -278.68792724609375, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": 0.012096477672457695, "rewards/margins": 0.002340012462809682, "rewards/rejected": 0.009756465442478657, "step": 25 }, { "epoch": 0.0633245382585752, "grad_norm": 4.480110631795238, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.6986342668533325, "logits/rejected": -1.0124592781066895, "logps/chosen": -277.3695983886719, "logps/rejected": -256.33648681640625, "loss": 0.6908, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.019197864457964897, "rewards/margins": 0.006392383016645908, "rewards/rejected": 0.01280547957867384, "step": 30 }, { "epoch": 0.07387862796833773, "grad_norm": 4.572546926633594, "learning_rate": 3.645833333333333e-07, "logits/chosen": -0.7217592597007751, "logits/rejected": -0.9826194047927856, "logps/chosen": -276.353515625, "logps/rejected": -269.84747314453125, "loss": 0.6889, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.03310415893793106, "rewards/margins": 0.008944300934672356, "rewards/rejected": 0.024159858003258705, "step": 35 }, { "epoch": 0.08443271767810026, "grad_norm": 3.950940685241822, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.6703137755393982, "logits/rejected": -1.0556083917617798, "logps/chosen": -277.72515869140625, "logps/rejected": -255.3736572265625, "loss": 0.6856, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.044867198914289474, "rewards/margins": 0.01742670312523842, "rewards/rejected": 0.027440497651696205, "step": 40 }, { "epoch": 0.09498680738786279, "grad_norm": 4.408045626085674, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.7604807615280151, "logits/rejected": -1.0656068325042725, "logps/chosen": -283.796142578125, "logps/rejected": -269.21075439453125, "loss": 0.6824, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.055293601006269455, "rewards/margins": 0.017781417816877365, "rewards/rejected": 0.03751217946410179, "step": 45 }, { "epoch": 0.10554089709762533, "grad_norm": 4.594023555859445, "learning_rate": 4.999726797933858e-07, "logits/chosen": -0.7825593948364258, "logits/rejected": -1.0136535167694092, "logps/chosen": -268.57232666015625, "logps/rejected": -254.4635772705078, "loss": 0.6786, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04131672903895378, "rewards/margins": 0.02473551593720913, "rewards/rejected": 0.016581213101744652, "step": 50 }, { "epoch": 0.11609498680738786, "grad_norm": 4.732128821227025, "learning_rate": 4.99665396039775e-07, "logits/chosen": -0.8582944869995117, "logits/rejected": -1.092308759689331, "logps/chosen": -272.50872802734375, "logps/rejected": -269.22015380859375, "loss": 0.6711, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.015134250745177269, "rewards/margins": 0.03893275931477547, "rewards/rejected": -0.02379850670695305, "step": 55 }, { "epoch": 0.1266490765171504, "grad_norm": 5.480552136086532, "learning_rate": 4.99017099386437e-07, "logits/chosen": -0.9315390586853027, "logits/rejected": -1.1771332025527954, "logps/chosen": -278.89837646484375, "logps/rejected": -268.14080810546875, "loss": 0.6679, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.014189760200679302, "rewards/margins": 0.06192191690206528, "rewards/rejected": -0.07611168175935745, "step": 60 }, { "epoch": 0.13720316622691292, "grad_norm": 5.176626164434011, "learning_rate": 4.980286753286194e-07, "logits/chosen": -0.8333457708358765, "logits/rejected": -1.3162130117416382, "logps/chosen": -288.89825439453125, "logps/rejected": -264.5441589355469, "loss": 0.6667, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.045755136758089066, "rewards/margins": 0.08817130327224731, "rewards/rejected": -0.13392645120620728, "step": 65 }, { "epoch": 0.14775725593667546, "grad_norm": 5.725175266189831, "learning_rate": 4.967014739346915e-07, "logits/chosen": -0.9382959604263306, "logits/rejected": -1.3034207820892334, "logps/chosen": -273.29193115234375, "logps/rejected": -274.21929931640625, "loss": 0.6606, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.05725777894258499, "rewards/margins": 0.08167224377393723, "rewards/rejected": -0.13892999291419983, "step": 70 }, { "epoch": 0.158311345646438, "grad_norm": 5.9050273856078395, "learning_rate": 4.950373080021136e-07, "logits/chosen": -1.0476350784301758, "logits/rejected": -1.337590217590332, "logps/chosen": -292.19378662109375, "logps/rejected": -282.83001708984375, "loss": 0.6585, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07018107920885086, "rewards/margins": 0.08405766636133194, "rewards/rejected": -0.1542387306690216, "step": 75 }, { "epoch": 0.16886543535620052, "grad_norm": 5.714632118731764, "learning_rate": 4.930384505813737e-07, "logits/chosen": -0.9645854830741882, "logits/rejected": -1.3480749130249023, "logps/chosen": -290.5950012207031, "logps/rejected": -275.71417236328125, "loss": 0.6617, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11630520969629288, "rewards/margins": 0.08103077113628387, "rewards/rejected": -0.19733598828315735, "step": 80 }, { "epoch": 0.17941952506596306, "grad_norm": 6.048274761863404, "learning_rate": 4.907076318712738e-07, "logits/chosen": -1.0770204067230225, "logits/rejected": -1.342997431755066, "logps/chosen": -301.7802734375, "logps/rejected": -287.3224792480469, "loss": 0.6561, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13322284817695618, "rewards/margins": 0.07080608606338501, "rewards/rejected": -0.2040289342403412, "step": 85 }, { "epoch": 0.18997361477572558, "grad_norm": 5.616972735220456, "learning_rate": 4.88048035489807e-07, "logits/chosen": -1.0288609266281128, "logits/rejected": -1.537954568862915, "logps/chosen": -303.514892578125, "logps/rejected": -282.09832763671875, "loss": 0.6458, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.12775097787380219, "rewards/margins": 0.11901189386844635, "rewards/rejected": -0.24676287174224854, "step": 90 }, { "epoch": 0.20052770448548812, "grad_norm": 6.041190762428844, "learning_rate": 4.85063294125718e-07, "logits/chosen": -1.1466128826141357, "logits/rejected": -1.4186201095581055, "logps/chosen": -323.9360046386719, "logps/rejected": -326.41461181640625, "loss": 0.6493, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18756112456321716, "rewards/margins": 0.12050308287143707, "rewards/rejected": -0.3080642521381378, "step": 95 }, { "epoch": 0.21108179419525067, "grad_norm": 7.792002911640772, "learning_rate": 4.817574845766874e-07, "logits/chosen": -1.1385769844055176, "logits/rejected": -1.4923776388168335, "logps/chosen": -314.1307373046875, "logps/rejected": -307.49102783203125, "loss": 0.6441, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.26007553935050964, "rewards/margins": 0.1371382772922516, "rewards/rejected": -0.397213876247406, "step": 100 }, { "epoch": 0.22163588390501318, "grad_norm": 6.885087311095594, "learning_rate": 4.781351221809166e-07, "logits/chosen": -1.1828514337539673, "logits/rejected": -1.624103307723999, "logps/chosen": -304.28204345703125, "logps/rejected": -294.31048583984375, "loss": 0.6373, "rewards/accuracies": 0.75, "rewards/chosen": -0.30514588952064514, "rewards/margins": 0.1688612401485443, "rewards/rejected": -0.47400718927383423, "step": 105 }, { "epoch": 0.23218997361477572, "grad_norm": 8.481883842604432, "learning_rate": 4.742011546497182e-07, "logits/chosen": -1.212425947189331, "logits/rejected": -1.3756533861160278, "logps/chosen": -313.9586486816406, "logps/rejected": -320.29425048828125, "loss": 0.6538, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.30393490195274353, "rewards/margins": 0.1464935690164566, "rewards/rejected": -0.45042848587036133, "step": 110 }, { "epoch": 0.24274406332453827, "grad_norm": 7.149769163847217, "learning_rate": 4.6996095530953875e-07, "logits/chosen": -1.2339892387390137, "logits/rejected": -1.58319890499115, "logps/chosen": -315.6721496582031, "logps/rejected": -308.2062072753906, "loss": 0.6291, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.32919472455978394, "rewards/margins": 0.1386784464120865, "rewards/rejected": -0.4678731858730316, "step": 115 }, { "epoch": 0.2532981530343008, "grad_norm": 7.759815340386084, "learning_rate": 4.654203157626399e-07, "logits/chosen": -1.2471096515655518, "logits/rejected": -1.6236129999160767, "logps/chosen": -341.6539611816406, "logps/rejected": -330.80926513671875, "loss": 0.6335, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4439183175563812, "rewards/margins": 0.12948934733867645, "rewards/rejected": -0.5734077095985413, "step": 120 }, { "epoch": 0.2638522427440633, "grad_norm": 8.303750659351337, "learning_rate": 4.605854379764673e-07, "logits/chosen": -1.2065553665161133, "logits/rejected": -1.5575497150421143, "logps/chosen": -347.19696044921875, "logps/rejected": -339.4477233886719, "loss": 0.63, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4391602873802185, "rewards/margins": 0.14842209219932556, "rewards/rejected": -0.5875824093818665, "step": 125 }, { "epoch": 0.27440633245382584, "grad_norm": 7.626112760961139, "learning_rate": 4.5546292581250857e-07, "logits/chosen": -1.1812589168548584, "logits/rejected": -1.513511300086975, "logps/chosen": -325.56005859375, "logps/rejected": -315.3307800292969, "loss": 0.6305, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5028723478317261, "rewards/margins": 0.12545283138751984, "rewards/rejected": -0.6283251643180847, "step": 130 }, { "epoch": 0.2849604221635884, "grad_norm": 8.681810962953072, "learning_rate": 4.5005977600621275e-07, "logits/chosen": -1.33579683303833, "logits/rejected": -1.586660623550415, "logps/chosen": -343.98089599609375, "logps/rejected": -351.74066162109375, "loss": 0.631, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5469980835914612, "rewards/margins": 0.19922946393489838, "rewards/rejected": -0.7462274432182312, "step": 135 }, { "epoch": 0.2955145118733509, "grad_norm": 9.263751197369732, "learning_rate": 4.443833686102919e-07, "logits/chosen": -1.4017233848571777, "logits/rejected": -1.7090505361557007, "logps/chosen": -355.2716369628906, "logps/rejected": -371.23492431640625, "loss": 0.6335, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.667505145072937, "rewards/margins": 0.2195053994655609, "rewards/rejected": -0.8870105743408203, "step": 140 }, { "epoch": 0.30606860158311344, "grad_norm": 8.944976382840098, "learning_rate": 4.384414569144561e-07, "logits/chosen": -1.3571860790252686, "logits/rejected": -1.624506950378418, "logps/chosen": -356.50885009765625, "logps/rejected": -361.44512939453125, "loss": 0.6242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.723587691783905, "rewards/margins": 0.22243139147758484, "rewards/rejected": -0.9460189938545227, "step": 145 }, { "epoch": 0.316622691292876, "grad_norm": 9.048728108809618, "learning_rate": 4.3224215685535287e-07, "logits/chosen": -1.2304835319519043, "logits/rejected": -1.607114553451538, "logps/chosen": -340.3996887207031, "logps/rejected": -343.8750915527344, "loss": 0.6193, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5864183902740479, "rewards/margins": 0.2611897587776184, "rewards/rejected": -0.8476082682609558, "step": 150 }, { "epoch": 0.32717678100263853, "grad_norm": 10.012310357130646, "learning_rate": 4.2579393593117364e-07, "logits/chosen": -1.3340481519699097, "logits/rejected": -1.707767128944397, "logps/chosen": -366.13104248046875, "logps/rejected": -364.83026123046875, "loss": 0.6204, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7475859522819519, "rewards/margins": 0.2101312130689621, "rewards/rejected": -0.9577171206474304, "step": 155 }, { "epoch": 0.33773087071240104, "grad_norm": 9.68044164663275, "learning_rate": 4.191056016360699e-07, "logits/chosen": -1.394718050956726, "logits/rejected": -1.6881500482559204, "logps/chosen": -368.72381591796875, "logps/rejected": -381.956298828125, "loss": 0.6135, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8789850473403931, "rewards/margins": 0.3012150526046753, "rewards/rejected": -1.1802000999450684, "step": 160 }, { "epoch": 0.3482849604221636, "grad_norm": 10.276456210059177, "learning_rate": 4.121862894301754e-07, "logits/chosen": -1.3367292881011963, "logits/rejected": -1.7920604944229126, "logps/chosen": -379.0816650390625, "logps/rejected": -372.62432861328125, "loss": 0.6186, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8941882252693176, "rewards/margins": 0.2552604675292969, "rewards/rejected": -1.1494486331939697, "step": 165 }, { "epoch": 0.35883905013192613, "grad_norm": 10.349641550261767, "learning_rate": 4.050454502616667e-07, "logits/chosen": -1.3888546228408813, "logits/rejected": -1.7364885807037354, "logps/chosen": -375.4383239746094, "logps/rejected": -369.5252685546875, "loss": 0.6183, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8307794332504272, "rewards/margins": 0.19674496352672577, "rewards/rejected": -1.027524471282959, "step": 170 }, { "epoch": 0.36939313984168864, "grad_norm": 10.29658804390271, "learning_rate": 3.976928376579047e-07, "logits/chosen": -1.4784464836120605, "logits/rejected": -1.8144117593765259, "logps/chosen": -355.7376708984375, "logps/rejected": -354.1457824707031, "loss": 0.6153, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8430948257446289, "rewards/margins": 0.21338331699371338, "rewards/rejected": -1.0564781427383423, "step": 175 }, { "epoch": 0.37994722955145116, "grad_norm": 20.628198563240826, "learning_rate": 3.9013849440328945e-07, "logits/chosen": -1.3779172897338867, "logits/rejected": -1.7602001428604126, "logps/chosen": -353.769287109375, "logps/rejected": -358.7577209472656, "loss": 0.6204, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8876091837882996, "rewards/margins": 0.22880685329437256, "rewards/rejected": -1.1164162158966064, "step": 180 }, { "epoch": 0.39050131926121373, "grad_norm": 10.868907026626026, "learning_rate": 3.8239273882202473e-07, "logits/chosen": -1.439247488975525, "logits/rejected": -1.8125137090682983, "logps/chosen": -412.8868103027344, "logps/rejected": -431.59063720703125, "loss": 0.6016, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1018221378326416, "rewards/margins": 0.4088074564933777, "rewards/rejected": -1.5106297731399536, "step": 185 }, { "epoch": 0.40105540897097625, "grad_norm": 10.784941413981636, "learning_rate": 3.7446615068452804e-07, "logits/chosen": -1.4441838264465332, "logits/rejected": -1.7783229351043701, "logps/chosen": -398.41009521484375, "logps/rejected": -396.8212890625, "loss": 0.594, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.115227222442627, "rewards/margins": 0.2704046070575714, "rewards/rejected": -1.3856319189071655, "step": 190 }, { "epoch": 0.41160949868073876, "grad_norm": 10.229960177651233, "learning_rate": 3.6636955675673743e-07, "logits/chosen": -1.5908405780792236, "logits/rejected": -1.9355300664901733, "logps/chosen": -426.3243713378906, "logps/rejected": -420.7511291503906, "loss": 0.5957, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1847457885742188, "rewards/margins": 0.32065972685813904, "rewards/rejected": -1.5054056644439697, "step": 195 }, { "epoch": 0.42216358839050133, "grad_norm": 18.20685869729302, "learning_rate": 3.5811401601205093e-07, "logits/chosen": -1.6325582265853882, "logits/rejected": -1.8879244327545166, "logps/chosen": -426.10943603515625, "logps/rejected": -426.29376220703125, "loss": 0.6339, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3495900630950928, "rewards/margins": 0.15765051543712616, "rewards/rejected": -1.507240653038025, "step": 200 }, { "epoch": 0.43271767810026385, "grad_norm": 10.716178488233457, "learning_rate": 3.497108045260995e-07, "logits/chosen": -1.6447012424468994, "logits/rejected": -1.9266440868377686, "logps/chosen": -422.4698181152344, "logps/rejected": -423.3296813964844, "loss": 0.6095, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2483638525009155, "rewards/margins": 0.21740670502185822, "rewards/rejected": -1.4657707214355469, "step": 205 }, { "epoch": 0.44327176781002636, "grad_norm": 9.319577970375986, "learning_rate": 3.411714000749838e-07, "logits/chosen": -1.5758410692214966, "logits/rejected": -1.9720706939697266, "logps/chosen": -413.7496032714844, "logps/rejected": -432.4217834472656, "loss": 0.5971, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2690123319625854, "rewards/margins": 0.31965065002441406, "rewards/rejected": -1.58866286277771, "step": 210 }, { "epoch": 0.45382585751978893, "grad_norm": 18.334377917058617, "learning_rate": 3.3250746645801287e-07, "logits/chosen": -1.6151403188705444, "logits/rejected": -1.9621028900146484, "logps/chosen": -431.717529296875, "logps/rejected": -438.23095703125, "loss": 0.5914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.601030945777893, "rewards/margins": 0.29736214876174927, "rewards/rejected": -1.8983930349349976, "step": 215 }, { "epoch": 0.46437994722955145, "grad_norm": 13.987559233928428, "learning_rate": 3.237308375663571e-07, "logits/chosen": -1.5672855377197266, "logits/rejected": -1.8798201084136963, "logps/chosen": -465.22882080078125, "logps/rejected": -480.69036865234375, "loss": 0.5731, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6366822719573975, "rewards/margins": 0.33864718675613403, "rewards/rejected": -1.9753293991088867, "step": 220 }, { "epoch": 0.47493403693931396, "grad_norm": 15.585874610978292, "learning_rate": 3.148535012193767e-07, "logits/chosen": -1.4787318706512451, "logits/rejected": -1.7937052249908447, "logps/chosen": -463.3704528808594, "logps/rejected": -513.5693359375, "loss": 0.5913, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7906410694122314, "rewards/margins": 0.637313723564148, "rewards/rejected": -2.42795467376709, "step": 225 }, { "epoch": 0.48548812664907653, "grad_norm": 10.989676492328872, "learning_rate": 3.0588758279070183e-07, "logits/chosen": -1.4634826183319092, "logits/rejected": -1.688738226890564, "logps/chosen": -402.5445556640625, "logps/rejected": -404.0518493652344, "loss": 0.62, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4222261905670166, "rewards/margins": 0.1772518903017044, "rewards/rejected": -1.599478006362915, "step": 230 }, { "epoch": 0.49604221635883905, "grad_norm": 10.557802697469821, "learning_rate": 2.968453286464312e-07, "logits/chosen": -1.386103868484497, "logits/rejected": -1.759375810623169, "logps/chosen": -398.8132629394531, "logps/rejected": -399.6328125, "loss": 0.5904, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1334482431411743, "rewards/margins": 0.23164169490337372, "rewards/rejected": -1.365089774131775, "step": 235 }, { "epoch": 0.5065963060686016, "grad_norm": 13.209672009218341, "learning_rate": 2.8773908941806877e-07, "logits/chosen": -1.5705225467681885, "logits/rejected": -1.753831148147583, "logps/chosen": -442.28857421875, "logps/rejected": -449.0203552246094, "loss": 0.5998, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6151583194732666, "rewards/margins": 0.23577281832695007, "rewards/rejected": -1.85093092918396, "step": 240 }, { "epoch": 0.5171503957783641, "grad_norm": 16.396333599315767, "learning_rate": 2.785813031330473e-07, "logits/chosen": -1.6287492513656616, "logits/rejected": -1.9647096395492554, "logps/chosen": -466.08599853515625, "logps/rejected": -482.62847900390625, "loss": 0.6041, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8849313259124756, "rewards/margins": 0.3867245614528656, "rewards/rejected": -2.271656036376953, "step": 245 }, { "epoch": 0.5277044854881267, "grad_norm": 10.479150105315131, "learning_rate": 2.693844782258779e-07, "logits/chosen": -1.6182796955108643, "logits/rejected": -1.851154088973999, "logps/chosen": -442.0950622558594, "logps/rejected": -452.76416015625, "loss": 0.6023, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5875468254089355, "rewards/margins": 0.27402180433273315, "rewards/rejected": -1.8615686893463135, "step": 250 }, { "epoch": 0.5382585751978892, "grad_norm": 11.245899562560366, "learning_rate": 2.601611764531342e-07, "logits/chosen": -1.5520964860916138, "logits/rejected": -1.8409061431884766, "logps/chosen": -385.7509765625, "logps/rejected": -413.82147216796875, "loss": 0.602, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3112901449203491, "rewards/margins": 0.3254047930240631, "rewards/rejected": -1.6366949081420898, "step": 255 }, { "epoch": 0.5488126649076517, "grad_norm": 10.216434963455866, "learning_rate": 2.5092399573560323e-07, "logits/chosen": -1.552223563194275, "logits/rejected": -1.9581362009048462, "logps/chosen": -435.2206115722656, "logps/rejected": -440.0597229003906, "loss": 0.6024, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5202934741973877, "rewards/margins": 0.2939620614051819, "rewards/rejected": -1.8142554759979248, "step": 260 }, { "epoch": 0.5593667546174143, "grad_norm": 15.557028702183048, "learning_rate": 2.4168555295104124e-07, "logits/chosen": -1.5453598499298096, "logits/rejected": -1.900339126586914, "logps/chosen": -430.10980224609375, "logps/rejected": -445.18658447265625, "loss": 0.5844, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5744996070861816, "rewards/margins": 0.3325752317905426, "rewards/rejected": -1.9070749282836914, "step": 265 }, { "epoch": 0.5699208443271768, "grad_norm": 17.943254997397123, "learning_rate": 2.3245846670103626e-07, "logits/chosen": -1.604867935180664, "logits/rejected": -2.0065605640411377, "logps/chosen": -474.488037109375, "logps/rejected": -498.0807189941406, "loss": 0.5789, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8745540380477905, "rewards/margins": 0.41972631216049194, "rewards/rejected": -2.294280529022217, "step": 270 }, { "epoch": 0.5804749340369393, "grad_norm": 24.025134545110568, "learning_rate": 2.232553400755159e-07, "logits/chosen": -1.5600621700286865, "logits/rejected": -1.9929841756820679, "logps/chosen": -506.9547424316406, "logps/rejected": -510.70306396484375, "loss": 0.6081, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.104123592376709, "rewards/margins": 0.3540397882461548, "rewards/rejected": -2.4581634998321533, "step": 275 }, { "epoch": 0.5910290237467019, "grad_norm": 12.929099239614445, "learning_rate": 2.1408874343844294e-07, "logits/chosen": -1.6627086400985718, "logits/rejected": -1.9773311614990234, "logps/chosen": -452.6092224121094, "logps/rejected": -466.3548889160156, "loss": 0.5697, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.777646780014038, "rewards/margins": 0.39899054169654846, "rewards/rejected": -2.1766371726989746, "step": 280 }, { "epoch": 0.6015831134564644, "grad_norm": 14.764167900995057, "learning_rate": 2.049711972582101e-07, "logits/chosen": -1.4953606128692627, "logits/rejected": -1.8248519897460938, "logps/chosen": -454.2190856933594, "logps/rejected": -484.0538635253906, "loss": 0.5691, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6517919301986694, "rewards/margins": 0.40098685026168823, "rewards/rejected": -2.052778720855713, "step": 285 }, { "epoch": 0.6121372031662269, "grad_norm": 16.272348359396457, "learning_rate": 1.9591515500618588e-07, "logits/chosen": -1.5684363842010498, "logits/rejected": -1.8171417713165283, "logps/chosen": -463.537841796875, "logps/rejected": -480.9203186035156, "loss": 0.5867, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7810500860214233, "rewards/margins": 0.29418668150901794, "rewards/rejected": -2.0752367973327637, "step": 290 }, { "epoch": 0.6226912928759895, "grad_norm": 14.742811810031489, "learning_rate": 1.8693298614677112e-07, "logits/chosen": -1.466384768486023, "logits/rejected": -1.8593746423721313, "logps/chosen": -479.5718688964844, "logps/rejected": -491.52154541015625, "loss": 0.5822, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8734004497528076, "rewards/margins": 0.33124423027038574, "rewards/rejected": -2.2046444416046143, "step": 295 }, { "epoch": 0.633245382585752, "grad_norm": 17.118353279558573, "learning_rate": 1.7803695924219814e-07, "logits/chosen": -1.6126632690429688, "logits/rejected": -1.906806230545044, "logps/chosen": -501.42083740234375, "logps/rejected": -519.7081909179688, "loss": 0.5917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.099165439605713, "rewards/margins": 0.307799756526947, "rewards/rejected": -2.4069650173187256, "step": 300 }, { "epoch": 0.6437994722955145, "grad_norm": 13.624538503432188, "learning_rate": 1.6923922519515067e-07, "logits/chosen": -1.6364351511001587, "logits/rejected": -1.9255473613739014, "logps/chosen": -485.3211975097656, "logps/rejected": -504.00701904296875, "loss": 0.5809, "rewards/accuracies": 0.75, "rewards/chosen": -1.9193140268325806, "rewards/margins": 0.4129720628261566, "rewards/rejected": -2.3322861194610596, "step": 305 }, { "epoch": 0.6543535620052771, "grad_norm": 17.071661718014518, "learning_rate": 1.605518006520924e-07, "logits/chosen": -1.727064847946167, "logits/rejected": -2.0727763175964355, "logps/chosen": -501.14495849609375, "logps/rejected": -513.572509765625, "loss": 0.5871, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1765451431274414, "rewards/margins": 0.34206461906433105, "rewards/rejected": -2.5186100006103516, "step": 310 }, { "epoch": 0.6649076517150396, "grad_norm": 13.617029224965975, "learning_rate": 1.519865515899731e-07, "logits/chosen": -1.722412109375, "logits/rejected": -2.04305362701416, "logps/chosen": -467.9588928222656, "logps/rejected": -480.5577087402344, "loss": 0.5821, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8842096328735352, "rewards/margins": 0.34835028648376465, "rewards/rejected": -2.2325596809387207, "step": 315 }, { "epoch": 0.6754617414248021, "grad_norm": 13.33856540505469, "learning_rate": 1.4355517710873182e-07, "logits/chosen": -1.8616483211517334, "logits/rejected": -2.127676248550415, "logps/chosen": -491.52545166015625, "logps/rejected": -527.18212890625, "loss": 0.5874, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0936801433563232, "rewards/margins": 0.45663338899612427, "rewards/rejected": -2.5503134727478027, "step": 320 }, { "epoch": 0.6860158311345647, "grad_norm": 17.145800349025656, "learning_rate": 1.3526919345173318e-07, "logits/chosen": -1.7799503803253174, "logits/rejected": -2.053417921066284, "logps/chosen": -521.0397338867188, "logps/rejected": -544.9762573242188, "loss": 0.5769, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4065451622009277, "rewards/margins": 0.452395498752594, "rewards/rejected": -2.858940601348877, "step": 325 }, { "epoch": 0.6965699208443272, "grad_norm": 19.087646634462068, "learning_rate": 1.2713991827596443e-07, "logits/chosen": -1.8048852682113647, "logits/rejected": -2.0732533931732178, "logps/chosen": -538.1304931640625, "logps/rejected": -579.5018310546875, "loss": 0.5753, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.58000111579895, "rewards/margins": 0.5617579221725464, "rewards/rejected": -3.141758680343628, "step": 330 }, { "epoch": 0.7071240105540897, "grad_norm": 16.296965660815633, "learning_rate": 1.191784551934773e-07, "logits/chosen": -1.6937000751495361, "logits/rejected": -2.0096402168273926, "logps/chosen": -490.8270568847656, "logps/rejected": -560.6513671875, "loss": 0.5806, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2391765117645264, "rewards/margins": 0.8371523022651672, "rewards/rejected": -3.076328992843628, "step": 335 }, { "epoch": 0.7176781002638523, "grad_norm": 13.84198150957549, "learning_rate": 1.1139567860518953e-07, "logits/chosen": -1.6130354404449463, "logits/rejected": -1.875739336013794, "logps/chosen": -477.005615234375, "logps/rejected": -505.4608459472656, "loss": 0.5914, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9602851867675781, "rewards/margins": 0.4700210988521576, "rewards/rejected": -2.4303066730499268, "step": 340 }, { "epoch": 0.7282321899736148, "grad_norm": 15.316683752394184, "learning_rate": 1.0380221884776128e-07, "logits/chosen": -1.671500563621521, "logits/rejected": -1.958186149597168, "logps/chosen": -483.4461975097656, "logps/rejected": -497.53643798828125, "loss": 0.5842, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9341436624526978, "rewards/margins": 0.3594801723957062, "rewards/rejected": -2.293623924255371, "step": 345 }, { "epoch": 0.7387862796833773, "grad_norm": 11.225540406360041, "learning_rate": 9.640844767383405e-08, "logits/chosen": -1.7304404973983765, "logits/rejected": -2.0152411460876465, "logps/chosen": -474.5326232910156, "logps/rejected": -519.5494384765625, "loss": 0.5663, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8573243618011475, "rewards/margins": 0.5369530916213989, "rewards/rejected": -2.394277334213257, "step": 350 }, { "epoch": 0.7493403693931399, "grad_norm": 69.37431303110792, "learning_rate": 8.922446408546378e-08, "logits/chosen": -1.636301040649414, "logits/rejected": -1.9108378887176514, "logps/chosen": -474.32769775390625, "logps/rejected": -491.1766052246094, "loss": 0.5914, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.870996117591858, "rewards/margins": 0.4108423590660095, "rewards/rejected": -2.2818384170532227, "step": 355 }, { "epoch": 0.7598944591029023, "grad_norm": 20.752730975509387, "learning_rate": 8.22600805400994e-08, "logits/chosen": -1.597144603729248, "logits/rejected": -1.939162015914917, "logps/chosen": -516.8674926757812, "logps/rejected": -526.4575805664062, "loss": 0.5934, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.107037305831909, "rewards/margins": 0.36362889409065247, "rewards/rejected": -2.4706661701202393, "step": 360 }, { "epoch": 0.7704485488126649, "grad_norm": 17.42422968220554, "learning_rate": 7.552480954794558e-08, "logits/chosen": -1.664350152015686, "logits/rejected": -1.8763881921768188, "logps/chosen": -474.96917724609375, "logps/rejected": -517.1463623046875, "loss": 0.5755, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.939814567565918, "rewards/margins": 0.3745439350605011, "rewards/rejected": -2.3143584728240967, "step": 365 }, { "epoch": 0.7810026385224275, "grad_norm": 14.771602880443869, "learning_rate": 6.902785067901854e-08, "logits/chosen": -1.6192362308502197, "logits/rejected": -1.9148075580596924, "logps/chosen": -488.96221923828125, "logps/rejected": -493.0494689941406, "loss": 0.5705, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.987235426902771, "rewards/margins": 0.29930660128593445, "rewards/rejected": -2.2865424156188965, "step": 370 }, { "epoch": 0.7915567282321899, "grad_norm": 17.979535692288096, "learning_rate": 6.277807799763973e-08, "logits/chosen": -1.739436149597168, "logits/rejected": -1.9250596761703491, "logps/chosen": -524.38720703125, "logps/rejected": -558.7305908203125, "loss": 0.5799, "rewards/accuracies": 0.6875, "rewards/chosen": -2.345944881439209, "rewards/margins": 0.3936893045902252, "rewards/rejected": -2.7396342754364014, "step": 375 }, { "epoch": 0.8021108179419525, "grad_norm": 16.020544985708035, "learning_rate": 5.678402794153145e-08, "logits/chosen": -1.6335742473602295, "logits/rejected": -1.9916164875030518, "logps/chosen": -496.64111328125, "logps/rejected": -516.6607666015625, "loss": 0.5759, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1185414791107178, "rewards/margins": 0.3739583492279053, "rewards/rejected": -2.492499828338623, "step": 380 }, { "epoch": 0.8126649076517151, "grad_norm": 15.483975057559833, "learning_rate": 5.105388766206969e-08, "logits/chosen": -1.7242807149887085, "logits/rejected": -1.9720449447631836, "logps/chosen": -476.0779724121094, "logps/rejected": -498.2892150878906, "loss": 0.5878, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9155553579330444, "rewards/margins": 0.33329516649246216, "rewards/rejected": -2.2488505840301514, "step": 385 }, { "epoch": 0.8232189973614775, "grad_norm": 12.980915706351402, "learning_rate": 4.5595483841620484e-08, "logits/chosen": -1.685105562210083, "logits/rejected": -1.9450676441192627, "logps/chosen": -459.869384765625, "logps/rejected": -495.52069091796875, "loss": 0.5753, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7653785943984985, "rewards/margins": 0.45078420639038086, "rewards/rejected": -2.216163158416748, "step": 390 }, { "epoch": 0.8337730870712401, "grad_norm": 12.943578700815056, "learning_rate": 4.0416272003232526e-08, "logits/chosen": -1.5918303728103638, "logits/rejected": -1.9432264566421509, "logps/chosen": -461.55078125, "logps/rejected": -483.1607971191406, "loss": 0.5828, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7096798419952393, "rewards/margins": 0.43595314025878906, "rewards/rejected": -2.1456329822540283, "step": 395 }, { "epoch": 0.8443271767810027, "grad_norm": 13.529250322769109, "learning_rate": 3.552332632729041e-08, "logits/chosen": -1.676417350769043, "logits/rejected": -1.8683099746704102, "logps/chosen": -448.98809814453125, "logps/rejected": -474.80450439453125, "loss": 0.5696, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.735640525817871, "rewards/margins": 0.3609997630119324, "rewards/rejected": -2.096640110015869, "step": 400 }, { "epoch": 0.8443271767810027, "eval_logits/chosen": -1.8635751008987427, "eval_logits/rejected": -1.727868914604187, "eval_logps/chosen": -464.8841857910156, "eval_logps/rejected": -503.46514892578125, "eval_loss": 0.6257370710372925, "eval_rewards/accuracies": 0.6639676094055176, "eval_rewards/chosen": -1.8789465427398682, "eval_rewards/margins": 0.299042671918869, "eval_rewards/rejected": -2.1779892444610596, "eval_runtime": 316.7001, "eval_samples_per_second": 6.239, "eval_steps_per_second": 1.56, "step": 400 }, { "epoch": 0.8548812664907651, "grad_norm": 16.739492605341695, "learning_rate": 3.092332998903416e-08, "logits/chosen": -1.7163026332855225, "logits/rejected": -2.0801901817321777, "logps/chosen": -481.8212890625, "logps/rejected": -521.2871704101562, "loss": 0.5594, "rewards/accuracies": 0.75, "rewards/chosen": -1.8860801458358765, "rewards/margins": 0.5326521992683411, "rewards/rejected": -2.418732166290283, "step": 405 }, { "epoch": 0.8654353562005277, "grad_norm": 18.511909575910575, "learning_rate": 2.6622566030146455e-08, "logits/chosen": -1.7279727458953857, "logits/rejected": -1.9562079906463623, "logps/chosen": -501.9583435058594, "logps/rejected": -521.0777587890625, "loss": 0.5736, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.089940309524536, "rewards/margins": 0.37453165650367737, "rewards/rejected": -2.4644720554351807, "step": 410 }, { "epoch": 0.8759894459102903, "grad_norm": 13.262757276399812, "learning_rate": 2.26269087768734e-08, "logits/chosen": -1.7813360691070557, "logits/rejected": -1.99080491065979, "logps/chosen": -470.19732666015625, "logps/rejected": -517.9837646484375, "loss": 0.5669, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0514144897460938, "rewards/margins": 0.6286773681640625, "rewards/rejected": -2.680091619491577, "step": 415 }, { "epoch": 0.8865435356200527, "grad_norm": 16.729852500651287, "learning_rate": 1.894181581640106e-08, "logits/chosen": -1.7729663848876953, "logits/rejected": -2.0622265338897705, "logps/chosen": -503.3247985839844, "logps/rejected": -532.9273681640625, "loss": 0.5733, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2469749450683594, "rewards/margins": 0.4464968144893646, "rewards/rejected": -2.6934714317321777, "step": 420 }, { "epoch": 0.8970976253298153, "grad_norm": 15.498959956089978, "learning_rate": 1.5572320542448143e-08, "logits/chosen": -1.8235836029052734, "logits/rejected": -2.0790963172912598, "logps/chosen": -518.3297119140625, "logps/rejected": -555.9387817382812, "loss": 0.5909, "rewards/accuracies": 0.75, "rewards/chosen": -2.182375431060791, "rewards/margins": 0.5672179460525513, "rewards/rejected": -2.7495932579040527, "step": 425 }, { "epoch": 0.9076517150395779, "grad_norm": 13.029691392427118, "learning_rate": 1.2523025280255729e-08, "logits/chosen": -1.7515465021133423, "logits/rejected": -2.0758919715881348, "logps/chosen": -505.37646484375, "logps/rejected": -527.7960815429688, "loss": 0.5682, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.237623691558838, "rewards/margins": 0.4710654616355896, "rewards/rejected": -2.7086894512176514, "step": 430 }, { "epoch": 0.9182058047493403, "grad_norm": 16.269526596286124, "learning_rate": 9.798095000364214e-09, "logits/chosen": -1.7598968744277954, "logits/rejected": -1.9988504648208618, "logps/chosen": -508.0267028808594, "logps/rejected": -554.0763549804688, "loss": 0.5581, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.217205762863159, "rewards/margins": 0.5872582197189331, "rewards/rejected": -2.8044638633728027, "step": 435 }, { "epoch": 0.9287598944591029, "grad_norm": 13.648970556247901, "learning_rate": 7.401251629764876e-09, "logits/chosen": -1.830775499343872, "logits/rejected": -2.0407309532165527, "logps/chosen": -511.0887145996094, "logps/rejected": -543.5230712890625, "loss": 0.5799, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.284379482269287, "rewards/margins": 0.47375327348709106, "rewards/rejected": -2.7581324577331543, "step": 440 }, { "epoch": 0.9393139841688655, "grad_norm": 17.489158193863855, "learning_rate": 5.335768968195098e-09, "logits/chosen": -1.7661769390106201, "logits/rejected": -2.1901516914367676, "logps/chosen": -519.0462646484375, "logps/rejected": -544.9937133789062, "loss": 0.5703, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.308170795440674, "rewards/margins": 0.4751991331577301, "rewards/rejected": -2.783369779586792, "step": 445 }, { "epoch": 0.9498680738786279, "grad_norm": 18.472750585474607, "learning_rate": 3.604468216521883e-09, "logits/chosen": -1.8184922933578491, "logits/rejected": -2.069641590118408, "logps/chosen": -510.5535583496094, "logps/rejected": -536.5929565429688, "loss": 0.5651, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.3444266319274902, "rewards/margins": 0.45197463035583496, "rewards/rejected": -2.7964015007019043, "step": 450 }, { "epoch": 0.9604221635883905, "grad_norm": 19.193548961658735, "learning_rate": 2.2097141233206884e-09, "logits/chosen": -1.7842222452163696, "logits/rejected": -2.0406641960144043, "logps/chosen": -513.885986328125, "logps/rejected": -545.530029296875, "loss": 0.5708, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.247333526611328, "rewards/margins": 0.44551533460617065, "rewards/rejected": -2.6928489208221436, "step": 455 }, { "epoch": 0.9709762532981531, "grad_norm": 15.684871774317772, "learning_rate": 1.1534117549133472e-09, "logits/chosen": -1.8590974807739258, "logits/rejected": -2.08577036857605, "logps/chosen": -512.5687866210938, "logps/rejected": -551.6975708007812, "loss": 0.5662, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2769620418548584, "rewards/margins": 0.5433439016342163, "rewards/rejected": -2.8203060626983643, "step": 460 }, { "epoch": 0.9815303430079155, "grad_norm": 16.324336075352026, "learning_rate": 4.3700389327672173e-10, "logits/chosen": -1.74801504611969, "logits/rejected": -2.0831220149993896, "logps/chosen": -508.1880798339844, "logps/rejected": -548.400390625, "loss": 0.578, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2477779388427734, "rewards/margins": 0.5980393886566162, "rewards/rejected": -2.8458173274993896, "step": 465 }, { "epoch": 0.9920844327176781, "grad_norm": 18.434311800327553, "learning_rate": 6.146906537587982e-11, "logits/chosen": -1.7675012350082397, "logits/rejected": -2.0456321239471436, "logps/chosen": -524.4590454101562, "logps/rejected": -550.3624877929688, "loss": 0.5793, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3114473819732666, "rewards/margins": 0.4332752823829651, "rewards/rejected": -2.744722366333008, "step": 470 }, { "epoch": 0.9984168865435357, "step": 473, "total_flos": 0.0, "train_loss": 0.6103140643736776, "train_runtime": 23898.8744, "train_samples_per_second": 2.537, "train_steps_per_second": 0.02 } ], "logging_steps": 5, "max_steps": 473, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }