diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 1.48540452899306, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.0685516744852066, + "logits/rejected": 0.14143499732017517, + "logps/chosen": -1.7162926197052002, + "logps/rejected": -1.8897325992584229, + "loss": 0.6976, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.7162926197052002, + "rewards/margins": 0.17344002425670624, + "rewards/rejected": -1.8897325992584229, + "sft_loss": 1.468671202659607, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 1.413805704580881, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": -0.006755639798939228, + "logits/rejected": 0.1146969422698021, + "logps/chosen": -1.8025729656219482, + "logps/rejected": -1.8460617065429688, + "loss": 0.704, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8025729656219482, + "rewards/margins": 0.04348861053586006, + "rewards/rejected": -1.8460617065429688, + "sft_loss": 1.5083630084991455, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 1.1477130633823904, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.04371301457285881, + "logits/rejected": 0.05566522479057312, + "logps/chosen": -1.6346843242645264, + "logps/rejected": -1.765125036239624, + "loss": 0.7082, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6346843242645264, + "rewards/margins": 0.13044048845767975, + "rewards/rejected": -1.765125036239624, + "sft_loss": 1.50040602684021, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 1.3186253450806018, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.04668748378753662, + "logits/rejected": 0.041610319167375565, + "logps/chosen": -1.7238433361053467, + "logps/rejected": -1.805229902267456, + "loss": 0.7121, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7238433361053467, + "rewards/margins": 0.08138636499643326, + "rewards/rejected": -1.805229902267456, + "sft_loss": 1.4999595880508423, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 1.201395274341559, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.04870440810918808, + "logits/rejected": 0.03901376202702522, + "logps/chosen": -1.8682407140731812, + "logps/rejected": -1.7784353494644165, + "loss": 0.7417, + "rewards/accuracies": 0.3812499940395355, + "rewards/chosen": -1.8682407140731812, + "rewards/margins": -0.08980532735586166, + "rewards/rejected": -1.7784353494644165, + "sft_loss": 1.5453672409057617, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 1.1663689864355962, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.08696512877941132, + "logits/rejected": 0.008516276255249977, + "logps/chosen": -1.9079726934432983, + "logps/rejected": -1.8313806056976318, + "loss": 0.7073, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.9079726934432983, + "rewards/margins": -0.07659195363521576, + "rewards/rejected": -1.8313806056976318, + "sft_loss": 1.6459842920303345, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 1.2100627166727136, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.06849979609251022, + "logits/rejected": 0.09280852228403091, + "logps/chosen": -1.846811294555664, + "logps/rejected": -1.9957473278045654, + "loss": 0.724, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.846811294555664, + "rewards/margins": 0.14893609285354614, + "rewards/rejected": -1.9957473278045654, + "sft_loss": 1.5619454383850098, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 1.2220620735018575, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.023500319570302963, + "logits/rejected": 0.1998632699251175, + "logps/chosen": -1.8833141326904297, + "logps/rejected": -1.744641900062561, + "loss": 0.7216, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8833141326904297, + "rewards/margins": -0.13867226243019104, + "rewards/rejected": -1.744641900062561, + "sft_loss": 1.5195614099502563, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 1.1538644802166662, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.041353605687618256, + "logits/rejected": 0.24557694792747498, + "logps/chosen": -1.8384199142456055, + "logps/rejected": -1.8726599216461182, + "loss": 0.7116, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.8384199142456055, + "rewards/margins": 0.034239742904901505, + "rewards/rejected": -1.8726599216461182, + "sft_loss": 1.5369489192962646, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 1.067987242731931, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.05942578241229057, + "logits/rejected": 0.09370598196983337, + "logps/chosen": -1.8994518518447876, + "logps/rejected": -1.7794386148452759, + "loss": 0.7154, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.8994518518447876, + "rewards/margins": -0.12001317739486694, + "rewards/rejected": -1.7794386148452759, + "sft_loss": 1.5832624435424805, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 1.150489594431797, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.11642004549503326, + "logits/rejected": 0.10642417520284653, + "logps/chosen": -1.8365113735198975, + "logps/rejected": -1.8701823949813843, + "loss": 0.7066, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8365113735198975, + "rewards/margins": 0.0336710661649704, + "rewards/rejected": -1.8701823949813843, + "sft_loss": 1.5844300985336304, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 1.20034100694076, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.08687268197536469, + "logits/rejected": 0.10528527200222015, + "logps/chosen": -1.7937759160995483, + "logps/rejected": -1.8984864950180054, + "loss": 0.7037, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7937759160995483, + "rewards/margins": 0.1047104150056839, + "rewards/rejected": -1.8984864950180054, + "sft_loss": 1.545082688331604, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 1.1774202373517098, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.022284885868430138, + "logits/rejected": 0.12686273455619812, + "logps/chosen": -1.642361044883728, + "logps/rejected": -1.77422297000885, + "loss": 0.703, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.642361044883728, + "rewards/margins": 0.1318618804216385, + "rewards/rejected": -1.77422297000885, + "sft_loss": 1.4764430522918701, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 1.514146379790193, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.07851644605398178, + "logits/rejected": 0.07680389285087585, + "logps/chosen": -1.7728259563446045, + "logps/rejected": -1.8199899196624756, + "loss": 0.7192, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -1.7728259563446045, + "rewards/margins": 0.04716411232948303, + "rewards/rejected": -1.8199899196624756, + "sft_loss": 1.6338894367218018, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 1.1841492258645112, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.07346881926059723, + "logits/rejected": 0.10432298481464386, + "logps/chosen": -1.7868471145629883, + "logps/rejected": -2.049077033996582, + "loss": 0.7099, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7868471145629883, + "rewards/margins": 0.26223024725914, + "rewards/rejected": -2.049077033996582, + "sft_loss": 1.5695842504501343, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 1.4171546244542632, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": 0.011562767438590527, + "logits/rejected": 0.12024722248315811, + "logps/chosen": -1.7296804189682007, + "logps/rejected": -1.7619798183441162, + "loss": 0.7154, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7296804189682007, + "rewards/margins": 0.03229951113462448, + "rewards/rejected": -1.7619798183441162, + "sft_loss": 1.5305752754211426, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 1.2388388222456028, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.14274922013282776, + "logits/rejected": 0.11063234508037567, + "logps/chosen": -1.804538369178772, + "logps/rejected": -1.9845695495605469, + "loss": 0.7123, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.804538369178772, + "rewards/margins": 0.180031418800354, + "rewards/rejected": -1.9845695495605469, + "sft_loss": 1.5006142854690552, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 1.1389220986989186, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.07219815999269485, + "logits/rejected": 0.036963194608688354, + "logps/chosen": -1.7647689580917358, + "logps/rejected": -1.7878549098968506, + "loss": 0.7149, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.7647689580917358, + "rewards/margins": 0.02308591641485691, + "rewards/rejected": -1.7878549098968506, + "sft_loss": 1.4621435403823853, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 1.4273520301005898, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.08188000321388245, + "logits/rejected": 0.06936556100845337, + "logps/chosen": -1.832297921180725, + "logps/rejected": -1.9317655563354492, + "loss": 0.7116, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.832297921180725, + "rewards/margins": 0.09946787357330322, + "rewards/rejected": -1.9317655563354492, + "sft_loss": 1.5347415208816528, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 1.0902790876389015, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.025676894932985306, + "logits/rejected": 0.04155648872256279, + "logps/chosen": -1.7139743566513062, + "logps/rejected": -1.8227088451385498, + "loss": 0.7064, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7139743566513062, + "rewards/margins": 0.10873470455408096, + "rewards/rejected": -1.8227088451385498, + "sft_loss": 1.4991495609283447, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 1.3390278945525504, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.053162623196840286, + "logits/rejected": 0.08006517589092255, + "logps/chosen": -1.6687822341918945, + "logps/rejected": -1.83384108543396, + "loss": 0.6973, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6687822341918945, + "rewards/margins": 0.16505882143974304, + "rewards/rejected": -1.83384108543396, + "sft_loss": 1.4465464353561401, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 1.1380107250036546, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.02342567965388298, + "logits/rejected": 0.12284733355045319, + "logps/chosen": -1.7178386449813843, + "logps/rejected": -1.774283766746521, + "loss": 0.7154, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.7178386449813843, + "rewards/margins": 0.05644518882036209, + "rewards/rejected": -1.774283766746521, + "sft_loss": 1.4845190048217773, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 1.7449537985357941, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.05169694870710373, + "logits/rejected": 0.26776689291000366, + "logps/chosen": -1.7055747509002686, + "logps/rejected": -1.9986652135849, + "loss": 0.6995, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7055747509002686, + "rewards/margins": 0.2930903434753418, + "rewards/rejected": -1.9986652135849, + "sft_loss": 1.588734745979309, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 1.1567141674033627, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.06300070881843567, + "logits/rejected": 0.11846703290939331, + "logps/chosen": -1.807218313217163, + "logps/rejected": -1.9346367120742798, + "loss": 0.7002, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.807218313217163, + "rewards/margins": 0.12741819024085999, + "rewards/rejected": -1.9346367120742798, + "sft_loss": 1.5945073366165161, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 1.2458548746530076, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.07810702174901962, + "logits/rejected": 0.05697429180145264, + "logps/chosen": -1.7272956371307373, + "logps/rejected": -1.6646478176116943, + "loss": 0.7234, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.7272956371307373, + "rewards/margins": -0.06264790147542953, + "rewards/rejected": -1.6646478176116943, + "sft_loss": 1.5413930416107178, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 1.8740559078645729, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.05399082973599434, + "logits/rejected": 0.19395580887794495, + "logps/chosen": -1.7780125141143799, + "logps/rejected": -1.8939733505249023, + "loss": 0.7074, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7780125141143799, + "rewards/margins": 0.11596081405878067, + "rewards/rejected": -1.8939733505249023, + "sft_loss": 1.6091417074203491, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 1.110779459632305, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.030741384252905846, + "logits/rejected": 0.09419815987348557, + "logps/chosen": -1.8397916555404663, + "logps/rejected": -1.8377549648284912, + "loss": 0.7164, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8397916555404663, + "rewards/margins": -0.002036741469055414, + "rewards/rejected": -1.8377549648284912, + "sft_loss": 1.5586879253387451, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 1.541796808985463, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.016195252537727356, + "logits/rejected": 0.15794073045253754, + "logps/chosen": -1.7996280193328857, + "logps/rejected": -1.9841985702514648, + "loss": 0.7067, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7996280193328857, + "rewards/margins": 0.1845705807209015, + "rewards/rejected": -1.9841985702514648, + "sft_loss": 1.6003118753433228, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 1.258768101937461, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": 0.0032178417313843966, + "logits/rejected": 0.1682175099849701, + "logps/chosen": -1.7254054546356201, + "logps/rejected": -1.8469938039779663, + "loss": 0.7035, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.7254054546356201, + "rewards/margins": 0.12158823013305664, + "rewards/rejected": -1.8469938039779663, + "sft_loss": 1.539015293121338, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 1.4744864735547007, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.019702356308698654, + "logits/rejected": 0.1558084785938263, + "logps/chosen": -1.681099534034729, + "logps/rejected": -1.6747146844863892, + "loss": 0.7081, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.681099534034729, + "rewards/margins": -0.006384936161339283, + "rewards/rejected": -1.6747146844863892, + "sft_loss": 1.3968122005462646, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 1.5789944598469368, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.044391922652721405, + "logits/rejected": 0.009145406074821949, + "logps/chosen": -1.7172826528549194, + "logps/rejected": -1.795111060142517, + "loss": 0.7087, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7172826528549194, + "rewards/margins": 0.07782838493585587, + "rewards/rejected": -1.795111060142517, + "sft_loss": 1.5068585872650146, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 1.3241057971254109, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.11585366725921631, + "logits/rejected": 0.03425910696387291, + "logps/chosen": -1.8618148565292358, + "logps/rejected": -1.8349769115447998, + "loss": 0.726, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.8618148565292358, + "rewards/margins": -0.02683776617050171, + "rewards/rejected": -1.8349769115447998, + "sft_loss": 1.5718270540237427, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 1.2997658998322292, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.053279124200344086, + "logits/rejected": 0.1222684383392334, + "logps/chosen": -1.6682636737823486, + "logps/rejected": -1.8548986911773682, + "loss": 0.7049, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.6682636737823486, + "rewards/margins": 0.1866351068019867, + "rewards/rejected": -1.8548986911773682, + "sft_loss": 1.4499410390853882, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 1.3208734338750652, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.07479271292686462, + "logits/rejected": -0.01702618971467018, + "logps/chosen": -1.8411445617675781, + "logps/rejected": -1.87027108669281, + "loss": 0.714, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.8411445617675781, + "rewards/margins": 0.029126638546586037, + "rewards/rejected": -1.87027108669281, + "sft_loss": 1.558387279510498, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 1.3344718053025082, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.08084265887737274, + "logits/rejected": 0.08330238610506058, + "logps/chosen": -1.707576036453247, + "logps/rejected": -1.7905025482177734, + "loss": 0.7239, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.707576036453247, + "rewards/margins": 0.08292657881975174, + "rewards/rejected": -1.7905025482177734, + "sft_loss": 1.53104567527771, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 1.2266352439696486, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": 0.02214609459042549, + "logits/rejected": 0.024739524349570274, + "logps/chosen": -1.741689682006836, + "logps/rejected": -1.8428306579589844, + "loss": 0.7046, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.741689682006836, + "rewards/margins": 0.1011408120393753, + "rewards/rejected": -1.8428306579589844, + "sft_loss": 1.5208356380462646, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 1.627310649600377, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.11873702704906464, + "logits/rejected": -0.025531206279993057, + "logps/chosen": -1.6886358261108398, + "logps/rejected": -1.7515376806259155, + "loss": 0.7184, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6886358261108398, + "rewards/margins": 0.06290177255868912, + "rewards/rejected": -1.7515376806259155, + "sft_loss": 1.4868758916854858, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 2.2936334873784188, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.04581695422530174, + "logits/rejected": 0.08045488595962524, + "logps/chosen": -1.8337981700897217, + "logps/rejected": -1.8637981414794922, + "loss": 0.7164, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8337981700897217, + "rewards/margins": 0.030000019818544388, + "rewards/rejected": -1.8637981414794922, + "sft_loss": 1.5478748083114624, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 1.4095395623852622, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": 0.040932267904281616, + "logits/rejected": 0.2044890820980072, + "logps/chosen": -1.554783821105957, + "logps/rejected": -1.685030221939087, + "loss": 0.7067, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.554783821105957, + "rewards/margins": 0.13024640083312988, + "rewards/rejected": -1.685030221939087, + "sft_loss": 1.3969472646713257, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 1.366095338570351, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.07507927715778351, + "logits/rejected": 0.06702321767807007, + "logps/chosen": -1.7471084594726562, + "logps/rejected": -1.6983184814453125, + "loss": 0.7148, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7471084594726562, + "rewards/margins": -0.048790059983730316, + "rewards/rejected": -1.6983184814453125, + "sft_loss": 1.539676547050476, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 1.450732603887787, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.038362838327884674, + "logits/rejected": 0.11528350412845612, + "logps/chosen": -1.7480757236480713, + "logps/rejected": -1.6963634490966797, + "loss": 0.7129, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7480757236480713, + "rewards/margins": -0.05171237513422966, + "rewards/rejected": -1.6963634490966797, + "sft_loss": 1.4396885633468628, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 1.3763776982907177, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.11977557092905045, + "logits/rejected": 0.08482502400875092, + "logps/chosen": -1.7112598419189453, + "logps/rejected": -1.9077314138412476, + "loss": 0.6965, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.7112598419189453, + "rewards/margins": 0.19647178053855896, + "rewards/rejected": -1.9077314138412476, + "sft_loss": 1.4883301258087158, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 1.2054543849961346, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.16571705043315887, + "logits/rejected": 0.08995556831359863, + "logps/chosen": -1.6306493282318115, + "logps/rejected": -1.7074792385101318, + "loss": 0.7, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.6306493282318115, + "rewards/margins": 0.0768299400806427, + "rewards/rejected": -1.7074792385101318, + "sft_loss": 1.4890508651733398, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 1.3114056890831345, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.06645621359348297, + "logits/rejected": 0.17207393050193787, + "logps/chosen": -1.631810188293457, + "logps/rejected": -1.8495622873306274, + "loss": 0.6927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.631810188293457, + "rewards/margins": 0.21775206923484802, + "rewards/rejected": -1.8495622873306274, + "sft_loss": 1.4851901531219482, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 1.51541200160965, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.11003299802541733, + "logits/rejected": 0.06239504739642143, + "logps/chosen": -1.5596537590026855, + "logps/rejected": -1.6928398609161377, + "loss": 0.697, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5596537590026855, + "rewards/margins": 0.13318602740764618, + "rewards/rejected": -1.6928398609161377, + "sft_loss": 1.406328558921814, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 1.503571872068748, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": -0.015409344807267189, + "logits/rejected": 0.06427817046642303, + "logps/chosen": -1.6280763149261475, + "logps/rejected": -1.769737958908081, + "loss": 0.7017, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.6280763149261475, + "rewards/margins": 0.14166171848773956, + "rewards/rejected": -1.769737958908081, + "sft_loss": 1.3876667022705078, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 1.6288220279547727, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": 0.011006379500031471, + "logits/rejected": 0.15760047733783722, + "logps/chosen": -1.5208041667938232, + "logps/rejected": -1.704429268836975, + "loss": 0.6852, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5208041667938232, + "rewards/margins": 0.183625265955925, + "rewards/rejected": -1.704429268836975, + "sft_loss": 1.3771321773529053, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 1.6522624770561913, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.05322523042559624, + "logits/rejected": 0.0770733654499054, + "logps/chosen": -1.512322187423706, + "logps/rejected": -1.6946868896484375, + "loss": 0.7075, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.512322187423706, + "rewards/margins": 0.1823645383119583, + "rewards/rejected": -1.6946868896484375, + "sft_loss": 1.4470535516738892, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 1.7657529349560406, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": 0.01734323427081108, + "logits/rejected": 0.14011171460151672, + "logps/chosen": -1.56759512424469, + "logps/rejected": -1.6468995809555054, + "loss": 0.7005, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.56759512424469, + "rewards/margins": 0.07930465787649155, + "rewards/rejected": -1.6468995809555054, + "sft_loss": 1.5069551467895508, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 1.859004151149198, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": -0.08213020861148834, + "logits/rejected": 0.08221803605556488, + "logps/chosen": -1.5308703184127808, + "logps/rejected": -1.583105444908142, + "loss": 0.7119, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.5308703184127808, + "rewards/margins": 0.052234966307878494, + "rewards/rejected": -1.583105444908142, + "sft_loss": 1.3916592597961426, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 1.9364919295573748, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -0.06537928432226181, + "logits/rejected": 0.07307229936122894, + "logps/chosen": -1.3984811305999756, + "logps/rejected": -1.5488195419311523, + "loss": 0.6939, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3984811305999756, + "rewards/margins": 0.15033839643001556, + "rewards/rejected": -1.5488195419311523, + "sft_loss": 1.3026387691497803, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 2.1865734300951845, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.2760574221611023, + "logits/rejected": -0.17358729243278503, + "logps/chosen": -1.527931571006775, + "logps/rejected": -1.6576035022735596, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.527931571006775, + "rewards/margins": 0.12967175245285034, + "rewards/rejected": -1.6576035022735596, + "sft_loss": 1.4426121711730957, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 2.5225617128658664, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": -0.11230075359344482, + "logits/rejected": -0.028052741661667824, + "logps/chosen": -1.5118391513824463, + "logps/rejected": -1.6556705236434937, + "loss": 0.7078, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5118391513824463, + "rewards/margins": 0.14383149147033691, + "rewards/rejected": -1.6556705236434937, + "sft_loss": 1.4891895055770874, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 2.923055759654654, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -0.13042227923870087, + "logits/rejected": -0.0014180898433551192, + "logps/chosen": -1.393897294998169, + "logps/rejected": -1.5104312896728516, + "loss": 0.7049, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.393897294998169, + "rewards/margins": 0.11653389781713486, + "rewards/rejected": -1.5104312896728516, + "sft_loss": 1.3747899532318115, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 3.9939481336962137, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -0.08223985135555267, + "logits/rejected": 0.01174080092459917, + "logps/chosen": -1.3294731378555298, + "logps/rejected": -1.5173444747924805, + "loss": 0.7069, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3294731378555298, + "rewards/margins": 0.1878713071346283, + "rewards/rejected": -1.5173444747924805, + "sft_loss": 1.2925227880477905, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 6.106718295896271, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": -0.15224897861480713, + "logits/rejected": -0.004387478344142437, + "logps/chosen": -1.3839061260223389, + "logps/rejected": -1.49300217628479, + "loss": 0.707, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3839061260223389, + "rewards/margins": 0.1090959757566452, + "rewards/rejected": -1.49300217628479, + "sft_loss": 1.3571009635925293, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 4.959246349891953, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.11805672943592072, + "logits/rejected": 0.016202565282583237, + "logps/chosen": -1.3780156373977661, + "logps/rejected": -1.4832361936569214, + "loss": 0.6954, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3780156373977661, + "rewards/margins": 0.10522061586380005, + "rewards/rejected": -1.4832361936569214, + "sft_loss": 1.417772889137268, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 2.7747854707798947, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.1688564121723175, + "logits/rejected": 0.11571399122476578, + "logps/chosen": -1.400048017501831, + "logps/rejected": -1.5392086505889893, + "loss": 0.6834, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.400048017501831, + "rewards/margins": 0.13916051387786865, + "rewards/rejected": -1.5392086505889893, + "sft_loss": 1.3857918977737427, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 8.877597356625866, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": -0.12207935005426407, + "logits/rejected": -0.06441991776227951, + "logps/chosen": -1.2970856428146362, + "logps/rejected": -1.450477957725525, + "loss": 0.6917, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2970856428146362, + "rewards/margins": 0.15339213609695435, + "rewards/rejected": -1.450477957725525, + "sft_loss": 1.2975612878799438, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 3.318035682630868, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.12218412011861801, + "logits/rejected": 0.03679182007908821, + "logps/chosen": -1.3453871011734009, + "logps/rejected": -1.4296029806137085, + "loss": 0.7139, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3453871011734009, + "rewards/margins": 0.08421595394611359, + "rewards/rejected": -1.4296029806137085, + "sft_loss": 1.3945664167404175, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 4.339443086417133, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": -0.086027592420578, + "logits/rejected": -0.012940932996571064, + "logps/chosen": -1.4613901376724243, + "logps/rejected": -1.4588768482208252, + "loss": 0.7165, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.4613901376724243, + "rewards/margins": -0.0025134205352514982, + "rewards/rejected": -1.4588768482208252, + "sft_loss": 1.4459879398345947, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 3.6558070591746565, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.2582774758338928, + "logits/rejected": -0.16772443056106567, + "logps/chosen": -1.4190332889556885, + "logps/rejected": -1.5252110958099365, + "loss": 0.711, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4190332889556885, + "rewards/margins": 0.10617784410715103, + "rewards/rejected": -1.5252110958099365, + "sft_loss": 1.4055038690567017, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 3.9092243838995517, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.07796318829059601, + "logits/rejected": 0.07571976631879807, + "logps/chosen": -1.4165136814117432, + "logps/rejected": -1.5895618200302124, + "loss": 0.6889, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4165136814117432, + "rewards/margins": 0.17304803431034088, + "rewards/rejected": -1.5895618200302124, + "sft_loss": 1.4236104488372803, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 4.434603741805856, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.12248054891824722, + "logits/rejected": 0.0066294134594500065, + "logps/chosen": -1.3746910095214844, + "logps/rejected": -1.4437358379364014, + "loss": 0.7053, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3746910095214844, + "rewards/margins": 0.06904484331607819, + "rewards/rejected": -1.4437358379364014, + "sft_loss": 1.3798660039901733, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 2.1170803811392447, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.1800215244293213, + "logits/rejected": -0.06589541584253311, + "logps/chosen": -1.4075556993484497, + "logps/rejected": -1.6932398080825806, + "loss": 0.6934, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4075556993484497, + "rewards/margins": 0.285684198141098, + "rewards/rejected": -1.6932398080825806, + "sft_loss": 1.4548895359039307, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 4.476326136428031, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -0.09052817523479462, + "logits/rejected": 0.0518343523144722, + "logps/chosen": -1.4117047786712646, + "logps/rejected": -1.6468381881713867, + "loss": 0.6942, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4117047786712646, + "rewards/margins": 0.23513329029083252, + "rewards/rejected": -1.6468381881713867, + "sft_loss": 1.4054486751556396, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 3.030168461363324, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": -0.04353749752044678, + "logits/rejected": 0.060181625187397, + "logps/chosen": -1.4509786367416382, + "logps/rejected": -1.5014684200286865, + "loss": 0.7139, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.4509786367416382, + "rewards/margins": 0.050489895045757294, + "rewards/rejected": -1.5014684200286865, + "sft_loss": 1.3967310190200806, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 2.6262695039771744, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -0.09377865493297577, + "logits/rejected": 0.049373142421245575, + "logps/chosen": -1.5408929586410522, + "logps/rejected": -1.6390235424041748, + "loss": 0.7017, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.5408929586410522, + "rewards/margins": 0.09813062846660614, + "rewards/rejected": -1.6390235424041748, + "sft_loss": 1.4603184461593628, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 4.295652610582466, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": -0.020409051328897476, + "logits/rejected": 0.009363172575831413, + "logps/chosen": -1.4553922414779663, + "logps/rejected": -1.6140689849853516, + "loss": 0.6987, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.4553922414779663, + "rewards/margins": 0.15867677330970764, + "rewards/rejected": -1.6140689849853516, + "sft_loss": 1.4302983283996582, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 3.8217455475304716, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": -0.06983217597007751, + "logits/rejected": 0.019797608256340027, + "logps/chosen": -1.4062483310699463, + "logps/rejected": -1.5235086679458618, + "loss": 0.6957, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4062483310699463, + "rewards/margins": 0.11726043373346329, + "rewards/rejected": -1.5235086679458618, + "sft_loss": 1.4087426662445068, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 6.302535253237734, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.15332934260368347, + "logits/rejected": 0.06179341673851013, + "logps/chosen": -1.4927293062210083, + "logps/rejected": -1.556748867034912, + "loss": 0.7039, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.4927293062210083, + "rewards/margins": 0.06401960551738739, + "rewards/rejected": -1.556748867034912, + "sft_loss": 1.4707249402999878, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 4.822321471397462, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.1483144611120224, + "logits/rejected": -0.07352069765329361, + "logps/chosen": -1.4491080045700073, + "logps/rejected": -1.6051616668701172, + "loss": 0.7084, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4491080045700073, + "rewards/margins": 0.15605367720127106, + "rewards/rejected": -1.6051616668701172, + "sft_loss": 1.3767362833023071, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 9.552981170129502, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": -0.0526859275996685, + "logits/rejected": 0.03089299239218235, + "logps/chosen": -1.4122530221939087, + "logps/rejected": -1.5068573951721191, + "loss": 0.7048, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4122530221939087, + "rewards/margins": 0.09460441023111343, + "rewards/rejected": -1.5068573951721191, + "sft_loss": 1.345879316329956, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 2.0321803233059885, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": -0.08251919597387314, + "logits/rejected": 0.010809054598212242, + "logps/chosen": -1.3975403308868408, + "logps/rejected": -1.4715176820755005, + "loss": 0.7186, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3975403308868408, + "rewards/margins": 0.07397731393575668, + "rewards/rejected": -1.4715176820755005, + "sft_loss": 1.3381938934326172, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 6.659596031603889, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.1440359652042389, + "logits/rejected": 0.005278570111840963, + "logps/chosen": -1.387573003768921, + "logps/rejected": -1.5763928890228271, + "loss": 0.7005, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.387573003768921, + "rewards/margins": 0.18881988525390625, + "rewards/rejected": -1.5763928890228271, + "sft_loss": 1.3912830352783203, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 2.151759579797557, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.11488963663578033, + "logits/rejected": -0.0325799360871315, + "logps/chosen": -1.4236202239990234, + "logps/rejected": -1.6178334951400757, + "loss": 0.6845, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4236202239990234, + "rewards/margins": 0.1942131221294403, + "rewards/rejected": -1.6178334951400757, + "sft_loss": 1.3828890323638916, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 1.3655878297554467, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": -0.08240549266338348, + "logits/rejected": -0.0049137575551867485, + "logps/chosen": -1.4938578605651855, + "logps/rejected": -1.5395147800445557, + "loss": 0.6938, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4938578605651855, + "rewards/margins": 0.04565705358982086, + "rewards/rejected": -1.5395147800445557, + "sft_loss": 1.471738576889038, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 4.660223267244464, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": -0.014319619163870811, + "logits/rejected": 0.1460811048746109, + "logps/chosen": -1.505752444267273, + "logps/rejected": -1.6274387836456299, + "loss": 0.704, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.505752444267273, + "rewards/margins": 0.12168624252080917, + "rewards/rejected": -1.6274387836456299, + "sft_loss": 1.4696409702301025, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 1.4807670413046374, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.12310131639242172, + "logits/rejected": 0.03627241775393486, + "logps/chosen": -1.4809261560440063, + "logps/rejected": -1.5585112571716309, + "loss": 0.7054, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4809261560440063, + "rewards/margins": 0.07758528739213943, + "rewards/rejected": -1.5585112571716309, + "sft_loss": 1.421349048614502, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 2.132106324899655, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": -0.0066552236676216125, + "logits/rejected": 0.08685633540153503, + "logps/chosen": -1.5027748346328735, + "logps/rejected": -1.6426845788955688, + "loss": 0.6889, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5027748346328735, + "rewards/margins": 0.1399095356464386, + "rewards/rejected": -1.6426845788955688, + "sft_loss": 1.400646448135376, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.19446183741092682, + "eval_logits/rejected": 0.2817261815071106, + "eval_logps/chosen": -1.5228804349899292, + "eval_logps/rejected": -1.6954823732376099, + "eval_loss": 0.7002516984939575, + "eval_rewards/accuracies": 0.5578634738922119, + "eval_rewards/chosen": -1.5228804349899292, + "eval_rewards/margins": 0.17260216176509857, + "eval_rewards/rejected": -1.6954823732376099, + "eval_runtime": 43.9468, + "eval_samples_per_second": 30.605, + "eval_sft_loss": 1.438156247138977, + "eval_steps_per_second": 7.668, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 2.238195461356105, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": -0.07115691900253296, + "logits/rejected": 0.022948969155550003, + "logps/chosen": -1.5295543670654297, + "logps/rejected": -1.6470978260040283, + "loss": 0.7159, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.5295543670654297, + "rewards/margins": 0.11754367500543594, + "rewards/rejected": -1.6470978260040283, + "sft_loss": 1.4397932291030884, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 5.739972439145583, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": -0.049605417996644974, + "logits/rejected": 0.07737796008586884, + "logps/chosen": -1.4525340795516968, + "logps/rejected": -1.6043227910995483, + "loss": 0.6972, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4525340795516968, + "rewards/margins": 0.15178880095481873, + "rewards/rejected": -1.6043227910995483, + "sft_loss": 1.4185234308242798, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 1.9055908146961429, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": -0.08537553250789642, + "logits/rejected": -0.04558895155787468, + "logps/chosen": -1.4500477313995361, + "logps/rejected": -1.6230523586273193, + "loss": 0.6948, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4500477313995361, + "rewards/margins": 0.17300477623939514, + "rewards/rejected": -1.6230523586273193, + "sft_loss": 1.4089908599853516, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 3.550660441004718, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.09662449359893799, + "logits/rejected": 0.09209617227315903, + "logps/chosen": -1.3820686340332031, + "logps/rejected": -1.524606704711914, + "loss": 0.7069, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3820686340332031, + "rewards/margins": 0.14253807067871094, + "rewards/rejected": -1.524606704711914, + "sft_loss": 1.3853243589401245, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 3.1473111106901506, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -0.155962273478508, + "logits/rejected": 0.03296704962849617, + "logps/chosen": -1.4189581871032715, + "logps/rejected": -1.6313225030899048, + "loss": 0.6883, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4189581871032715, + "rewards/margins": 0.2123643457889557, + "rewards/rejected": -1.6313225030899048, + "sft_loss": 1.4540103673934937, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 3.396706695086868, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.17719171941280365, + "logits/rejected": 0.012565260753035545, + "logps/chosen": -1.4164271354675293, + "logps/rejected": -1.6276462078094482, + "loss": 0.6913, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4164271354675293, + "rewards/margins": 0.211218923330307, + "rewards/rejected": -1.6276462078094482, + "sft_loss": 1.4394110441207886, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 3.0261161806756487, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": -0.12973877787590027, + "logits/rejected": -0.046690475195646286, + "logps/chosen": -1.2951653003692627, + "logps/rejected": -1.447550654411316, + "loss": 0.6843, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2951653003692627, + "rewards/margins": 0.15238544344902039, + "rewards/rejected": -1.447550654411316, + "sft_loss": 1.3464162349700928, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 5.1132228847446575, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": -0.12432174384593964, + "logits/rejected": -0.03838655725121498, + "logps/chosen": -1.3688849210739136, + "logps/rejected": -1.5176225900650024, + "loss": 0.6987, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3688849210739136, + "rewards/margins": 0.14873747527599335, + "rewards/rejected": -1.5176225900650024, + "sft_loss": 1.3759212493896484, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 4.131642586210721, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": -0.16325688362121582, + "logits/rejected": -0.05782388523221016, + "logps/chosen": -1.4089410305023193, + "logps/rejected": -1.6085622310638428, + "loss": 0.703, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4089410305023193, + "rewards/margins": 0.19962140917778015, + "rewards/rejected": -1.6085622310638428, + "sft_loss": 1.4247992038726807, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 1.750905776750948, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": -0.10383790731430054, + "logits/rejected": 0.019616033881902695, + "logps/chosen": -1.4496909379959106, + "logps/rejected": -1.6070255041122437, + "loss": 0.7122, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4496909379959106, + "rewards/margins": 0.15733470022678375, + "rewards/rejected": -1.6070255041122437, + "sft_loss": 1.382320523262024, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 3.171172164305393, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": -0.12625646591186523, + "logits/rejected": -0.045686252415180206, + "logps/chosen": -1.4090499877929688, + "logps/rejected": -1.6708507537841797, + "loss": 0.6904, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4090499877929688, + "rewards/margins": 0.2618007957935333, + "rewards/rejected": -1.6708507537841797, + "sft_loss": 1.3707574605941772, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 3.688534309233378, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.2515523135662079, + "logits/rejected": -0.13388481736183167, + "logps/chosen": -1.5547903776168823, + "logps/rejected": -1.6721168756484985, + "loss": 0.7035, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5547903776168823, + "rewards/margins": 0.11732640117406845, + "rewards/rejected": -1.6721168756484985, + "sft_loss": 1.5243290662765503, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 4.4349826049597745, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": -0.016191715374588966, + "logits/rejected": -0.002241746988147497, + "logps/chosen": -1.5643110275268555, + "logps/rejected": -1.755155324935913, + "loss": 0.6948, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5643110275268555, + "rewards/margins": 0.19084429740905762, + "rewards/rejected": -1.755155324935913, + "sft_loss": 1.4712715148925781, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 3.172937410518411, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": 0.01810188591480255, + "logits/rejected": -0.020999742671847343, + "logps/chosen": -1.4785630702972412, + "logps/rejected": -1.6897687911987305, + "loss": 0.6951, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4785630702972412, + "rewards/margins": 0.21120555698871613, + "rewards/rejected": -1.6897687911987305, + "sft_loss": 1.4240328073501587, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 2.3986245435415654, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -0.18378640711307526, + "logits/rejected": -0.04902447760105133, + "logps/chosen": -1.4604475498199463, + "logps/rejected": -1.8186414241790771, + "loss": 0.6848, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4604475498199463, + "rewards/margins": 0.3581937849521637, + "rewards/rejected": -1.8186414241790771, + "sft_loss": 1.4552308320999146, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 7.88706357415983, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": -0.1797124743461609, + "logits/rejected": 0.012060348875820637, + "logps/chosen": -1.4463261365890503, + "logps/rejected": -1.5902550220489502, + "loss": 0.7187, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4463261365890503, + "rewards/margins": 0.14392876625061035, + "rewards/rejected": -1.5902550220489502, + "sft_loss": 1.420996904373169, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 6.436198819046655, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": -0.1368204802274704, + "logits/rejected": -0.09737800061702728, + "logps/chosen": -1.592976689338684, + "logps/rejected": -1.6954838037490845, + "loss": 0.7061, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.592976689338684, + "rewards/margins": 0.10250727087259293, + "rewards/rejected": -1.6954838037490845, + "sft_loss": 1.4994713068008423, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 3.4782486146378475, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": -0.13630501925945282, + "logits/rejected": -0.06619258970022202, + "logps/chosen": -1.5162378549575806, + "logps/rejected": -1.6546752452850342, + "loss": 0.71, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5162378549575806, + "rewards/margins": 0.13843724131584167, + "rewards/rejected": -1.6546752452850342, + "sft_loss": 1.4422112703323364, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 7.153663720382908, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": -0.19033238291740417, + "logits/rejected": -0.1686793565750122, + "logps/chosen": -1.5363590717315674, + "logps/rejected": -1.6592457294464111, + "loss": 0.7071, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5363590717315674, + "rewards/margins": 0.12288665771484375, + "rewards/rejected": -1.6592457294464111, + "sft_loss": 1.523964762687683, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 3.1275666171203147, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": -0.1973910927772522, + "logits/rejected": -0.09779137372970581, + "logps/chosen": -1.4702038764953613, + "logps/rejected": -1.6763232946395874, + "loss": 0.7079, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4702038764953613, + "rewards/margins": 0.20611953735351562, + "rewards/rejected": -1.6763232946395874, + "sft_loss": 1.4103636741638184, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 1.545590729725742, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": -0.2061690390110016, + "logits/rejected": -0.06640944629907608, + "logps/chosen": -1.579946756362915, + "logps/rejected": -1.6387602090835571, + "loss": 0.7003, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.579946756362915, + "rewards/margins": 0.05881340429186821, + "rewards/rejected": -1.6387602090835571, + "sft_loss": 1.489118218421936, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 1.478907150560264, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": -0.04340224340558052, + "logits/rejected": 0.014481568709015846, + "logps/chosen": -1.561303973197937, + "logps/rejected": -1.7578208446502686, + "loss": 0.6952, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.561303973197937, + "rewards/margins": 0.19651691615581512, + "rewards/rejected": -1.7578208446502686, + "sft_loss": 1.4192326068878174, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 3.6300079780170655, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": -0.11791355907917023, + "logits/rejected": -0.02622136101126671, + "logps/chosen": -1.4685485363006592, + "logps/rejected": -1.6369479894638062, + "loss": 0.6788, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4685485363006592, + "rewards/margins": 0.16839949786663055, + "rewards/rejected": -1.6369479894638062, + "sft_loss": 1.401977300643921, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 1.9461711581313945, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": -0.24227562546730042, + "logits/rejected": -0.10975190252065659, + "logps/chosen": -1.4977983236312866, + "logps/rejected": -1.6738275289535522, + "loss": 0.6954, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4977983236312866, + "rewards/margins": 0.1760290265083313, + "rewards/rejected": -1.6738275289535522, + "sft_loss": 1.5253281593322754, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 4.39745711780097, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": -0.06434062868356705, + "logits/rejected": 0.00504128634929657, + "logps/chosen": -1.5231517553329468, + "logps/rejected": -1.7861477136611938, + "loss": 0.6937, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5231517553329468, + "rewards/margins": 0.2629958689212799, + "rewards/rejected": -1.7861477136611938, + "sft_loss": 1.5069881677627563, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 4.74780941859642, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": -0.08390182256698608, + "logits/rejected": -0.0013960630167275667, + "logps/chosen": -1.410300612449646, + "logps/rejected": -1.5978670120239258, + "loss": 0.7043, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.410300612449646, + "rewards/margins": 0.18756639957427979, + "rewards/rejected": -1.5978670120239258, + "sft_loss": 1.3433904647827148, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 3.656009593259164, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.2296837866306305, + "logits/rejected": 0.028430040925741196, + "logps/chosen": -1.4031928777694702, + "logps/rejected": -1.5763087272644043, + "loss": 0.6837, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4031928777694702, + "rewards/margins": 0.17311576008796692, + "rewards/rejected": -1.5763087272644043, + "sft_loss": 1.34806227684021, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 4.988252598362669, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": -0.1283363401889801, + "logits/rejected": -0.05954523757100105, + "logps/chosen": -1.5781985521316528, + "logps/rejected": -1.723961591720581, + "loss": 0.6985, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5781985521316528, + "rewards/margins": 0.14576300978660583, + "rewards/rejected": -1.723961591720581, + "sft_loss": 1.5613014698028564, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 4.077268940652683, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": -0.21156974136829376, + "logits/rejected": -0.014496455900371075, + "logps/chosen": -1.4874637126922607, + "logps/rejected": -1.6751620769500732, + "loss": 0.6849, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4874637126922607, + "rewards/margins": 0.1876983940601349, + "rewards/rejected": -1.6751620769500732, + "sft_loss": 1.4299863576889038, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 3.843684359548097, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -0.10458724200725555, + "logits/rejected": -0.03413146734237671, + "logps/chosen": -1.4824299812316895, + "logps/rejected": -1.6548147201538086, + "loss": 0.6957, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4824299812316895, + "rewards/margins": 0.17238478362560272, + "rewards/rejected": -1.6548147201538086, + "sft_loss": 1.416551947593689, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 7.776033781083553, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": -0.20451894402503967, + "logits/rejected": -0.08231017738580704, + "logps/chosen": -1.5417841672897339, + "logps/rejected": -1.6531574726104736, + "loss": 0.6956, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5417841672897339, + "rewards/margins": 0.11137330532073975, + "rewards/rejected": -1.6531574726104736, + "sft_loss": 1.5059678554534912, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 5.8735917625409435, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": -0.08917711675167084, + "logits/rejected": -0.07247889041900635, + "logps/chosen": -1.4098436832427979, + "logps/rejected": -1.5879590511322021, + "loss": 0.6897, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4098436832427979, + "rewards/margins": 0.1781153380870819, + "rewards/rejected": -1.5879590511322021, + "sft_loss": 1.4960367679595947, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 1.9362695189538406, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": -0.11081753671169281, + "logits/rejected": -0.059358786791563034, + "logps/chosen": -1.4940464496612549, + "logps/rejected": -1.681850790977478, + "loss": 0.7057, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4940464496612549, + "rewards/margins": 0.1878044307231903, + "rewards/rejected": -1.681850790977478, + "sft_loss": 1.4923908710479736, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 3.875133594534664, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": -0.19271458685398102, + "logits/rejected": 0.00990169309079647, + "logps/chosen": -1.468133807182312, + "logps/rejected": -1.6332943439483643, + "loss": 0.6908, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.468133807182312, + "rewards/margins": 0.16516050696372986, + "rewards/rejected": -1.6332943439483643, + "sft_loss": 1.4964772462844849, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 2.8053986938228075, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": -0.1740867793560028, + "logits/rejected": -0.11315342038869858, + "logps/chosen": -1.4085177183151245, + "logps/rejected": -1.6071035861968994, + "loss": 0.705, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4085177183151245, + "rewards/margins": 0.1985858678817749, + "rewards/rejected": -1.6071035861968994, + "sft_loss": 1.40883469581604, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 2.262322344262053, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": -0.17668426036834717, + "logits/rejected": -0.02388429269194603, + "logps/chosen": -1.4115188121795654, + "logps/rejected": -1.6690114736557007, + "loss": 0.687, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4115188121795654, + "rewards/margins": 0.25749272108078003, + "rewards/rejected": -1.6690114736557007, + "sft_loss": 1.399688959121704, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 2.0619495664128116, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": -0.2010982483625412, + "logits/rejected": -0.10281310975551605, + "logps/chosen": -1.6184282302856445, + "logps/rejected": -1.7165712118148804, + "loss": 0.709, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.6184282302856445, + "rewards/margins": 0.09814301878213882, + "rewards/rejected": -1.7165712118148804, + "sft_loss": 1.5608628988265991, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 17.893057152362953, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": -0.15141572058200836, + "logits/rejected": -0.02625897526741028, + "logps/chosen": -1.518196702003479, + "logps/rejected": -1.5954731702804565, + "loss": 0.7093, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.518196702003479, + "rewards/margins": 0.07727648317813873, + "rewards/rejected": -1.5954731702804565, + "sft_loss": 1.5144684314727783, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 5.458181178019198, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": -0.10900517553091049, + "logits/rejected": 0.010265020653605461, + "logps/chosen": -1.4177100658416748, + "logps/rejected": -1.604640007019043, + "loss": 0.7065, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4177100658416748, + "rewards/margins": 0.18692997097969055, + "rewards/rejected": -1.604640007019043, + "sft_loss": 1.4123425483703613, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 3.2116080209981144, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": -0.18455770611763, + "logits/rejected": -0.041767168790102005, + "logps/chosen": -1.434877872467041, + "logps/rejected": -1.5541597604751587, + "loss": 0.7082, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.434877872467041, + "rewards/margins": 0.11928168684244156, + "rewards/rejected": -1.5541597604751587, + "sft_loss": 1.4335283041000366, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 4.074685283282293, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": -0.20280078053474426, + "logits/rejected": -0.12091977894306183, + "logps/chosen": -1.5426971912384033, + "logps/rejected": -1.742283582687378, + "loss": 0.6872, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5426971912384033, + "rewards/margins": 0.19958636164665222, + "rewards/rejected": -1.742283582687378, + "sft_loss": 1.5260711908340454, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 9.705402694855097, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": -0.10903636366128922, + "logits/rejected": 0.03241968899965286, + "logps/chosen": -1.4781376123428345, + "logps/rejected": -1.6694921255111694, + "loss": 0.6948, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4781376123428345, + "rewards/margins": 0.1913544237613678, + "rewards/rejected": -1.6694921255111694, + "sft_loss": 1.4647178649902344, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 1.7131026025783753, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": -0.18533024191856384, + "logits/rejected": -0.08608406037092209, + "logps/chosen": -1.4794480800628662, + "logps/rejected": -1.7197014093399048, + "loss": 0.6895, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4794480800628662, + "rewards/margins": 0.24025335907936096, + "rewards/rejected": -1.7197014093399048, + "sft_loss": 1.4695863723754883, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 3.8641548703644917, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": -0.09173591434955597, + "logits/rejected": -0.02755848690867424, + "logps/chosen": -1.5675022602081299, + "logps/rejected": -1.7330118417739868, + "loss": 0.6939, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5675022602081299, + "rewards/margins": 0.165509432554245, + "rewards/rejected": -1.7330118417739868, + "sft_loss": 1.5306199789047241, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 3.676252558960378, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": -0.10520021617412567, + "logits/rejected": -0.004293438978493214, + "logps/chosen": -1.4628154039382935, + "logps/rejected": -1.7190496921539307, + "loss": 0.6817, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4628154039382935, + "rewards/margins": 0.25623443722724915, + "rewards/rejected": -1.7190496921539307, + "sft_loss": 1.4512197971343994, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 2.259243380926168, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": -0.19209852814674377, + "logits/rejected": -0.09239007532596588, + "logps/chosen": -1.5089333057403564, + "logps/rejected": -1.741621732711792, + "loss": 0.6898, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5089333057403564, + "rewards/margins": 0.23268857598304749, + "rewards/rejected": -1.741621732711792, + "sft_loss": 1.445467233657837, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 3.8794295408319064, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": -0.14607372879981995, + "logits/rejected": 0.07945680618286133, + "logps/chosen": -1.569690227508545, + "logps/rejected": -1.7385027408599854, + "loss": 0.7069, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.569690227508545, + "rewards/margins": 0.16881242394447327, + "rewards/rejected": -1.7385027408599854, + "sft_loss": 1.5495669841766357, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 3.5814865977797874, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": -0.22887110710144043, + "logits/rejected": -0.044267140328884125, + "logps/chosen": -1.5432608127593994, + "logps/rejected": -1.735375165939331, + "loss": 0.7112, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5432608127593994, + "rewards/margins": 0.1921147108078003, + "rewards/rejected": -1.735375165939331, + "sft_loss": 1.5614808797836304, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 2.5129021167780103, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": -0.22399599850177765, + "logits/rejected": -0.14003901183605194, + "logps/chosen": -1.4522173404693604, + "logps/rejected": -1.73953378200531, + "loss": 0.6995, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4522173404693604, + "rewards/margins": 0.28731635212898254, + "rewards/rejected": -1.73953378200531, + "sft_loss": 1.4069039821624756, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 3.3412286431292646, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": -0.18452927470207214, + "logits/rejected": -0.04418149217963219, + "logps/chosen": -1.625009298324585, + "logps/rejected": -1.868293046951294, + "loss": 0.7021, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.625009298324585, + "rewards/margins": 0.24328365921974182, + "rewards/rejected": -1.868293046951294, + "sft_loss": 1.5876249074935913, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 2.920134399051155, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": -0.11768593639135361, + "logits/rejected": -0.10396134853363037, + "logps/chosen": -1.5662527084350586, + "logps/rejected": -1.814776062965393, + "loss": 0.6946, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5662527084350586, + "rewards/margins": 0.24852335453033447, + "rewards/rejected": -1.814776062965393, + "sft_loss": 1.5303701162338257, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 5.67572610332352, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": -0.12730535864830017, + "logits/rejected": -0.00879682321101427, + "logps/chosen": -1.535307765007019, + "logps/rejected": -1.6664314270019531, + "loss": 0.6939, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.535307765007019, + "rewards/margins": 0.1311238706111908, + "rewards/rejected": -1.6664314270019531, + "sft_loss": 1.5180950164794922, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 4.356303767175146, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": -0.11556844413280487, + "logits/rejected": 0.018486717715859413, + "logps/chosen": -1.43712317943573, + "logps/rejected": -1.6720008850097656, + "loss": 0.6843, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.43712317943573, + "rewards/margins": 0.2348775863647461, + "rewards/rejected": -1.6720008850097656, + "sft_loss": 1.4556975364685059, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 4.5032969292048435, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": -0.1438135802745819, + "logits/rejected": 0.05054495856165886, + "logps/chosen": -1.4716922044754028, + "logps/rejected": -1.6338465213775635, + "loss": 0.6857, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.4716922044754028, + "rewards/margins": 0.16215436160564423, + "rewards/rejected": -1.6338465213775635, + "sft_loss": 1.5467185974121094, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 4.997362924802158, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": -0.13516470789909363, + "logits/rejected": -0.055740244686603546, + "logps/chosen": -1.461431860923767, + "logps/rejected": -1.7734931707382202, + "loss": 0.6841, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.461431860923767, + "rewards/margins": 0.3120613992214203, + "rewards/rejected": -1.7734931707382202, + "sft_loss": 1.4976530075073242, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 12.728909779386376, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": -0.05813581869006157, + "logits/rejected": 0.09768891334533691, + "logps/chosen": -1.521172285079956, + "logps/rejected": -1.6993563175201416, + "loss": 0.7165, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.521172285079956, + "rewards/margins": 0.17818418145179749, + "rewards/rejected": -1.6993563175201416, + "sft_loss": 1.4803167581558228, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 3.256957423420728, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": -0.10296891629695892, + "logits/rejected": 0.04575002193450928, + "logps/chosen": -1.4965869188308716, + "logps/rejected": -1.6728099584579468, + "loss": 0.6928, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4965869188308716, + "rewards/margins": 0.1762230098247528, + "rewards/rejected": -1.6728099584579468, + "sft_loss": 1.4776004552841187, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 3.170059736659928, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": -0.086936354637146, + "logits/rejected": -0.05500447005033493, + "logps/chosen": -1.4945694208145142, + "logps/rejected": -1.7144079208374023, + "loss": 0.677, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4945694208145142, + "rewards/margins": 0.2198384553194046, + "rewards/rejected": -1.7144079208374023, + "sft_loss": 1.5325548648834229, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 2.4094839136415516, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": -0.1298869550228119, + "logits/rejected": -0.045692089945077896, + "logps/chosen": -1.4005720615386963, + "logps/rejected": -1.662145972251892, + "loss": 0.6675, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4005720615386963, + "rewards/margins": 0.26157405972480774, + "rewards/rejected": -1.662145972251892, + "sft_loss": 1.422020673751831, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 3.2382070067603257, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": -0.23172077536582947, + "logits/rejected": -0.1004585400223732, + "logps/chosen": -1.5804080963134766, + "logps/rejected": -1.7512238025665283, + "loss": 0.6935, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5804080963134766, + "rewards/margins": 0.17081551253795624, + "rewards/rejected": -1.7512238025665283, + "sft_loss": 1.5292551517486572, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 2.5855757075376427, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": -0.1970127820968628, + "logits/rejected": -0.03319654241204262, + "logps/chosen": -1.541736364364624, + "logps/rejected": -1.8506510257720947, + "loss": 0.6861, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.541736364364624, + "rewards/margins": 0.30891457200050354, + "rewards/rejected": -1.8506510257720947, + "sft_loss": 1.530510663986206, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 3.4405670225413147, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": -0.06787233799695969, + "logits/rejected": 0.027986615896224976, + "logps/chosen": -1.4952280521392822, + "logps/rejected": -1.7443792819976807, + "loss": 0.6854, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4952280521392822, + "rewards/margins": 0.24915120005607605, + "rewards/rejected": -1.7443792819976807, + "sft_loss": 1.4719597101211548, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 3.0786013057975254, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": -0.13065661489963531, + "logits/rejected": 0.029547732323408127, + "logps/chosen": -1.6301422119140625, + "logps/rejected": -1.8177034854888916, + "loss": 0.6997, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6301422119140625, + "rewards/margins": 0.1875612437725067, + "rewards/rejected": -1.8177034854888916, + "sft_loss": 1.5304896831512451, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 2.5010116709016854, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": -0.06926265358924866, + "logits/rejected": 0.11330119520425797, + "logps/chosen": -1.645268440246582, + "logps/rejected": -1.8578002452850342, + "loss": 0.6989, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.645268440246582, + "rewards/margins": 0.21253187954425812, + "rewards/rejected": -1.8578002452850342, + "sft_loss": 1.623335599899292, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 4.336032649046367, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": -0.1750633716583252, + "logits/rejected": -0.03857022523880005, + "logps/chosen": -1.452271580696106, + "logps/rejected": -1.8203308582305908, + "loss": 0.6765, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.452271580696106, + "rewards/margins": 0.3680591583251953, + "rewards/rejected": -1.8203308582305908, + "sft_loss": 1.4848424196243286, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 6.870768364380895, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": -0.21457481384277344, + "logits/rejected": -0.05655393749475479, + "logps/chosen": -1.5485544204711914, + "logps/rejected": -1.774266004562378, + "loss": 0.6866, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5485544204711914, + "rewards/margins": 0.225711852312088, + "rewards/rejected": -1.774266004562378, + "sft_loss": 1.5377953052520752, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 2.839351971144575, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": -0.1863122284412384, + "logits/rejected": -0.0894741415977478, + "logps/chosen": -1.5549644231796265, + "logps/rejected": -1.7339067459106445, + "loss": 0.6869, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5549644231796265, + "rewards/margins": 0.17894235253334045, + "rewards/rejected": -1.7339067459106445, + "sft_loss": 1.5759687423706055, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 4.456809773344875, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": -0.1679374873638153, + "logits/rejected": -0.0004952967283315957, + "logps/chosen": -1.6225173473358154, + "logps/rejected": -1.7671568393707275, + "loss": 0.6921, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.6225173473358154, + "rewards/margins": 0.14463947713375092, + "rewards/rejected": -1.7671568393707275, + "sft_loss": 1.6077091693878174, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 3.0879758095157652, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": -0.045103929936885834, + "logits/rejected": 0.02557518519461155, + "logps/chosen": -1.6077511310577393, + "logps/rejected": -1.9150081872940063, + "loss": 0.6895, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6077511310577393, + "rewards/margins": 0.3072572350502014, + "rewards/rejected": -1.9150081872940063, + "sft_loss": 1.5629363059997559, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 8.798708800289246, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": -0.12965276837348938, + "logits/rejected": 0.027466658502817154, + "logps/chosen": -1.5537043809890747, + "logps/rejected": -1.8158077001571655, + "loss": 0.6955, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5537043809890747, + "rewards/margins": 0.26210322976112366, + "rewards/rejected": -1.8158077001571655, + "sft_loss": 1.5344383716583252, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 1.8930806235904913, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": -0.09079675376415253, + "logits/rejected": 0.032065801322460175, + "logps/chosen": -1.5689537525177002, + "logps/rejected": -1.8230974674224854, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5689537525177002, + "rewards/margins": 0.2541435956954956, + "rewards/rejected": -1.8230974674224854, + "sft_loss": 1.5655837059020996, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 1.8032216917676425, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": -0.17655274271965027, + "logits/rejected": -0.1094353199005127, + "logps/chosen": -1.5864803791046143, + "logps/rejected": -1.8785566091537476, + "loss": 0.6962, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5864803791046143, + "rewards/margins": 0.2920762896537781, + "rewards/rejected": -1.8785566091537476, + "sft_loss": 1.5739283561706543, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 2.762762201186071, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": -0.1467718631029129, + "logits/rejected": -0.05782966688275337, + "logps/chosen": -1.569352388381958, + "logps/rejected": -1.7833820581436157, + "loss": 0.691, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.569352388381958, + "rewards/margins": 0.21402959525585175, + "rewards/rejected": -1.7833820581436157, + "sft_loss": 1.5140148401260376, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 3.989204115760969, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": -0.06442215293645859, + "logits/rejected": 0.05450627952814102, + "logps/chosen": -1.5156924724578857, + "logps/rejected": -1.930771827697754, + "loss": 0.6583, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5156924724578857, + "rewards/margins": 0.41507917642593384, + "rewards/rejected": -1.930771827697754, + "sft_loss": 1.4957122802734375, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 1.98743781290134, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": -0.1774439513683319, + "logits/rejected": -0.01137492060661316, + "logps/chosen": -1.548862099647522, + "logps/rejected": -1.8399940729141235, + "loss": 0.6824, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.548862099647522, + "rewards/margins": 0.2911320626735687, + "rewards/rejected": -1.8399940729141235, + "sft_loss": 1.480064034461975, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 3.816846659534653, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": -0.24712248146533966, + "logits/rejected": -0.10505137592554092, + "logps/chosen": -1.4992997646331787, + "logps/rejected": -1.773298978805542, + "loss": 0.6846, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4992997646331787, + "rewards/margins": 0.27399933338165283, + "rewards/rejected": -1.773298978805542, + "sft_loss": 1.51784348487854, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 4.540593941105179, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": -0.11738238483667374, + "logits/rejected": 0.006606881506741047, + "logps/chosen": -1.467149019241333, + "logps/rejected": -1.7952121496200562, + "loss": 0.6702, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.467149019241333, + "rewards/margins": 0.32806330919265747, + "rewards/rejected": -1.7952121496200562, + "sft_loss": 1.474399447441101, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 7.143739814074999, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": -0.16249966621398926, + "logits/rejected": -0.037639666348695755, + "logps/chosen": -1.5038591623306274, + "logps/rejected": -1.7525460720062256, + "loss": 0.69, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5038591623306274, + "rewards/margins": 0.24868711829185486, + "rewards/rejected": -1.7525460720062256, + "sft_loss": 1.490281343460083, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 1.8637758614434734, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": -0.18016555905342102, + "logits/rejected": -0.002284090965986252, + "logps/chosen": -1.4759074449539185, + "logps/rejected": -1.8262622356414795, + "loss": 0.6782, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4759074449539185, + "rewards/margins": 0.3503546118736267, + "rewards/rejected": -1.8262622356414795, + "sft_loss": 1.491576075553894, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 3.489424716877646, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": -0.12133590877056122, + "logits/rejected": -0.039609938859939575, + "logps/chosen": -1.502015471458435, + "logps/rejected": -1.7361009120941162, + "loss": 0.6916, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.502015471458435, + "rewards/margins": 0.23408547043800354, + "rewards/rejected": -1.7361009120941162, + "sft_loss": 1.4773706197738647, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.2000727504491806, + "eval_logits/rejected": 0.2874969244003296, + "eval_logps/chosen": -1.5414361953735352, + "eval_logps/rejected": -1.8469265699386597, + "eval_loss": 0.6821562647819519, + "eval_rewards/accuracies": 0.607566773891449, + "eval_rewards/chosen": -1.5414361953735352, + "eval_rewards/margins": 0.3054904639720917, + "eval_rewards/rejected": -1.8469265699386597, + "eval_runtime": 43.0492, + "eval_samples_per_second": 31.243, + "eval_sft_loss": 1.5282062292099, + "eval_steps_per_second": 7.828, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 1.475474768839584, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": -0.1691408008337021, + "logits/rejected": -0.016671936959028244, + "logps/chosen": -1.5347111225128174, + "logps/rejected": -1.9066162109375, + "loss": 0.6851, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5347111225128174, + "rewards/margins": 0.3719049096107483, + "rewards/rejected": -1.9066162109375, + "sft_loss": 1.5340166091918945, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 3.0932573552854667, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": -0.0883767232298851, + "logits/rejected": -0.014658985659480095, + "logps/chosen": -1.4988377094268799, + "logps/rejected": -1.7675899267196655, + "loss": 0.6771, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4988377094268799, + "rewards/margins": 0.2687521278858185, + "rewards/rejected": -1.7675899267196655, + "sft_loss": 1.4959461688995361, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 3.792863765233088, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": -0.07489411532878876, + "logits/rejected": 0.04157022386789322, + "logps/chosen": -1.438208818435669, + "logps/rejected": -1.8774745464324951, + "loss": 0.6486, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.438208818435669, + "rewards/margins": 0.4392658770084381, + "rewards/rejected": -1.8774745464324951, + "sft_loss": 1.5085432529449463, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 3.5907793135097483, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": -0.06854422390460968, + "logits/rejected": -0.012159738689661026, + "logps/chosen": -1.5659635066986084, + "logps/rejected": -1.865256905555725, + "loss": 0.695, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5659635066986084, + "rewards/margins": 0.2992933392524719, + "rewards/rejected": -1.865256905555725, + "sft_loss": 1.5442118644714355, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 2.69942669661372, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": -0.11319144070148468, + "logits/rejected": 0.010987621732056141, + "logps/chosen": -1.4739656448364258, + "logps/rejected": -1.768318772315979, + "loss": 0.6781, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4739656448364258, + "rewards/margins": 0.29435327649116516, + "rewards/rejected": -1.768318772315979, + "sft_loss": 1.5512921810150146, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 3.602350790071803, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": -0.023851171135902405, + "logits/rejected": 0.13682684302330017, + "logps/chosen": -1.5443211793899536, + "logps/rejected": -1.7456066608428955, + "loss": 0.6904, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.5443211793899536, + "rewards/margins": 0.20128539204597473, + "rewards/rejected": -1.7456066608428955, + "sft_loss": 1.4974839687347412, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 8.314425129967058, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": -0.06683783233165741, + "logits/rejected": 0.05744323879480362, + "logps/chosen": -1.4640414714813232, + "logps/rejected": -1.708734154701233, + "loss": 0.6802, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4640414714813232, + "rewards/margins": 0.24469268321990967, + "rewards/rejected": -1.708734154701233, + "sft_loss": 1.5117323398590088, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 5.804501641676646, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": -0.08640140295028687, + "logits/rejected": 0.07182900607585907, + "logps/chosen": -1.510978102684021, + "logps/rejected": -1.8227230310440063, + "loss": 0.696, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.510978102684021, + "rewards/margins": 0.3117448687553406, + "rewards/rejected": -1.8227230310440063, + "sft_loss": 1.5465790033340454, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 1.7114648252088038, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": -0.028147101402282715, + "logits/rejected": 0.03421204164624214, + "logps/chosen": -1.5054690837860107, + "logps/rejected": -1.6977351903915405, + "loss": 0.6932, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5054690837860107, + "rewards/margins": 0.19226618111133575, + "rewards/rejected": -1.6977351903915405, + "sft_loss": 1.5626037120819092, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 5.095358497485958, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": -0.07386619597673416, + "logits/rejected": -0.007664171047508717, + "logps/chosen": -1.4631752967834473, + "logps/rejected": -1.7233270406723022, + "loss": 0.678, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4631752967834473, + "rewards/margins": 0.2601519227027893, + "rewards/rejected": -1.7233270406723022, + "sft_loss": 1.4765675067901611, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 2.177709752758681, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": -0.10711731761693954, + "logits/rejected": 0.027393508702516556, + "logps/chosen": -1.527491569519043, + "logps/rejected": -1.841475248336792, + "loss": 0.715, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.527491569519043, + "rewards/margins": 0.31398382782936096, + "rewards/rejected": -1.841475248336792, + "sft_loss": 1.520607829093933, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 1.599740709379549, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": -0.1450614035129547, + "logits/rejected": 0.01107205729931593, + "logps/chosen": -1.5596612691879272, + "logps/rejected": -1.9219337701797485, + "loss": 0.6953, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5596612691879272, + "rewards/margins": 0.3622724413871765, + "rewards/rejected": -1.9219337701797485, + "sft_loss": 1.571467399597168, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 4.7574966687692966, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": -0.04354889690876007, + "logits/rejected": 0.05113743618130684, + "logps/chosen": -1.4938322305679321, + "logps/rejected": -1.8625303506851196, + "loss": 0.6779, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4938322305679321, + "rewards/margins": 0.36869820952415466, + "rewards/rejected": -1.8625303506851196, + "sft_loss": 1.4814262390136719, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 2.319826048394682, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": -0.16301113367080688, + "logits/rejected": 0.08414186537265778, + "logps/chosen": -1.5025361776351929, + "logps/rejected": -1.8213615417480469, + "loss": 0.6724, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5025361776351929, + "rewards/margins": 0.31882524490356445, + "rewards/rejected": -1.8213615417480469, + "sft_loss": 1.561790108680725, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 2.5724445594754957, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": -0.02423214539885521, + "logits/rejected": 0.05520091578364372, + "logps/chosen": -1.5861196517944336, + "logps/rejected": -1.8330475091934204, + "loss": 0.7036, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5861196517944336, + "rewards/margins": 0.24692782759666443, + "rewards/rejected": -1.8330475091934204, + "sft_loss": 1.5468577146530151, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 1.9383173865476142, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": -0.010085579939186573, + "logits/rejected": 0.16605985164642334, + "logps/chosen": -1.6291682720184326, + "logps/rejected": -1.936402678489685, + "loss": 0.6845, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6291682720184326, + "rewards/margins": 0.3072342574596405, + "rewards/rejected": -1.936402678489685, + "sft_loss": 1.5209325551986694, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 5.317339130080701, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": -0.10790582746267319, + "logits/rejected": -0.022575518116354942, + "logps/chosen": -1.5415394306182861, + "logps/rejected": -1.8893182277679443, + "loss": 0.6734, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5415394306182861, + "rewards/margins": 0.3477786183357239, + "rewards/rejected": -1.8893182277679443, + "sft_loss": 1.5552213191986084, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 3.0959199433553253, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.20399203896522522, + "logits/rejected": -0.07375577837228775, + "logps/chosen": -1.563641905784607, + "logps/rejected": -1.8401918411254883, + "loss": 0.678, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.563641905784607, + "rewards/margins": 0.2765499949455261, + "rewards/rejected": -1.8401918411254883, + "sft_loss": 1.5999377965927124, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 2.5676282566774216, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": 0.009197077713906765, + "logits/rejected": 0.12456144392490387, + "logps/chosen": -1.4496952295303345, + "logps/rejected": -1.7056611776351929, + "loss": 0.6805, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4496952295303345, + "rewards/margins": 0.2559662461280823, + "rewards/rejected": -1.7056611776351929, + "sft_loss": 1.4831942319869995, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 2.751949084255799, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": -0.12211020290851593, + "logits/rejected": -0.04600541293621063, + "logps/chosen": -1.4672808647155762, + "logps/rejected": -1.8262830972671509, + "loss": 0.68, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4672808647155762, + "rewards/margins": 0.35900211334228516, + "rewards/rejected": -1.8262830972671509, + "sft_loss": 1.4929606914520264, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 3.6068251039747845, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": -0.025395523756742477, + "logits/rejected": 0.16361042857170105, + "logps/chosen": -1.5420572757720947, + "logps/rejected": -1.9077465534210205, + "loss": 0.6829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5420572757720947, + "rewards/margins": 0.36568912863731384, + "rewards/rejected": -1.9077465534210205, + "sft_loss": 1.5835001468658447, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 2.4716053388381574, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": -0.07407438009977341, + "logits/rejected": 0.02558054029941559, + "logps/chosen": -1.5568764209747314, + "logps/rejected": -1.822037696838379, + "loss": 0.6959, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5568764209747314, + "rewards/margins": 0.2651612162590027, + "rewards/rejected": -1.822037696838379, + "sft_loss": 1.5309042930603027, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 2.6328372724664586, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": -0.0730830505490303, + "logits/rejected": 0.10969231277704239, + "logps/chosen": -1.48826003074646, + "logps/rejected": -1.7999156713485718, + "loss": 0.678, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.48826003074646, + "rewards/margins": 0.3116556406021118, + "rewards/rejected": -1.7999156713485718, + "sft_loss": 1.4933485984802246, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 4.032568675460934, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": 0.06822212785482407, + "logits/rejected": 0.20987817645072937, + "logps/chosen": -1.4264925718307495, + "logps/rejected": -1.7993030548095703, + "loss": 0.6783, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4264925718307495, + "rewards/margins": 0.372810423374176, + "rewards/rejected": -1.7993030548095703, + "sft_loss": 1.4660489559173584, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 2.776730542659782, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": -0.09571783244609833, + "logits/rejected": 0.06038772314786911, + "logps/chosen": -1.5659586191177368, + "logps/rejected": -1.7870457172393799, + "loss": 0.6844, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5659586191177368, + "rewards/margins": 0.22108721733093262, + "rewards/rejected": -1.7870457172393799, + "sft_loss": 1.5310592651367188, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 9.725427285571993, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": -0.10902807861566544, + "logits/rejected": 0.005884545389562845, + "logps/chosen": -1.4849127531051636, + "logps/rejected": -1.9125696420669556, + "loss": 0.6933, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4849127531051636, + "rewards/margins": 0.4276568293571472, + "rewards/rejected": -1.9125696420669556, + "sft_loss": 1.4972728490829468, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 3.6272589230670826, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": -0.019013594835996628, + "logits/rejected": 0.030416369438171387, + "logps/chosen": -1.462626576423645, + "logps/rejected": -1.7875211238861084, + "loss": 0.6793, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.462626576423645, + "rewards/margins": 0.3248947858810425, + "rewards/rejected": -1.7875211238861084, + "sft_loss": 1.503143072128296, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 2.956874199257772, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": -0.04178246855735779, + "logits/rejected": 0.06810219585895538, + "logps/chosen": -1.5072715282440186, + "logps/rejected": -1.7970716953277588, + "loss": 0.6726, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5072715282440186, + "rewards/margins": 0.2898002564907074, + "rewards/rejected": -1.7970716953277588, + "sft_loss": 1.5333386659622192, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 3.264190471072544, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": -0.10239378362894058, + "logits/rejected": 0.022822603583335876, + "logps/chosen": -1.488178014755249, + "logps/rejected": -1.878474235534668, + "loss": 0.675, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.488178014755249, + "rewards/margins": 0.39029616117477417, + "rewards/rejected": -1.878474235534668, + "sft_loss": 1.497314214706421, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 3.043037021365146, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": -0.08505845814943314, + "logits/rejected": 0.07929755002260208, + "logps/chosen": -1.5903935432434082, + "logps/rejected": -1.8115625381469727, + "loss": 0.6943, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.5903935432434082, + "rewards/margins": 0.22116902470588684, + "rewards/rejected": -1.8115625381469727, + "sft_loss": 1.5833137035369873, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 1.7750056065969102, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": -0.039474982768297195, + "logits/rejected": 0.17847755551338196, + "logps/chosen": -1.6058772802352905, + "logps/rejected": -1.8520838022232056, + "loss": 0.6964, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.6058772802352905, + "rewards/margins": 0.24620631337165833, + "rewards/rejected": -1.8520838022232056, + "sft_loss": 1.644765853881836, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 3.8779332519956418, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": -0.08051494508981705, + "logits/rejected": 0.058378733694553375, + "logps/chosen": -1.557374119758606, + "logps/rejected": -1.7982490062713623, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.557374119758606, + "rewards/margins": 0.2408749759197235, + "rewards/rejected": -1.7982490062713623, + "sft_loss": 1.567808747291565, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 5.284646234885677, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": 0.021580982953310013, + "logits/rejected": 0.025302177295088768, + "logps/chosen": -1.6421302556991577, + "logps/rejected": -1.9848263263702393, + "loss": 0.6849, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6421302556991577, + "rewards/margins": 0.3426961302757263, + "rewards/rejected": -1.9848263263702393, + "sft_loss": 1.6509084701538086, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 6.072098892973339, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": -0.004926004912704229, + "logits/rejected": 0.003906804136931896, + "logps/chosen": -1.5009397268295288, + "logps/rejected": -1.800026535987854, + "loss": 0.6973, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5009397268295288, + "rewards/margins": 0.29908668994903564, + "rewards/rejected": -1.800026535987854, + "sft_loss": 1.487810730934143, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 2.0130232142621383, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": -0.13648810982704163, + "logits/rejected": 0.041145212948322296, + "logps/chosen": -1.5612967014312744, + "logps/rejected": -1.838753342628479, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5612967014312744, + "rewards/margins": 0.277456670999527, + "rewards/rejected": -1.838753342628479, + "sft_loss": 1.5941799879074097, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 10.066727538058064, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": -0.13426437973976135, + "logits/rejected": 0.055471908301115036, + "logps/chosen": -1.6321967840194702, + "logps/rejected": -1.9175608158111572, + "loss": 0.7019, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6321967840194702, + "rewards/margins": 0.28536421060562134, + "rewards/rejected": -1.9175608158111572, + "sft_loss": 1.6166508197784424, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 4.760869708002439, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": -0.025986677035689354, + "logits/rejected": 0.16338834166526794, + "logps/chosen": -1.5891669988632202, + "logps/rejected": -1.8759765625, + "loss": 0.6823, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5891669988632202, + "rewards/margins": 0.28680968284606934, + "rewards/rejected": -1.8759765625, + "sft_loss": 1.6372543573379517, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 3.9398474753155375, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": 0.008121741004288197, + "logits/rejected": 0.047742851078510284, + "logps/chosen": -1.3868433237075806, + "logps/rejected": -1.7650667428970337, + "loss": 0.6691, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3868433237075806, + "rewards/margins": 0.3782234787940979, + "rewards/rejected": -1.7650667428970337, + "sft_loss": 1.4816190004348755, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 6.487750700925487, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": 0.007622921373695135, + "logits/rejected": 0.08368368446826935, + "logps/chosen": -1.5163058042526245, + "logps/rejected": -1.7254890203475952, + "loss": 0.7072, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5163058042526245, + "rewards/margins": 0.20918314158916473, + "rewards/rejected": -1.7254890203475952, + "sft_loss": 1.573646068572998, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 2.4530057893357196, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": 0.004174326546490192, + "logits/rejected": 0.11828531324863434, + "logps/chosen": -1.5501234531402588, + "logps/rejected": -1.928701400756836, + "loss": 0.6713, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5501234531402588, + "rewards/margins": 0.3785778284072876, + "rewards/rejected": -1.928701400756836, + "sft_loss": 1.5593101978302002, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 7.3330298167234, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": -0.007203756831586361, + "logits/rejected": 0.14894136786460876, + "logps/chosen": -1.604166030883789, + "logps/rejected": -1.933411955833435, + "loss": 0.6907, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.604166030883789, + "rewards/margins": 0.32924580574035645, + "rewards/rejected": -1.933411955833435, + "sft_loss": 1.6360177993774414, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 1.9102616369453456, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": -0.05284330993890762, + "logits/rejected": 0.12526333332061768, + "logps/chosen": -1.4626752138137817, + "logps/rejected": -1.8857272863388062, + "loss": 0.6693, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4626752138137817, + "rewards/margins": 0.4230521619319916, + "rewards/rejected": -1.8857272863388062, + "sft_loss": 1.4868780374526978, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 4.945281070667856, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": -0.06405460834503174, + "logits/rejected": 0.10280958563089371, + "logps/chosen": -1.4857017993927002, + "logps/rejected": -1.8511368036270142, + "loss": 0.6692, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4857017993927002, + "rewards/margins": 0.3654349148273468, + "rewards/rejected": -1.8511368036270142, + "sft_loss": 1.5392990112304688, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 3.4180212766462117, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": -0.035744160413742065, + "logits/rejected": 0.0958310067653656, + "logps/chosen": -1.5413570404052734, + "logps/rejected": -1.9201008081436157, + "loss": 0.6724, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5413570404052734, + "rewards/margins": 0.37874364852905273, + "rewards/rejected": -1.9201008081436157, + "sft_loss": 1.590633511543274, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 3.377588865739187, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": -0.08251983672380447, + "logits/rejected": 0.03991267830133438, + "logps/chosen": -1.5043127536773682, + "logps/rejected": -1.9541893005371094, + "loss": 0.6588, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5043127536773682, + "rewards/margins": 0.44987648725509644, + "rewards/rejected": -1.9541893005371094, + "sft_loss": 1.514143705368042, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 3.632024377290369, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": -0.021055901423096657, + "logits/rejected": 0.06645959615707397, + "logps/chosen": -1.5638939142227173, + "logps/rejected": -1.8698257207870483, + "loss": 0.68, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5638939142227173, + "rewards/margins": 0.3059318959712982, + "rewards/rejected": -1.8698257207870483, + "sft_loss": 1.5990447998046875, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 2.447338930418058, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": -0.03157895803451538, + "logits/rejected": 0.0898062214255333, + "logps/chosen": -1.646627426147461, + "logps/rejected": -1.8698803186416626, + "loss": 0.6907, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.646627426147461, + "rewards/margins": 0.22325296700000763, + "rewards/rejected": -1.8698803186416626, + "sft_loss": 1.629865050315857, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 1.9963882491988056, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": 0.0005223065381869674, + "logits/rejected": 0.16107748448848724, + "logps/chosen": -1.5309419631958008, + "logps/rejected": -1.9671926498413086, + "loss": 0.6754, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5309419631958008, + "rewards/margins": 0.4362506866455078, + "rewards/rejected": -1.9671926498413086, + "sft_loss": 1.5817630290985107, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 3.1737320220160146, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": -0.06241927295923233, + "logits/rejected": 0.08840295672416687, + "logps/chosen": -1.5109721422195435, + "logps/rejected": -1.9459927082061768, + "loss": 0.6927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5109721422195435, + "rewards/margins": 0.4350206255912781, + "rewards/rejected": -1.9459927082061768, + "sft_loss": 1.587691307067871, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 17.451231917179324, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": -0.07376951724290848, + "logits/rejected": 0.03201202303171158, + "logps/chosen": -1.6037285327911377, + "logps/rejected": -2.057170867919922, + "loss": 0.6891, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6037285327911377, + "rewards/margins": 0.4534422755241394, + "rewards/rejected": -2.057170867919922, + "sft_loss": 1.6328027248382568, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 9.148526949692766, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": -0.02030854858458042, + "logits/rejected": 0.06003781035542488, + "logps/chosen": -1.5734418630599976, + "logps/rejected": -1.9668676853179932, + "loss": 0.6757, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5734418630599976, + "rewards/margins": 0.3934256434440613, + "rewards/rejected": -1.9668676853179932, + "sft_loss": 1.5984394550323486, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 3.534136389187126, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": 0.031088626012206078, + "logits/rejected": 0.21325087547302246, + "logps/chosen": -1.666304588317871, + "logps/rejected": -1.995410680770874, + "loss": 0.6915, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.666304588317871, + "rewards/margins": 0.3291061818599701, + "rewards/rejected": -1.995410680770874, + "sft_loss": 1.626892328262329, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 3.465326478762225, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": 0.017907222732901573, + "logits/rejected": 0.14953218400478363, + "logps/chosen": -1.5562763214111328, + "logps/rejected": -2.0238089561462402, + "loss": 0.6665, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5562763214111328, + "rewards/margins": 0.4675326943397522, + "rewards/rejected": -2.0238089561462402, + "sft_loss": 1.6350345611572266, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 2.1403087762463056, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": -0.11527495086193085, + "logits/rejected": 0.11920982599258423, + "logps/chosen": -1.592320203781128, + "logps/rejected": -1.9931793212890625, + "loss": 0.6708, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.592320203781128, + "rewards/margins": 0.4008590579032898, + "rewards/rejected": -1.9931793212890625, + "sft_loss": 1.589946985244751, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 2.840207538309126, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": -0.07293342798948288, + "logits/rejected": 0.04848995804786682, + "logps/chosen": -1.5443998575210571, + "logps/rejected": -1.9821274280548096, + "loss": 0.6806, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5443998575210571, + "rewards/margins": 0.4377274513244629, + "rewards/rejected": -1.9821274280548096, + "sft_loss": 1.5755616426467896, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 3.4426385094229985, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": -0.14516997337341309, + "logits/rejected": -0.037937816232442856, + "logps/chosen": -1.703258752822876, + "logps/rejected": -1.9618574380874634, + "loss": 0.693, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.703258752822876, + "rewards/margins": 0.2585986852645874, + "rewards/rejected": -1.9618574380874634, + "sft_loss": 1.6879968643188477, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 3.3916420368703224, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": -0.06839573383331299, + "logits/rejected": 0.09993009269237518, + "logps/chosen": -1.6345523595809937, + "logps/rejected": -1.83871328830719, + "loss": 0.704, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6345523595809937, + "rewards/margins": 0.20416107773780823, + "rewards/rejected": -1.83871328830719, + "sft_loss": 1.6142337322235107, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 2.1757545678128323, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": -0.07336665689945221, + "logits/rejected": 0.030426010489463806, + "logps/chosen": -1.6222292184829712, + "logps/rejected": -1.806684136390686, + "loss": 0.6954, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6222292184829712, + "rewards/margins": 0.18445488810539246, + "rewards/rejected": -1.806684136390686, + "sft_loss": 1.560254693031311, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 3.103314219915245, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": -0.1993558704853058, + "logits/rejected": -0.06145832687616348, + "logps/chosen": -1.6323795318603516, + "logps/rejected": -2.0493452548980713, + "loss": 0.6879, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6323795318603516, + "rewards/margins": 0.4169657826423645, + "rewards/rejected": -2.0493452548980713, + "sft_loss": 1.6026633977890015, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 5.244140816418331, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": -0.0671815425157547, + "logits/rejected": 0.04157133400440216, + "logps/chosen": -1.5560808181762695, + "logps/rejected": -2.0572400093078613, + "loss": 0.6703, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5560808181762695, + "rewards/margins": 0.5011593103408813, + "rewards/rejected": -2.0572400093078613, + "sft_loss": 1.509490966796875, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 3.360600229736125, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": -0.13002343475818634, + "logits/rejected": -0.04660915583372116, + "logps/chosen": -1.6290388107299805, + "logps/rejected": -1.9336614608764648, + "loss": 0.6895, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.6290388107299805, + "rewards/margins": 0.304622620344162, + "rewards/rejected": -1.9336614608764648, + "sft_loss": 1.5408756732940674, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 4.119757221087749, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": -0.0894903913140297, + "logits/rejected": 0.03421594947576523, + "logps/chosen": -1.5715105533599854, + "logps/rejected": -1.8307205438613892, + "loss": 0.6916, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5715105533599854, + "rewards/margins": 0.2592098116874695, + "rewards/rejected": -1.8307205438613892, + "sft_loss": 1.5369572639465332, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 9.965157899426226, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": -0.00039502381696365774, + "logits/rejected": 0.12465800344944, + "logps/chosen": -1.50301194190979, + "logps/rejected": -1.9628212451934814, + "loss": 0.6709, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.50301194190979, + "rewards/margins": 0.45980945229530334, + "rewards/rejected": -1.9628212451934814, + "sft_loss": 1.4338786602020264, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 3.5833047947801533, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": -0.17405982315540314, + "logits/rejected": -0.06815730035305023, + "logps/chosen": -1.4872372150421143, + "logps/rejected": -1.8442754745483398, + "loss": 0.6695, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4872372150421143, + "rewards/margins": 0.3570381700992584, + "rewards/rejected": -1.8442754745483398, + "sft_loss": 1.5191495418548584, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 3.9352368286368864, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": -0.1691105216741562, + "logits/rejected": 0.017339913174510002, + "logps/chosen": -1.5508992671966553, + "logps/rejected": -1.9966919422149658, + "loss": 0.6686, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5508992671966553, + "rewards/margins": 0.4457928240299225, + "rewards/rejected": -1.9966919422149658, + "sft_loss": 1.5330065488815308, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 2.761349442479446, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": -0.0050021009519696236, + "logits/rejected": 0.06278308480978012, + "logps/chosen": -1.5469844341278076, + "logps/rejected": -1.8885329961776733, + "loss": 0.6849, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5469844341278076, + "rewards/margins": 0.34154844284057617, + "rewards/rejected": -1.8885329961776733, + "sft_loss": 1.4816900491714478, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 3.9949752862983052, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": -0.06258795410394669, + "logits/rejected": 0.09723483771085739, + "logps/chosen": -1.6089904308319092, + "logps/rejected": -1.987370252609253, + "loss": 0.6762, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6089904308319092, + "rewards/margins": 0.37837955355644226, + "rewards/rejected": -1.987370252609253, + "sft_loss": 1.4937976598739624, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 6.5844448907344395, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": -0.07619436085224152, + "logits/rejected": -0.03522288054227829, + "logps/chosen": -1.5906662940979004, + "logps/rejected": -1.7915910482406616, + "loss": 0.6957, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5906662940979004, + "rewards/margins": 0.20092466473579407, + "rewards/rejected": -1.7915910482406616, + "sft_loss": 1.6237761974334717, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 5.35326679908279, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": -0.2236470729112625, + "logits/rejected": -0.10575082153081894, + "logps/chosen": -1.545088291168213, + "logps/rejected": -1.9672361612319946, + "loss": 0.6871, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.545088291168213, + "rewards/margins": 0.42214781045913696, + "rewards/rejected": -1.9672361612319946, + "sft_loss": 1.5959243774414062, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 4.0563338921443535, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": -0.2021702527999878, + "logits/rejected": -0.0658130794763565, + "logps/chosen": -1.4679720401763916, + "logps/rejected": -1.866217851638794, + "loss": 0.672, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4679720401763916, + "rewards/margins": 0.39824575185775757, + "rewards/rejected": -1.866217851638794, + "sft_loss": 1.447820782661438, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 3.0364513987292607, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": -0.05491810292005539, + "logits/rejected": 0.03884059563279152, + "logps/chosen": -1.555903673171997, + "logps/rejected": -1.8066275119781494, + "loss": 0.7026, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.555903673171997, + "rewards/margins": 0.25072377920150757, + "rewards/rejected": -1.8066275119781494, + "sft_loss": 1.578338384628296, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 3.3942088776852763, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": -0.0013744793832302094, + "logits/rejected": 0.10036492347717285, + "logps/chosen": -1.4689775705337524, + "logps/rejected": -1.851769208908081, + "loss": 0.6687, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4689775705337524, + "rewards/margins": 0.3827916979789734, + "rewards/rejected": -1.851769208908081, + "sft_loss": 1.533111333847046, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 3.560423950376706, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": -0.06372959911823273, + "logits/rejected": 0.011961914598941803, + "logps/chosen": -1.4778475761413574, + "logps/rejected": -1.7782748937606812, + "loss": 0.6678, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4778475761413574, + "rewards/margins": 0.3004273772239685, + "rewards/rejected": -1.7782748937606812, + "sft_loss": 1.4882819652557373, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 3.47458523618669, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": -0.10142596065998077, + "logits/rejected": 0.052701033651828766, + "logps/chosen": -1.748448133468628, + "logps/rejected": -2.0238280296325684, + "loss": 0.6941, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.748448133468628, + "rewards/margins": 0.2753797173500061, + "rewards/rejected": -2.0238280296325684, + "sft_loss": 1.6605145931243896, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 2.4810129789735975, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": -0.07748343050479889, + "logits/rejected": 0.10826456546783447, + "logps/chosen": -1.5186102390289307, + "logps/rejected": -1.8455289602279663, + "loss": 0.6808, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5186102390289307, + "rewards/margins": 0.32691866159439087, + "rewards/rejected": -1.8455289602279663, + "sft_loss": 1.5482877492904663, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 1.5032379010693973, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": -0.10568971931934357, + "logits/rejected": 0.030995279550552368, + "logps/chosen": -1.6142991781234741, + "logps/rejected": -1.9687931537628174, + "loss": 0.6828, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6142991781234741, + "rewards/margins": 0.3544939458370209, + "rewards/rejected": -1.9687931537628174, + "sft_loss": 1.591963529586792, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 2.1484957476697897, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": -0.042309779673814774, + "logits/rejected": 0.09029584378004074, + "logps/chosen": -1.5777010917663574, + "logps/rejected": -2.0055551528930664, + "loss": 0.6825, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5777010917663574, + "rewards/margins": 0.4278542399406433, + "rewards/rejected": -2.0055551528930664, + "sft_loss": 1.558754324913025, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 3.3425717204320717, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": -0.12501846253871918, + "logits/rejected": 0.015322742983698845, + "logps/chosen": -1.6131792068481445, + "logps/rejected": -2.024101734161377, + "loss": 0.6636, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6131792068481445, + "rewards/margins": 0.4109226167201996, + "rewards/rejected": -2.024101734161377, + "sft_loss": 1.6081043481826782, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 5.308022190457999, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": -0.07765809446573257, + "logits/rejected": 0.03634956479072571, + "logps/chosen": -1.4784530401229858, + "logps/rejected": -1.9112812280654907, + "loss": 0.6714, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4784530401229858, + "rewards/margins": 0.43282780051231384, + "rewards/rejected": -1.9112812280654907, + "sft_loss": 1.542213797569275, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 8.271258182294176, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": -0.13535340130329132, + "logits/rejected": -0.06341275572776794, + "logps/chosen": -1.5599576234817505, + "logps/rejected": -1.8556772470474243, + "loss": 0.6757, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5599576234817505, + "rewards/margins": 0.29571956396102905, + "rewards/rejected": -1.8556772470474243, + "sft_loss": 1.5496803522109985, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.20428466796875, + "eval_logits/rejected": 0.2921755313873291, + "eval_logps/chosen": -1.5600173473358154, + "eval_logps/rejected": -1.9539307355880737, + "eval_loss": 0.6770769357681274, + "eval_rewards/accuracies": 0.6216617226600647, + "eval_rewards/chosen": -1.5600173473358154, + "eval_rewards/margins": 0.39391323924064636, + "eval_rewards/rejected": -1.9539307355880737, + "eval_runtime": 43.3031, + "eval_samples_per_second": 31.06, + "eval_sft_loss": 1.5573872327804565, + "eval_steps_per_second": 7.782, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 7.702822915063556, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": -0.2256489247083664, + "logits/rejected": -0.05405454710125923, + "logps/chosen": -1.5026648044586182, + "logps/rejected": -1.8729171752929688, + "loss": 0.6732, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5026648044586182, + "rewards/margins": 0.37025216221809387, + "rewards/rejected": -1.8729171752929688, + "sft_loss": 1.5461900234222412, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 6.35721552375853, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": -0.12963464856147766, + "logits/rejected": 0.05106347054243088, + "logps/chosen": -1.5107980966567993, + "logps/rejected": -1.8580278158187866, + "loss": 0.6868, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5107980966567993, + "rewards/margins": 0.3472297489643097, + "rewards/rejected": -1.8580278158187866, + "sft_loss": 1.5099369287490845, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 1.866771016177349, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": -0.20429301261901855, + "logits/rejected": -0.09392275661230087, + "logps/chosen": -1.5623438358306885, + "logps/rejected": -1.8264286518096924, + "loss": 0.6989, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5623438358306885, + "rewards/margins": 0.26408472657203674, + "rewards/rejected": -1.8264286518096924, + "sft_loss": 1.5577224493026733, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 6.372271471250699, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": 0.006414422299712896, + "logits/rejected": 0.10992386192083359, + "logps/chosen": -1.464158058166504, + "logps/rejected": -1.7496296167373657, + "loss": 0.6895, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.464158058166504, + "rewards/margins": 0.28547143936157227, + "rewards/rejected": -1.7496296167373657, + "sft_loss": 1.4565626382827759, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 3.3112098860934522, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": -0.22578708827495575, + "logits/rejected": -0.021116072311997414, + "logps/chosen": -1.5680522918701172, + "logps/rejected": -1.8750545978546143, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5680522918701172, + "rewards/margins": 0.3070022463798523, + "rewards/rejected": -1.8750545978546143, + "sft_loss": 1.5702797174453735, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 5.484447391905437, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": -0.11542798578739166, + "logits/rejected": -0.012674192897975445, + "logps/chosen": -1.5529954433441162, + "logps/rejected": -1.849373459815979, + "loss": 0.6977, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5529954433441162, + "rewards/margins": 0.2963777482509613, + "rewards/rejected": -1.849373459815979, + "sft_loss": 1.498388648033142, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 2.6988517762160065, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": -0.08795280009508133, + "logits/rejected": 0.031953997910022736, + "logps/chosen": -1.6094152927398682, + "logps/rejected": -1.8868507146835327, + "loss": 0.6931, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6094152927398682, + "rewards/margins": 0.2774355411529541, + "rewards/rejected": -1.8868507146835327, + "sft_loss": 1.6308940649032593, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 3.1410372474228634, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": -0.23440854251384735, + "logits/rejected": -0.10367898643016815, + "logps/chosen": -1.6422284841537476, + "logps/rejected": -1.9875221252441406, + "loss": 0.6862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6422284841537476, + "rewards/margins": 0.34529370069503784, + "rewards/rejected": -1.9875221252441406, + "sft_loss": 1.636867880821228, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 3.3956525710849648, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": -0.14539551734924316, + "logits/rejected": 0.03338898345828056, + "logps/chosen": -1.6730928421020508, + "logps/rejected": -1.9549732208251953, + "loss": 0.6896, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.6730928421020508, + "rewards/margins": 0.28188052773475647, + "rewards/rejected": -1.9549732208251953, + "sft_loss": 1.6566932201385498, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 7.15480911411517, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": -0.1386650800704956, + "logits/rejected": 0.05835481360554695, + "logps/chosen": -1.7480783462524414, + "logps/rejected": -2.050711154937744, + "loss": 0.6999, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7480783462524414, + "rewards/margins": 0.3026331067085266, + "rewards/rejected": -2.050711154937744, + "sft_loss": 1.7057584524154663, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 5.122868397150163, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": -0.13555511832237244, + "logits/rejected": -0.009329566732048988, + "logps/chosen": -1.6446784734725952, + "logps/rejected": -2.0092029571533203, + "loss": 0.6889, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6446784734725952, + "rewards/margins": 0.3645244836807251, + "rewards/rejected": -2.0092029571533203, + "sft_loss": 1.6743896007537842, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 4.072451407401296, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": -0.19094884395599365, + "logits/rejected": -0.06241123750805855, + "logps/chosen": -1.5872955322265625, + "logps/rejected": -2.1329524517059326, + "loss": 0.6851, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5872955322265625, + "rewards/margins": 0.5456571578979492, + "rewards/rejected": -2.1329524517059326, + "sft_loss": 1.610181450843811, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 3.5520563975970263, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": -0.08579371869564056, + "logits/rejected": 0.05589524656534195, + "logps/chosen": -1.5661556720733643, + "logps/rejected": -2.0218825340270996, + "loss": 0.6791, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5661556720733643, + "rewards/margins": 0.45572710037231445, + "rewards/rejected": -2.0218825340270996, + "sft_loss": 1.5690847635269165, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 2.5863182066589667, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": -0.1772998571395874, + "logits/rejected": -0.07537718117237091, + "logps/chosen": -1.5737218856811523, + "logps/rejected": -2.018108606338501, + "loss": 0.674, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5737218856811523, + "rewards/margins": 0.4443867802619934, + "rewards/rejected": -2.018108606338501, + "sft_loss": 1.608443021774292, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 1.7393209176572886, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": -0.17091652750968933, + "logits/rejected": -0.06616854667663574, + "logps/chosen": -1.5976166725158691, + "logps/rejected": -1.964280366897583, + "loss": 0.6779, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5976166725158691, + "rewards/margins": 0.3666638135910034, + "rewards/rejected": -1.964280366897583, + "sft_loss": 1.543792486190796, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 8.241710987478688, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": -0.1482701599597931, + "logits/rejected": 0.001603972166776657, + "logps/chosen": -1.6374969482421875, + "logps/rejected": -1.916908621788025, + "loss": 0.695, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6374969482421875, + "rewards/margins": 0.2794113755226135, + "rewards/rejected": -1.916908621788025, + "sft_loss": 1.6501718759536743, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 7.128832532182297, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": -0.12469017505645752, + "logits/rejected": -0.0011989653576165438, + "logps/chosen": -1.5048984289169312, + "logps/rejected": -1.7811921834945679, + "loss": 0.674, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5048984289169312, + "rewards/margins": 0.2762937545776367, + "rewards/rejected": -1.7811921834945679, + "sft_loss": 1.4971561431884766, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 2.5701358301942276, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": -0.2623763978481293, + "logits/rejected": -0.11838710308074951, + "logps/chosen": -1.5606590509414673, + "logps/rejected": -1.7689409255981445, + "loss": 0.6898, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5606590509414673, + "rewards/margins": 0.20828208327293396, + "rewards/rejected": -1.7689409255981445, + "sft_loss": 1.5570552349090576, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 2.995443118039142, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": -0.15557818114757538, + "logits/rejected": -0.13853155076503754, + "logps/chosen": -1.557213544845581, + "logps/rejected": -1.975672721862793, + "loss": 0.6714, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.557213544845581, + "rewards/margins": 0.4184592366218567, + "rewards/rejected": -1.975672721862793, + "sft_loss": 1.5470623970031738, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 3.1857631474864787, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": -0.15887439250946045, + "logits/rejected": 0.08108071982860565, + "logps/chosen": -1.5032182931900024, + "logps/rejected": -1.9598249197006226, + "loss": 0.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5032182931900024, + "rewards/margins": 0.4566067159175873, + "rewards/rejected": -1.9598249197006226, + "sft_loss": 1.5519423484802246, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 2.9846126418031127, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": -0.020437534898519516, + "logits/rejected": 0.03360765427350998, + "logps/chosen": -1.5451034307479858, + "logps/rejected": -1.8681617975234985, + "loss": 0.6791, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5451034307479858, + "rewards/margins": 0.323058158159256, + "rewards/rejected": -1.8681617975234985, + "sft_loss": 1.5251609086990356, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 2.7939468269777143, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": -0.04316322132945061, + "logits/rejected": 0.040403760969638824, + "logps/chosen": -1.49526047706604, + "logps/rejected": -1.839127779006958, + "loss": 0.6796, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.49526047706604, + "rewards/margins": 0.34386715292930603, + "rewards/rejected": -1.839127779006958, + "sft_loss": 1.5057623386383057, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 2.012780081071824, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": -0.12500867247581482, + "logits/rejected": 0.04848942905664444, + "logps/chosen": -1.4530186653137207, + "logps/rejected": -1.8870136737823486, + "loss": 0.6543, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4530186653137207, + "rewards/margins": 0.43399494886398315, + "rewards/rejected": -1.8870136737823486, + "sft_loss": 1.4522100687026978, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 2.3132974309008896, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": -0.2322191447019577, + "logits/rejected": -0.04070020467042923, + "logps/chosen": -1.467179536819458, + "logps/rejected": -1.8701632022857666, + "loss": 0.6749, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.467179536819458, + "rewards/margins": 0.4029836058616638, + "rewards/rejected": -1.8701632022857666, + "sft_loss": 1.488556146621704, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 5.872677785280684, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": -0.08441654592752457, + "logits/rejected": 0.020516756922006607, + "logps/chosen": -1.551811933517456, + "logps/rejected": -1.8700692653656006, + "loss": 0.6895, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.551811933517456, + "rewards/margins": 0.31825751066207886, + "rewards/rejected": -1.8700692653656006, + "sft_loss": 1.5086562633514404, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 2.6113187087154888, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": -0.1095929890871048, + "logits/rejected": 0.0020243481267243624, + "logps/chosen": -1.5338222980499268, + "logps/rejected": -1.770532250404358, + "loss": 0.6847, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5338222980499268, + "rewards/margins": 0.23670975863933563, + "rewards/rejected": -1.770532250404358, + "sft_loss": 1.525867223739624, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 3.6022868512669715, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": -0.09152556955814362, + "logits/rejected": -0.01481366716325283, + "logps/chosen": -1.6264232397079468, + "logps/rejected": -1.9128234386444092, + "loss": 0.6819, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6264232397079468, + "rewards/margins": 0.2864004373550415, + "rewards/rejected": -1.9128234386444092, + "sft_loss": 1.6053024530410767, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 2.9818283371238694, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": -0.09121497720479965, + "logits/rejected": -0.053193580359220505, + "logps/chosen": -1.5617108345031738, + "logps/rejected": -1.851728081703186, + "loss": 0.6968, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5617108345031738, + "rewards/margins": 0.290017306804657, + "rewards/rejected": -1.851728081703186, + "sft_loss": 1.541918158531189, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 3.2254024286317704, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": -0.05244135856628418, + "logits/rejected": 0.13419947028160095, + "logps/chosen": -1.527730941772461, + "logps/rejected": -1.9169437885284424, + "loss": 0.6789, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.527730941772461, + "rewards/margins": 0.38921260833740234, + "rewards/rejected": -1.9169437885284424, + "sft_loss": 1.5584609508514404, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 3.184994861200645, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": -0.04931309074163437, + "logits/rejected": 0.011732319369912148, + "logps/chosen": -1.5444519519805908, + "logps/rejected": -1.713204026222229, + "loss": 0.6983, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.5444519519805908, + "rewards/margins": 0.1687517762184143, + "rewards/rejected": -1.713204026222229, + "sft_loss": 1.5547934770584106, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 2.3196954532425895, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": -0.07705724984407425, + "logits/rejected": 0.20211537182331085, + "logps/chosen": -1.6409122943878174, + "logps/rejected": -1.981188416481018, + "loss": 0.6877, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6409122943878174, + "rewards/margins": 0.34027615189552307, + "rewards/rejected": -1.981188416481018, + "sft_loss": 1.6130729913711548, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 4.604948427878151, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": -0.17080985009670258, + "logits/rejected": -0.026930373162031174, + "logps/chosen": -1.6719783544540405, + "logps/rejected": -2.09201979637146, + "loss": 0.6854, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6719783544540405, + "rewards/margins": 0.4200412631034851, + "rewards/rejected": -2.09201979637146, + "sft_loss": 1.6854654550552368, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 1.9140637251332389, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": -0.07638595253229141, + "logits/rejected": 0.06088540703058243, + "logps/chosen": -1.5613833665847778, + "logps/rejected": -1.8865737915039062, + "loss": 0.6861, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5613833665847778, + "rewards/margins": 0.32519054412841797, + "rewards/rejected": -1.8865737915039062, + "sft_loss": 1.5651633739471436, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 2.003458713392101, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": -0.06622512638568878, + "logits/rejected": 0.049443237483501434, + "logps/chosen": -1.6531356573104858, + "logps/rejected": -2.1463818550109863, + "loss": 0.6654, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6531356573104858, + "rewards/margins": 0.49324607849121094, + "rewards/rejected": -2.1463818550109863, + "sft_loss": 1.633424997329712, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 3.2176370131268883, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.11674849689006805, + "logits/rejected": 0.030340248718857765, + "logps/chosen": -1.5437183380126953, + "logps/rejected": -2.0395216941833496, + "loss": 0.6638, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5437183380126953, + "rewards/margins": 0.49580326676368713, + "rewards/rejected": -2.0395216941833496, + "sft_loss": 1.5829532146453857, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 4.8124347775749605, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.15775957703590393, + "logits/rejected": -0.0332581028342247, + "logps/chosen": -1.5887658596038818, + "logps/rejected": -1.8498271703720093, + "loss": 0.6951, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5887658596038818, + "rewards/margins": 0.26106134057044983, + "rewards/rejected": -1.8498271703720093, + "sft_loss": 1.5464975833892822, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 8.86609371083185, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.1439005434513092, + "logits/rejected": -0.015216085128486156, + "logps/chosen": -1.4790098667144775, + "logps/rejected": -1.861185073852539, + "loss": 0.6906, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4790098667144775, + "rewards/margins": 0.382175087928772, + "rewards/rejected": -1.861185073852539, + "sft_loss": 1.5054775476455688, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 5.132404998842138, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": -0.048094429075717926, + "logits/rejected": 0.07193388044834137, + "logps/chosen": -1.5926799774169922, + "logps/rejected": -1.9508346319198608, + "loss": 0.6881, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5926799774169922, + "rewards/margins": 0.358154833316803, + "rewards/rejected": -1.9508346319198608, + "sft_loss": 1.6146783828735352, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 3.231833246170535, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": 0.00042394845513626933, + "logits/rejected": 0.09290830790996552, + "logps/chosen": -1.5559685230255127, + "logps/rejected": -1.8194761276245117, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5559685230255127, + "rewards/margins": 0.26350778341293335, + "rewards/rejected": -1.8194761276245117, + "sft_loss": 1.539088487625122, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 1.4119295681940902, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.21388454735279083, + "logits/rejected": -0.07617992907762527, + "logps/chosen": -1.6378538608551025, + "logps/rejected": -2.004629373550415, + "loss": 0.6852, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6378538608551025, + "rewards/margins": 0.36677560210227966, + "rewards/rejected": -2.004629373550415, + "sft_loss": 1.642905831336975, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 2.6573615191238322, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.11038383096456528, + "logits/rejected": 0.070514015853405, + "logps/chosen": -1.437596321105957, + "logps/rejected": -1.957358717918396, + "loss": 0.6812, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.437596321105957, + "rewards/margins": 0.5197626948356628, + "rewards/rejected": -1.957358717918396, + "sft_loss": 1.4881653785705566, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 3.3860319941892083, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.16537480056285858, + "logits/rejected": 0.008626123890280724, + "logps/chosen": -1.59507155418396, + "logps/rejected": -1.9980379343032837, + "loss": 0.6697, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.59507155418396, + "rewards/margins": 0.4029662609100342, + "rewards/rejected": -1.9980379343032837, + "sft_loss": 1.6412235498428345, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 5.364636442909648, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": -0.058947961777448654, + "logits/rejected": 0.08670911192893982, + "logps/chosen": -1.609555959701538, + "logps/rejected": -1.789323091506958, + "loss": 0.6968, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.609555959701538, + "rewards/margins": 0.17976680397987366, + "rewards/rejected": -1.789323091506958, + "sft_loss": 1.5581715106964111, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 2.6038674841340193, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": -0.045800067484378815, + "logits/rejected": 0.08673261106014252, + "logps/chosen": -1.5158113241195679, + "logps/rejected": -2.089888334274292, + "loss": 0.6823, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5158113241195679, + "rewards/margins": 0.5740770697593689, + "rewards/rejected": -2.089888334274292, + "sft_loss": 1.545422911643982, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 1.9927918386030432, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": -0.007310047745704651, + "logits/rejected": 0.07895542681217194, + "logps/chosen": -1.5619844198226929, + "logps/rejected": -2.0922622680664062, + "loss": 0.6698, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5619844198226929, + "rewards/margins": 0.5302778482437134, + "rewards/rejected": -2.0922622680664062, + "sft_loss": 1.5573543310165405, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 4.490498215579793, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.13646551966667175, + "logits/rejected": 0.0039533572271466255, + "logps/chosen": -1.6439111232757568, + "logps/rejected": -1.954990029335022, + "loss": 0.6815, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6439111232757568, + "rewards/margins": 0.31107890605926514, + "rewards/rejected": -1.954990029335022, + "sft_loss": 1.572951078414917, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 2.801289868801277, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": 0.009105369448661804, + "logits/rejected": 0.07025135308504105, + "logps/chosen": -1.5027073621749878, + "logps/rejected": -2.024282693862915, + "loss": 0.6839, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5027073621749878, + "rewards/margins": 0.5215753316879272, + "rewards/rejected": -2.024282693862915, + "sft_loss": 1.5148102045059204, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 3.395666019700634, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": -0.0959697738289833, + "logits/rejected": 0.015020926482975483, + "logps/chosen": -1.5443953275680542, + "logps/rejected": -1.8177766799926758, + "loss": 0.686, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5443953275680542, + "rewards/margins": 0.27338117361068726, + "rewards/rejected": -1.8177766799926758, + "sft_loss": 1.5885334014892578, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 5.468030761727132, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.14049549400806427, + "logits/rejected": 0.07801493257284164, + "logps/chosen": -1.670945167541504, + "logps/rejected": -2.048828363418579, + "loss": 0.676, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.670945167541504, + "rewards/margins": 0.3778831958770752, + "rewards/rejected": -2.048828363418579, + "sft_loss": 1.6721107959747314, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 26.45492077745353, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": -0.01423375029116869, + "logits/rejected": 0.08405263721942902, + "logps/chosen": -1.73532235622406, + "logps/rejected": -1.9594646692276, + "loss": 0.6902, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.73532235622406, + "rewards/margins": 0.22414252161979675, + "rewards/rejected": -1.9594646692276, + "sft_loss": 1.6139285564422607, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 1.6370612121298826, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.0711984857916832, + "logits/rejected": 0.09706473350524902, + "logps/chosen": -1.7322229146957397, + "logps/rejected": -2.187854051589966, + "loss": 0.676, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7322229146957397, + "rewards/margins": 0.4556312561035156, + "rewards/rejected": -2.187854051589966, + "sft_loss": 1.666243314743042, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 1.3578427220948361, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": -0.02941352128982544, + "logits/rejected": 0.1412985622882843, + "logps/chosen": -1.897125244140625, + "logps/rejected": -2.1788127422332764, + "loss": 0.688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.897125244140625, + "rewards/margins": 0.2816876471042633, + "rewards/rejected": -2.1788127422332764, + "sft_loss": 1.6443207263946533, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 1.1439248265728157, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": -0.016499606892466545, + "logits/rejected": 0.09893260896205902, + "logps/chosen": -1.961816430091858, + "logps/rejected": -2.18733549118042, + "loss": 0.698, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.961816430091858, + "rewards/margins": 0.22551891207695007, + "rewards/rejected": -2.18733549118042, + "sft_loss": 1.7368465662002563, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 1.2935551708071662, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.10018036514520645, + "logits/rejected": -0.0658569410443306, + "logps/chosen": -2.0715017318725586, + "logps/rejected": -2.278127431869507, + "loss": 0.7069, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.0715017318725586, + "rewards/margins": 0.20662562549114227, + "rewards/rejected": -2.278127431869507, + "sft_loss": 1.8013073205947876, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 1.3289250577714493, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.12351454794406891, + "logits/rejected": 0.05726455897092819, + "logps/chosen": -1.958917260169983, + "logps/rejected": -2.243590831756592, + "loss": 0.6972, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.958917260169983, + "rewards/margins": 0.28467339277267456, + "rewards/rejected": -2.243590831756592, + "sft_loss": 1.6803343296051025, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 1.482748397618818, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": -0.02373456209897995, + "logits/rejected": 0.0855960100889206, + "logps/chosen": -2.0908074378967285, + "logps/rejected": -2.3277366161346436, + "loss": 0.7072, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.0908074378967285, + "rewards/margins": 0.23692938685417175, + "rewards/rejected": -2.3277366161346436, + "sft_loss": 1.735292673110962, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 1.643821120380416, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": -0.027933578938245773, + "logits/rejected": 0.1411551535129547, + "logps/chosen": -1.9436454772949219, + "logps/rejected": -2.229135036468506, + "loss": 0.6911, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9436454772949219, + "rewards/margins": 0.2854893207550049, + "rewards/rejected": -2.229135036468506, + "sft_loss": 1.614961862564087, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 1.893708518906089, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.1228971853852272, + "logits/rejected": -0.0020554482471197844, + "logps/chosen": -1.8631207942962646, + "logps/rejected": -2.0909080505371094, + "loss": 0.7057, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.8631207942962646, + "rewards/margins": 0.2277870923280716, + "rewards/rejected": -2.0909080505371094, + "sft_loss": 1.654308557510376, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 1.9499816452217278, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": -0.11362478882074356, + "logits/rejected": -0.0030496090184897184, + "logps/chosen": -2.018123149871826, + "logps/rejected": -2.1381494998931885, + "loss": 0.7083, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.018123149871826, + "rewards/margins": 0.12002629041671753, + "rewards/rejected": -2.1381494998931885, + "sft_loss": 1.8265842199325562, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 1.6020532710600257, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.12686367332935333, + "logits/rejected": -0.023370999842882156, + "logps/chosen": -1.9039256572723389, + "logps/rejected": -2.173767566680908, + "loss": 0.7034, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.9039256572723389, + "rewards/margins": 0.2698422074317932, + "rewards/rejected": -2.173767566680908, + "sft_loss": 1.7249925136566162, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 2.7728240416119223, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": -0.07930402457714081, + "logits/rejected": -0.011636780574917793, + "logps/chosen": -1.7571052312850952, + "logps/rejected": -2.135777711868286, + "loss": 0.6826, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7571052312850952, + "rewards/margins": 0.378672331571579, + "rewards/rejected": -2.135777711868286, + "sft_loss": 1.5940500497817993, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 1.6316113332161346, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.15546007454395294, + "logits/rejected": 0.11401049792766571, + "logps/chosen": -1.739991545677185, + "logps/rejected": -2.1746387481689453, + "loss": 0.6812, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.739991545677185, + "rewards/margins": 0.4346471428871155, + "rewards/rejected": -2.1746387481689453, + "sft_loss": 1.6523933410644531, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 2.8595704944653146, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": -0.08268715441226959, + "logits/rejected": -0.013263854198157787, + "logps/chosen": -1.6454627513885498, + "logps/rejected": -2.0212502479553223, + "loss": 0.689, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6454627513885498, + "rewards/margins": 0.3757875859737396, + "rewards/rejected": -2.0212502479553223, + "sft_loss": 1.592815637588501, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 4.485806385390208, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": -0.06977352499961853, + "logits/rejected": 0.022602787241339684, + "logps/chosen": -1.679253339767456, + "logps/rejected": -1.99455988407135, + "loss": 0.6847, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.679253339767456, + "rewards/margins": 0.3153064250946045, + "rewards/rejected": -1.99455988407135, + "sft_loss": 1.725643515586853, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 1.5757725922095123, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.1212872862815857, + "logits/rejected": 0.056601427495479584, + "logps/chosen": -1.6112515926361084, + "logps/rejected": -1.9420926570892334, + "loss": 0.6834, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6112515926361084, + "rewards/margins": 0.3308410048484802, + "rewards/rejected": -1.9420926570892334, + "sft_loss": 1.5881706476211548, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 2.800830308826896, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.12238524109125137, + "logits/rejected": -0.00532518932595849, + "logps/chosen": -1.6283352375030518, + "logps/rejected": -1.928149938583374, + "loss": 0.6924, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6283352375030518, + "rewards/margins": 0.2998148798942566, + "rewards/rejected": -1.928149938583374, + "sft_loss": 1.5983941555023193, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 2.0165413656926114, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.12431217730045319, + "logits/rejected": 0.05323215574026108, + "logps/chosen": -1.5410443544387817, + "logps/rejected": -1.9448493719100952, + "loss": 0.6646, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5410443544387817, + "rewards/margins": 0.4038047790527344, + "rewards/rejected": -1.9448493719100952, + "sft_loss": 1.5656566619873047, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 3.6027657530567905, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": -0.09710733592510223, + "logits/rejected": 0.0640190914273262, + "logps/chosen": -1.5832998752593994, + "logps/rejected": -2.163719892501831, + "loss": 0.6669, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5832998752593994, + "rewards/margins": 0.5804203748703003, + "rewards/rejected": -2.163719892501831, + "sft_loss": 1.5556138753890991, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 4.383434026201945, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": -0.05685956031084061, + "logits/rejected": 0.10388834774494171, + "logps/chosen": -1.5632593631744385, + "logps/rejected": -1.8989818096160889, + "loss": 0.6938, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5632593631744385, + "rewards/margins": 0.33572250604629517, + "rewards/rejected": -1.8989818096160889, + "sft_loss": 1.5883595943450928, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 3.579654924226385, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.21645978093147278, + "logits/rejected": 0.0461578443646431, + "logps/chosen": -1.6038854122161865, + "logps/rejected": -2.0388576984405518, + "loss": 0.6847, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6038854122161865, + "rewards/margins": 0.43497198820114136, + "rewards/rejected": -2.0388576984405518, + "sft_loss": 1.6251455545425415, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 1.449373832558233, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.17601588368415833, + "logits/rejected": 0.07178305834531784, + "logps/chosen": -1.627812147140503, + "logps/rejected": -2.0813026428222656, + "loss": 0.6906, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.627812147140503, + "rewards/margins": 0.4534904360771179, + "rewards/rejected": -2.0813026428222656, + "sft_loss": 1.5838168859481812, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 2.3986835135715947, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": -0.053179144859313965, + "logits/rejected": 0.019892878830432892, + "logps/chosen": -1.6524696350097656, + "logps/rejected": -1.9607833623886108, + "loss": 0.67, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6524696350097656, + "rewards/margins": 0.30831378698349, + "rewards/rejected": -1.9607833623886108, + "sft_loss": 1.6692003011703491, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 1.8093814907030827, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": -0.05682498216629028, + "logits/rejected": 0.041855137795209885, + "logps/chosen": -1.638074278831482, + "logps/rejected": -1.9424121379852295, + "loss": 0.6873, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.638074278831482, + "rewards/margins": 0.30433768033981323, + "rewards/rejected": -1.9424121379852295, + "sft_loss": 1.6524169445037842, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 3.979211925428682, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": -0.025512274354696274, + "logits/rejected": -0.005897931754589081, + "logps/chosen": -1.5600687265396118, + "logps/rejected": -1.8334335088729858, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5600687265396118, + "rewards/margins": 0.273364782333374, + "rewards/rejected": -1.8334335088729858, + "sft_loss": 1.5935394763946533, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 5.017621580214614, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": -0.07320438325405121, + "logits/rejected": 0.03042033314704895, + "logps/chosen": -1.5760712623596191, + "logps/rejected": -1.838953971862793, + "loss": 0.6916, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5760712623596191, + "rewards/margins": 0.2628825902938843, + "rewards/rejected": -1.838953971862793, + "sft_loss": 1.5679020881652832, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 2.770920413022084, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": -0.027145802974700928, + "logits/rejected": 0.2162061482667923, + "logps/chosen": -1.6395962238311768, + "logps/rejected": -2.0249321460723877, + "loss": 0.6722, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6395962238311768, + "rewards/margins": 0.3853360712528229, + "rewards/rejected": -2.0249321460723877, + "sft_loss": 1.6166515350341797, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 2.6484303604459223, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": -0.08917608112096786, + "logits/rejected": 0.06780953705310822, + "logps/chosen": -1.50400972366333, + "logps/rejected": -2.0031304359436035, + "loss": 0.6684, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.50400972366333, + "rewards/margins": 0.4991206228733063, + "rewards/rejected": -2.0031304359436035, + "sft_loss": 1.498430609703064, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 10.533324718921243, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.11556092649698257, + "logits/rejected": -0.02518743835389614, + "logps/chosen": -1.5511956214904785, + "logps/rejected": -1.8880646228790283, + "loss": 0.6763, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5511956214904785, + "rewards/margins": 0.33686885237693787, + "rewards/rejected": -1.8880646228790283, + "sft_loss": 1.6106898784637451, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 4.531173815612611, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": -0.03725462406873703, + "logits/rejected": 0.05171867460012436, + "logps/chosen": -1.7447688579559326, + "logps/rejected": -1.9502300024032593, + "loss": 0.7021, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7447688579559326, + "rewards/margins": 0.20546121895313263, + "rewards/rejected": -1.9502300024032593, + "sft_loss": 1.691070556640625, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 2.398422908898425, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.16255098581314087, + "logits/rejected": 0.03137350454926491, + "logps/chosen": -1.6325353384017944, + "logps/rejected": -2.1362509727478027, + "loss": 0.6744, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6325353384017944, + "rewards/margins": 0.5037158727645874, + "rewards/rejected": -2.1362509727478027, + "sft_loss": 1.6324422359466553, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.1912696212530136, + "eval_logits/rejected": 0.28272679448127747, + "eval_logps/chosen": -1.6092780828475952, + "eval_logps/rejected": -2.0407536029815674, + "eval_loss": 0.6739305257797241, + "eval_rewards/accuracies": 0.6335311532020569, + "eval_rewards/chosen": -1.6092780828475952, + "eval_rewards/margins": 0.4314754903316498, + "eval_rewards/rejected": -2.0407536029815674, + "eval_runtime": 46.9151, + "eval_samples_per_second": 28.669, + "eval_sft_loss": 1.595874309539795, + "eval_steps_per_second": 7.183, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 2.325941888707781, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.15885820984840393, + "logits/rejected": 0.061292119324207306, + "logps/chosen": -1.6002050638198853, + "logps/rejected": -2.0625171661376953, + "loss": 0.6704, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6002050638198853, + "rewards/margins": 0.46231213212013245, + "rewards/rejected": -2.0625171661376953, + "sft_loss": 1.5941084623336792, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 2.6887603502483137, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.16491663455963135, + "logits/rejected": 0.005011633038520813, + "logps/chosen": -1.5596110820770264, + "logps/rejected": -2.1306350231170654, + "loss": 0.6768, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5596110820770264, + "rewards/margins": 0.5710240602493286, + "rewards/rejected": -2.1306350231170654, + "sft_loss": 1.5888437032699585, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 3.5086793422707188, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.11782260239124298, + "logits/rejected": -0.012947884388267994, + "logps/chosen": -1.6803264617919922, + "logps/rejected": -2.164822816848755, + "loss": 0.6761, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6803264617919922, + "rewards/margins": 0.48449650406837463, + "rewards/rejected": -2.164822816848755, + "sft_loss": 1.6974128484725952, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 2.736107063051043, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.06576915830373764, + "logits/rejected": 0.08024446666240692, + "logps/chosen": -1.509300947189331, + "logps/rejected": -1.94968581199646, + "loss": 0.6684, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.509300947189331, + "rewards/margins": 0.44038495421409607, + "rewards/rejected": -1.94968581199646, + "sft_loss": 1.513148546218872, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 2.1306604254638954, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": -0.003141905413940549, + "logits/rejected": 0.04678082466125488, + "logps/chosen": -1.6815744638442993, + "logps/rejected": -1.9233620166778564, + "loss": 0.6948, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6815744638442993, + "rewards/margins": 0.24178750813007355, + "rewards/rejected": -1.9233620166778564, + "sft_loss": 1.6074094772338867, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 5.170181882532501, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": -0.0599781759083271, + "logits/rejected": 0.08817584812641144, + "logps/chosen": -1.5599714517593384, + "logps/rejected": -1.941277265548706, + "loss": 0.6729, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5599714517593384, + "rewards/margins": 0.381305992603302, + "rewards/rejected": -1.941277265548706, + "sft_loss": 1.5579124689102173, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 5.875083105284614, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.16240473091602325, + "logits/rejected": -0.04438484460115433, + "logps/chosen": -1.4770807027816772, + "logps/rejected": -1.9241431951522827, + "loss": 0.6772, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4770807027816772, + "rewards/margins": 0.4470624029636383, + "rewards/rejected": -1.9241431951522827, + "sft_loss": 1.5148780345916748, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 2.29849411000814, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.14882834255695343, + "logits/rejected": -0.03332848101854324, + "logps/chosen": -1.491604208946228, + "logps/rejected": -1.9753957986831665, + "loss": 0.6773, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.491604208946228, + "rewards/margins": 0.4837915301322937, + "rewards/rejected": -1.9753957986831665, + "sft_loss": 1.5202728509902954, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 5.056281533703873, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.0903366208076477, + "logits/rejected": -0.03441023826599121, + "logps/chosen": -1.5394830703735352, + "logps/rejected": -1.837863564491272, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5394830703735352, + "rewards/margins": 0.2983805239200592, + "rewards/rejected": -1.837863564491272, + "sft_loss": 1.5584405660629272, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 5.480908431753782, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": -0.01808544062077999, + "logits/rejected": 0.051542092114686966, + "logps/chosen": -1.515172004699707, + "logps/rejected": -1.8816455602645874, + "loss": 0.6712, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.515172004699707, + "rewards/margins": 0.36647361516952515, + "rewards/rejected": -1.8816455602645874, + "sft_loss": 1.4930051565170288, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 4.592937357483597, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.14737102389335632, + "logits/rejected": -0.03177300840616226, + "logps/chosen": -1.5563279390335083, + "logps/rejected": -1.9180982112884521, + "loss": 0.7019, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5563279390335083, + "rewards/margins": 0.3617701232433319, + "rewards/rejected": -1.9180982112884521, + "sft_loss": 1.5393438339233398, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 3.821892841718335, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": -0.031492680311203, + "logits/rejected": -0.0020574121735990047, + "logps/chosen": -1.4627248048782349, + "logps/rejected": -1.8346326351165771, + "loss": 0.6765, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4627248048782349, + "rewards/margins": 0.37190794944763184, + "rewards/rejected": -1.8346326351165771, + "sft_loss": 1.411965012550354, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 3.712972200243005, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": -0.07665841281414032, + "logits/rejected": -0.0004884630325250328, + "logps/chosen": -1.6424614191055298, + "logps/rejected": -1.8931677341461182, + "loss": 0.6884, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6424614191055298, + "rewards/margins": 0.2507062554359436, + "rewards/rejected": -1.8931677341461182, + "sft_loss": 1.5956447124481201, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 3.8721602308327197, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": -0.042667657136917114, + "logits/rejected": 0.1281338334083557, + "logps/chosen": -1.6979026794433594, + "logps/rejected": -2.0013856887817383, + "loss": 0.7094, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.6979026794433594, + "rewards/margins": 0.3034830689430237, + "rewards/rejected": -2.0013856887817383, + "sft_loss": 1.6016740798950195, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 2.855384143369643, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": -0.07494497299194336, + "logits/rejected": 0.1362690031528473, + "logps/chosen": -1.6354612112045288, + "logps/rejected": -2.150341749191284, + "loss": 0.6934, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6354612112045288, + "rewards/margins": 0.514880359172821, + "rewards/rejected": -2.150341749191284, + "sft_loss": 1.6575100421905518, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 5.104972393628471, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.1633204072713852, + "logits/rejected": -0.05091344565153122, + "logps/chosen": -1.6553776264190674, + "logps/rejected": -2.027411699295044, + "loss": 0.6862, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6553776264190674, + "rewards/margins": 0.3720341622829437, + "rewards/rejected": -2.027411699295044, + "sft_loss": 1.6414705514907837, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 3.8592230555707423, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": -0.12714692950248718, + "logits/rejected": 0.11072331666946411, + "logps/chosen": -1.6420742273330688, + "logps/rejected": -2.115112781524658, + "loss": 0.6898, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.6420742273330688, + "rewards/margins": 0.47303861379623413, + "rewards/rejected": -2.115112781524658, + "sft_loss": 1.635709524154663, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 2.9593871241841145, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.1302056759595871, + "logits/rejected": 0.061131738126277924, + "logps/chosen": -1.5277483463287354, + "logps/rejected": -2.047550916671753, + "loss": 0.6678, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5277483463287354, + "rewards/margins": 0.5198026895523071, + "rewards/rejected": -2.047550916671753, + "sft_loss": 1.588708519935608, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 5.614753547947297, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.2522553503513336, + "logits/rejected": -0.10259109735488892, + "logps/chosen": -1.6107975244522095, + "logps/rejected": -2.0465805530548096, + "loss": 0.6766, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6107975244522095, + "rewards/margins": 0.4357830584049225, + "rewards/rejected": -2.0465805530548096, + "sft_loss": 1.6689703464508057, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 7.2355387166733465, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": -0.07041691988706589, + "logits/rejected": -0.10695245116949081, + "logps/chosen": -1.628130316734314, + "logps/rejected": -1.8095182180404663, + "loss": 0.6951, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.628130316734314, + "rewards/margins": 0.18138787150382996, + "rewards/rejected": -1.8095182180404663, + "sft_loss": 1.6410319805145264, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 2.962926044065382, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": -0.07322728633880615, + "logits/rejected": 0.07346437871456146, + "logps/chosen": -1.5739026069641113, + "logps/rejected": -1.9100176095962524, + "loss": 0.6844, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5739026069641113, + "rewards/margins": 0.3361150622367859, + "rewards/rejected": -1.9100176095962524, + "sft_loss": 1.5770145654678345, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 2.9598461510264396, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.11749941110610962, + "logits/rejected": -0.03795120120048523, + "logps/chosen": -1.5487481355667114, + "logps/rejected": -1.897014856338501, + "loss": 0.681, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5487481355667114, + "rewards/margins": 0.3482665419578552, + "rewards/rejected": -1.897014856338501, + "sft_loss": 1.5692429542541504, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 2.147733896750781, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.11868778616189957, + "logits/rejected": 0.09065614640712738, + "logps/chosen": -1.6962509155273438, + "logps/rejected": -2.0103964805603027, + "loss": 0.6931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6962509155273438, + "rewards/margins": 0.3141458034515381, + "rewards/rejected": -2.0103964805603027, + "sft_loss": 1.630626916885376, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 3.1619078999272006, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.09264856576919556, + "logits/rejected": -0.01981327496469021, + "logps/chosen": -1.5092804431915283, + "logps/rejected": -1.969607949256897, + "loss": 0.6664, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5092804431915283, + "rewards/margins": 0.46032723784446716, + "rewards/rejected": -1.969607949256897, + "sft_loss": 1.4860968589782715, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 2.8083488278583673, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": -0.0894165113568306, + "logits/rejected": 0.018659692257642746, + "logps/chosen": -1.6026160717010498, + "logps/rejected": -2.1429691314697266, + "loss": 0.6692, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6026160717010498, + "rewards/margins": 0.5403528213500977, + "rewards/rejected": -2.1429691314697266, + "sft_loss": 1.633073091506958, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 7.867167915315164, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": -0.0508013479411602, + "logits/rejected": 0.010919039137661457, + "logps/chosen": -1.5411659479141235, + "logps/rejected": -1.8381420373916626, + "loss": 0.6846, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5411659479141235, + "rewards/margins": 0.2969761788845062, + "rewards/rejected": -1.8381420373916626, + "sft_loss": 1.5529801845550537, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 2.748657377959956, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.18835203349590302, + "logits/rejected": -0.023120930418372154, + "logps/chosen": -1.5490851402282715, + "logps/rejected": -2.185800075531006, + "loss": 0.6654, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5490851402282715, + "rewards/margins": 0.6367148160934448, + "rewards/rejected": -2.185800075531006, + "sft_loss": 1.473778486251831, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 1.7240456217497198, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.14346204698085785, + "logits/rejected": -0.001293714391067624, + "logps/chosen": -1.5688560009002686, + "logps/rejected": -2.0389304161071777, + "loss": 0.6728, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5688560009002686, + "rewards/margins": 0.4700745642185211, + "rewards/rejected": -2.0389304161071777, + "sft_loss": 1.500208854675293, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 3.8100275253920333, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": -0.09344641864299774, + "logits/rejected": -0.05481022596359253, + "logps/chosen": -1.5765666961669922, + "logps/rejected": -1.9928195476531982, + "loss": 0.6807, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5765666961669922, + "rewards/margins": 0.41625308990478516, + "rewards/rejected": -1.9928195476531982, + "sft_loss": 1.609368920326233, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 3.2646387720028565, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.12338890880346298, + "logits/rejected": -0.06230727955698967, + "logps/chosen": -1.6930673122406006, + "logps/rejected": -2.1141300201416016, + "loss": 0.6792, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6930673122406006, + "rewards/margins": 0.42106279730796814, + "rewards/rejected": -2.1141300201416016, + "sft_loss": 1.6491073369979858, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 2.559943077562952, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.1306913197040558, + "logits/rejected": -0.03029678203165531, + "logps/chosen": -1.5664455890655518, + "logps/rejected": -1.947819709777832, + "loss": 0.6754, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5664455890655518, + "rewards/margins": 0.38137391209602356, + "rewards/rejected": -1.947819709777832, + "sft_loss": 1.540327548980713, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 2.22133408064078, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.1324535310268402, + "logits/rejected": -0.015176964923739433, + "logps/chosen": -1.5364208221435547, + "logps/rejected": -1.9308210611343384, + "loss": 0.6769, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5364208221435547, + "rewards/margins": 0.394400417804718, + "rewards/rejected": -1.9308210611343384, + "sft_loss": 1.5744932889938354, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 3.5146639998653613, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": -0.05859723687171936, + "logits/rejected": -0.03487427160143852, + "logps/chosen": -1.556701898574829, + "logps/rejected": -1.9352022409439087, + "loss": 0.6918, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.556701898574829, + "rewards/margins": 0.37850040197372437, + "rewards/rejected": -1.9352022409439087, + "sft_loss": 1.5356924533843994, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 2.3927352220671403, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": -0.07654085010290146, + "logits/rejected": 0.0713915079832077, + "logps/chosen": -1.6044076681137085, + "logps/rejected": -1.892507553100586, + "loss": 0.6759, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6044076681137085, + "rewards/margins": 0.28809982538223267, + "rewards/rejected": -1.892507553100586, + "sft_loss": 1.644179105758667, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 5.443172287517489, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.12157295644283295, + "logits/rejected": -0.029175758361816406, + "logps/chosen": -1.4955778121948242, + "logps/rejected": -1.8605883121490479, + "loss": 0.6741, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4955778121948242, + "rewards/margins": 0.36501047015190125, + "rewards/rejected": -1.8605883121490479, + "sft_loss": 1.5303313732147217, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 3.711177108393341, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.14020344614982605, + "logits/rejected": -0.01927454024553299, + "logps/chosen": -1.5508153438568115, + "logps/rejected": -1.9805715084075928, + "loss": 0.6793, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5508153438568115, + "rewards/margins": 0.4297560751438141, + "rewards/rejected": -1.9805715084075928, + "sft_loss": 1.4936689138412476, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 2.444443823624952, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": -0.027949964627623558, + "logits/rejected": 0.04656906798481941, + "logps/chosen": -1.5221688747406006, + "logps/rejected": -1.7465623617172241, + "loss": 0.6931, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5221688747406006, + "rewards/margins": 0.22439360618591309, + "rewards/rejected": -1.7465623617172241, + "sft_loss": 1.4782495498657227, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 4.113717811033466, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": -0.008868610486388206, + "logits/rejected": 0.04976072162389755, + "logps/chosen": -1.5966230630874634, + "logps/rejected": -1.966228723526001, + "loss": 0.679, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5966230630874634, + "rewards/margins": 0.36960554122924805, + "rewards/rejected": -1.966228723526001, + "sft_loss": 1.589078664779663, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 3.257044099078646, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.1507941633462906, + "logits/rejected": -0.010530698113143444, + "logps/chosen": -1.4972823858261108, + "logps/rejected": -2.0315799713134766, + "loss": 0.6675, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4972823858261108, + "rewards/margins": 0.5342975854873657, + "rewards/rejected": -2.0315799713134766, + "sft_loss": 1.5237815380096436, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 2.2342951425617086, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.10327861458063126, + "logits/rejected": -0.022930169478058815, + "logps/chosen": -1.5314559936523438, + "logps/rejected": -1.8455177545547485, + "loss": 0.6798, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5314559936523438, + "rewards/margins": 0.31406185030937195, + "rewards/rejected": -1.8455177545547485, + "sft_loss": 1.5912697315216064, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 3.4486990877578068, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.23526854813098907, + "logits/rejected": -0.1062394380569458, + "logps/chosen": -1.4703363180160522, + "logps/rejected": -2.0053153038024902, + "loss": 0.6678, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4703363180160522, + "rewards/margins": 0.5349791646003723, + "rewards/rejected": -2.0053153038024902, + "sft_loss": 1.5218839645385742, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 5.274171248765637, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.10490190982818604, + "logits/rejected": 0.04200034216046333, + "logps/chosen": -1.5042073726654053, + "logps/rejected": -1.8845596313476562, + "loss": 0.6852, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5042073726654053, + "rewards/margins": 0.3803521990776062, + "rewards/rejected": -1.8845596313476562, + "sft_loss": 1.5251801013946533, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 3.384379823608095, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.14682337641716003, + "logits/rejected": -0.017906129360198975, + "logps/chosen": -1.5452371835708618, + "logps/rejected": -1.9544401168823242, + "loss": 0.6747, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5452371835708618, + "rewards/margins": 0.40920290350914, + "rewards/rejected": -1.9544401168823242, + "sft_loss": 1.5843769311904907, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 2.1764985463320174, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": -0.12845490872859955, + "logits/rejected": -0.053113799542188644, + "logps/chosen": -1.5860878229141235, + "logps/rejected": -1.8734443187713623, + "loss": 0.6819, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5860878229141235, + "rewards/margins": 0.287356436252594, + "rewards/rejected": -1.8734443187713623, + "sft_loss": 1.5250016450881958, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 2.975329281636474, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.19761589169502258, + "logits/rejected": -0.060870569199323654, + "logps/chosen": -1.5228192806243896, + "logps/rejected": -2.0270698070526123, + "loss": 0.6821, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5228192806243896, + "rewards/margins": 0.5042504668235779, + "rewards/rejected": -2.0270698070526123, + "sft_loss": 1.5352400541305542, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 3.5373981714487654, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": -0.05435476452112198, + "logits/rejected": -0.021604064851999283, + "logps/chosen": -1.5740587711334229, + "logps/rejected": -1.8411388397216797, + "loss": 0.6843, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5740587711334229, + "rewards/margins": 0.267080157995224, + "rewards/rejected": -1.8411388397216797, + "sft_loss": 1.5605738162994385, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 2.5979519570662237, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.14526249468326569, + "logits/rejected": -0.08094936609268188, + "logps/chosen": -1.5423524379730225, + "logps/rejected": -1.7739944458007812, + "loss": 0.6863, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5423524379730225, + "rewards/margins": 0.23164193332195282, + "rewards/rejected": -1.7739944458007812, + "sft_loss": 1.5814307928085327, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 3.4902212698196577, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": -0.08723638206720352, + "logits/rejected": 0.01722792722284794, + "logps/chosen": -1.5690443515777588, + "logps/rejected": -2.032050132751465, + "loss": 0.6752, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5690443515777588, + "rewards/margins": 0.46300578117370605, + "rewards/rejected": -2.032050132751465, + "sft_loss": 1.6070778369903564, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 4.70587093676687, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.15140441060066223, + "logits/rejected": -0.05931438133120537, + "logps/chosen": -1.5859469175338745, + "logps/rejected": -1.8986486196517944, + "loss": 0.6807, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5859469175338745, + "rewards/margins": 0.31270185112953186, + "rewards/rejected": -1.8986486196517944, + "sft_loss": 1.5984251499176025, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 1.6964966931429128, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.26614508032798767, + "logits/rejected": -0.1424245536327362, + "logps/chosen": -1.6399457454681396, + "logps/rejected": -1.8817602396011353, + "loss": 0.7036, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.6399457454681396, + "rewards/margins": 0.241814523935318, + "rewards/rejected": -1.8817602396011353, + "sft_loss": 1.5788856744766235, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 3.3134105939667897, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.16021369397640228, + "logits/rejected": -0.05174224451184273, + "logps/chosen": -1.5435289144515991, + "logps/rejected": -1.9880918264389038, + "loss": 0.6745, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5435289144515991, + "rewards/margins": 0.4445629119873047, + "rewards/rejected": -1.9880918264389038, + "sft_loss": 1.5799753665924072, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 3.543600343763058, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.14376217126846313, + "logits/rejected": -0.06231715530157089, + "logps/chosen": -1.702498435974121, + "logps/rejected": -2.086392879486084, + "loss": 0.6769, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.702498435974121, + "rewards/margins": 0.38389450311660767, + "rewards/rejected": -2.086392879486084, + "sft_loss": 1.6510069370269775, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 3.735182065545256, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.1657623052597046, + "logits/rejected": -0.08684898167848587, + "logps/chosen": -1.603272795677185, + "logps/rejected": -2.073019504547119, + "loss": 0.6675, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.603272795677185, + "rewards/margins": 0.46974676847457886, + "rewards/rejected": -2.073019504547119, + "sft_loss": 1.5904573202133179, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 3.0067181950688338, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.2079596072435379, + "logits/rejected": -0.051432013511657715, + "logps/chosen": -1.779923677444458, + "logps/rejected": -2.2986772060394287, + "loss": 0.6911, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.779923677444458, + "rewards/margins": 0.5187537670135498, + "rewards/rejected": -2.2986772060394287, + "sft_loss": 1.7402454614639282, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 1.8008577817804836, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.24255767464637756, + "logits/rejected": -0.10895420610904694, + "logps/chosen": -1.8133251667022705, + "logps/rejected": -2.1065123081207275, + "loss": 0.69, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.8133251667022705, + "rewards/margins": 0.2931869328022003, + "rewards/rejected": -2.1065123081207275, + "sft_loss": 1.6673797369003296, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 3.214189820214113, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.11197970062494278, + "logits/rejected": 0.06118257716298103, + "logps/chosen": -1.7502024173736572, + "logps/rejected": -2.4015679359436035, + "loss": 0.6733, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7502024173736572, + "rewards/margins": 0.6513655185699463, + "rewards/rejected": -2.4015679359436035, + "sft_loss": 1.679987907409668, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 1.843152406956296, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.14984950423240662, + "logits/rejected": -0.06991840898990631, + "logps/chosen": -1.6768118143081665, + "logps/rejected": -2.119256019592285, + "loss": 0.6709, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6768118143081665, + "rewards/margins": 0.44244417548179626, + "rewards/rejected": -2.119256019592285, + "sft_loss": 1.6225124597549438, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 3.3615092610738317, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.15404468774795532, + "logits/rejected": 0.045191358774900436, + "logps/chosen": -1.6460826396942139, + "logps/rejected": -2.1567981243133545, + "loss": 0.6888, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6460826396942139, + "rewards/margins": 0.5107154846191406, + "rewards/rejected": -2.1567981243133545, + "sft_loss": 1.706637978553772, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 6.204257382868284, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.08804275840520859, + "logits/rejected": -0.029958754777908325, + "logps/chosen": -1.678462028503418, + "logps/rejected": -2.050363779067993, + "loss": 0.6745, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.678462028503418, + "rewards/margins": 0.37190166115760803, + "rewards/rejected": -2.050363779067993, + "sft_loss": 1.5845019817352295, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 3.0052606984490824, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.23166868090629578, + "logits/rejected": 0.019547026604413986, + "logps/chosen": -1.6740500926971436, + "logps/rejected": -2.0996651649475098, + "loss": 0.6714, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6740500926971436, + "rewards/margins": 0.42561525106430054, + "rewards/rejected": -2.0996651649475098, + "sft_loss": 1.651079773902893, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 2.599228606225825, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.129378542304039, + "logits/rejected": 0.02050795778632164, + "logps/chosen": -1.6524471044540405, + "logps/rejected": -1.946677565574646, + "loss": 0.6851, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6524471044540405, + "rewards/margins": 0.2942304313182831, + "rewards/rejected": -1.946677565574646, + "sft_loss": 1.5889383554458618, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 2.7196530986636303, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.21800541877746582, + "logits/rejected": -0.09155451506376266, + "logps/chosen": -1.5629202127456665, + "logps/rejected": -1.9020274877548218, + "loss": 0.689, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5629202127456665, + "rewards/margins": 0.33910730481147766, + "rewards/rejected": -1.9020274877548218, + "sft_loss": 1.5543124675750732, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 2.389383825205269, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.14448337256908417, + "logits/rejected": -0.0616963729262352, + "logps/chosen": -1.5610359907150269, + "logps/rejected": -1.8518545627593994, + "loss": 0.6822, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5610359907150269, + "rewards/margins": 0.2908182442188263, + "rewards/rejected": -1.8518545627593994, + "sft_loss": 1.5730682611465454, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 2.519521640106637, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.1483561098575592, + "logits/rejected": -0.06898792088031769, + "logps/chosen": -1.4000225067138672, + "logps/rejected": -1.8333406448364258, + "loss": 0.654, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4000225067138672, + "rewards/margins": 0.4333181381225586, + "rewards/rejected": -1.8333406448364258, + "sft_loss": 1.432531714439392, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 2.827697197982811, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.2625359892845154, + "logits/rejected": -0.055143196135759354, + "logps/chosen": -1.5686523914337158, + "logps/rejected": -1.9178880453109741, + "loss": 0.6765, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5686523914337158, + "rewards/margins": 0.3492355942726135, + "rewards/rejected": -1.9178880453109741, + "sft_loss": 1.5924729108810425, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 3.874785154702929, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.18027618527412415, + "logits/rejected": -0.041930388659238815, + "logps/chosen": -1.440549612045288, + "logps/rejected": -1.9213539361953735, + "loss": 0.6565, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.440549612045288, + "rewards/margins": 0.48080435395240784, + "rewards/rejected": -1.9213539361953735, + "sft_loss": 1.4900104999542236, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 3.0687794402615913, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.22330304980278015, + "logits/rejected": -0.04363471269607544, + "logps/chosen": -1.5808871984481812, + "logps/rejected": -2.0564162731170654, + "loss": 0.6773, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5808871984481812, + "rewards/margins": 0.47552910447120667, + "rewards/rejected": -2.0564162731170654, + "sft_loss": 1.5744086503982544, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 3.3563856521239184, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.13265278935432434, + "logits/rejected": -0.011842799372971058, + "logps/chosen": -1.5311095714569092, + "logps/rejected": -2.0399792194366455, + "loss": 0.6708, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5311095714569092, + "rewards/margins": 0.5088695287704468, + "rewards/rejected": -2.0399792194366455, + "sft_loss": 1.5216306447982788, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 3.216162602204211, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.22170314192771912, + "logits/rejected": -0.04994767904281616, + "logps/chosen": -1.5834558010101318, + "logps/rejected": -2.1816983222961426, + "loss": 0.6626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5834558010101318, + "rewards/margins": 0.5982425212860107, + "rewards/rejected": -2.1816983222961426, + "sft_loss": 1.614985704421997, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 2.184799152816952, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.1607782393693924, + "logits/rejected": 0.047008074820041656, + "logps/chosen": -1.622387170791626, + "logps/rejected": -2.0742850303649902, + "loss": 0.6761, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.622387170791626, + "rewards/margins": 0.4518980085849762, + "rewards/rejected": -2.0742850303649902, + "sft_loss": 1.6672132015228271, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 6.250157245132641, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.14060020446777344, + "logits/rejected": -0.0696086436510086, + "logps/chosen": -1.472314476966858, + "logps/rejected": -1.8883994817733765, + "loss": 0.6626, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.472314476966858, + "rewards/margins": 0.4160851538181305, + "rewards/rejected": -1.8883994817733765, + "sft_loss": 1.487228512763977, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 1.6480790156412097, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.19000086188316345, + "logits/rejected": -0.15001313388347626, + "logps/chosen": -1.524418592453003, + "logps/rejected": -1.8568360805511475, + "loss": 0.6727, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.524418592453003, + "rewards/margins": 0.3324173390865326, + "rewards/rejected": -1.8568360805511475, + "sft_loss": 1.5695242881774902, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 4.760518771846394, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.1314769834280014, + "logits/rejected": -0.06649098545312881, + "logps/chosen": -1.4572066068649292, + "logps/rejected": -2.0760457515716553, + "loss": 0.6541, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4572066068649292, + "rewards/margins": 0.6188389658927917, + "rewards/rejected": -2.0760457515716553, + "sft_loss": 1.508234977722168, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 3.4784089052305083, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.1936931610107422, + "logits/rejected": -0.057973574846982956, + "logps/chosen": -1.4674887657165527, + "logps/rejected": -1.873656988143921, + "loss": 0.677, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4674887657165527, + "rewards/margins": 0.40616822242736816, + "rewards/rejected": -1.873656988143921, + "sft_loss": 1.5056754350662231, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 2.5732562782095068, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": -0.10193010419607162, + "logits/rejected": -0.017011495307087898, + "logps/chosen": -1.4385443925857544, + "logps/rejected": -1.8158655166625977, + "loss": 0.672, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4385443925857544, + "rewards/margins": 0.3773210942745209, + "rewards/rejected": -1.8158655166625977, + "sft_loss": 1.4791452884674072, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 3.3913885943645914, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": -0.08598263561725616, + "logits/rejected": 0.0331895537674427, + "logps/chosen": -1.5553724765777588, + "logps/rejected": -1.9766775369644165, + "loss": 0.6723, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5553724765777588, + "rewards/margins": 0.4213050901889801, + "rewards/rejected": -1.9766775369644165, + "sft_loss": 1.550366759300232, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 3.052299834948276, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.14798256754875183, + "logits/rejected": 0.005163169465959072, + "logps/chosen": -1.654259443283081, + "logps/rejected": -1.9187091588974, + "loss": 0.6827, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.654259443283081, + "rewards/margins": 0.2644497752189636, + "rewards/rejected": -1.9187091588974, + "sft_loss": 1.562561273574829, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 2.957691286330317, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.20001676678657532, + "logits/rejected": -0.10823854058980942, + "logps/chosen": -1.6180994510650635, + "logps/rejected": -1.9572150707244873, + "loss": 0.676, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6180994510650635, + "rewards/margins": 0.3391154706478119, + "rewards/rejected": -1.9572150707244873, + "sft_loss": 1.6023613214492798, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 3.5448089859794485, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": -0.08415170013904572, + "logits/rejected": 0.06598736345767975, + "logps/chosen": -1.5128848552703857, + "logps/rejected": -2.073805093765259, + "loss": 0.6516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5128848552703857, + "rewards/margins": 0.5609201192855835, + "rewards/rejected": -2.073805093765259, + "sft_loss": 1.5354398488998413, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 11.781259970594936, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.17054155468940735, + "logits/rejected": -0.01642870530486107, + "logps/chosen": -1.6963222026824951, + "logps/rejected": -2.1025562286376953, + "loss": 0.714, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6963222026824951, + "rewards/margins": 0.40623408555984497, + "rewards/rejected": -2.1025562286376953, + "sft_loss": 1.662977933883667, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.11042266339063644, + "eval_logits/rejected": 0.19898873567581177, + "eval_logps/chosen": -1.562518835067749, + "eval_logps/rejected": -2.0465967655181885, + "eval_loss": 0.6718986630439758, + "eval_rewards/accuracies": 0.6268545985221863, + "eval_rewards/chosen": -1.562518835067749, + "eval_rewards/margins": 0.48407796025276184, + "eval_rewards/rejected": -2.0465967655181885, + "eval_runtime": 47.4282, + "eval_samples_per_second": 28.359, + "eval_sft_loss": 1.5563881397247314, + "eval_steps_per_second": 7.105, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 3.513893008993406, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.1328616440296173, + "logits/rejected": -0.04587624594569206, + "logps/chosen": -1.6134157180786133, + "logps/rejected": -2.0356030464172363, + "loss": 0.6805, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.6134157180786133, + "rewards/margins": 0.4221871793270111, + "rewards/rejected": -2.0356030464172363, + "sft_loss": 1.6057875156402588, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 4.802954296253678, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.10831431299448013, + "logits/rejected": -0.0031776546966284513, + "logps/chosen": -1.4763798713684082, + "logps/rejected": -1.8952471017837524, + "loss": 0.672, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4763798713684082, + "rewards/margins": 0.41886717081069946, + "rewards/rejected": -1.8952471017837524, + "sft_loss": 1.5348972082138062, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 2.858256991560063, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.18605680763721466, + "logits/rejected": -0.04593934118747711, + "logps/chosen": -1.5805212259292603, + "logps/rejected": -2.0479989051818848, + "loss": 0.6793, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5805212259292603, + "rewards/margins": 0.4674775004386902, + "rewards/rejected": -2.0479989051818848, + "sft_loss": 1.506347417831421, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 4.329723234751204, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.24087968468666077, + "logits/rejected": -0.062106020748615265, + "logps/chosen": -1.6259753704071045, + "logps/rejected": -2.1355271339416504, + "loss": 0.6662, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6259753704071045, + "rewards/margins": 0.5095517039299011, + "rewards/rejected": -2.1355271339416504, + "sft_loss": 1.5672664642333984, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 3.495146311508932, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.12188255786895752, + "logits/rejected": 0.0037569478154182434, + "logps/chosen": -1.6243022680282593, + "logps/rejected": -2.1850533485412598, + "loss": 0.6581, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6243022680282593, + "rewards/margins": 0.5607510805130005, + "rewards/rejected": -2.1850533485412598, + "sft_loss": 1.6390899419784546, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 3.039671083603455, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.19984665513038635, + "logits/rejected": -0.16852447390556335, + "logps/chosen": -1.5463429689407349, + "logps/rejected": -1.9309475421905518, + "loss": 0.6711, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5463429689407349, + "rewards/margins": 0.3846047520637512, + "rewards/rejected": -1.9309475421905518, + "sft_loss": 1.5829219818115234, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 2.790160745985205, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.22109492123126984, + "logits/rejected": -0.05157407373189926, + "logps/chosen": -1.6182200908660889, + "logps/rejected": -2.0302791595458984, + "loss": 0.6748, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6182200908660889, + "rewards/margins": 0.4120589792728424, + "rewards/rejected": -2.0302791595458984, + "sft_loss": 1.6120506525039673, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 3.315920683366051, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.2908003330230713, + "logits/rejected": -0.2197912484407425, + "logps/chosen": -1.6452051401138306, + "logps/rejected": -1.9569673538208008, + "loss": 0.6658, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6452051401138306, + "rewards/margins": 0.3117622435092926, + "rewards/rejected": -1.9569673538208008, + "sft_loss": 1.6148096323013306, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 5.462304473351597, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.14959219098091125, + "logits/rejected": -0.11621057987213135, + "logps/chosen": -1.5611058473587036, + "logps/rejected": -1.9687639474868774, + "loss": 0.6683, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5611058473587036, + "rewards/margins": 0.40765801072120667, + "rewards/rejected": -1.9687639474868774, + "sft_loss": 1.4946987628936768, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 8.662994976379249, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.24751749634742737, + "logits/rejected": -0.1236846074461937, + "logps/chosen": -1.6202363967895508, + "logps/rejected": -1.980786919593811, + "loss": 0.6921, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6202363967895508, + "rewards/margins": 0.36055055260658264, + "rewards/rejected": -1.980786919593811, + "sft_loss": 1.6707451343536377, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 4.456899179225936, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.17690253257751465, + "logits/rejected": -0.0510096549987793, + "logps/chosen": -1.5945512056350708, + "logps/rejected": -2.002166271209717, + "loss": 0.6654, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5945512056350708, + "rewards/margins": 0.40761512517929077, + "rewards/rejected": -2.002166271209717, + "sft_loss": 1.628100037574768, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 4.65806576514881, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.1397233009338379, + "logits/rejected": -0.09292219579219818, + "logps/chosen": -1.5926469564437866, + "logps/rejected": -2.1180152893066406, + "loss": 0.6667, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5926469564437866, + "rewards/margins": 0.5253681540489197, + "rewards/rejected": -2.1180152893066406, + "sft_loss": 1.6516544818878174, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 3.5342850756940387, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.2090418040752411, + "logits/rejected": -0.07881750911474228, + "logps/chosen": -1.5486667156219482, + "logps/rejected": -1.9510724544525146, + "loss": 0.6652, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5486667156219482, + "rewards/margins": 0.40240558981895447, + "rewards/rejected": -1.9510724544525146, + "sft_loss": 1.530504584312439, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 2.832539801553843, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.20894074440002441, + "logits/rejected": -0.06344683468341827, + "logps/chosen": -1.530551791191101, + "logps/rejected": -2.070535182952881, + "loss": 0.6511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.530551791191101, + "rewards/margins": 0.5399833917617798, + "rewards/rejected": -2.070535182952881, + "sft_loss": 1.517613172531128, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 3.38191618292415, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.23570159077644348, + "logits/rejected": -0.13040268421173096, + "logps/chosen": -1.644474744796753, + "logps/rejected": -2.1541659832000732, + "loss": 0.6699, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.644474744796753, + "rewards/margins": 0.5096911191940308, + "rewards/rejected": -2.1541659832000732, + "sft_loss": 1.7060950994491577, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 2.364002055632971, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.25074702501296997, + "logits/rejected": -0.0874941349029541, + "logps/chosen": -1.4881782531738281, + "logps/rejected": -2.0220913887023926, + "loss": 0.6526, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4881782531738281, + "rewards/margins": 0.5339129567146301, + "rewards/rejected": -2.0220913887023926, + "sft_loss": 1.530760407447815, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 6.815710847699838, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.20786544680595398, + "logits/rejected": -0.0966242104768753, + "logps/chosen": -1.5392001867294312, + "logps/rejected": -2.1013989448547363, + "loss": 0.6623, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5392001867294312, + "rewards/margins": 0.5621985793113708, + "rewards/rejected": -2.1013989448547363, + "sft_loss": 1.576633334159851, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 4.49999671804145, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.259671151638031, + "logits/rejected": -0.24447309970855713, + "logps/chosen": -1.5871320962905884, + "logps/rejected": -1.8874315023422241, + "loss": 0.6689, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5871320962905884, + "rewards/margins": 0.3002995252609253, + "rewards/rejected": -1.8874315023422241, + "sft_loss": 1.626317024230957, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 2.0323666230061845, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.24338272213935852, + "logits/rejected": -0.10547523200511932, + "logps/chosen": -1.7075008153915405, + "logps/rejected": -2.0710959434509277, + "loss": 0.6956, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7075008153915405, + "rewards/margins": 0.36359527707099915, + "rewards/rejected": -2.0710959434509277, + "sft_loss": 1.6951372623443604, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 6.765873292332573, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.16340097784996033, + "logits/rejected": -0.11491282284259796, + "logps/chosen": -1.7300260066986084, + "logps/rejected": -2.051795482635498, + "loss": 0.6792, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7300260066986084, + "rewards/margins": 0.3217691481113434, + "rewards/rejected": -2.051795482635498, + "sft_loss": 1.6949584484100342, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 3.199587380600216, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": -0.10508096218109131, + "logits/rejected": 0.014907196164131165, + "logps/chosen": -1.619755506515503, + "logps/rejected": -2.115018606185913, + "loss": 0.6639, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.619755506515503, + "rewards/margins": 0.4952631890773773, + "rewards/rejected": -2.115018606185913, + "sft_loss": 1.5901477336883545, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 8.121098754677774, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": -0.19740596413612366, + "logits/rejected": -0.04385928064584732, + "logps/chosen": -1.5759496688842773, + "logps/rejected": -2.2588329315185547, + "loss": 0.6571, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5759496688842773, + "rewards/margins": 0.682883083820343, + "rewards/rejected": -2.2588329315185547, + "sft_loss": 1.6312938928604126, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 4.349147420252072, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": -0.08879465609788895, + "logits/rejected": -0.03234269469976425, + "logps/chosen": -1.6562553644180298, + "logps/rejected": -2.0740723609924316, + "loss": 0.6553, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6562553644180298, + "rewards/margins": 0.41781702637672424, + "rewards/rejected": -2.0740723609924316, + "sft_loss": 1.6364749670028687, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 3.943676183821946, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": -0.17019066214561462, + "logits/rejected": -0.126693457365036, + "logps/chosen": -1.5662453174591064, + "logps/rejected": -1.9887056350708008, + "loss": 0.6629, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5662453174591064, + "rewards/margins": 0.4224603772163391, + "rewards/rejected": -1.9887056350708008, + "sft_loss": 1.633723497390747, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 5.473259104854902, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": -0.14707279205322266, + "logits/rejected": -0.08167170733213425, + "logps/chosen": -1.516506552696228, + "logps/rejected": -2.081937789916992, + "loss": 0.6483, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.516506552696228, + "rewards/margins": 0.5654313564300537, + "rewards/rejected": -2.081937789916992, + "sft_loss": 1.5508047342300415, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 4.218506802957272, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": -0.10259685665369034, + "logits/rejected": 0.1375846266746521, + "logps/chosen": -1.573190689086914, + "logps/rejected": -2.189675807952881, + "loss": 0.6609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.573190689086914, + "rewards/margins": 0.6164848804473877, + "rewards/rejected": -2.189675807952881, + "sft_loss": 1.6187585592269897, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 5.6719658150784396, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": -0.11972524225711823, + "logits/rejected": 0.0455777645111084, + "logps/chosen": -1.6146034002304077, + "logps/rejected": -2.1865992546081543, + "loss": 0.6688, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6146034002304077, + "rewards/margins": 0.5719958543777466, + "rewards/rejected": -2.1865992546081543, + "sft_loss": 1.6162227392196655, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 3.953793622265437, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": -0.15830545127391815, + "logits/rejected": -0.015896636992692947, + "logps/chosen": -1.6073856353759766, + "logps/rejected": -2.148798704147339, + "loss": 0.676, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6073856353759766, + "rewards/margins": 0.5414127707481384, + "rewards/rejected": -2.148798704147339, + "sft_loss": 1.6097323894500732, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 2.7461565163394446, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": -0.1359499990940094, + "logits/rejected": -0.04135057330131531, + "logps/chosen": -1.536054253578186, + "logps/rejected": -2.0973961353302, + "loss": 0.6463, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.536054253578186, + "rewards/margins": 0.5613418817520142, + "rewards/rejected": -2.0973961353302, + "sft_loss": 1.5942299365997314, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 1.772647002953551, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.21985526382923126, + "logits/rejected": -0.16376951336860657, + "logps/chosen": -1.7838491201400757, + "logps/rejected": -2.25213885307312, + "loss": 0.6791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7838491201400757, + "rewards/margins": 0.46828994154930115, + "rewards/rejected": -2.25213885307312, + "sft_loss": 1.699902892112732, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 1.8222620144441337, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": -0.1366005688905716, + "logits/rejected": -0.07194123417139053, + "logps/chosen": -1.7815555334091187, + "logps/rejected": -2.1918790340423584, + "loss": 0.6683, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.7815555334091187, + "rewards/margins": 0.4103233218193054, + "rewards/rejected": -2.1918790340423584, + "sft_loss": 1.7492650747299194, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 7.720740241090713, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.1920839548110962, + "logits/rejected": -0.08554677665233612, + "logps/chosen": -1.6699411869049072, + "logps/rejected": -2.223794937133789, + "loss": 0.6466, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6699411869049072, + "rewards/margins": 0.5538536310195923, + "rewards/rejected": -2.223794937133789, + "sft_loss": 1.6859760284423828, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 1.815984222290429, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": -0.18362686038017273, + "logits/rejected": -0.0012839033734053373, + "logps/chosen": -1.6417992115020752, + "logps/rejected": -2.2869229316711426, + "loss": 0.6707, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6417992115020752, + "rewards/margins": 0.6451237797737122, + "rewards/rejected": -2.2869229316711426, + "sft_loss": 1.6056686639785767, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 2.811239975892915, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": -0.17116299271583557, + "logits/rejected": -0.00892619788646698, + "logps/chosen": -1.5159931182861328, + "logps/rejected": -2.1569645404815674, + "loss": 0.6551, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5159931182861328, + "rewards/margins": 0.640971302986145, + "rewards/rejected": -2.1569645404815674, + "sft_loss": 1.5590221881866455, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 3.8606992562661486, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.2119617462158203, + "logits/rejected": -0.04030895233154297, + "logps/chosen": -1.575308084487915, + "logps/rejected": -2.019559144973755, + "loss": 0.67, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.575308084487915, + "rewards/margins": 0.4442509114742279, + "rewards/rejected": -2.019559144973755, + "sft_loss": 1.589772343635559, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 5.051142678262766, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": -0.17688782513141632, + "logits/rejected": -0.09141966700553894, + "logps/chosen": -1.5610404014587402, + "logps/rejected": -2.0049726963043213, + "loss": 0.656, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5610404014587402, + "rewards/margins": 0.4439323842525482, + "rewards/rejected": -2.0049726963043213, + "sft_loss": 1.5945446491241455, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 3.7442822333080703, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": -0.11327006667852402, + "logits/rejected": 0.024983903393149376, + "logps/chosen": -1.6136016845703125, + "logps/rejected": -1.9793922901153564, + "loss": 0.673, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6136016845703125, + "rewards/margins": 0.3657905161380768, + "rewards/rejected": -1.9793922901153564, + "sft_loss": 1.566320538520813, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 3.412670516498388, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": -0.050362229347229004, + "logits/rejected": 0.006541428156197071, + "logps/chosen": -1.5444839000701904, + "logps/rejected": -2.030472993850708, + "loss": 0.6607, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5444839000701904, + "rewards/margins": 0.48598918318748474, + "rewards/rejected": -2.030472993850708, + "sft_loss": 1.6165252923965454, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 3.6855124550664695, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": -0.12177852541208267, + "logits/rejected": -0.029640281572937965, + "logps/chosen": -1.5576467514038086, + "logps/rejected": -2.102085590362549, + "loss": 0.6617, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5576467514038086, + "rewards/margins": 0.5444390177726746, + "rewards/rejected": -2.102085590362549, + "sft_loss": 1.6117064952850342, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 3.0318364406998137, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": -0.17540039122104645, + "logits/rejected": -0.18315133452415466, + "logps/chosen": -1.6013386249542236, + "logps/rejected": -2.041822910308838, + "loss": 0.6668, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6013386249542236, + "rewards/margins": 0.4404842257499695, + "rewards/rejected": -2.041822910308838, + "sft_loss": 1.660768747329712, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 4.709270443883367, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.224747896194458, + "logits/rejected": -0.038868196308612823, + "logps/chosen": -1.5257173776626587, + "logps/rejected": -2.0317251682281494, + "loss": 0.667, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5257173776626587, + "rewards/margins": 0.506007969379425, + "rewards/rejected": -2.0317251682281494, + "sft_loss": 1.5080684423446655, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 3.7929751048578484, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.2617315649986267, + "logits/rejected": -0.14034771919250488, + "logps/chosen": -1.3956866264343262, + "logps/rejected": -2.0973429679870605, + "loss": 0.6389, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3956866264343262, + "rewards/margins": 0.7016563415527344, + "rewards/rejected": -2.0973429679870605, + "sft_loss": 1.4621050357818604, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 3.3729236083614635, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.21176567673683167, + "logits/rejected": -0.13093852996826172, + "logps/chosen": -1.7197602987289429, + "logps/rejected": -2.2689871788024902, + "loss": 0.6712, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7197602987289429, + "rewards/margins": 0.5492271184921265, + "rewards/rejected": -2.2689871788024902, + "sft_loss": 1.6264690160751343, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 3.1390038372846467, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.26345133781433105, + "logits/rejected": -0.1286747008562088, + "logps/chosen": -1.534954309463501, + "logps/rejected": -1.9984760284423828, + "loss": 0.6728, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.534954309463501, + "rewards/margins": 0.46352171897888184, + "rewards/rejected": -1.9984760284423828, + "sft_loss": 1.5593990087509155, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 3.360746336759213, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.2391415536403656, + "logits/rejected": -0.035399384796619415, + "logps/chosen": -1.5200650691986084, + "logps/rejected": -2.197875499725342, + "loss": 0.6529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5200650691986084, + "rewards/margins": 0.677810549736023, + "rewards/rejected": -2.197875499725342, + "sft_loss": 1.5431331396102905, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 21.355656846058746, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.2608916461467743, + "logits/rejected": -0.11797686666250229, + "logps/chosen": -1.5888410806655884, + "logps/rejected": -2.375009059906006, + "loss": 0.6799, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5888410806655884, + "rewards/margins": 0.7861679196357727, + "rewards/rejected": -2.375009059906006, + "sft_loss": 1.5629527568817139, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 5.423272631066221, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": -0.2376869022846222, + "logits/rejected": -0.05963951349258423, + "logps/chosen": -1.5007435083389282, + "logps/rejected": -2.112490177154541, + "loss": 0.6556, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5007435083389282, + "rewards/margins": 0.6117470264434814, + "rewards/rejected": -2.112490177154541, + "sft_loss": 1.5177980661392212, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 4.5684604787024155, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.19655776023864746, + "logits/rejected": -0.038988981395959854, + "logps/chosen": -1.5234743356704712, + "logps/rejected": -2.0442726612091064, + "loss": 0.65, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5234743356704712, + "rewards/margins": 0.5207983255386353, + "rewards/rejected": -2.0442726612091064, + "sft_loss": 1.541851282119751, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 5.618983740559261, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": -0.1544310748577118, + "logits/rejected": -0.10041675716638565, + "logps/chosen": -1.4842172861099243, + "logps/rejected": -1.9160614013671875, + "loss": 0.6643, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4842172861099243, + "rewards/margins": 0.4318443238735199, + "rewards/rejected": -1.9160614013671875, + "sft_loss": 1.5359599590301514, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 3.503081298811264, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": -0.08281540125608444, + "logits/rejected": -0.056744955480098724, + "logps/chosen": -1.470792293548584, + "logps/rejected": -2.0276589393615723, + "loss": 0.6547, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.470792293548584, + "rewards/margins": 0.5568663477897644, + "rewards/rejected": -2.0276589393615723, + "sft_loss": 1.491330862045288, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 2.6902293197930613, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.16725048422813416, + "logits/rejected": -0.05352597311139107, + "logps/chosen": -1.603641152381897, + "logps/rejected": -1.9243053197860718, + "loss": 0.6768, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.603641152381897, + "rewards/margins": 0.3206642270088196, + "rewards/rejected": -1.9243053197860718, + "sft_loss": 1.5715465545654297, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 3.188819818525931, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.09861128032207489, + "logits/rejected": -0.10165087878704071, + "logps/chosen": -1.4987351894378662, + "logps/rejected": -1.8199284076690674, + "loss": 0.6648, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4987351894378662, + "rewards/margins": 0.3211931884288788, + "rewards/rejected": -1.8199284076690674, + "sft_loss": 1.538835883140564, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 3.759510277724772, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.17928043007850647, + "logits/rejected": -0.07855254411697388, + "logps/chosen": -1.5587952136993408, + "logps/rejected": -1.9538202285766602, + "loss": 0.6721, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5587952136993408, + "rewards/margins": 0.3950250744819641, + "rewards/rejected": -1.9538202285766602, + "sft_loss": 1.632261872291565, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 4.262150922630274, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.26483815908432007, + "logits/rejected": -0.09061449021100998, + "logps/chosen": -1.5146794319152832, + "logps/rejected": -2.103245735168457, + "loss": 0.6441, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5146794319152832, + "rewards/margins": 0.5885661840438843, + "rewards/rejected": -2.103245735168457, + "sft_loss": 1.484770655632019, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 6.369910309273914, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.15041914582252502, + "logits/rejected": -0.01669810339808464, + "logps/chosen": -1.4834332466125488, + "logps/rejected": -2.0982134342193604, + "loss": 0.641, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4834332466125488, + "rewards/margins": 0.614780068397522, + "rewards/rejected": -2.0982134342193604, + "sft_loss": 1.4943815469741821, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 3.8965884076523145, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.200050950050354, + "logits/rejected": -0.024930734187364578, + "logps/chosen": -1.3915150165557861, + "logps/rejected": -1.8986610174179077, + "loss": 0.6561, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3915150165557861, + "rewards/margins": 0.5071460008621216, + "rewards/rejected": -1.8986610174179077, + "sft_loss": 1.4466291666030884, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 3.1816520286868695, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.12159286439418793, + "logits/rejected": 0.08534251153469086, + "logps/chosen": -1.5690300464630127, + "logps/rejected": -2.113259792327881, + "loss": 0.659, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5690300464630127, + "rewards/margins": 0.5442299246788025, + "rewards/rejected": -2.113259792327881, + "sft_loss": 1.5607188940048218, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 10.016477380540637, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.2426680326461792, + "logits/rejected": -0.07869232445955276, + "logps/chosen": -1.4299724102020264, + "logps/rejected": -1.9123938083648682, + "loss": 0.6885, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4299724102020264, + "rewards/margins": 0.4824213981628418, + "rewards/rejected": -1.9123938083648682, + "sft_loss": 1.4660427570343018, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 4.9025494746692875, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.16157862544059753, + "logits/rejected": -0.06997451931238174, + "logps/chosen": -1.5286527872085571, + "logps/rejected": -2.041487216949463, + "loss": 0.6687, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5286527872085571, + "rewards/margins": 0.5128344893455505, + "rewards/rejected": -2.041487216949463, + "sft_loss": 1.6155637502670288, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 4.784052292853247, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.1238420233130455, + "logits/rejected": 0.01755066215991974, + "logps/chosen": -1.559314489364624, + "logps/rejected": -1.9448274374008179, + "loss": 0.6812, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.559314489364624, + "rewards/margins": 0.385513037443161, + "rewards/rejected": -1.9448274374008179, + "sft_loss": 1.6126524209976196, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 2.9649760356128496, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.18504095077514648, + "logits/rejected": -0.03801523894071579, + "logps/chosen": -1.7024242877960205, + "logps/rejected": -2.318399429321289, + "loss": 0.659, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7024242877960205, + "rewards/margins": 0.6159749627113342, + "rewards/rejected": -2.318399429321289, + "sft_loss": 1.6507160663604736, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 9.849081860091095, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.15487918257713318, + "logits/rejected": 0.11469636857509613, + "logps/chosen": -1.5609652996063232, + "logps/rejected": -2.2454323768615723, + "loss": 0.6696, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5609652996063232, + "rewards/margins": 0.6844668388366699, + "rewards/rejected": -2.2454323768615723, + "sft_loss": 1.538551688194275, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 5.162481401345401, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.1427270919084549, + "logits/rejected": 0.0529114305973053, + "logps/chosen": -1.6295175552368164, + "logps/rejected": -2.1949503421783447, + "loss": 0.6645, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6295175552368164, + "rewards/margins": 0.5654329061508179, + "rewards/rejected": -2.1949503421783447, + "sft_loss": 1.6471545696258545, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 3.5235516282761807, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": -0.047027476131916046, + "logits/rejected": -0.007574816700071096, + "logps/chosen": -1.6011667251586914, + "logps/rejected": -2.0129342079162598, + "loss": 0.6819, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6011667251586914, + "rewards/margins": 0.41176754236221313, + "rewards/rejected": -2.0129342079162598, + "sft_loss": 1.5609718561172485, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 3.186771606317666, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": -0.04279576614499092, + "logits/rejected": 0.03906116262078285, + "logps/chosen": -1.6204216480255127, + "logps/rejected": -2.166551351547241, + "loss": 0.673, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6204216480255127, + "rewards/margins": 0.5461298227310181, + "rewards/rejected": -2.166551351547241, + "sft_loss": 1.5434439182281494, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 3.401960753855456, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.19755621254444122, + "logits/rejected": -0.07851502299308777, + "logps/chosen": -1.576912522315979, + "logps/rejected": -2.0294346809387207, + "loss": 0.6715, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.576912522315979, + "rewards/margins": 0.452522337436676, + "rewards/rejected": -2.0294346809387207, + "sft_loss": 1.6274635791778564, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 4.701880233448562, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": -0.08824042975902557, + "logits/rejected": -0.016690267249941826, + "logps/chosen": -1.6092437505722046, + "logps/rejected": -2.20479679107666, + "loss": 0.6621, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6092437505722046, + "rewards/margins": 0.5955528020858765, + "rewards/rejected": -2.20479679107666, + "sft_loss": 1.614297866821289, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 2.51524692247004, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.25718605518341064, + "logits/rejected": -0.06741656363010406, + "logps/chosen": -1.6128227710723877, + "logps/rejected": -2.1121630668640137, + "loss": 0.6764, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6128227710723877, + "rewards/margins": 0.49934014678001404, + "rewards/rejected": -2.1121630668640137, + "sft_loss": 1.599364995956421, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 4.293545314379525, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.1881771683692932, + "logits/rejected": -0.01125810481607914, + "logps/chosen": -1.5785601139068604, + "logps/rejected": -2.2719714641571045, + "loss": 0.652, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5785601139068604, + "rewards/margins": 0.6934112310409546, + "rewards/rejected": -2.2719714641571045, + "sft_loss": 1.5927412509918213, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 3.0130648130551183, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.17986498773097992, + "logits/rejected": -0.025739211589097977, + "logps/chosen": -1.6297643184661865, + "logps/rejected": -2.0613436698913574, + "loss": 0.674, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6297643184661865, + "rewards/margins": 0.43157950043678284, + "rewards/rejected": -2.0613436698913574, + "sft_loss": 1.6227772235870361, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 3.5574036891629355, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.14336520433425903, + "logits/rejected": 0.007078066468238831, + "logps/chosen": -1.4993739128112793, + "logps/rejected": -2.249967336654663, + "loss": 0.6512, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4993739128112793, + "rewards/margins": 0.7505934238433838, + "rewards/rejected": -2.249967336654663, + "sft_loss": 1.539982557296753, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 4.607600533757477, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.14995525777339935, + "logits/rejected": -0.007469749543815851, + "logps/chosen": -1.5812904834747314, + "logps/rejected": -2.2936625480651855, + "loss": 0.6543, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5812904834747314, + "rewards/margins": 0.7123721241950989, + "rewards/rejected": -2.2936625480651855, + "sft_loss": 1.6329816579818726, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 2.202312490868307, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.1573416292667389, + "logits/rejected": -0.022275719791650772, + "logps/chosen": -1.5800076723098755, + "logps/rejected": -2.2222046852111816, + "loss": 0.6606, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5800076723098755, + "rewards/margins": 0.6421971321105957, + "rewards/rejected": -2.2222046852111816, + "sft_loss": 1.6377489566802979, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 5.428563326051036, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.11440400034189224, + "logits/rejected": 0.021290892735123634, + "logps/chosen": -1.6060960292816162, + "logps/rejected": -2.099410057067871, + "loss": 0.664, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6060960292816162, + "rewards/margins": 0.49331387877464294, + "rewards/rejected": -2.099410057067871, + "sft_loss": 1.6460119485855103, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 2.9292725092051186, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.2844286561012268, + "logits/rejected": -0.10208000987768173, + "logps/chosen": -1.4620226621627808, + "logps/rejected": -1.8907419443130493, + "loss": 0.6757, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4620226621627808, + "rewards/margins": 0.4287194609642029, + "rewards/rejected": -1.8907419443130493, + "sft_loss": 1.501758098602295, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 4.760229882974814, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.23014874756336212, + "logits/rejected": -0.02316945232450962, + "logps/chosen": -1.621044397354126, + "logps/rejected": -2.260775327682495, + "loss": 0.6514, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.621044397354126, + "rewards/margins": 0.639731228351593, + "rewards/rejected": -2.260775327682495, + "sft_loss": 1.6766067743301392, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 4.284439527519819, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.11533300578594208, + "logits/rejected": -0.005840751342475414, + "logps/chosen": -1.590293288230896, + "logps/rejected": -2.0981833934783936, + "loss": 0.6667, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.590293288230896, + "rewards/margins": 0.5078902244567871, + "rewards/rejected": -2.0981833934783936, + "sft_loss": 1.5520540475845337, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 3.2520408120851583, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": -0.13598333299160004, + "logits/rejected": -0.06311875581741333, + "logps/chosen": -1.5608255863189697, + "logps/rejected": -2.1643850803375244, + "loss": 0.6505, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5608255863189697, + "rewards/margins": 0.6035594940185547, + "rewards/rejected": -2.1643850803375244, + "sft_loss": 1.5832816362380981, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 4.450140568534266, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.29020220041275024, + "logits/rejected": -0.15658116340637207, + "logps/chosen": -1.4511228799819946, + "logps/rejected": -2.0163347721099854, + "loss": 0.6668, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4511228799819946, + "rewards/margins": 0.5652118921279907, + "rewards/rejected": -2.0163347721099854, + "sft_loss": 1.5070421695709229, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 4.555054953431903, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.1606305092573166, + "logits/rejected": -0.038159389048814774, + "logps/chosen": -1.4936621189117432, + "logps/rejected": -1.953710913658142, + "loss": 0.6715, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4936621189117432, + "rewards/margins": 0.46004876494407654, + "rewards/rejected": -1.953710913658142, + "sft_loss": 1.5176284313201904, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.15357214212417603, + "eval_logits/rejected": 0.24874404072761536, + "eval_logps/chosen": -1.5844873189926147, + "eval_logps/rejected": -2.108262300491333, + "eval_loss": 0.6719112396240234, + "eval_rewards/accuracies": 0.637982189655304, + "eval_rewards/chosen": -1.5844873189926147, + "eval_rewards/margins": 0.5237749814987183, + "eval_rewards/rejected": -2.108262300491333, + "eval_runtime": 46.9387, + "eval_samples_per_second": 28.654, + "eval_sft_loss": 1.5798624753952026, + "eval_steps_per_second": 7.18, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 3.2817263234158642, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.13636358082294464, + "logits/rejected": 0.016568128019571304, + "logps/chosen": -1.5544657707214355, + "logps/rejected": -2.173511266708374, + "loss": 0.6612, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5544657707214355, + "rewards/margins": 0.6190455555915833, + "rewards/rejected": -2.173511266708374, + "sft_loss": 1.5676701068878174, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 2.4873833817716045, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.2956455647945404, + "logits/rejected": -0.09363798052072525, + "logps/chosen": -1.5647292137145996, + "logps/rejected": -1.8906043767929077, + "loss": 0.685, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5647292137145996, + "rewards/margins": 0.3258754014968872, + "rewards/rejected": -1.8906043767929077, + "sft_loss": 1.5583795309066772, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 2.266065266546848, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": -0.10976632684469223, + "logits/rejected": -0.03128939867019653, + "logps/chosen": -1.4546763896942139, + "logps/rejected": -2.143770694732666, + "loss": 0.6437, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4546763896942139, + "rewards/margins": 0.6890941858291626, + "rewards/rejected": -2.143770694732666, + "sft_loss": 1.453282356262207, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 4.142183019556512, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.2382328063249588, + "logits/rejected": -0.10706863552331924, + "logps/chosen": -1.585900068283081, + "logps/rejected": -2.180056095123291, + "loss": 0.6464, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.585900068283081, + "rewards/margins": 0.5941557884216309, + "rewards/rejected": -2.180056095123291, + "sft_loss": 1.6415297985076904, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 3.1261348241576195, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.13553068041801453, + "logits/rejected": -0.054775677621364594, + "logps/chosen": -1.4355140924453735, + "logps/rejected": -1.8778979778289795, + "loss": 0.6544, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4355140924453735, + "rewards/margins": 0.4423840641975403, + "rewards/rejected": -1.8778979778289795, + "sft_loss": 1.5382192134857178, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 1.567158682723937, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.11800148338079453, + "logits/rejected": -0.0526716485619545, + "logps/chosen": -1.696176290512085, + "logps/rejected": -2.1083648204803467, + "loss": 0.6821, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.696176290512085, + "rewards/margins": 0.41218847036361694, + "rewards/rejected": -2.1083648204803467, + "sft_loss": 1.6392204761505127, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 1.957028998509613, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.10932836681604385, + "logits/rejected": 0.0219273678958416, + "logps/chosen": -1.5512540340423584, + "logps/rejected": -1.9477107524871826, + "loss": 0.6736, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5512540340423584, + "rewards/margins": 0.39645668864250183, + "rewards/rejected": -1.9477107524871826, + "sft_loss": 1.5412626266479492, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 3.552787449098246, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.22610628604888916, + "logits/rejected": -0.1075693815946579, + "logps/chosen": -1.5580403804779053, + "logps/rejected": -2.112811326980591, + "loss": 0.6763, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5580403804779053, + "rewards/margins": 0.5547709465026855, + "rewards/rejected": -2.112811326980591, + "sft_loss": 1.492222547531128, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 2.9339544586198723, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.1766672432422638, + "logits/rejected": -0.027664726600050926, + "logps/chosen": -1.6293308734893799, + "logps/rejected": -2.0644872188568115, + "loss": 0.6705, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6293308734893799, + "rewards/margins": 0.4351561665534973, + "rewards/rejected": -2.0644872188568115, + "sft_loss": 1.6100572347640991, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 2.6433000494373906, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.17485761642456055, + "logits/rejected": -0.015716422349214554, + "logps/chosen": -1.568189024925232, + "logps/rejected": -2.212991714477539, + "loss": 0.6637, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.568189024925232, + "rewards/margins": 0.6448027491569519, + "rewards/rejected": -2.212991714477539, + "sft_loss": 1.5443557500839233, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 1.5803897852634907, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.18889057636260986, + "logits/rejected": 0.06050186604261398, + "logps/chosen": -1.6265077590942383, + "logps/rejected": -2.2065577507019043, + "loss": 0.6627, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6265077590942383, + "rewards/margins": 0.580049991607666, + "rewards/rejected": -2.2065577507019043, + "sft_loss": 1.6225849390029907, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 2.343820078187948, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.15496715903282166, + "logits/rejected": -0.040690433233976364, + "logps/chosen": -1.6261556148529053, + "logps/rejected": -2.021845579147339, + "loss": 0.666, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6261556148529053, + "rewards/margins": 0.3956899642944336, + "rewards/rejected": -2.021845579147339, + "sft_loss": 1.609708547592163, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 4.181663506103672, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.14380675554275513, + "logits/rejected": -0.05919576808810234, + "logps/chosen": -1.571504831314087, + "logps/rejected": -2.152155876159668, + "loss": 0.6648, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.571504831314087, + "rewards/margins": 0.5806511044502258, + "rewards/rejected": -2.152155876159668, + "sft_loss": 1.526630163192749, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 6.117249686412447, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.28105050325393677, + "logits/rejected": -0.14320190250873566, + "logps/chosen": -1.5202324390411377, + "logps/rejected": -1.9660730361938477, + "loss": 0.6713, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5202324390411377, + "rewards/margins": 0.4458405375480652, + "rewards/rejected": -1.9660730361938477, + "sft_loss": 1.5822843313217163, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 3.4734110417559187, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.16598021984100342, + "logits/rejected": -0.0934697836637497, + "logps/chosen": -1.5609283447265625, + "logps/rejected": -2.0777783393859863, + "loss": 0.6628, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5609283447265625, + "rewards/margins": 0.5168498754501343, + "rewards/rejected": -2.0777783393859863, + "sft_loss": 1.577129602432251, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 2.954704936714105, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.30001145601272583, + "logits/rejected": -0.11963419616222382, + "logps/chosen": -1.5922280550003052, + "logps/rejected": -2.2104010581970215, + "loss": 0.6462, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5922280550003052, + "rewards/margins": 0.6181727647781372, + "rewards/rejected": -2.2104010581970215, + "sft_loss": 1.666888952255249, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 2.448905033086133, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": -0.11784724146127701, + "logits/rejected": 0.05674583837389946, + "logps/chosen": -1.5779105424880981, + "logps/rejected": -2.2519729137420654, + "loss": 0.6741, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5779105424880981, + "rewards/margins": 0.6740623712539673, + "rewards/rejected": -2.2519729137420654, + "sft_loss": 1.6603368520736694, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 4.414698044694562, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.22511418163776398, + "logits/rejected": -0.16420219838619232, + "logps/chosen": -1.6116619110107422, + "logps/rejected": -2.096935749053955, + "loss": 0.6599, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6116619110107422, + "rewards/margins": 0.48527368903160095, + "rewards/rejected": -2.096935749053955, + "sft_loss": 1.6449838876724243, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 3.648026469136519, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": -0.12327933311462402, + "logits/rejected": -0.10511080920696259, + "logps/chosen": -1.6098442077636719, + "logps/rejected": -2.1295418739318848, + "loss": 0.6748, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6098442077636719, + "rewards/margins": 0.5196977853775024, + "rewards/rejected": -2.1295418739318848, + "sft_loss": 1.6177031993865967, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 3.976663031956452, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.12157398462295532, + "logits/rejected": 0.034795455634593964, + "logps/chosen": -1.5347206592559814, + "logps/rejected": -2.0359296798706055, + "loss": 0.6675, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5347206592559814, + "rewards/margins": 0.5012091398239136, + "rewards/rejected": -2.0359296798706055, + "sft_loss": 1.5952433347702026, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 3.8336725585450404, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.14082691073417664, + "logits/rejected": 0.03473823145031929, + "logps/chosen": -1.5047399997711182, + "logps/rejected": -2.0923469066619873, + "loss": 0.654, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5047399997711182, + "rewards/margins": 0.5876072645187378, + "rewards/rejected": -2.0923469066619873, + "sft_loss": 1.5038917064666748, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 7.052185955639947, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.21331937611103058, + "logits/rejected": -0.07680104672908783, + "logps/chosen": -1.4796664714813232, + "logps/rejected": -2.0214037895202637, + "loss": 0.6581, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4796664714813232, + "rewards/margins": 0.5417372584342957, + "rewards/rejected": -2.0214037895202637, + "sft_loss": 1.5362671613693237, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 3.397500264067202, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": -0.1217583566904068, + "logits/rejected": 0.0007851526024751365, + "logps/chosen": -1.579552412033081, + "logps/rejected": -2.0797359943389893, + "loss": 0.6742, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.579552412033081, + "rewards/margins": 0.5001831650733948, + "rewards/rejected": -2.0797359943389893, + "sft_loss": 1.6408942937850952, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 1.8049247352380564, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.2258315086364746, + "logits/rejected": -0.1402186155319214, + "logps/chosen": -1.5952908992767334, + "logps/rejected": -2.016566514968872, + "loss": 0.6693, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5952908992767334, + "rewards/margins": 0.4212755560874939, + "rewards/rejected": -2.016566514968872, + "sft_loss": 1.4963607788085938, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 7.382090470127641, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.20450296998023987, + "logits/rejected": -0.1239570751786232, + "logps/chosen": -1.602783203125, + "logps/rejected": -2.1566102504730225, + "loss": 0.6528, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.602783203125, + "rewards/margins": 0.553827166557312, + "rewards/rejected": -2.1566102504730225, + "sft_loss": 1.7019307613372803, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 9.4844763217049, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.22892682254314423, + "logits/rejected": -0.17251577973365784, + "logps/chosen": -1.6106479167938232, + "logps/rejected": -2.0148985385894775, + "loss": 0.6758, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6106479167938232, + "rewards/margins": 0.4042506217956543, + "rewards/rejected": -2.0148985385894775, + "sft_loss": 1.6035655736923218, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 3.352412444441813, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.26767629384994507, + "logits/rejected": -0.11004316806793213, + "logps/chosen": -1.5166822671890259, + "logps/rejected": -2.0801236629486084, + "loss": 0.6687, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5166822671890259, + "rewards/margins": 0.5634415745735168, + "rewards/rejected": -2.0801236629486084, + "sft_loss": 1.5298277139663696, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 14.55625660152246, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.19522763788700104, + "logits/rejected": -0.014734324999153614, + "logps/chosen": -1.7307504415512085, + "logps/rejected": -2.087800979614258, + "loss": 0.6776, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7307504415512085, + "rewards/margins": 0.3570505976676941, + "rewards/rejected": -2.087800979614258, + "sft_loss": 1.6963192224502563, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 4.187645415508513, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.18527694046497345, + "logits/rejected": 0.0016733307857066393, + "logps/chosen": -1.5385725498199463, + "logps/rejected": -2.181457281112671, + "loss": 0.6652, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5385725498199463, + "rewards/margins": 0.6428850293159485, + "rewards/rejected": -2.181457281112671, + "sft_loss": 1.5796239376068115, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 2.373501532859784, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.19420263171195984, + "logits/rejected": -0.027106571942567825, + "logps/chosen": -1.5866892337799072, + "logps/rejected": -1.9564670324325562, + "loss": 0.6795, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5866892337799072, + "rewards/margins": 0.3697775602340698, + "rewards/rejected": -1.9564670324325562, + "sft_loss": 1.5613502264022827, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 3.7834706241934524, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.15926238894462585, + "logits/rejected": -0.0661858469247818, + "logps/chosen": -1.4821274280548096, + "logps/rejected": -2.1808743476867676, + "loss": 0.6528, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4821274280548096, + "rewards/margins": 0.698746919631958, + "rewards/rejected": -2.1808743476867676, + "sft_loss": 1.5234520435333252, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 3.583960186561033, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": -0.1691628098487854, + "logits/rejected": -0.04530046880245209, + "logps/chosen": -1.5531885623931885, + "logps/rejected": -2.1100735664367676, + "loss": 0.6622, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5531885623931885, + "rewards/margins": 0.5568851828575134, + "rewards/rejected": -2.1100735664367676, + "sft_loss": 1.5481550693511963, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 13.56875223974927, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.18169988691806793, + "logits/rejected": -0.05722617357969284, + "logps/chosen": -1.5489635467529297, + "logps/rejected": -2.208829402923584, + "loss": 0.6621, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5489635467529297, + "rewards/margins": 0.6598660349845886, + "rewards/rejected": -2.208829402923584, + "sft_loss": 1.5578101873397827, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 3.269721055203262, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": -0.12570646405220032, + "logits/rejected": -0.04687528684735298, + "logps/chosen": -1.5527559518814087, + "logps/rejected": -2.0612478256225586, + "loss": 0.665, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5527559518814087, + "rewards/margins": 0.5084918737411499, + "rewards/rejected": -2.0612478256225586, + "sft_loss": 1.535414457321167, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 7.6401000061639195, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.2719722390174866, + "logits/rejected": -0.1282080113887787, + "logps/chosen": -1.5063210725784302, + "logps/rejected": -2.0476253032684326, + "loss": 0.6635, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5063210725784302, + "rewards/margins": 0.5413042306900024, + "rewards/rejected": -2.0476253032684326, + "sft_loss": 1.5388996601104736, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 5.53193179030059, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.2048530876636505, + "logits/rejected": -0.12116770446300507, + "logps/chosen": -1.559245228767395, + "logps/rejected": -1.8791849613189697, + "loss": 0.6719, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.559245228767395, + "rewards/margins": 0.31993982195854187, + "rewards/rejected": -1.8791849613189697, + "sft_loss": 1.5968759059906006, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 5.8336608420197775, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": -0.11024793237447739, + "logits/rejected": 0.04365754872560501, + "logps/chosen": -1.4740869998931885, + "logps/rejected": -2.0004656314849854, + "loss": 0.6473, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4740869998931885, + "rewards/margins": 0.5263785719871521, + "rewards/rejected": -2.0004656314849854, + "sft_loss": 1.4342414140701294, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 4.838704544675485, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.21626317501068115, + "logits/rejected": -0.07905010879039764, + "logps/chosen": -1.4662044048309326, + "logps/rejected": -1.9638229608535767, + "loss": 0.667, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4662044048309326, + "rewards/margins": 0.49761828780174255, + "rewards/rejected": -1.9638229608535767, + "sft_loss": 1.5521126985549927, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 5.6321716050921475, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.1686750203371048, + "logits/rejected": -0.11953876912593842, + "logps/chosen": -1.5056140422821045, + "logps/rejected": -1.9153554439544678, + "loss": 0.6725, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5056140422821045, + "rewards/margins": 0.4097414016723633, + "rewards/rejected": -1.9153554439544678, + "sft_loss": 1.4823278188705444, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 5.256141485844498, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": -0.07481982558965683, + "logits/rejected": 0.05477939918637276, + "logps/chosen": -1.5007654428482056, + "logps/rejected": -1.840966820716858, + "loss": 0.6668, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5007654428482056, + "rewards/margins": 0.340201199054718, + "rewards/rejected": -1.840966820716858, + "sft_loss": 1.5125921964645386, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 2.4415775706067118, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": -0.0970289334654808, + "logits/rejected": -0.04464380070567131, + "logps/chosen": -1.5349600315093994, + "logps/rejected": -1.9589964151382446, + "loss": 0.6791, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5349600315093994, + "rewards/margins": 0.42403656244277954, + "rewards/rejected": -1.9589964151382446, + "sft_loss": 1.5748927593231201, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 4.649699563839675, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": -0.07690870761871338, + "logits/rejected": -0.000336170953232795, + "logps/chosen": -1.5294123888015747, + "logps/rejected": -1.9836593866348267, + "loss": 0.6744, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5294123888015747, + "rewards/margins": 0.4542468190193176, + "rewards/rejected": -1.9836593866348267, + "sft_loss": 1.5801916122436523, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 6.0153698780292, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.1915212720632553, + "logits/rejected": -0.04982220381498337, + "logps/chosen": -1.463560938835144, + "logps/rejected": -2.0906307697296143, + "loss": 0.6569, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.463560938835144, + "rewards/margins": 0.6270699501037598, + "rewards/rejected": -2.0906307697296143, + "sft_loss": 1.5472733974456787, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 4.624166029738389, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": -0.18850019574165344, + "logits/rejected": -0.19020205736160278, + "logps/chosen": -1.5914949178695679, + "logps/rejected": -1.8966413736343384, + "loss": 0.6831, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5914949178695679, + "rewards/margins": 0.3051464855670929, + "rewards/rejected": -1.8966413736343384, + "sft_loss": 1.6427654027938843, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 2.0745035306044537, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.15994513034820557, + "logits/rejected": -0.00917014293372631, + "logps/chosen": -1.6022281646728516, + "logps/rejected": -2.0769033432006836, + "loss": 0.6677, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6022281646728516, + "rewards/margins": 0.47467517852783203, + "rewards/rejected": -2.0769033432006836, + "sft_loss": 1.5459787845611572, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 4.20200577999301, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.31244519352912903, + "logits/rejected": -0.11458522081375122, + "logps/chosen": -1.5718451738357544, + "logps/rejected": -2.1408886909484863, + "loss": 0.6771, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5718451738357544, + "rewards/margins": 0.5690435171127319, + "rewards/rejected": -2.1408886909484863, + "sft_loss": 1.5779874324798584, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 3.4210463268097926, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.241429403424263, + "logits/rejected": -0.1481015533208847, + "logps/chosen": -1.5252244472503662, + "logps/rejected": -2.0902552604675293, + "loss": 0.6601, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5252244472503662, + "rewards/margins": 0.5650309324264526, + "rewards/rejected": -2.0902552604675293, + "sft_loss": 1.5835373401641846, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 4.097976614384087, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": -0.16206932067871094, + "logits/rejected": -0.07781902700662613, + "logps/chosen": -1.6320310831069946, + "logps/rejected": -2.0864644050598145, + "loss": 0.6729, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6320310831069946, + "rewards/margins": 0.4544333517551422, + "rewards/rejected": -2.0864644050598145, + "sft_loss": 1.5919148921966553, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 5.291704044136723, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.1960441768169403, + "logits/rejected": -0.11005222797393799, + "logps/chosen": -1.5638240575790405, + "logps/rejected": -2.1008219718933105, + "loss": 0.6674, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5638240575790405, + "rewards/margins": 0.5369978547096252, + "rewards/rejected": -2.1008219718933105, + "sft_loss": 1.5387166738510132, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 5.5685439824404845, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.2559325098991394, + "logits/rejected": -0.10011889785528183, + "logps/chosen": -1.5849932432174683, + "logps/rejected": -2.3768601417541504, + "loss": 0.6778, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5849932432174683, + "rewards/margins": 0.791866660118103, + "rewards/rejected": -2.3768601417541504, + "sft_loss": 1.5990684032440186, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 8.078595435314279, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.16147413849830627, + "logits/rejected": -0.006514549255371094, + "logps/chosen": -1.505885362625122, + "logps/rejected": -2.0246315002441406, + "loss": 0.6516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.505885362625122, + "rewards/margins": 0.5187458992004395, + "rewards/rejected": -2.0246315002441406, + "sft_loss": 1.5155709981918335, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 5.341300478947878, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": -0.11378873884677887, + "logits/rejected": -0.06429148465394974, + "logps/chosen": -1.5142958164215088, + "logps/rejected": -2.185347080230713, + "loss": 0.6616, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5142958164215088, + "rewards/margins": 0.6710509061813354, + "rewards/rejected": -2.185347080230713, + "sft_loss": 1.5100260972976685, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 3.176988277206947, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.2651599049568176, + "logits/rejected": -0.13692238926887512, + "logps/chosen": -1.6227686405181885, + "logps/rejected": -2.251343011856079, + "loss": 0.6522, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6227686405181885, + "rewards/margins": 0.6285744309425354, + "rewards/rejected": -2.251343011856079, + "sft_loss": 1.6871635913848877, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 2.883049496689386, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": -0.12162493169307709, + "logits/rejected": -0.021850673481822014, + "logps/chosen": -1.6583884954452515, + "logps/rejected": -2.1100821495056152, + "loss": 0.6593, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6583884954452515, + "rewards/margins": 0.45169371366500854, + "rewards/rejected": -2.1100821495056152, + "sft_loss": 1.6291186809539795, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 12.871054148089197, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.2270089089870453, + "logits/rejected": -0.07251622527837753, + "logps/chosen": -1.7089569568634033, + "logps/rejected": -2.04795503616333, + "loss": 0.6807, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7089569568634033, + "rewards/margins": 0.3389982581138611, + "rewards/rejected": -2.04795503616333, + "sft_loss": 1.5236625671386719, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 6.283613019960803, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.2575947046279907, + "logits/rejected": -0.08951815217733383, + "logps/chosen": -1.582740068435669, + "logps/rejected": -2.117541551589966, + "loss": 0.6733, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.582740068435669, + "rewards/margins": 0.5348014831542969, + "rewards/rejected": -2.117541551589966, + "sft_loss": 1.5981295108795166, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 6.264530990495215, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.20818033814430237, + "logits/rejected": 0.002076371107250452, + "logps/chosen": -1.5417582988739014, + "logps/rejected": -2.0970585346221924, + "loss": 0.6658, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5417582988739014, + "rewards/margins": 0.5553001761436462, + "rewards/rejected": -2.0970585346221924, + "sft_loss": 1.5650211572647095, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 1.9064429797923519, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.3717069625854492, + "logits/rejected": -0.1375175565481186, + "logps/chosen": -1.6472127437591553, + "logps/rejected": -2.391467332839966, + "loss": 0.6576, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6472127437591553, + "rewards/margins": 0.7442543506622314, + "rewards/rejected": -2.391467332839966, + "sft_loss": 1.5960744619369507, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 2.5469317246859338, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": -0.18331435322761536, + "logits/rejected": -0.15281887352466583, + "logps/chosen": -1.5887569189071655, + "logps/rejected": -2.1782190799713135, + "loss": 0.67, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5887569189071655, + "rewards/margins": 0.5894622206687927, + "rewards/rejected": -2.1782190799713135, + "sft_loss": 1.5789341926574707, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 5.17353516640267, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.2531064450740814, + "logits/rejected": -0.06976678222417831, + "logps/chosen": -1.6449716091156006, + "logps/rejected": -2.1559338569641113, + "loss": 0.6767, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6449716091156006, + "rewards/margins": 0.5109620690345764, + "rewards/rejected": -2.1559338569641113, + "sft_loss": 1.663458228111267, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 2.741290627931337, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -0.25593826174736023, + "logits/rejected": -0.043508801609277725, + "logps/chosen": -1.453940749168396, + "logps/rejected": -2.434394598007202, + "loss": 0.6342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.453940749168396, + "rewards/margins": 0.9804538488388062, + "rewards/rejected": -2.434394598007202, + "sft_loss": 1.415795087814331, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 3.1028823700547408, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": -0.14798538386821747, + "logits/rejected": -0.11400546878576279, + "logps/chosen": -1.5622608661651611, + "logps/rejected": -2.07262921333313, + "loss": 0.6499, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5622608661651611, + "rewards/margins": 0.5103680491447449, + "rewards/rejected": -2.07262921333313, + "sft_loss": 1.5270684957504272, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 4.332292650790028, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.29937106370925903, + "logits/rejected": -0.19251208007335663, + "logps/chosen": -1.4233131408691406, + "logps/rejected": -1.9493812322616577, + "loss": 0.6512, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4233131408691406, + "rewards/margins": 0.5260680913925171, + "rewards/rejected": -1.9493812322616577, + "sft_loss": 1.5380337238311768, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 7.4402227586030065, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.25485068559646606, + "logits/rejected": -0.13746242225170135, + "logps/chosen": -1.5280907154083252, + "logps/rejected": -1.8951940536499023, + "loss": 0.6741, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5280907154083252, + "rewards/margins": 0.3671031594276428, + "rewards/rejected": -1.8951940536499023, + "sft_loss": 1.4867727756500244, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 4.465298263051977, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.2098223716020584, + "logits/rejected": -0.10190926492214203, + "logps/chosen": -1.4999598264694214, + "logps/rejected": -2.030771255493164, + "loss": 0.6535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4999598264694214, + "rewards/margins": 0.5308112502098083, + "rewards/rejected": -2.030771255493164, + "sft_loss": 1.4926955699920654, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 5.828776774520263, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.2677415609359741, + "logits/rejected": -0.09359410405158997, + "logps/chosen": -1.4975014925003052, + "logps/rejected": -2.195223093032837, + "loss": 0.66, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4975014925003052, + "rewards/margins": 0.6977213621139526, + "rewards/rejected": -2.195223093032837, + "sft_loss": 1.507609486579895, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 6.085674622259741, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.25986337661743164, + "logits/rejected": -0.06538162380456924, + "logps/chosen": -1.4231359958648682, + "logps/rejected": -2.107517719268799, + "loss": 0.6484, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4231359958648682, + "rewards/margins": 0.6843816637992859, + "rewards/rejected": -2.107517719268799, + "sft_loss": 1.4849355220794678, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 5.793144292379904, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.2694844603538513, + "logits/rejected": -0.1628873646259308, + "logps/chosen": -1.5163061618804932, + "logps/rejected": -2.3208680152893066, + "loss": 0.6526, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5163061618804932, + "rewards/margins": 0.804561972618103, + "rewards/rejected": -2.3208680152893066, + "sft_loss": 1.5444607734680176, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 4.465041295648728, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.2704235911369324, + "logits/rejected": -0.14123213291168213, + "logps/chosen": -1.4960787296295166, + "logps/rejected": -1.9699077606201172, + "loss": 0.6543, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4960787296295166, + "rewards/margins": 0.4738289713859558, + "rewards/rejected": -1.9699077606201172, + "sft_loss": 1.4975306987762451, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 3.7597880435407323, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.24096722900867462, + "logits/rejected": -0.13594664633274078, + "logps/chosen": -1.5836585760116577, + "logps/rejected": -2.232564687728882, + "loss": 0.6445, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5836585760116577, + "rewards/margins": 0.6489061117172241, + "rewards/rejected": -2.232564687728882, + "sft_loss": 1.5925681591033936, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 8.458017615838024, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.1675444394350052, + "logits/rejected": -0.02611861564218998, + "logps/chosen": -1.4565662145614624, + "logps/rejected": -1.983507513999939, + "loss": 0.6629, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4565662145614624, + "rewards/margins": 0.5269410610198975, + "rewards/rejected": -1.983507513999939, + "sft_loss": 1.5315477848052979, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 3.2438935797505413, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.2979353070259094, + "logits/rejected": -0.12887240946292877, + "logps/chosen": -1.5524100065231323, + "logps/rejected": -2.2004122734069824, + "loss": 0.6605, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5524100065231323, + "rewards/margins": 0.6480022072792053, + "rewards/rejected": -2.2004122734069824, + "sft_loss": 1.546685814857483, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 4.32323300694089, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": -0.20667262375354767, + "logits/rejected": -0.07070144265890121, + "logps/chosen": -1.4959765672683716, + "logps/rejected": -1.9927799701690674, + "loss": 0.6459, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4959765672683716, + "rewards/margins": 0.4968033730983734, + "rewards/rejected": -1.9927799701690674, + "sft_loss": 1.5539121627807617, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 8.115148746232252, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.28089088201522827, + "logits/rejected": -0.2090112864971161, + "logps/chosen": -1.451894760131836, + "logps/rejected": -1.7988488674163818, + "loss": 0.6661, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.451894760131836, + "rewards/margins": 0.34695425629615784, + "rewards/rejected": -1.7988488674163818, + "sft_loss": 1.5190246105194092, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 2.9528466265049724, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.24385952949523926, + "logits/rejected": -0.19614621996879578, + "logps/chosen": -1.504969835281372, + "logps/rejected": -2.081859588623047, + "loss": 0.6539, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.504969835281372, + "rewards/margins": 0.5768897533416748, + "rewards/rejected": -2.081859588623047, + "sft_loss": 1.5464751720428467, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 2.556932366962606, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.23676061630249023, + "logits/rejected": -0.1409497857093811, + "logps/chosen": -1.6686662435531616, + "logps/rejected": -2.109144449234009, + "loss": 0.6907, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.6686662435531616, + "rewards/margins": 0.44047823548316956, + "rewards/rejected": -2.109144449234009, + "sft_loss": 1.6145015954971313, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 6.677454504540487, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.28398019075393677, + "logits/rejected": -0.1465751677751541, + "logps/chosen": -1.6540460586547852, + "logps/rejected": -2.3016133308410645, + "loss": 0.6763, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6540460586547852, + "rewards/margins": 0.6475671529769897, + "rewards/rejected": -2.3016133308410645, + "sft_loss": 1.6323429346084595, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 5.837506585015335, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": -0.21055534482002258, + "logits/rejected": -0.22708992660045624, + "logps/chosen": -1.5476744174957275, + "logps/rejected": -2.052155017852783, + "loss": 0.6651, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5476744174957275, + "rewards/margins": 0.5044804215431213, + "rewards/rejected": -2.052155017852783, + "sft_loss": 1.6932910680770874, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 3.794310744789843, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": -0.11941961199045181, + "logits/rejected": -0.1020720824599266, + "logps/chosen": -1.5011706352233887, + "logps/rejected": -2.019202709197998, + "loss": 0.672, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5011706352233887, + "rewards/margins": 0.5180321931838989, + "rewards/rejected": -2.019202709197998, + "sft_loss": 1.484691858291626, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 3.05795063441273, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.28632354736328125, + "logits/rejected": -0.19636675715446472, + "logps/chosen": -1.623683214187622, + "logps/rejected": -2.1551148891448975, + "loss": 0.6658, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.623683214187622, + "rewards/margins": 0.5314315557479858, + "rewards/rejected": -2.1551148891448975, + "sft_loss": 1.6839704513549805, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.025673363357782364, + "eval_logits/rejected": 0.11081632226705551, + "eval_logps/chosen": -1.6197056770324707, + "eval_logps/rejected": -2.181790590286255, + "eval_loss": 0.6707143783569336, + "eval_rewards/accuracies": 0.6454005837440491, + "eval_rewards/chosen": -1.6197056770324707, + "eval_rewards/margins": 0.5620848536491394, + "eval_rewards/rejected": -2.181790590286255, + "eval_runtime": 46.4594, + "eval_samples_per_second": 28.95, + "eval_sft_loss": 1.6055032014846802, + "eval_steps_per_second": 7.254, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 2.0226207098379314, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.39112040400505066, + "logits/rejected": -0.24808260798454285, + "logps/chosen": -1.4235303401947021, + "logps/rejected": -1.9543339014053345, + "loss": 0.6509, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4235303401947021, + "rewards/margins": 0.5308033227920532, + "rewards/rejected": -1.9543339014053345, + "sft_loss": 1.4845623970031738, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 9.915134760660942, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.22110262513160706, + "logits/rejected": -0.17764215171337128, + "logps/chosen": -1.5744935274124146, + "logps/rejected": -2.0595247745513916, + "loss": 0.6595, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5744935274124146, + "rewards/margins": 0.4850311875343323, + "rewards/rejected": -2.0595247745513916, + "sft_loss": 1.6815903186798096, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 3.022713285905784, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": -0.1872178614139557, + "logits/rejected": -0.06712029874324799, + "logps/chosen": -1.5897722244262695, + "logps/rejected": -2.178109645843506, + "loss": 0.6587, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5897722244262695, + "rewards/margins": 0.5883373022079468, + "rewards/rejected": -2.178109645843506, + "sft_loss": 1.5976924896240234, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 3.716607098661702, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.21846675872802734, + "logits/rejected": -0.046916164457798004, + "logps/chosen": -1.58573317527771, + "logps/rejected": -2.0916264057159424, + "loss": 0.6722, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.58573317527771, + "rewards/margins": 0.5058929324150085, + "rewards/rejected": -2.0916264057159424, + "sft_loss": 1.5854836702346802, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 3.4725896292535148, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.26726624369621277, + "logits/rejected": -0.08334805816411972, + "logps/chosen": -1.6209615468978882, + "logps/rejected": -2.3004868030548096, + "loss": 0.6695, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6209615468978882, + "rewards/margins": 0.6795251965522766, + "rewards/rejected": -2.3004868030548096, + "sft_loss": 1.5951423645019531, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 9.544540172332685, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.21872110664844513, + "logits/rejected": -0.06208853796124458, + "logps/chosen": -1.643139123916626, + "logps/rejected": -2.0365586280822754, + "loss": 0.695, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.643139123916626, + "rewards/margins": 0.39341965317726135, + "rewards/rejected": -2.0365586280822754, + "sft_loss": 1.676015853881836, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 5.649184349410335, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.2260381281375885, + "logits/rejected": -0.06770346313714981, + "logps/chosen": -1.5730047225952148, + "logps/rejected": -2.150031089782715, + "loss": 0.6663, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5730047225952148, + "rewards/margins": 0.5770264863967896, + "rewards/rejected": -2.150031089782715, + "sft_loss": 1.583595871925354, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 7.166373629229626, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.20473213493824005, + "logits/rejected": -0.10485156625509262, + "logps/chosen": -1.5947294235229492, + "logps/rejected": -1.987549066543579, + "loss": 0.6681, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5947294235229492, + "rewards/margins": 0.39281970262527466, + "rewards/rejected": -1.987549066543579, + "sft_loss": 1.6280778646469116, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 5.79631646232252, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": -0.17899790406227112, + "logits/rejected": -0.0887211412191391, + "logps/chosen": -1.6026455163955688, + "logps/rejected": -2.146700859069824, + "loss": 0.6727, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6026455163955688, + "rewards/margins": 0.5440553426742554, + "rewards/rejected": -2.146700859069824, + "sft_loss": 1.540010690689087, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 11.689193206305001, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.2543259561061859, + "logits/rejected": -0.17087538540363312, + "logps/chosen": -1.5911014080047607, + "logps/rejected": -2.1432793140411377, + "loss": 0.6798, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5911014080047607, + "rewards/margins": 0.5521779656410217, + "rewards/rejected": -2.1432793140411377, + "sft_loss": 1.6015312671661377, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 2.7080491563427267, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.2648487687110901, + "logits/rejected": -0.09542088210582733, + "logps/chosen": -1.7155059576034546, + "logps/rejected": -2.2239832878112793, + "loss": 0.6788, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.7155059576034546, + "rewards/margins": 0.5084772109985352, + "rewards/rejected": -2.2239832878112793, + "sft_loss": 1.7024424076080322, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 2.878698675596129, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -0.3109549880027771, + "logits/rejected": -0.12017013877630234, + "logps/chosen": -1.6655937433242798, + "logps/rejected": -2.110398769378662, + "loss": 0.6693, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6655937433242798, + "rewards/margins": 0.44480523467063904, + "rewards/rejected": -2.110398769378662, + "sft_loss": 1.5922762155532837, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 6.031810915102935, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -0.31219014525413513, + "logits/rejected": -0.16434085369110107, + "logps/chosen": -1.5579341650009155, + "logps/rejected": -2.12554669380188, + "loss": 0.6829, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5579341650009155, + "rewards/margins": 0.56761234998703, + "rewards/rejected": -2.12554669380188, + "sft_loss": 1.6126997470855713, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 6.298021089512181, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.22607514262199402, + "logits/rejected": -0.09791434556245804, + "logps/chosen": -1.5956169366836548, + "logps/rejected": -1.9712276458740234, + "loss": 0.6828, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5956169366836548, + "rewards/margins": 0.3756106197834015, + "rewards/rejected": -1.9712276458740234, + "sft_loss": 1.5619713068008423, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 3.0670744673020494, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.23713460564613342, + "logits/rejected": -0.12866348028182983, + "logps/chosen": -1.5899370908737183, + "logps/rejected": -2.066519260406494, + "loss": 0.6664, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5899370908737183, + "rewards/margins": 0.47658228874206543, + "rewards/rejected": -2.066519260406494, + "sft_loss": 1.5455102920532227, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 4.091790425211791, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": -0.10790850222110748, + "logits/rejected": 0.008631653152406216, + "logps/chosen": -1.606191635131836, + "logps/rejected": -2.1437809467315674, + "loss": 0.6585, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.606191635131836, + "rewards/margins": 0.5375891923904419, + "rewards/rejected": -2.1437809467315674, + "sft_loss": 1.5784814357757568, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 3.9005798756672756, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.2036016881465912, + "logits/rejected": -0.07130200415849686, + "logps/chosen": -1.640968680381775, + "logps/rejected": -2.1466243267059326, + "loss": 0.6689, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.640968680381775, + "rewards/margins": 0.5056557655334473, + "rewards/rejected": -2.1466243267059326, + "sft_loss": 1.650861144065857, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 4.24708693894575, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.2895212471485138, + "logits/rejected": -0.09275839477777481, + "logps/chosen": -1.5821725130081177, + "logps/rejected": -2.12876558303833, + "loss": 0.6735, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5821725130081177, + "rewards/margins": 0.5465930700302124, + "rewards/rejected": -2.12876558303833, + "sft_loss": 1.551035761833191, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 4.237934129093345, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.28006845712661743, + "logits/rejected": -0.1313776671886444, + "logps/chosen": -1.5778753757476807, + "logps/rejected": -2.209226131439209, + "loss": 0.6642, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5778753757476807, + "rewards/margins": 0.6313506364822388, + "rewards/rejected": -2.209226131439209, + "sft_loss": 1.5569813251495361, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 5.009728052455516, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.20519180595874786, + "logits/rejected": -0.09929974377155304, + "logps/chosen": -1.58689284324646, + "logps/rejected": -2.2199573516845703, + "loss": 0.6682, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.58689284324646, + "rewards/margins": 0.6330643892288208, + "rewards/rejected": -2.2199573516845703, + "sft_loss": 1.5671353340148926, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 2.918493022779605, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": -0.12294672429561615, + "logits/rejected": -0.09764774888753891, + "logps/chosen": -1.5841805934906006, + "logps/rejected": -2.107809543609619, + "loss": 0.6577, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5841805934906006, + "rewards/margins": 0.5236291885375977, + "rewards/rejected": -2.107809543609619, + "sft_loss": 1.6166483163833618, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 2.5726111441709683, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.1962536871433258, + "logits/rejected": -0.08937899023294449, + "logps/chosen": -1.4637311697006226, + "logps/rejected": -1.9759318828582764, + "loss": 0.6521, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4637311697006226, + "rewards/margins": 0.5122007131576538, + "rewards/rejected": -1.9759318828582764, + "sft_loss": 1.5529893636703491, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 2.5475667771729413, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.16151954233646393, + "logits/rejected": 0.012316593900322914, + "logps/chosen": -1.6571800708770752, + "logps/rejected": -2.189068555831909, + "loss": 0.6674, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6571800708770752, + "rewards/margins": 0.5318886041641235, + "rewards/rejected": -2.189068555831909, + "sft_loss": 1.7012519836425781, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 1.52301957259393, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.14033019542694092, + "logits/rejected": -0.015837164595723152, + "logps/chosen": -1.6222553253173828, + "logps/rejected": -2.169674873352051, + "loss": 0.6757, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6222553253173828, + "rewards/margins": 0.5474194288253784, + "rewards/rejected": -2.169674873352051, + "sft_loss": 1.618524193763733, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 4.221517759195312, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.15154433250427246, + "logits/rejected": -0.08596460521221161, + "logps/chosen": -1.591717004776001, + "logps/rejected": -2.138441324234009, + "loss": 0.6492, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.591717004776001, + "rewards/margins": 0.5467241406440735, + "rewards/rejected": -2.138441324234009, + "sft_loss": 1.571844458580017, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 2.5282174445739853, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.29474323987960815, + "logits/rejected": -0.11456756293773651, + "logps/chosen": -1.5818030834197998, + "logps/rejected": -2.5438389778137207, + "loss": 0.6563, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5818030834197998, + "rewards/margins": 0.9620355367660522, + "rewards/rejected": -2.5438389778137207, + "sft_loss": 1.6236823797225952, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 3.3302415118771487, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.24398522078990936, + "logits/rejected": -0.09933225810527802, + "logps/chosen": -1.6292442083358765, + "logps/rejected": -2.180849552154541, + "loss": 0.662, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6292442083358765, + "rewards/margins": 0.5516052842140198, + "rewards/rejected": -2.180849552154541, + "sft_loss": 1.6737045049667358, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 2.447525860866938, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.24691593647003174, + "logits/rejected": -0.07334215939044952, + "logps/chosen": -1.5735365152359009, + "logps/rejected": -2.3229589462280273, + "loss": 0.6428, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5735365152359009, + "rewards/margins": 0.7494224309921265, + "rewards/rejected": -2.3229589462280273, + "sft_loss": 1.6313596963882446, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 6.579022510254535, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.23249594867229462, + "logits/rejected": -0.0880482941865921, + "logps/chosen": -1.6138916015625, + "logps/rejected": -2.014530897140503, + "loss": 0.6691, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6138916015625, + "rewards/margins": 0.40063929557800293, + "rewards/rejected": -2.014530897140503, + "sft_loss": 1.5987266302108765, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 4.445224356828172, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.11989017575979233, + "logits/rejected": 0.1379929631948471, + "logps/chosen": -1.487082600593567, + "logps/rejected": -2.2836666107177734, + "loss": 0.6409, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.487082600593567, + "rewards/margins": 0.7965839505195618, + "rewards/rejected": -2.2836666107177734, + "sft_loss": 1.5135908126831055, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 4.411394972956897, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.10843691974878311, + "logits/rejected": 0.0039287833496928215, + "logps/chosen": -1.5481619834899902, + "logps/rejected": -2.1295180320739746, + "loss": 0.6628, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5481619834899902, + "rewards/margins": 0.5813560485839844, + "rewards/rejected": -2.1295180320739746, + "sft_loss": 1.5984420776367188, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 2.813638099032081, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.23725099861621857, + "logits/rejected": -0.0956241562962532, + "logps/chosen": -1.5859758853912354, + "logps/rejected": -2.1242527961730957, + "loss": 0.6539, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5859758853912354, + "rewards/margins": 0.5382769703865051, + "rewards/rejected": -2.1242527961730957, + "sft_loss": 1.5380891561508179, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 5.144347674500833, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.18725113570690155, + "logits/rejected": -0.08537117391824722, + "logps/chosen": -1.5215387344360352, + "logps/rejected": -2.1008687019348145, + "loss": 0.652, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5215387344360352, + "rewards/margins": 0.5793299078941345, + "rewards/rejected": -2.1008687019348145, + "sft_loss": 1.5025901794433594, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 6.205747754627465, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.18871183693408966, + "logits/rejected": 0.00811255443841219, + "logps/chosen": -1.598693609237671, + "logps/rejected": -2.211864709854126, + "loss": 0.6744, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.598693609237671, + "rewards/margins": 0.6131712794303894, + "rewards/rejected": -2.211864709854126, + "sft_loss": 1.6509023904800415, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 4.2464036589658045, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.17433153092861176, + "logits/rejected": -0.02227271907031536, + "logps/chosen": -1.5661596059799194, + "logps/rejected": -2.295907497406006, + "loss": 0.655, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5661596059799194, + "rewards/margins": 0.729748010635376, + "rewards/rejected": -2.295907497406006, + "sft_loss": 1.6411842107772827, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 6.976785428121013, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.22953324019908905, + "logits/rejected": -0.05818880721926689, + "logps/chosen": -1.6724097728729248, + "logps/rejected": -2.1595005989074707, + "loss": 0.6828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6724097728729248, + "rewards/margins": 0.48709067702293396, + "rewards/rejected": -2.1595005989074707, + "sft_loss": 1.6616108417510986, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 3.5408258715654886, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.21573512256145477, + "logits/rejected": -0.017917849123477936, + "logps/chosen": -1.65218186378479, + "logps/rejected": -2.1986420154571533, + "loss": 0.6765, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.65218186378479, + "rewards/margins": 0.5464602708816528, + "rewards/rejected": -2.1986420154571533, + "sft_loss": 1.6009705066680908, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 6.592240750655103, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.19919119775295258, + "logits/rejected": -0.005335810594260693, + "logps/chosen": -1.5355892181396484, + "logps/rejected": -2.2174465656280518, + "loss": 0.6627, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5355892181396484, + "rewards/margins": 0.6818572282791138, + "rewards/rejected": -2.2174465656280518, + "sft_loss": 1.5363190174102783, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 3.47366638355546, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.14296242594718933, + "logits/rejected": -0.01892923191189766, + "logps/chosen": -1.6547492742538452, + "logps/rejected": -2.495971202850342, + "loss": 0.6519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6547492742538452, + "rewards/margins": 0.8412219882011414, + "rewards/rejected": -2.495971202850342, + "sft_loss": 1.6823389530181885, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 4.8846488670917925, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.19661283493041992, + "logits/rejected": -0.03337302431464195, + "logps/chosen": -1.6535663604736328, + "logps/rejected": -2.164750576019287, + "loss": 0.6891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6535663604736328, + "rewards/margins": 0.5111840963363647, + "rewards/rejected": -2.164750576019287, + "sft_loss": 1.5775151252746582, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 5.160745165468197, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.2481531798839569, + "logits/rejected": -0.08857151120901108, + "logps/chosen": -1.5439527034759521, + "logps/rejected": -2.017119884490967, + "loss": 0.6696, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5439527034759521, + "rewards/margins": 0.47316694259643555, + "rewards/rejected": -2.017119884490967, + "sft_loss": 1.4927449226379395, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 4.381947958181414, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.2527027428150177, + "logits/rejected": -0.12018553167581558, + "logps/chosen": -1.5270473957061768, + "logps/rejected": -2.1087257862091064, + "loss": 0.6744, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5270473957061768, + "rewards/margins": 0.5816782116889954, + "rewards/rejected": -2.1087257862091064, + "sft_loss": 1.545583724975586, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 2.9727093199437515, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": -0.05318199470639229, + "logits/rejected": 0.0316009446978569, + "logps/chosen": -1.644573450088501, + "logps/rejected": -2.1216087341308594, + "loss": 0.6565, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.644573450088501, + "rewards/margins": 0.4770355224609375, + "rewards/rejected": -2.1216087341308594, + "sft_loss": 1.639564871788025, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 3.7309407734006514, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.2446902096271515, + "logits/rejected": -0.08602052927017212, + "logps/chosen": -1.5557916164398193, + "logps/rejected": -2.083207368850708, + "loss": 0.6657, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5557916164398193, + "rewards/margins": 0.5274157524108887, + "rewards/rejected": -2.083207368850708, + "sft_loss": 1.6140753030776978, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 3.9303733965572953, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.19570858776569366, + "logits/rejected": -0.02743830345571041, + "logps/chosen": -1.4865481853485107, + "logps/rejected": -1.9939115047454834, + "loss": 0.6748, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4865481853485107, + "rewards/margins": 0.5073633193969727, + "rewards/rejected": -1.9939115047454834, + "sft_loss": 1.554677963256836, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 3.2539139154960326, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.18052852153778076, + "logits/rejected": -0.03774655982851982, + "logps/chosen": -1.686672568321228, + "logps/rejected": -2.1763644218444824, + "loss": 0.6646, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.686672568321228, + "rewards/margins": 0.4896920323371887, + "rewards/rejected": -2.1763644218444824, + "sft_loss": 1.6342140436172485, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 4.422880196611135, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.054852940142154694, + "logits/rejected": 0.07409236580133438, + "logps/chosen": -1.5826961994171143, + "logps/rejected": -1.96904718875885, + "loss": 0.6725, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5826961994171143, + "rewards/margins": 0.3863510191440582, + "rewards/rejected": -1.96904718875885, + "sft_loss": 1.553628921508789, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 6.7335124089664795, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.17560145258903503, + "logits/rejected": -0.10570497810840607, + "logps/chosen": -1.4192087650299072, + "logps/rejected": -2.121539831161499, + "loss": 0.6416, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4192087650299072, + "rewards/margins": 0.7023310661315918, + "rewards/rejected": -2.121539831161499, + "sft_loss": 1.4606475830078125, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 4.315507698172223, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.20051035284996033, + "logits/rejected": -0.09910713881254196, + "logps/chosen": -1.5228033065795898, + "logps/rejected": -1.9885295629501343, + "loss": 0.6692, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5228033065795898, + "rewards/margins": 0.46572622656822205, + "rewards/rejected": -1.9885295629501343, + "sft_loss": 1.5164625644683838, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 4.409635070267732, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.231184720993042, + "logits/rejected": -0.0673697367310524, + "logps/chosen": -1.635107398033142, + "logps/rejected": -2.1991419792175293, + "loss": 0.6605, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.635107398033142, + "rewards/margins": 0.5640343427658081, + "rewards/rejected": -2.1991419792175293, + "sft_loss": 1.6451671123504639, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 2.2495017944096625, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.2617799639701843, + "logits/rejected": -0.12326967716217041, + "logps/chosen": -1.5735450983047485, + "logps/rejected": -2.2101778984069824, + "loss": 0.6563, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5735450983047485, + "rewards/margins": 0.6366329789161682, + "rewards/rejected": -2.2101778984069824, + "sft_loss": 1.5666234493255615, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 3.8372559186929984, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.23476651310920715, + "logits/rejected": -0.052711568772792816, + "logps/chosen": -1.5595481395721436, + "logps/rejected": -2.0955264568328857, + "loss": 0.6704, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5595481395721436, + "rewards/margins": 0.5359782576560974, + "rewards/rejected": -2.0955264568328857, + "sft_loss": 1.60107421875, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 5.453924102853809, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.12843522429466248, + "logits/rejected": -0.07940709590911865, + "logps/chosen": -1.5772250890731812, + "logps/rejected": -2.2718729972839355, + "loss": 0.6515, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5772250890731812, + "rewards/margins": 0.6946476697921753, + "rewards/rejected": -2.2718729972839355, + "sft_loss": 1.6094601154327393, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 3.152373845146598, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.2801273763179779, + "logits/rejected": -0.1516799032688141, + "logps/chosen": -1.5942035913467407, + "logps/rejected": -2.177546262741089, + "loss": 0.6678, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5942035913467407, + "rewards/margins": 0.5833428502082825, + "rewards/rejected": -2.177546262741089, + "sft_loss": 1.6442228555679321, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 4.23043494021146, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.26326099038124084, + "logits/rejected": -0.1866808533668518, + "logps/chosen": -1.5465459823608398, + "logps/rejected": -2.1609504222869873, + "loss": 0.6618, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5465459823608398, + "rewards/margins": 0.6144044399261475, + "rewards/rejected": -2.1609504222869873, + "sft_loss": 1.4961479902267456, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 6.4660259398849345, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.3211483359336853, + "logits/rejected": -0.11951088905334473, + "logps/chosen": -1.4363352060317993, + "logps/rejected": -2.0495729446411133, + "loss": 0.66, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4363352060317993, + "rewards/margins": 0.613237738609314, + "rewards/rejected": -2.0495729446411133, + "sft_loss": 1.4899227619171143, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 11.36741274310511, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.1367950141429901, + "logits/rejected": -0.1153026595711708, + "logps/chosen": -1.4984586238861084, + "logps/rejected": -2.107638359069824, + "loss": 0.6633, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4984586238861084, + "rewards/margins": 0.6091797351837158, + "rewards/rejected": -2.107638359069824, + "sft_loss": 1.534341812133789, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 4.08127766292726, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.11941780894994736, + "logits/rejected": -0.041212014853954315, + "logps/chosen": -1.5258735418319702, + "logps/rejected": -1.9783833026885986, + "loss": 0.6681, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5258735418319702, + "rewards/margins": 0.45250973105430603, + "rewards/rejected": -1.9783833026885986, + "sft_loss": 1.6122392416000366, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 4.142980500759543, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.16154876351356506, + "logits/rejected": -0.008436007425189018, + "logps/chosen": -1.5421125888824463, + "logps/rejected": -2.1379971504211426, + "loss": 0.6642, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5421125888824463, + "rewards/margins": 0.5958844423294067, + "rewards/rejected": -2.1379971504211426, + "sft_loss": 1.5496423244476318, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 3.3827559336720023, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.28748637437820435, + "logits/rejected": -0.137380450963974, + "logps/chosen": -1.5870611667633057, + "logps/rejected": -2.0440049171447754, + "loss": 0.691, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5870611667633057, + "rewards/margins": 0.4569437503814697, + "rewards/rejected": -2.0440049171447754, + "sft_loss": 1.6794652938842773, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 3.8765153027358497, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.2404676377773285, + "logits/rejected": -0.04566841199994087, + "logps/chosen": -1.5718721151351929, + "logps/rejected": -2.0481221675872803, + "loss": 0.6709, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5718721151351929, + "rewards/margins": 0.4762501120567322, + "rewards/rejected": -2.0481221675872803, + "sft_loss": 1.6210445165634155, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 2.7125040680828545, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.17511853575706482, + "logits/rejected": -0.059401821345090866, + "logps/chosen": -1.5637508630752563, + "logps/rejected": -2.0859580039978027, + "loss": 0.6644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5637508630752563, + "rewards/margins": 0.5222072601318359, + "rewards/rejected": -2.0859580039978027, + "sft_loss": 1.4825503826141357, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 1.837498071457063, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.1399889588356018, + "logits/rejected": -0.04128175228834152, + "logps/chosen": -1.5877896547317505, + "logps/rejected": -2.170602321624756, + "loss": 0.6475, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5877896547317505, + "rewards/margins": 0.5828126072883606, + "rewards/rejected": -2.170602321624756, + "sft_loss": 1.6309263706207275, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 3.0047021986668674, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.26999109983444214, + "logits/rejected": -0.1168588176369667, + "logps/chosen": -1.6641719341278076, + "logps/rejected": -2.2500786781311035, + "loss": 0.6924, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6641719341278076, + "rewards/margins": 0.585906445980072, + "rewards/rejected": -2.2500786781311035, + "sft_loss": 1.6384944915771484, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 2.5450039000979245, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.1542874127626419, + "logits/rejected": -0.03618238866329193, + "logps/chosen": -1.7048299312591553, + "logps/rejected": -2.316401481628418, + "loss": 0.675, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7048299312591553, + "rewards/margins": 0.6115714311599731, + "rewards/rejected": -2.316401481628418, + "sft_loss": 1.5900487899780273, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 4.731359554444461, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.17984583973884583, + "logits/rejected": -0.029300883412361145, + "logps/chosen": -1.642860770225525, + "logps/rejected": -2.2364511489868164, + "loss": 0.6764, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.642860770225525, + "rewards/margins": 0.5935903787612915, + "rewards/rejected": -2.2364511489868164, + "sft_loss": 1.6760437488555908, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 3.5207430962242725, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.14204739034175873, + "logits/rejected": -0.07755346596240997, + "logps/chosen": -1.6895830631256104, + "logps/rejected": -2.2165913581848145, + "loss": 0.6697, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6895830631256104, + "rewards/margins": 0.527008593082428, + "rewards/rejected": -2.2165913581848145, + "sft_loss": 1.5765527486801147, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 3.6629297139978796, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -0.4186740815639496, + "logits/rejected": -0.26618799567222595, + "logps/chosen": -1.502715826034546, + "logps/rejected": -1.992854356765747, + "loss": 0.6598, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.502715826034546, + "rewards/margins": 0.4901384711265564, + "rewards/rejected": -1.992854356765747, + "sft_loss": 1.5291118621826172, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 5.915396790393318, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.18335725367069244, + "logits/rejected": -0.04013916105031967, + "logps/chosen": -1.5186660289764404, + "logps/rejected": -2.1200027465820312, + "loss": 0.663, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5186660289764404, + "rewards/margins": 0.6013368964195251, + "rewards/rejected": -2.1200027465820312, + "sft_loss": 1.5608304738998413, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 4.3467671734022355, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.3105720281600952, + "logits/rejected": -0.18775293231010437, + "logps/chosen": -1.608131766319275, + "logps/rejected": -2.0446836948394775, + "loss": 0.6691, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.608131766319275, + "rewards/margins": 0.4365520477294922, + "rewards/rejected": -2.0446836948394775, + "sft_loss": 1.558866262435913, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 8.05953814997876, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.3036400079727173, + "logits/rejected": -0.17522230744361877, + "logps/chosen": -1.4665193557739258, + "logps/rejected": -2.0359203815460205, + "loss": 0.6463, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4665193557739258, + "rewards/margins": 0.5694009065628052, + "rewards/rejected": -2.0359203815460205, + "sft_loss": 1.4616729021072388, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 4.6680877708152435, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.23240847885608673, + "logits/rejected": -0.21550750732421875, + "logps/chosen": -1.5334768295288086, + "logps/rejected": -2.082609176635742, + "loss": 0.6575, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5334768295288086, + "rewards/margins": 0.5491322875022888, + "rewards/rejected": -2.082609176635742, + "sft_loss": 1.6290900707244873, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 2.779228534406303, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.318700909614563, + "logits/rejected": -0.2146882265806198, + "logps/chosen": -1.573927640914917, + "logps/rejected": -1.9949705600738525, + "loss": 0.6794, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.573927640914917, + "rewards/margins": 0.4210427701473236, + "rewards/rejected": -1.9949705600738525, + "sft_loss": 1.6389739513397217, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 3.4045042632324374, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.2660903334617615, + "logits/rejected": -0.1637762188911438, + "logps/chosen": -1.5688062906265259, + "logps/rejected": -2.2330851554870605, + "loss": 0.6723, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5688062906265259, + "rewards/margins": 0.664279043674469, + "rewards/rejected": -2.2330851554870605, + "sft_loss": 1.6072860956192017, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 6.639672136295296, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.29735398292541504, + "logits/rejected": -0.1823483258485794, + "logps/chosen": -1.6251757144927979, + "logps/rejected": -1.9575645923614502, + "loss": 0.6946, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6251757144927979, + "rewards/margins": 0.33238908648490906, + "rewards/rejected": -1.9575645923614502, + "sft_loss": 1.6332619190216064, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 1.7769204924020565, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.1574326455593109, + "logits/rejected": -0.10058436542749405, + "logps/chosen": -1.608525037765503, + "logps/rejected": -2.324282169342041, + "loss": 0.6644, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.608525037765503, + "rewards/margins": 0.7157570123672485, + "rewards/rejected": -2.324282169342041, + "sft_loss": 1.601483941078186, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 5.238907987535943, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.20695853233337402, + "logits/rejected": -0.1775607317686081, + "logps/chosen": -1.6417983770370483, + "logps/rejected": -1.962201476097107, + "loss": 0.6726, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6417983770370483, + "rewards/margins": 0.3204033374786377, + "rewards/rejected": -1.962201476097107, + "sft_loss": 1.6719862222671509, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 3.2650118759743316, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.3084166646003723, + "logits/rejected": -0.1510084867477417, + "logps/chosen": -1.5061914920806885, + "logps/rejected": -2.324479579925537, + "loss": 0.6452, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5061914920806885, + "rewards/margins": 0.8182878494262695, + "rewards/rejected": -2.324479579925537, + "sft_loss": 1.5005475282669067, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 3.1120595174991936, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.31429943442344666, + "logits/rejected": -0.205793097615242, + "logps/chosen": -1.5214614868164062, + "logps/rejected": -2.128938913345337, + "loss": 0.6549, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5214614868164062, + "rewards/margins": 0.6074774265289307, + "rewards/rejected": -2.128938913345337, + "sft_loss": 1.4796510934829712, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 3.7733075424203335, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.1961269974708557, + "logits/rejected": -0.05360075831413269, + "logps/chosen": -1.6085929870605469, + "logps/rejected": -2.3059115409851074, + "loss": 0.6709, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6085929870605469, + "rewards/margins": 0.6973183751106262, + "rewards/rejected": -2.3059115409851074, + "sft_loss": 1.6058056354522705, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.05022057890892029, + "eval_logits/rejected": 0.13729551434516907, + "eval_logps/chosen": -1.5941189527511597, + "eval_logps/rejected": -2.1720833778381348, + "eval_loss": 0.6700656414031982, + "eval_rewards/accuracies": 0.6476261019706726, + "eval_rewards/chosen": -1.5941189527511597, + "eval_rewards/margins": 0.577964723110199, + "eval_rewards/rejected": -2.1720833778381348, + "eval_runtime": 43.8762, + "eval_samples_per_second": 30.654, + "eval_sft_loss": 1.5844956636428833, + "eval_steps_per_second": 7.681, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 2.3281305995229844, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.3298807144165039, + "logits/rejected": -0.20993852615356445, + "logps/chosen": -1.5723832845687866, + "logps/rejected": -2.127077102661133, + "loss": 0.6648, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5723832845687866, + "rewards/margins": 0.5546938180923462, + "rewards/rejected": -2.127077102661133, + "sft_loss": 1.5659022331237793, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 4.646094569181479, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.29386386275291443, + "logits/rejected": -0.1871393471956253, + "logps/chosen": -1.5456677675247192, + "logps/rejected": -2.1093719005584717, + "loss": 0.665, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5456677675247192, + "rewards/margins": 0.5637043118476868, + "rewards/rejected": -2.1093719005584717, + "sft_loss": 1.5557401180267334, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 11.118365684823452, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.2562563717365265, + "logits/rejected": -0.1333739459514618, + "logps/chosen": -1.5714125633239746, + "logps/rejected": -2.154006242752075, + "loss": 0.67, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5714125633239746, + "rewards/margins": 0.5825936198234558, + "rewards/rejected": -2.154006242752075, + "sft_loss": 1.5748682022094727, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 6.593994824049016, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -0.2744905352592468, + "logits/rejected": -0.05123863369226456, + "logps/chosen": -1.7629566192626953, + "logps/rejected": -2.3401918411254883, + "loss": 0.6708, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7629566192626953, + "rewards/margins": 0.5772350430488586, + "rewards/rejected": -2.3401918411254883, + "sft_loss": 1.6627269983291626, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 4.439427646218787, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.2580259442329407, + "logits/rejected": -0.1263856738805771, + "logps/chosen": -1.4843460321426392, + "logps/rejected": -2.166602849960327, + "loss": 0.657, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4843460321426392, + "rewards/margins": 0.6822569370269775, + "rewards/rejected": -2.166602849960327, + "sft_loss": 1.5099159479141235, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 4.3990206098793045, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.28104525804519653, + "logits/rejected": -0.11986222118139267, + "logps/chosen": -1.6033084392547607, + "logps/rejected": -2.4513392448425293, + "loss": 0.6509, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6033084392547607, + "rewards/margins": 0.8480307459831238, + "rewards/rejected": -2.4513392448425293, + "sft_loss": 1.5755976438522339, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 4.822929273440541, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.2546038031578064, + "logits/rejected": -0.17843613028526306, + "logps/chosen": -1.5555105209350586, + "logps/rejected": -2.129605770111084, + "loss": 0.6641, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5555105209350586, + "rewards/margins": 0.5740953683853149, + "rewards/rejected": -2.129605770111084, + "sft_loss": 1.5992151498794556, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 2.790602526801848, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": -0.1786082237958908, + "logits/rejected": -0.07207518815994263, + "logps/chosen": -1.6006673574447632, + "logps/rejected": -2.2379143238067627, + "loss": 0.6486, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6006673574447632, + "rewards/margins": 0.6372469067573547, + "rewards/rejected": -2.2379143238067627, + "sft_loss": 1.4736950397491455, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 3.527773904971378, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.35635730624198914, + "logits/rejected": -0.21600675582885742, + "logps/chosen": -1.6099275350570679, + "logps/rejected": -2.149055242538452, + "loss": 0.6725, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6099275350570679, + "rewards/margins": 0.5391277074813843, + "rewards/rejected": -2.149055242538452, + "sft_loss": 1.566629409790039, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 3.5194697362592726, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": -0.12450122833251953, + "logits/rejected": 0.024506190791726112, + "logps/chosen": -1.541261076927185, + "logps/rejected": -2.20589017868042, + "loss": 0.6624, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.541261076927185, + "rewards/margins": 0.6646289229393005, + "rewards/rejected": -2.20589017868042, + "sft_loss": 1.5440499782562256, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 2.0734042744332535, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.17247922718524933, + "logits/rejected": -0.0935923159122467, + "logps/chosen": -1.6456283330917358, + "logps/rejected": -2.0860137939453125, + "loss": 0.6677, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6456283330917358, + "rewards/margins": 0.44038552045822144, + "rewards/rejected": -2.0860137939453125, + "sft_loss": 1.630672812461853, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 2.6240527895390082, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.28131115436553955, + "logits/rejected": -0.13464775681495667, + "logps/chosen": -1.540771245956421, + "logps/rejected": -2.0464205741882324, + "loss": 0.6561, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.540771245956421, + "rewards/margins": 0.5056491494178772, + "rewards/rejected": -2.0464205741882324, + "sft_loss": 1.5565450191497803, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 6.75352646316694, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.15234871208667755, + "logits/rejected": -0.037437863647937775, + "logps/chosen": -1.5263092517852783, + "logps/rejected": -2.2033650875091553, + "loss": 0.656, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5263092517852783, + "rewards/margins": 0.6770559549331665, + "rewards/rejected": -2.2033650875091553, + "sft_loss": 1.5696706771850586, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 6.6376623109556405, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.16901914775371552, + "logits/rejected": -0.023870373144745827, + "logps/chosen": -1.4356871843338013, + "logps/rejected": -2.0282955169677734, + "loss": 0.647, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4356871843338013, + "rewards/margins": 0.5926082134246826, + "rewards/rejected": -2.0282955169677734, + "sft_loss": 1.4541518688201904, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 3.6683261434355945, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -0.23975209891796112, + "logits/rejected": 0.0791751816868782, + "logps/chosen": -1.5243529081344604, + "logps/rejected": -2.145411252975464, + "loss": 0.6541, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5243529081344604, + "rewards/margins": 0.6210582852363586, + "rewards/rejected": -2.145411252975464, + "sft_loss": 1.5562708377838135, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 4.898270621041198, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.20579198002815247, + "logits/rejected": -0.10366296768188477, + "logps/chosen": -1.4743316173553467, + "logps/rejected": -2.0218100547790527, + "loss": 0.6529, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4743316173553467, + "rewards/margins": 0.5474783182144165, + "rewards/rejected": -2.0218100547790527, + "sft_loss": 1.5755841732025146, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 3.861413615441136, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.22842450439929962, + "logits/rejected": 0.0003024935722351074, + "logps/chosen": -1.4898465871810913, + "logps/rejected": -2.079397201538086, + "loss": 0.6545, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4898465871810913, + "rewards/margins": 0.5895504355430603, + "rewards/rejected": -2.079397201538086, + "sft_loss": 1.526920199394226, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 4.130081611622876, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.18690963089466095, + "logits/rejected": -0.06916667520999908, + "logps/chosen": -1.5715512037277222, + "logps/rejected": -2.125609874725342, + "loss": 0.6739, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5715512037277222, + "rewards/margins": 0.5540584921836853, + "rewards/rejected": -2.125609874725342, + "sft_loss": 1.5971730947494507, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 5.680075231329591, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.1997305452823639, + "logits/rejected": -0.12079212814569473, + "logps/chosen": -1.6892131567001343, + "logps/rejected": -2.155735492706299, + "loss": 0.6799, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6892131567001343, + "rewards/margins": 0.4665220379829407, + "rewards/rejected": -2.155735492706299, + "sft_loss": 1.6960713863372803, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 2.931451514787955, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.30347341299057007, + "logits/rejected": -0.15210089087486267, + "logps/chosen": -1.7506462335586548, + "logps/rejected": -2.338987112045288, + "loss": 0.6733, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7506462335586548, + "rewards/margins": 0.5883409380912781, + "rewards/rejected": -2.338987112045288, + "sft_loss": 1.6921055316925049, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 5.547422306866258, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.314720094203949, + "logits/rejected": -0.16665521264076233, + "logps/chosen": -1.5950864553451538, + "logps/rejected": -2.1158535480499268, + "loss": 0.6813, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5950864553451538, + "rewards/margins": 0.520767092704773, + "rewards/rejected": -2.1158535480499268, + "sft_loss": 1.5926302671432495, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 5.088090669899126, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.12834122776985168, + "logits/rejected": -0.07507321983575821, + "logps/chosen": -1.6144376993179321, + "logps/rejected": -2.0841517448425293, + "loss": 0.67, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6144376993179321, + "rewards/margins": 0.4697140157222748, + "rewards/rejected": -2.0841517448425293, + "sft_loss": 1.6303374767303467, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 6.267169922611252, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.32739704847335815, + "logits/rejected": -0.17024415731430054, + "logps/chosen": -1.5832575559616089, + "logps/rejected": -2.1148574352264404, + "loss": 0.6732, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5832575559616089, + "rewards/margins": 0.5315998196601868, + "rewards/rejected": -2.1148574352264404, + "sft_loss": 1.5724600553512573, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 3.414609963561676, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.24356892704963684, + "logits/rejected": -0.049546681344509125, + "logps/chosen": -1.5284088850021362, + "logps/rejected": -2.3344571590423584, + "loss": 0.66, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5284088850021362, + "rewards/margins": 0.8060482740402222, + "rewards/rejected": -2.3344571590423584, + "sft_loss": 1.6087051630020142, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 6.679784628976017, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.3191532492637634, + "logits/rejected": -0.13674207031726837, + "logps/chosen": -1.6017239093780518, + "logps/rejected": -2.1782643795013428, + "loss": 0.6558, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6017239093780518, + "rewards/margins": 0.5765405893325806, + "rewards/rejected": -2.1782643795013428, + "sft_loss": 1.6856014728546143, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 11.060334378583923, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.20316722989082336, + "logits/rejected": -0.08586695045232773, + "logps/chosen": -1.4768259525299072, + "logps/rejected": -2.1830265522003174, + "loss": 0.6401, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4768259525299072, + "rewards/margins": 0.7062004208564758, + "rewards/rejected": -2.1830265522003174, + "sft_loss": 1.5168195962905884, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 4.1087649701684885, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.2536730170249939, + "logits/rejected": -0.09729192405939102, + "logps/chosen": -1.6899553537368774, + "logps/rejected": -2.173895835876465, + "loss": 0.6668, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6899553537368774, + "rewards/margins": 0.4839404225349426, + "rewards/rejected": -2.173895835876465, + "sft_loss": 1.6380395889282227, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 3.157861243925943, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.21639016270637512, + "logits/rejected": -0.07234219461679459, + "logps/chosen": -1.4712656736373901, + "logps/rejected": -2.205264091491699, + "loss": 0.6525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4712656736373901, + "rewards/margins": 0.7339984178543091, + "rewards/rejected": -2.205264091491699, + "sft_loss": 1.5232994556427002, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 6.890344570453813, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.21207043528556824, + "logits/rejected": -0.03308100253343582, + "logps/chosen": -1.424570083618164, + "logps/rejected": -2.0149474143981934, + "loss": 0.6476, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.424570083618164, + "rewards/margins": 0.5903773307800293, + "rewards/rejected": -2.0149474143981934, + "sft_loss": 1.4861271381378174, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 4.1095339352997335, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.3412007689476013, + "logits/rejected": -0.19217181205749512, + "logps/chosen": -1.64543879032135, + "logps/rejected": -2.1718764305114746, + "loss": 0.6755, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.64543879032135, + "rewards/margins": 0.5264378786087036, + "rewards/rejected": -2.1718764305114746, + "sft_loss": 1.64764404296875, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 9.631527390186543, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": -0.12420465797185898, + "logits/rejected": -0.07888902723789215, + "logps/chosen": -1.750036597251892, + "logps/rejected": -2.160815715789795, + "loss": 0.6941, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.750036597251892, + "rewards/margins": 0.4107791781425476, + "rewards/rejected": -2.160815715789795, + "sft_loss": 1.6819007396697998, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 4.0032765855071375, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.2751021087169647, + "logits/rejected": -0.052364956587553024, + "logps/chosen": -1.493459701538086, + "logps/rejected": -2.2093310356140137, + "loss": 0.6606, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.493459701538086, + "rewards/margins": 0.7158714532852173, + "rewards/rejected": -2.2093310356140137, + "sft_loss": 1.5034425258636475, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 3.4612579048779377, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.25418537855148315, + "logits/rejected": -0.09127441793680191, + "logps/chosen": -1.59048330783844, + "logps/rejected": -2.2793405055999756, + "loss": 0.6625, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.59048330783844, + "rewards/margins": 0.6888570189476013, + "rewards/rejected": -2.2793405055999756, + "sft_loss": 1.6225292682647705, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 3.612183359211995, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.22967293858528137, + "logits/rejected": -0.06436924636363983, + "logps/chosen": -1.604569673538208, + "logps/rejected": -2.123711109161377, + "loss": 0.6642, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.604569673538208, + "rewards/margins": 0.5191417932510376, + "rewards/rejected": -2.123711109161377, + "sft_loss": 1.5751338005065918, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 3.7546805011871975, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": -0.0654725581407547, + "logits/rejected": -0.0029380307532846928, + "logps/chosen": -1.7108027935028076, + "logps/rejected": -2.1099960803985596, + "loss": 0.6938, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.7108027935028076, + "rewards/margins": 0.3991931080818176, + "rewards/rejected": -2.1099960803985596, + "sft_loss": 1.6306812763214111, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 2.6074496359152914, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.3023606538772583, + "logits/rejected": -0.15595746040344238, + "logps/chosen": -1.5669963359832764, + "logps/rejected": -2.1247708797454834, + "loss": 0.6666, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5669963359832764, + "rewards/margins": 0.5577744245529175, + "rewards/rejected": -2.1247708797454834, + "sft_loss": 1.5600146055221558, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 3.5661741439601697, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.2698851525783539, + "logits/rejected": -0.23949381709098816, + "logps/chosen": -1.617048978805542, + "logps/rejected": -2.055820941925049, + "loss": 0.6773, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.617048978805542, + "rewards/margins": 0.43877163529396057, + "rewards/rejected": -2.055820941925049, + "sft_loss": 1.6467010974884033, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 3.7044403708843396, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.2357718050479889, + "logits/rejected": -0.1072898730635643, + "logps/chosen": -1.6923530101776123, + "logps/rejected": -2.2098324298858643, + "loss": 0.6778, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6923530101776123, + "rewards/margins": 0.517479419708252, + "rewards/rejected": -2.2098324298858643, + "sft_loss": 1.6890838146209717, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 6.973444286465607, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.24866466224193573, + "logits/rejected": -0.13903850317001343, + "logps/chosen": -1.4910849332809448, + "logps/rejected": -2.1087069511413574, + "loss": 0.6534, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4910849332809448, + "rewards/margins": 0.6176217794418335, + "rewards/rejected": -2.1087069511413574, + "sft_loss": 1.4826654195785522, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 4.253394673156057, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.2572954297065735, + "logits/rejected": -0.1391439139842987, + "logps/chosen": -1.6212129592895508, + "logps/rejected": -2.2261252403259277, + "loss": 0.6597, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6212129592895508, + "rewards/margins": 0.6049124002456665, + "rewards/rejected": -2.2261252403259277, + "sft_loss": 1.6586673259735107, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 6.2400965553296155, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.2455395758152008, + "logits/rejected": -0.08400936424732208, + "logps/chosen": -1.509826898574829, + "logps/rejected": -2.068021774291992, + "loss": 0.6773, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.509826898574829, + "rewards/margins": 0.5581950545310974, + "rewards/rejected": -2.068021774291992, + "sft_loss": 1.5645593404769897, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 4.447093603110371, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.25325924158096313, + "logits/rejected": -0.03236902132630348, + "logps/chosen": -1.681740164756775, + "logps/rejected": -2.2139956951141357, + "loss": 0.6854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.681740164756775, + "rewards/margins": 0.5322555303573608, + "rewards/rejected": -2.2139956951141357, + "sft_loss": 1.654750108718872, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 4.005222117059965, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.22943027317523956, + "logits/rejected": -0.11498402059078217, + "logps/chosen": -1.5746691226959229, + "logps/rejected": -2.2544422149658203, + "loss": 0.6758, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5746691226959229, + "rewards/margins": 0.679773211479187, + "rewards/rejected": -2.2544422149658203, + "sft_loss": 1.6747443675994873, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 4.146766720366405, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.17182201147079468, + "logits/rejected": -0.042574524879455566, + "logps/chosen": -1.6990025043487549, + "logps/rejected": -2.359184741973877, + "loss": 0.6712, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.6990025043487549, + "rewards/margins": 0.6601821780204773, + "rewards/rejected": -2.359184741973877, + "sft_loss": 1.7276928424835205, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 3.1039010392795388, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.3011739253997803, + "logits/rejected": -0.249053955078125, + "logps/chosen": -1.559574842453003, + "logps/rejected": -2.173431873321533, + "loss": 0.6685, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.559574842453003, + "rewards/margins": 0.6138567924499512, + "rewards/rejected": -2.173431873321533, + "sft_loss": 1.5596319437026978, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 16.226665192040024, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.2586892247200012, + "logits/rejected": -0.1233101636171341, + "logps/chosen": -1.565425157546997, + "logps/rejected": -2.539271354675293, + "loss": 0.6813, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.565425157546997, + "rewards/margins": 0.9738461375236511, + "rewards/rejected": -2.539271354675293, + "sft_loss": 1.5736205577850342, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 6.192896673655969, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.2718811631202698, + "logits/rejected": -0.16693784296512604, + "logps/chosen": -1.4871734380722046, + "logps/rejected": -2.1045596599578857, + "loss": 0.6433, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4871734380722046, + "rewards/margins": 0.6173862218856812, + "rewards/rejected": -2.1045596599578857, + "sft_loss": 1.5424994230270386, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 5.251785598075094, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.33662140369415283, + "logits/rejected": -0.2634265422821045, + "logps/chosen": -1.4642804861068726, + "logps/rejected": -2.2976937294006348, + "loss": 0.6679, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4642804861068726, + "rewards/margins": 0.833413302898407, + "rewards/rejected": -2.2976937294006348, + "sft_loss": 1.498628854751587, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 5.977642868911891, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.29837948083877563, + "logits/rejected": -0.07646293193101883, + "logps/chosen": -1.5494577884674072, + "logps/rejected": -2.1560275554656982, + "loss": 0.6637, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5494577884674072, + "rewards/margins": 0.6065697073936462, + "rewards/rejected": -2.1560275554656982, + "sft_loss": 1.5897536277770996, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 12.649656965505333, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.36968958377838135, + "logits/rejected": -0.17792022228240967, + "logps/chosen": -1.5691545009613037, + "logps/rejected": -2.228562355041504, + "loss": 0.6608, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5691545009613037, + "rewards/margins": 0.6594076156616211, + "rewards/rejected": -2.228562355041504, + "sft_loss": 1.635911226272583, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 3.260610776171183, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.23752427101135254, + "logits/rejected": -0.06146562844514847, + "logps/chosen": -1.7419025897979736, + "logps/rejected": -2.3571693897247314, + "loss": 0.6827, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7419025897979736, + "rewards/margins": 0.6152670979499817, + "rewards/rejected": -2.3571693897247314, + "sft_loss": 1.741065263748169, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 9.52227662285607, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.12958386540412903, + "logits/rejected": -0.0022026679944247007, + "logps/chosen": -1.5865254402160645, + "logps/rejected": -2.2228541374206543, + "loss": 0.6833, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5865254402160645, + "rewards/margins": 0.6363285779953003, + "rewards/rejected": -2.2228541374206543, + "sft_loss": 1.5762921571731567, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 2.263229018573152, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.30748385190963745, + "logits/rejected": -0.1827574074268341, + "logps/chosen": -1.4545913934707642, + "logps/rejected": -1.9467980861663818, + "loss": 0.6628, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4545913934707642, + "rewards/margins": 0.49220672249794006, + "rewards/rejected": -1.9467980861663818, + "sft_loss": 1.4630444049835205, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 2.5218768620989045, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.31609639525413513, + "logits/rejected": -0.21111683547496796, + "logps/chosen": -1.7103404998779297, + "logps/rejected": -2.197878122329712, + "loss": 0.6742, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7103404998779297, + "rewards/margins": 0.48753732442855835, + "rewards/rejected": -2.197878122329712, + "sft_loss": 1.681815505027771, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 3.065626489258694, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.3215027451515198, + "logits/rejected": -0.2551218867301941, + "logps/chosen": -1.5578937530517578, + "logps/rejected": -2.097822666168213, + "loss": 0.6535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5578937530517578, + "rewards/margins": 0.5399289727210999, + "rewards/rejected": -2.097822666168213, + "sft_loss": 1.5477797985076904, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 3.2673106603359585, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.18195316195487976, + "logits/rejected": -0.1400514543056488, + "logps/chosen": -1.6033868789672852, + "logps/rejected": -2.114414930343628, + "loss": 0.6608, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6033868789672852, + "rewards/margins": 0.5110281705856323, + "rewards/rejected": -2.114414930343628, + "sft_loss": 1.5997934341430664, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 5.970318175906083, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.22843074798583984, + "logits/rejected": -0.10632483661174774, + "logps/chosen": -1.5803484916687012, + "logps/rejected": -2.1380927562713623, + "loss": 0.6746, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5803484916687012, + "rewards/margins": 0.5577442049980164, + "rewards/rejected": -2.1380927562713623, + "sft_loss": 1.6084749698638916, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 3.184778905784191, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.20204658806324005, + "logits/rejected": -0.07352867722511292, + "logps/chosen": -1.4979311227798462, + "logps/rejected": -2.0505619049072266, + "loss": 0.6623, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4979311227798462, + "rewards/margins": 0.5526308417320251, + "rewards/rejected": -2.0505619049072266, + "sft_loss": 1.604762077331543, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 3.286481646732373, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -0.29412025213241577, + "logits/rejected": -0.12024509906768799, + "logps/chosen": -1.647534728050232, + "logps/rejected": -2.1293785572052, + "loss": 0.6664, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.647534728050232, + "rewards/margins": 0.48184362053871155, + "rewards/rejected": -2.1293785572052, + "sft_loss": 1.665687918663025, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 3.353615127880277, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.19847139716148376, + "logits/rejected": -0.034717656672000885, + "logps/chosen": -1.6773347854614258, + "logps/rejected": -2.419734239578247, + "loss": 0.6503, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6773347854614258, + "rewards/margins": 0.7423990964889526, + "rewards/rejected": -2.419734239578247, + "sft_loss": 1.6499170064926147, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 5.210551530691637, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.22429361939430237, + "logits/rejected": -0.0938660278916359, + "logps/chosen": -1.628684401512146, + "logps/rejected": -2.2493913173675537, + "loss": 0.6737, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.628684401512146, + "rewards/margins": 0.6207069158554077, + "rewards/rejected": -2.2493913173675537, + "sft_loss": 1.616787314414978, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 5.361114817879042, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.23827588558197021, + "logits/rejected": -0.1531572937965393, + "logps/chosen": -1.5106375217437744, + "logps/rejected": -2.0941097736358643, + "loss": 0.6549, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5106375217437744, + "rewards/margins": 0.5834720134735107, + "rewards/rejected": -2.0941097736358643, + "sft_loss": 1.5395634174346924, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 6.650010887041627, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.2544296681880951, + "logits/rejected": -0.05539902299642563, + "logps/chosen": -1.5374723672866821, + "logps/rejected": -2.1580734252929688, + "loss": 0.6823, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5374723672866821, + "rewards/margins": 0.6206012964248657, + "rewards/rejected": -2.1580734252929688, + "sft_loss": 1.525309443473816, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 4.947389267072906, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.299444317817688, + "logits/rejected": -0.1548498123884201, + "logps/chosen": -1.580509901046753, + "logps/rejected": -2.2148938179016113, + "loss": 0.6664, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.580509901046753, + "rewards/margins": 0.6343838572502136, + "rewards/rejected": -2.2148938179016113, + "sft_loss": 1.6169801950454712, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 4.308547829856648, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.22670379281044006, + "logits/rejected": -0.029366493225097656, + "logps/chosen": -1.557122826576233, + "logps/rejected": -2.1413769721984863, + "loss": 0.6637, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.557122826576233, + "rewards/margins": 0.584254264831543, + "rewards/rejected": -2.1413769721984863, + "sft_loss": 1.6209783554077148, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 5.075572874613157, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.2328222543001175, + "logits/rejected": -0.1593681126832962, + "logps/chosen": -1.5033090114593506, + "logps/rejected": -1.9975850582122803, + "loss": 0.6678, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5033090114593506, + "rewards/margins": 0.4942760467529297, + "rewards/rejected": -1.9975850582122803, + "sft_loss": 1.599522352218628, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 4.517737430236565, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -0.31304430961608887, + "logits/rejected": -0.17922961711883545, + "logps/chosen": -1.4516620635986328, + "logps/rejected": -2.0089287757873535, + "loss": 0.6662, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4516620635986328, + "rewards/margins": 0.5572668313980103, + "rewards/rejected": -2.0089287757873535, + "sft_loss": 1.5084701776504517, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 12.721843344506373, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -0.29551082849502563, + "logits/rejected": -0.06938707828521729, + "logps/chosen": -1.5136016607284546, + "logps/rejected": -1.977852463722229, + "loss": 0.6633, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5136016607284546, + "rewards/margins": 0.46425071358680725, + "rewards/rejected": -1.977852463722229, + "sft_loss": 1.5562989711761475, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 12.960155392502875, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -0.4133186936378479, + "logits/rejected": -0.15416371822357178, + "logps/chosen": -1.5097110271453857, + "logps/rejected": -2.037909507751465, + "loss": 0.6673, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5097110271453857, + "rewards/margins": 0.5281984806060791, + "rewards/rejected": -2.037909507751465, + "sft_loss": 1.5186493396759033, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 3.071835516804881, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.22974996268749237, + "logits/rejected": -0.1381615251302719, + "logps/chosen": -1.5370875597000122, + "logps/rejected": -2.025336503982544, + "loss": 0.671, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5370875597000122, + "rewards/margins": 0.488248735666275, + "rewards/rejected": -2.025336503982544, + "sft_loss": 1.4795281887054443, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 2.9337359048630907, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -0.3561546206474304, + "logits/rejected": -0.22161278128623962, + "logps/chosen": -1.4861180782318115, + "logps/rejected": -2.141692638397217, + "loss": 0.6721, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4861180782318115, + "rewards/margins": 0.6555744409561157, + "rewards/rejected": -2.141692638397217, + "sft_loss": 1.4962797164916992, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 5.309838370949358, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.25040486454963684, + "logits/rejected": -0.16132552921772003, + "logps/chosen": -1.5923874378204346, + "logps/rejected": -2.195833683013916, + "loss": 0.6695, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5923874378204346, + "rewards/margins": 0.6034458875656128, + "rewards/rejected": -2.195833683013916, + "sft_loss": 1.6128263473510742, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 3.3420280599413, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.22517654299736023, + "logits/rejected": -0.14259059727191925, + "logps/chosen": -1.5689737796783447, + "logps/rejected": -2.3011300563812256, + "loss": 0.6617, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5689737796783447, + "rewards/margins": 0.7321562767028809, + "rewards/rejected": -2.3011300563812256, + "sft_loss": 1.6068799495697021, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 3.4535664969591804, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.2836065888404846, + "logits/rejected": -0.1251402199268341, + "logps/chosen": -1.5470378398895264, + "logps/rejected": -2.167536973953247, + "loss": 0.6596, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5470378398895264, + "rewards/margins": 0.6204993724822998, + "rewards/rejected": -2.167536973953247, + "sft_loss": 1.5432517528533936, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 3.875237419980098, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.19648095965385437, + "logits/rejected": -0.07422200590372086, + "logps/chosen": -1.5497782230377197, + "logps/rejected": -2.1988131999969482, + "loss": 0.6753, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5497782230377197, + "rewards/margins": 0.6490351557731628, + "rewards/rejected": -2.1988131999969482, + "sft_loss": 1.528440237045288, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 5.110834671223678, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.2754908502101898, + "logits/rejected": -0.221980482339859, + "logps/chosen": -1.5026241540908813, + "logps/rejected": -2.2867393493652344, + "loss": 0.6447, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5026241540908813, + "rewards/margins": 0.7841153144836426, + "rewards/rejected": -2.2867393493652344, + "sft_loss": 1.5356286764144897, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 3.5772522043441386, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.18519827723503113, + "logits/rejected": -0.12182275950908661, + "logps/chosen": -1.6455638408660889, + "logps/rejected": -2.175966739654541, + "loss": 0.6552, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6455638408660889, + "rewards/margins": 0.5304029583930969, + "rewards/rejected": -2.175966739654541, + "sft_loss": 1.693975806236267, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 2.7451754612170163, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.22382116317749023, + "logits/rejected": -0.1571996957063675, + "logps/chosen": -1.6252641677856445, + "logps/rejected": -1.9908462762832642, + "loss": 0.6766, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6252641677856445, + "rewards/margins": 0.36558184027671814, + "rewards/rejected": -1.9908462762832642, + "sft_loss": 1.6530452966690063, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 3.577943615855644, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.3366868495941162, + "logits/rejected": -0.12997911870479584, + "logps/chosen": -1.5786283016204834, + "logps/rejected": -2.2249855995178223, + "loss": 0.6663, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5786283016204834, + "rewards/margins": 0.6463571786880493, + "rewards/rejected": -2.2249855995178223, + "sft_loss": 1.5080569982528687, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 4.96797129383118, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.18899324536323547, + "logits/rejected": -0.011387032456696033, + "logps/chosen": -1.5805315971374512, + "logps/rejected": -2.2807295322418213, + "loss": 0.659, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5805315971374512, + "rewards/margins": 0.7001979947090149, + "rewards/rejected": -2.2807295322418213, + "sft_loss": 1.5917775630950928, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.033155109733343124, + "eval_logits/rejected": 0.11886825412511826, + "eval_logps/chosen": -1.5548663139343262, + "eval_logps/rejected": -2.1383421421051025, + "eval_loss": 0.6686433553695679, + "eval_rewards/accuracies": 0.6454005837440491, + "eval_rewards/chosen": -1.5548663139343262, + "eval_rewards/margins": 0.583476185798645, + "eval_rewards/rejected": -2.1383421421051025, + "eval_runtime": 51.671, + "eval_samples_per_second": 26.03, + "eval_sft_loss": 1.5567699670791626, + "eval_steps_per_second": 6.522, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 4.6127131848133995, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.29335904121398926, + "logits/rejected": -0.25090211629867554, + "logps/chosen": -1.569379210472107, + "logps/rejected": -1.9675638675689697, + "loss": 0.6762, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.569379210472107, + "rewards/margins": 0.3981846868991852, + "rewards/rejected": -1.9675638675689697, + "sft_loss": 1.5933501720428467, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 3.2191921337472507, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.23906013369560242, + "logits/rejected": -0.11687849462032318, + "logps/chosen": -1.5674329996109009, + "logps/rejected": -2.1900410652160645, + "loss": 0.6544, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5674329996109009, + "rewards/margins": 0.6226081252098083, + "rewards/rejected": -2.1900410652160645, + "sft_loss": 1.6243540048599243, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 3.14323697314147, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.1652398258447647, + "logits/rejected": -0.047273315489292145, + "logps/chosen": -1.5660088062286377, + "logps/rejected": -2.0418858528137207, + "loss": 0.6756, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5660088062286377, + "rewards/margins": 0.47587698698043823, + "rewards/rejected": -2.0418858528137207, + "sft_loss": 1.5622031688690186, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 3.5297502350856966, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.23163394629955292, + "logits/rejected": -0.1616125851869583, + "logps/chosen": -1.597439169883728, + "logps/rejected": -2.0517027378082275, + "loss": 0.6636, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.597439169883728, + "rewards/margins": 0.45426350831985474, + "rewards/rejected": -2.0517027378082275, + "sft_loss": 1.6499122381210327, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 2.8994360995806447, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.15500815212726593, + "logits/rejected": -0.12774141132831573, + "logps/chosen": -1.534929633140564, + "logps/rejected": -1.972755789756775, + "loss": 0.6625, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.534929633140564, + "rewards/margins": 0.437826007604599, + "rewards/rejected": -1.972755789756775, + "sft_loss": 1.5774520635604858, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 5.11590460585965, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.25896185636520386, + "logits/rejected": -0.2015591561794281, + "logps/chosen": -1.6104027032852173, + "logps/rejected": -2.128175735473633, + "loss": 0.6787, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6104027032852173, + "rewards/margins": 0.5177728533744812, + "rewards/rejected": -2.128175735473633, + "sft_loss": 1.7070982456207275, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 6.88697763601533, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -0.2818135619163513, + "logits/rejected": -0.18838824331760406, + "logps/chosen": -1.5450584888458252, + "logps/rejected": -2.018775463104248, + "loss": 0.6674, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5450584888458252, + "rewards/margins": 0.4737167954444885, + "rewards/rejected": -2.018775463104248, + "sft_loss": 1.5849950313568115, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 3.560060583655417, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.23087699711322784, + "logits/rejected": -0.11774490773677826, + "logps/chosen": -1.6925204992294312, + "logps/rejected": -1.9472980499267578, + "loss": 0.6799, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6925204992294312, + "rewards/margins": 0.2547776997089386, + "rewards/rejected": -1.9472980499267578, + "sft_loss": 1.6750530004501343, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 3.4389483248324906, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.30328264832496643, + "logits/rejected": -0.1819484382867813, + "logps/chosen": -1.6709623336791992, + "logps/rejected": -1.9654725790023804, + "loss": 0.6793, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6709623336791992, + "rewards/margins": 0.29451045393943787, + "rewards/rejected": -1.9654725790023804, + "sft_loss": 1.6666209697723389, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 4.228768748209242, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -0.2856011390686035, + "logits/rejected": -0.1793937236070633, + "logps/chosen": -1.5447874069213867, + "logps/rejected": -2.0078554153442383, + "loss": 0.6554, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5447874069213867, + "rewards/margins": 0.4630679488182068, + "rewards/rejected": -2.0078554153442383, + "sft_loss": 1.6015489101409912, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 10.273668179191976, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.25895020365715027, + "logits/rejected": -0.12569567561149597, + "logps/chosen": -1.7206480503082275, + "logps/rejected": -2.194127321243286, + "loss": 0.6877, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7206480503082275, + "rewards/margins": 0.47347909212112427, + "rewards/rejected": -2.194127321243286, + "sft_loss": 1.6576461791992188, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 4.1471744569265425, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.32585427165031433, + "logits/rejected": -0.20504073798656464, + "logps/chosen": -1.5201367139816284, + "logps/rejected": -2.1893112659454346, + "loss": 0.6626, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5201367139816284, + "rewards/margins": 0.6691744327545166, + "rewards/rejected": -2.1893112659454346, + "sft_loss": 1.5931421518325806, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 4.687182181868727, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.2540055215358734, + "logits/rejected": -0.13636021316051483, + "logps/chosen": -1.5124223232269287, + "logps/rejected": -2.206188201904297, + "loss": 0.6558, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5124223232269287, + "rewards/margins": 0.6937659978866577, + "rewards/rejected": -2.206188201904297, + "sft_loss": 1.4735386371612549, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 2.3357998647710394, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.3001280725002289, + "logits/rejected": -0.16386644542217255, + "logps/chosen": -1.5323134660720825, + "logps/rejected": -2.2306857109069824, + "loss": 0.6475, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5323134660720825, + "rewards/margins": 0.6983723640441895, + "rewards/rejected": -2.2306857109069824, + "sft_loss": 1.612134337425232, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 4.424587556968458, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.28842639923095703, + "logits/rejected": -0.1421942263841629, + "logps/chosen": -1.6424224376678467, + "logps/rejected": -2.4093194007873535, + "loss": 0.6581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6424224376678467, + "rewards/margins": 0.7668969035148621, + "rewards/rejected": -2.4093194007873535, + "sft_loss": 1.6991245746612549, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 5.685577126456062, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.28574010729789734, + "logits/rejected": -0.1689317226409912, + "logps/chosen": -1.4668850898742676, + "logps/rejected": -2.0467896461486816, + "loss": 0.6454, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4668850898742676, + "rewards/margins": 0.5799044370651245, + "rewards/rejected": -2.0467896461486816, + "sft_loss": 1.4965256452560425, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 4.113095336765455, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.30331823229789734, + "logits/rejected": -0.2203049212694168, + "logps/chosen": -1.5306308269500732, + "logps/rejected": -2.2132725715637207, + "loss": 0.6466, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5306308269500732, + "rewards/margins": 0.6826415061950684, + "rewards/rejected": -2.2132725715637207, + "sft_loss": 1.593216896057129, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 2.9720597069657355, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.2730967104434967, + "logits/rejected": -0.1127859354019165, + "logps/chosen": -1.5642287731170654, + "logps/rejected": -2.2336513996124268, + "loss": 0.6702, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5642287731170654, + "rewards/margins": 0.6694225072860718, + "rewards/rejected": -2.2336513996124268, + "sft_loss": 1.5468215942382812, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 2.1520133586927157, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.23968365788459778, + "logits/rejected": -0.18710513412952423, + "logps/chosen": -1.6503604650497437, + "logps/rejected": -2.2875962257385254, + "loss": 0.6614, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6503604650497437, + "rewards/margins": 0.6372357606887817, + "rewards/rejected": -2.2875962257385254, + "sft_loss": 1.695789098739624, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 5.8010732155099225, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.21912772953510284, + "logits/rejected": -0.10847660154104233, + "logps/chosen": -1.5232489109039307, + "logps/rejected": -2.078616142272949, + "loss": 0.6552, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5232489109039307, + "rewards/margins": 0.5553671717643738, + "rewards/rejected": -2.078616142272949, + "sft_loss": 1.5513193607330322, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 4.062617243352844, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.26170676946640015, + "logits/rejected": -0.18718138337135315, + "logps/chosen": -1.5802628993988037, + "logps/rejected": -2.1678214073181152, + "loss": 0.668, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5802628993988037, + "rewards/margins": 0.5875582098960876, + "rewards/rejected": -2.1678214073181152, + "sft_loss": 1.5466351509094238, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 4.399706829417717, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.11614084243774414, + "logits/rejected": -0.049933575093746185, + "logps/chosen": -1.5520092248916626, + "logps/rejected": -2.0661747455596924, + "loss": 0.6619, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5520092248916626, + "rewards/margins": 0.5141655206680298, + "rewards/rejected": -2.0661747455596924, + "sft_loss": 1.6254345178604126, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 3.131404415148893, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.2017994225025177, + "logits/rejected": -0.12663564085960388, + "logps/chosen": -1.5812351703643799, + "logps/rejected": -2.2547194957733154, + "loss": 0.6603, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5812351703643799, + "rewards/margins": 0.6734842658042908, + "rewards/rejected": -2.2547194957733154, + "sft_loss": 1.6243197917938232, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 3.4885571992111593, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.2505125403404236, + "logits/rejected": -0.1667691022157669, + "logps/chosen": -1.5788905620574951, + "logps/rejected": -2.0330827236175537, + "loss": 0.6847, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5788905620574951, + "rewards/margins": 0.45419207215309143, + "rewards/rejected": -2.0330827236175537, + "sft_loss": 1.519217848777771, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 3.5935337638493237, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.2921850085258484, + "logits/rejected": -0.1891443431377411, + "logps/chosen": -1.5669046640396118, + "logps/rejected": -2.1437056064605713, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5669046640396118, + "rewards/margins": 0.576801061630249, + "rewards/rejected": -2.1437056064605713, + "sft_loss": 1.6299469470977783, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 3.83568295626339, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.20006486773490906, + "logits/rejected": -0.07037347555160522, + "logps/chosen": -1.504563570022583, + "logps/rejected": -2.0518879890441895, + "loss": 0.6536, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.504563570022583, + "rewards/margins": 0.5473244786262512, + "rewards/rejected": -2.0518879890441895, + "sft_loss": 1.5179142951965332, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 4.007642145196086, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.17184345424175262, + "logits/rejected": -0.08726246654987335, + "logps/chosen": -1.6674957275390625, + "logps/rejected": -1.9999866485595703, + "loss": 0.691, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6674957275390625, + "rewards/margins": 0.3324908912181854, + "rewards/rejected": -1.9999866485595703, + "sft_loss": 1.6672461032867432, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 2.6207362127955203, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.1908237785100937, + "logits/rejected": -0.13750803470611572, + "logps/chosen": -1.671579360961914, + "logps/rejected": -2.0861868858337402, + "loss": 0.6707, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.671579360961914, + "rewards/margins": 0.41460782289505005, + "rewards/rejected": -2.0861868858337402, + "sft_loss": 1.5895977020263672, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 3.4651381445485896, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.24086709320545197, + "logits/rejected": -0.149921253323555, + "logps/chosen": -1.6103401184082031, + "logps/rejected": -2.1344754695892334, + "loss": 0.6629, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6103401184082031, + "rewards/margins": 0.5241352915763855, + "rewards/rejected": -2.1344754695892334, + "sft_loss": 1.5900582075119019, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 2.9554036013361262, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.30406874418258667, + "logits/rejected": -0.21627768874168396, + "logps/chosen": -1.4910343885421753, + "logps/rejected": -2.131004810333252, + "loss": 0.6288, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4910343885421753, + "rewards/margins": 0.6399704217910767, + "rewards/rejected": -2.131004810333252, + "sft_loss": 1.4939466714859009, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 5.576360948284245, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.25901031494140625, + "logits/rejected": -0.1560325026512146, + "logps/chosen": -1.5033503770828247, + "logps/rejected": -2.2636451721191406, + "loss": 0.6319, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5033503770828247, + "rewards/margins": 0.7602945566177368, + "rewards/rejected": -2.2636451721191406, + "sft_loss": 1.546751618385315, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 3.1028811255439384, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.34134721755981445, + "logits/rejected": -0.30794721841812134, + "logps/chosen": -1.512274980545044, + "logps/rejected": -2.315218687057495, + "loss": 0.638, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.512274980545044, + "rewards/margins": 0.8029438853263855, + "rewards/rejected": -2.315218687057495, + "sft_loss": 1.582011342048645, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 11.087816656768059, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.20825648307800293, + "logits/rejected": -0.02212923765182495, + "logps/chosen": -1.614130973815918, + "logps/rejected": -2.3036086559295654, + "loss": 0.6606, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.614130973815918, + "rewards/margins": 0.6894776821136475, + "rewards/rejected": -2.3036086559295654, + "sft_loss": 1.5822635889053345, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 4.456838947656932, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.35738760232925415, + "logits/rejected": -0.19936344027519226, + "logps/chosen": -1.4727087020874023, + "logps/rejected": -2.3376715183258057, + "loss": 0.6227, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4727087020874023, + "rewards/margins": 0.8649626970291138, + "rewards/rejected": -2.3376715183258057, + "sft_loss": 1.5102074146270752, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 3.416057918617279, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.2814391255378723, + "logits/rejected": -0.2211610972881317, + "logps/chosen": -1.594720721244812, + "logps/rejected": -2.1970386505126953, + "loss": 0.6653, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.594720721244812, + "rewards/margins": 0.6023179292678833, + "rewards/rejected": -2.1970386505126953, + "sft_loss": 1.5907560586929321, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 4.021249068139565, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.21834734082221985, + "logits/rejected": -0.17740324139595032, + "logps/chosen": -1.569536566734314, + "logps/rejected": -2.200033664703369, + "loss": 0.6593, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.569536566734314, + "rewards/margins": 0.6304970979690552, + "rewards/rejected": -2.200033664703369, + "sft_loss": 1.605539321899414, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 3.621760161352139, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.28344467282295227, + "logits/rejected": -0.1576431542634964, + "logps/chosen": -1.5895583629608154, + "logps/rejected": -2.3175249099731445, + "loss": 0.6516, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5895583629608154, + "rewards/margins": 0.7279663681983948, + "rewards/rejected": -2.3175249099731445, + "sft_loss": 1.5516364574432373, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 3.4096565110748682, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.3263254165649414, + "logits/rejected": -0.15622875094413757, + "logps/chosen": -1.5148825645446777, + "logps/rejected": -2.298409938812256, + "loss": 0.6225, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5148825645446777, + "rewards/margins": 0.7835271954536438, + "rewards/rejected": -2.298409938812256, + "sft_loss": 1.5991499423980713, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 3.0479720277715945, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.24460425972938538, + "logits/rejected": -0.2142985314130783, + "logps/chosen": -1.5362884998321533, + "logps/rejected": -2.007601261138916, + "loss": 0.6638, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5362884998321533, + "rewards/margins": 0.47131237387657166, + "rewards/rejected": -2.007601261138916, + "sft_loss": 1.5806416273117065, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 3.5161509907695585, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.27440527081489563, + "logits/rejected": -0.15014337003231049, + "logps/chosen": -1.641608476638794, + "logps/rejected": -2.256495237350464, + "loss": 0.646, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.641608476638794, + "rewards/margins": 0.6148865818977356, + "rewards/rejected": -2.256495237350464, + "sft_loss": 1.6694062948226929, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 10.42396913663668, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.2544073164463043, + "logits/rejected": -0.14254014194011688, + "logps/chosen": -1.496002435684204, + "logps/rejected": -2.034226417541504, + "loss": 0.6467, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.496002435684204, + "rewards/margins": 0.5382241010665894, + "rewards/rejected": -2.034226417541504, + "sft_loss": 1.5158672332763672, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 4.383190990614828, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.16585926711559296, + "logits/rejected": -0.07460494339466095, + "logps/chosen": -1.492472767829895, + "logps/rejected": -2.326233386993408, + "loss": 0.6423, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.492472767829895, + "rewards/margins": 0.8337606191635132, + "rewards/rejected": -2.326233386993408, + "sft_loss": 1.4879860877990723, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 3.167380791186394, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.3844323754310608, + "logits/rejected": -0.26187294721603394, + "logps/chosen": -1.649530053138733, + "logps/rejected": -2.1097311973571777, + "loss": 0.6691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.649530053138733, + "rewards/margins": 0.46020111441612244, + "rewards/rejected": -2.1097311973571777, + "sft_loss": 1.5694371461868286, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 2.1309857741540585, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.3505423665046692, + "logits/rejected": -0.15348538756370544, + "logps/chosen": -1.6408374309539795, + "logps/rejected": -2.3227627277374268, + "loss": 0.6696, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6408374309539795, + "rewards/margins": 0.6819251775741577, + "rewards/rejected": -2.3227627277374268, + "sft_loss": 1.650193452835083, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 6.177280648723578, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.2825758159160614, + "logits/rejected": -0.17822478711605072, + "logps/chosen": -1.496321439743042, + "logps/rejected": -2.104548931121826, + "loss": 0.6507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.496321439743042, + "rewards/margins": 0.608227550983429, + "rewards/rejected": -2.104548931121826, + "sft_loss": 1.5814179182052612, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 3.385853799454223, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.3522363603115082, + "logits/rejected": -0.20810142159461975, + "logps/chosen": -1.589411973953247, + "logps/rejected": -2.328275680541992, + "loss": 0.6449, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.589411973953247, + "rewards/margins": 0.7388638854026794, + "rewards/rejected": -2.328275680541992, + "sft_loss": 1.5857503414154053, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 5.855398115427702, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.2700735330581665, + "logits/rejected": -0.11895928531885147, + "logps/chosen": -1.561927080154419, + "logps/rejected": -2.117443561553955, + "loss": 0.6631, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.561927080154419, + "rewards/margins": 0.5555165410041809, + "rewards/rejected": -2.117443561553955, + "sft_loss": 1.5860936641693115, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 3.8412171916270657, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.23423747718334198, + "logits/rejected": -0.12965528666973114, + "logps/chosen": -1.4988791942596436, + "logps/rejected": -2.19246768951416, + "loss": 0.6334, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4988791942596436, + "rewards/margins": 0.6935886144638062, + "rewards/rejected": -2.19246768951416, + "sft_loss": 1.516981840133667, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 3.476287578265999, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.3897637724876404, + "logits/rejected": -0.16315045952796936, + "logps/chosen": -1.5861146450042725, + "logps/rejected": -2.204552412033081, + "loss": 0.6569, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5861146450042725, + "rewards/margins": 0.618437647819519, + "rewards/rejected": -2.204552412033081, + "sft_loss": 1.6054118871688843, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 4.297658138812751, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.28043732047080994, + "logits/rejected": -0.11735528707504272, + "logps/chosen": -1.5826841592788696, + "logps/rejected": -2.1900174617767334, + "loss": 0.6648, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5826841592788696, + "rewards/margins": 0.6073335409164429, + "rewards/rejected": -2.1900174617767334, + "sft_loss": 1.5941132307052612, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 3.9597984184611454, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.2457316368818283, + "logits/rejected": -0.06879222393035889, + "logps/chosen": -1.5499293804168701, + "logps/rejected": -2.3246564865112305, + "loss": 0.6629, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5499293804168701, + "rewards/margins": 0.7747267484664917, + "rewards/rejected": -2.3246564865112305, + "sft_loss": 1.531903624534607, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 2.403023847709752, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.27923932671546936, + "logits/rejected": -0.1388339400291443, + "logps/chosen": -1.5517905950546265, + "logps/rejected": -2.3513119220733643, + "loss": 0.6612, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5517905950546265, + "rewards/margins": 0.7995215654373169, + "rewards/rejected": -2.3513119220733643, + "sft_loss": 1.5326491594314575, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 4.016783042699588, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.3711521625518799, + "logits/rejected": -0.19031788408756256, + "logps/chosen": -1.6756954193115234, + "logps/rejected": -2.082862615585327, + "loss": 0.6621, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6756954193115234, + "rewards/margins": 0.40716734528541565, + "rewards/rejected": -2.082862615585327, + "sft_loss": 1.6878407001495361, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 3.024186762527514, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.28381603956222534, + "logits/rejected": -0.22497475147247314, + "logps/chosen": -1.4522531032562256, + "logps/rejected": -2.0683815479278564, + "loss": 0.6459, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4522531032562256, + "rewards/margins": 0.6161283850669861, + "rewards/rejected": -2.0683815479278564, + "sft_loss": 1.4766418933868408, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 4.156505428484555, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.2825126647949219, + "logits/rejected": -0.17621538043022156, + "logps/chosen": -1.5356099605560303, + "logps/rejected": -2.134596586227417, + "loss": 0.6389, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5356099605560303, + "rewards/margins": 0.5989863276481628, + "rewards/rejected": -2.134596586227417, + "sft_loss": 1.6044524908065796, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 2.3562440156593265, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.29220303893089294, + "logits/rejected": -0.130177840590477, + "logps/chosen": -1.3875385522842407, + "logps/rejected": -2.1555233001708984, + "loss": 0.636, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3875385522842407, + "rewards/margins": 0.7679846882820129, + "rewards/rejected": -2.1555233001708984, + "sft_loss": 1.4205989837646484, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 3.358840986599196, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.327506422996521, + "logits/rejected": -0.18542982637882233, + "logps/chosen": -1.4863008260726929, + "logps/rejected": -2.287219762802124, + "loss": 0.6454, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4863008260726929, + "rewards/margins": 0.8009187579154968, + "rewards/rejected": -2.287219762802124, + "sft_loss": 1.52559494972229, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 5.410217880305303, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.22475166618824005, + "logits/rejected": -0.12966808676719666, + "logps/chosen": -1.5296286344528198, + "logps/rejected": -2.1420018672943115, + "loss": 0.6536, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5296286344528198, + "rewards/margins": 0.6123733520507812, + "rewards/rejected": -2.1420018672943115, + "sft_loss": 1.5668294429779053, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 3.9021667690936757, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.31892210245132446, + "logits/rejected": -0.08537305891513824, + "logps/chosen": -1.5146350860595703, + "logps/rejected": -2.2054097652435303, + "loss": 0.6532, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5146350860595703, + "rewards/margins": 0.6907747983932495, + "rewards/rejected": -2.2054097652435303, + "sft_loss": 1.554931640625, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 3.063283540434889, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.180180162191391, + "logits/rejected": -0.1619834005832672, + "logps/chosen": -1.5252013206481934, + "logps/rejected": -2.1712911128997803, + "loss": 0.6483, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5252013206481934, + "rewards/margins": 0.6460900902748108, + "rewards/rejected": -2.1712911128997803, + "sft_loss": 1.5667892694473267, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 14.199748316575338, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.246128648519516, + "logits/rejected": -0.15304933488368988, + "logps/chosen": -1.6168506145477295, + "logps/rejected": -2.1350083351135254, + "loss": 0.6645, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6168506145477295, + "rewards/margins": 0.518157422542572, + "rewards/rejected": -2.1350083351135254, + "sft_loss": 1.5538456439971924, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 2.8839854578578246, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.22317853569984436, + "logits/rejected": -0.10486292839050293, + "logps/chosen": -1.4757004976272583, + "logps/rejected": -2.0389535427093506, + "loss": 0.6426, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4757004976272583, + "rewards/margins": 0.5632533431053162, + "rewards/rejected": -2.0389535427093506, + "sft_loss": 1.4846004247665405, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 2.2850158307723807, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.30444300174713135, + "logits/rejected": -0.14454534649848938, + "logps/chosen": -1.5955278873443604, + "logps/rejected": -2.248922824859619, + "loss": 0.6674, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5955278873443604, + "rewards/margins": 0.6533945798873901, + "rewards/rejected": -2.248922824859619, + "sft_loss": 1.6477130651474, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 2.2209000145296645, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.3080112934112549, + "logits/rejected": -0.15310274064540863, + "logps/chosen": -1.5924137830734253, + "logps/rejected": -2.4808449745178223, + "loss": 0.6519, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5924137830734253, + "rewards/margins": 0.8884310722351074, + "rewards/rejected": -2.4808449745178223, + "sft_loss": 1.5781221389770508, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 5.045439828682632, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.2884170711040497, + "logits/rejected": -0.18335895240306854, + "logps/chosen": -1.4607820510864258, + "logps/rejected": -2.164994716644287, + "loss": 0.6311, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4607820510864258, + "rewards/margins": 0.7042129039764404, + "rewards/rejected": -2.164994716644287, + "sft_loss": 1.5072251558303833, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 2.3941074380032408, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.3501512110233307, + "logits/rejected": -0.2043345868587494, + "logps/chosen": -1.5407905578613281, + "logps/rejected": -2.1239986419677734, + "loss": 0.6568, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5407905578613281, + "rewards/margins": 0.5832081437110901, + "rewards/rejected": -2.1239986419677734, + "sft_loss": 1.5432384014129639, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 3.2289831570641385, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.23360323905944824, + "logits/rejected": -0.16891071200370789, + "logps/chosen": -1.556929349899292, + "logps/rejected": -2.2463905811309814, + "loss": 0.6374, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.556929349899292, + "rewards/margins": 0.689461350440979, + "rewards/rejected": -2.2463905811309814, + "sft_loss": 1.5343983173370361, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 6.043689603162791, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.2577422857284546, + "logits/rejected": -0.20512935519218445, + "logps/chosen": -1.484359860420227, + "logps/rejected": -2.2721476554870605, + "loss": 0.6306, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.484359860420227, + "rewards/margins": 0.7877878546714783, + "rewards/rejected": -2.2721476554870605, + "sft_loss": 1.422821283340454, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 3.7634708186300014, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.26054519414901733, + "logits/rejected": -0.09344630688428879, + "logps/chosen": -1.6033008098602295, + "logps/rejected": -2.1131608486175537, + "loss": 0.6645, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6033008098602295, + "rewards/margins": 0.5098596215248108, + "rewards/rejected": -2.1131608486175537, + "sft_loss": 1.6177575588226318, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 5.82072101056835, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.20669682323932648, + "logits/rejected": -0.05159702152013779, + "logps/chosen": -1.501251459121704, + "logps/rejected": -2.1043949127197266, + "loss": 0.6557, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.501251459121704, + "rewards/margins": 0.6031434535980225, + "rewards/rejected": -2.1043949127197266, + "sft_loss": 1.5311315059661865, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 3.6290893610895165, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.3596315085887909, + "logits/rejected": -0.27861496806144714, + "logps/chosen": -1.4975078105926514, + "logps/rejected": -2.2116639614105225, + "loss": 0.6398, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4975078105926514, + "rewards/margins": 0.7141561508178711, + "rewards/rejected": -2.2116639614105225, + "sft_loss": 1.5466364622116089, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 5.50609314622216, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.23720446228981018, + "logits/rejected": -0.12143871933221817, + "logps/chosen": -1.564563274383545, + "logps/rejected": -2.4740467071533203, + "loss": 0.6358, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.564563274383545, + "rewards/margins": 0.9094836115837097, + "rewards/rejected": -2.4740467071533203, + "sft_loss": 1.6424548625946045, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 5.574264917815816, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.21673135459423065, + "logits/rejected": -0.10895593464374542, + "logps/chosen": -1.4948537349700928, + "logps/rejected": -2.264178514480591, + "loss": 0.6406, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4948537349700928, + "rewards/margins": 0.769324779510498, + "rewards/rejected": -2.264178514480591, + "sft_loss": 1.5858509540557861, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 2.995368540835511, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.26554349064826965, + "logits/rejected": -0.17646172642707825, + "logps/chosen": -1.5605789422988892, + "logps/rejected": -2.2075772285461426, + "loss": 0.6429, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5605789422988892, + "rewards/margins": 0.6469985842704773, + "rewards/rejected": -2.2075772285461426, + "sft_loss": 1.6384786367416382, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 4.42950380392317, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.19056487083435059, + "logits/rejected": -0.092097207903862, + "logps/chosen": -1.4302465915679932, + "logps/rejected": -1.9939210414886475, + "loss": 0.6379, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4302465915679932, + "rewards/margins": 0.5636744499206543, + "rewards/rejected": -1.9939210414886475, + "sft_loss": 1.4193260669708252, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 3.2045560843805427, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.29241254925727844, + "logits/rejected": -0.217814639210701, + "logps/chosen": -1.4759756326675415, + "logps/rejected": -2.192446231842041, + "loss": 0.6197, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4759756326675415, + "rewards/margins": 0.7164705395698547, + "rewards/rejected": -2.192446231842041, + "sft_loss": 1.5651328563690186, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 3.104383102470415, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.33143699169158936, + "logits/rejected": -0.2266085147857666, + "logps/chosen": -1.6010643243789673, + "logps/rejected": -2.2874233722686768, + "loss": 0.6492, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6010643243789673, + "rewards/margins": 0.6863590478897095, + "rewards/rejected": -2.2874233722686768, + "sft_loss": 1.561039686203003, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 4.58052162712318, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.36959215998649597, + "logits/rejected": -0.23951435089111328, + "logps/chosen": -1.376029372215271, + "logps/rejected": -2.1417298316955566, + "loss": 0.6264, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.376029372215271, + "rewards/margins": 0.7657004594802856, + "rewards/rejected": -2.1417298316955566, + "sft_loss": 1.4588136672973633, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 2.7058154520054187, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.417828232049942, + "logits/rejected": -0.17566026747226715, + "logps/chosen": -1.5764210224151611, + "logps/rejected": -2.2656896114349365, + "loss": 0.6617, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5764210224151611, + "rewards/margins": 0.6892686486244202, + "rewards/rejected": -2.2656896114349365, + "sft_loss": 1.5459423065185547, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 12.57124495807514, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.27160823345184326, + "logits/rejected": -0.17148330807685852, + "logps/chosen": -1.4522453546524048, + "logps/rejected": -2.23818302154541, + "loss": 0.6241, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4522453546524048, + "rewards/margins": 0.7859378457069397, + "rewards/rejected": -2.23818302154541, + "sft_loss": 1.5470503568649292, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.09168983995914459, + "eval_logits/rejected": 0.1840367317199707, + "eval_logps/chosen": -1.5837377309799194, + "eval_logps/rejected": -2.1770241260528564, + "eval_loss": 0.6688703298568726, + "eval_rewards/accuracies": 0.6454005837440491, + "eval_rewards/chosen": -1.5837377309799194, + "eval_rewards/margins": 0.5932866930961609, + "eval_rewards/rejected": -2.1770241260528564, + "eval_runtime": 43.8362, + "eval_samples_per_second": 30.682, + "eval_sft_loss": 1.5858901739120483, + "eval_steps_per_second": 7.688, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 2.5281695533267596, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.2702171206474304, + "logits/rejected": -0.056992627680301666, + "logps/chosen": -1.7079509496688843, + "logps/rejected": -2.2261626720428467, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7079509496688843, + "rewards/margins": 0.5182119607925415, + "rewards/rejected": -2.2261626720428467, + "sft_loss": 1.642205834388733, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 3.0027996441018887, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.30804362893104553, + "logits/rejected": -0.21121926605701447, + "logps/chosen": -1.5518161058425903, + "logps/rejected": -2.1368958950042725, + "loss": 0.6447, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5518161058425903, + "rewards/margins": 0.5850798487663269, + "rewards/rejected": -2.1368958950042725, + "sft_loss": 1.5524580478668213, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 4.040083493049222, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.3009427487850189, + "logits/rejected": -0.051604628562927246, + "logps/chosen": -1.5372031927108765, + "logps/rejected": -2.121778964996338, + "loss": 0.662, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5372031927108765, + "rewards/margins": 0.5845755338668823, + "rewards/rejected": -2.121778964996338, + "sft_loss": 1.578824758529663, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 3.234270915114087, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.3834315538406372, + "logits/rejected": -0.23764514923095703, + "logps/chosen": -1.5670106410980225, + "logps/rejected": -2.0982117652893066, + "loss": 0.667, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5670106410980225, + "rewards/margins": 0.5312010049819946, + "rewards/rejected": -2.0982117652893066, + "sft_loss": 1.6243221759796143, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 3.270374268099964, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.2835990786552429, + "logits/rejected": -0.18782752752304077, + "logps/chosen": -1.4929298162460327, + "logps/rejected": -2.2040629386901855, + "loss": 0.654, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4929298162460327, + "rewards/margins": 0.7111331820487976, + "rewards/rejected": -2.2040629386901855, + "sft_loss": 1.5205652713775635, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 4.252244719400194, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.3299658000469208, + "logits/rejected": -0.2453681230545044, + "logps/chosen": -1.530896782875061, + "logps/rejected": -2.152971029281616, + "loss": 0.6482, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.530896782875061, + "rewards/margins": 0.6220741868019104, + "rewards/rejected": -2.152971029281616, + "sft_loss": 1.5590530633926392, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 8.025842772881083, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.2988958954811096, + "logits/rejected": -0.25589337944984436, + "logps/chosen": -1.4339474439620972, + "logps/rejected": -1.829405426979065, + "loss": 0.6578, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4339474439620972, + "rewards/margins": 0.3954579532146454, + "rewards/rejected": -1.829405426979065, + "sft_loss": 1.5186035633087158, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 3.6713172453330567, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.30180278420448303, + "logits/rejected": -0.2629424035549164, + "logps/chosen": -1.4053932428359985, + "logps/rejected": -2.0193417072296143, + "loss": 0.6537, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4053932428359985, + "rewards/margins": 0.613948404788971, + "rewards/rejected": -2.0193417072296143, + "sft_loss": 1.430471658706665, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 3.6517684799723407, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.22194473445415497, + "logits/rejected": -0.08967797458171844, + "logps/chosen": -1.540501356124878, + "logps/rejected": -2.0949809551239014, + "loss": 0.6628, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.540501356124878, + "rewards/margins": 0.554479718208313, + "rewards/rejected": -2.0949809551239014, + "sft_loss": 1.6057672500610352, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 3.402377140438689, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.32742300629615784, + "logits/rejected": -0.22545237839221954, + "logps/chosen": -1.543937087059021, + "logps/rejected": -1.9740203619003296, + "loss": 0.655, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.543937087059021, + "rewards/margins": 0.43008336424827576, + "rewards/rejected": -1.9740203619003296, + "sft_loss": 1.6045637130737305, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 2.534355319341989, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.21653883159160614, + "logits/rejected": -0.16043604910373688, + "logps/chosen": -1.5343761444091797, + "logps/rejected": -1.8853505849838257, + "loss": 0.6673, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5343761444091797, + "rewards/margins": 0.3509746193885803, + "rewards/rejected": -1.8853505849838257, + "sft_loss": 1.550243854522705, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 3.171312253268627, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.12519961595535278, + "logits/rejected": -0.020199311897158623, + "logps/chosen": -1.4559218883514404, + "logps/rejected": -2.0748279094696045, + "loss": 0.6433, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4559218883514404, + "rewards/margins": 0.6189061999320984, + "rewards/rejected": -2.0748279094696045, + "sft_loss": 1.5062953233718872, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 2.7594609214548056, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.358981192111969, + "logits/rejected": -0.2713465690612793, + "logps/chosen": -1.5313631296157837, + "logps/rejected": -2.1783242225646973, + "loss": 0.6448, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5313631296157837, + "rewards/margins": 0.6469610333442688, + "rewards/rejected": -2.1783242225646973, + "sft_loss": 1.5525610446929932, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 6.121162843444098, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.2529239058494568, + "logits/rejected": -0.05141264200210571, + "logps/chosen": -1.5103384256362915, + "logps/rejected": -2.1851370334625244, + "loss": 0.6413, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5103384256362915, + "rewards/margins": 0.6747983694076538, + "rewards/rejected": -2.1851370334625244, + "sft_loss": 1.5632578134536743, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 2.9500682885294665, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.300545334815979, + "logits/rejected": -0.12688735127449036, + "logps/chosen": -1.6066920757293701, + "logps/rejected": -2.146780014038086, + "loss": 0.6537, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6066920757293701, + "rewards/margins": 0.5400879383087158, + "rewards/rejected": -2.146780014038086, + "sft_loss": 1.584996223449707, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 5.603556259094692, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.29074740409851074, + "logits/rejected": -0.15702712535858154, + "logps/chosen": -1.6081063747406006, + "logps/rejected": -2.3623788356781006, + "loss": 0.6541, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6081063747406006, + "rewards/margins": 0.7542725801467896, + "rewards/rejected": -2.3623788356781006, + "sft_loss": 1.6760278940200806, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 4.607702897541227, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.24949641525745392, + "logits/rejected": -0.14471934735774994, + "logps/chosen": -1.5316954851150513, + "logps/rejected": -2.07235050201416, + "loss": 0.648, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5316954851150513, + "rewards/margins": 0.540655255317688, + "rewards/rejected": -2.07235050201416, + "sft_loss": 1.5349326133728027, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 3.898894706432375, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.3402459919452667, + "logits/rejected": -0.14765772223472595, + "logps/chosen": -1.5174437761306763, + "logps/rejected": -2.1393513679504395, + "loss": 0.6519, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5174437761306763, + "rewards/margins": 0.6219078302383423, + "rewards/rejected": -2.1393513679504395, + "sft_loss": 1.5297417640686035, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 3.829847486281691, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.30611926317214966, + "logits/rejected": -0.22039365768432617, + "logps/chosen": -1.5752049684524536, + "logps/rejected": -2.075282335281372, + "loss": 0.6605, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5752049684524536, + "rewards/margins": 0.5000771284103394, + "rewards/rejected": -2.075282335281372, + "sft_loss": 1.624929428100586, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 4.689857036556933, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.22249610722064972, + "logits/rejected": -0.08447030931711197, + "logps/chosen": -1.3815150260925293, + "logps/rejected": -2.1207103729248047, + "loss": 0.6157, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3815150260925293, + "rewards/margins": 0.7391951680183411, + "rewards/rejected": -2.1207103729248047, + "sft_loss": 1.515148401260376, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 3.130096858285971, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.2264455258846283, + "logits/rejected": -0.024552499875426292, + "logps/chosen": -1.628535270690918, + "logps/rejected": -2.1242098808288574, + "loss": 0.6589, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.628535270690918, + "rewards/margins": 0.4956747591495514, + "rewards/rejected": -2.1242098808288574, + "sft_loss": 1.5905323028564453, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 6.957922549826785, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.2159980833530426, + "logits/rejected": -0.037352751940488815, + "logps/chosen": -1.5755895376205444, + "logps/rejected": -2.074293375015259, + "loss": 0.6552, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5755895376205444, + "rewards/margins": 0.4987037777900696, + "rewards/rejected": -2.074293375015259, + "sft_loss": 1.5881279706954956, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 4.253885036940023, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.20722968876361847, + "logits/rejected": -0.1629769206047058, + "logps/chosen": -1.5342538356781006, + "logps/rejected": -2.29413104057312, + "loss": 0.6493, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5342538356781006, + "rewards/margins": 0.7598771452903748, + "rewards/rejected": -2.29413104057312, + "sft_loss": 1.5792230367660522, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 3.799657751145327, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.26215869188308716, + "logits/rejected": -0.1872785985469818, + "logps/chosen": -1.4091601371765137, + "logps/rejected": -2.145296812057495, + "loss": 0.6212, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4091601371765137, + "rewards/margins": 0.736136794090271, + "rewards/rejected": -2.145296812057495, + "sft_loss": 1.5159685611724854, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 3.359631802849689, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.3127399981021881, + "logits/rejected": -0.10671690851449966, + "logps/chosen": -1.4820116758346558, + "logps/rejected": -2.274005174636841, + "loss": 0.6364, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4820116758346558, + "rewards/margins": 0.7919936180114746, + "rewards/rejected": -2.274005174636841, + "sft_loss": 1.5597707033157349, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 4.013079286547791, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.2609347999095917, + "logits/rejected": -0.17760607600212097, + "logps/chosen": -1.5475456714630127, + "logps/rejected": -2.2146785259246826, + "loss": 0.6566, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5475456714630127, + "rewards/margins": 0.6671331524848938, + "rewards/rejected": -2.2146785259246826, + "sft_loss": 1.6043469905853271, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 5.388820016376569, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.25289368629455566, + "logits/rejected": -0.2507743835449219, + "logps/chosen": -1.6624103784561157, + "logps/rejected": -2.370983600616455, + "loss": 0.6575, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6624103784561157, + "rewards/margins": 0.7085734605789185, + "rewards/rejected": -2.370983600616455, + "sft_loss": 1.6707814931869507, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 3.4805850054249725, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.3757517337799072, + "logits/rejected": -0.29435932636260986, + "logps/chosen": -1.5222690105438232, + "logps/rejected": -2.18015718460083, + "loss": 0.6552, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5222690105438232, + "rewards/margins": 0.6578881740570068, + "rewards/rejected": -2.18015718460083, + "sft_loss": 1.5597175359725952, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 4.618202865883473, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.15316875278949738, + "logits/rejected": -0.01105407066643238, + "logps/chosen": -1.7164844274520874, + "logps/rejected": -2.389507293701172, + "loss": 0.6654, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7164844274520874, + "rewards/margins": 0.6730228662490845, + "rewards/rejected": -2.389507293701172, + "sft_loss": 1.6216520071029663, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 4.228533631865109, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.2290269434452057, + "logits/rejected": -0.138050839304924, + "logps/chosen": -1.4907031059265137, + "logps/rejected": -2.2492947578430176, + "loss": 0.6385, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4907031059265137, + "rewards/margins": 0.7585914731025696, + "rewards/rejected": -2.2492947578430176, + "sft_loss": 1.5580991506576538, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 3.554780898138054, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.19468359649181366, + "logits/rejected": -0.20953765511512756, + "logps/chosen": -1.5502973794937134, + "logps/rejected": -2.0763301849365234, + "loss": 0.6427, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5502973794937134, + "rewards/margins": 0.5260329246520996, + "rewards/rejected": -2.0763301849365234, + "sft_loss": 1.5064983367919922, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 3.635594365625625, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.27188238501548767, + "logits/rejected": -0.12990307807922363, + "logps/chosen": -1.6696773767471313, + "logps/rejected": -2.155397415161133, + "loss": 0.6537, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6696773767471313, + "rewards/margins": 0.48572009801864624, + "rewards/rejected": -2.155397415161133, + "sft_loss": 1.6960710287094116, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 3.9050399786833174, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.3017478287220001, + "logits/rejected": -0.20494893193244934, + "logps/chosen": -1.5754562616348267, + "logps/rejected": -2.265209197998047, + "loss": 0.6462, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5754562616348267, + "rewards/margins": 0.6897529363632202, + "rewards/rejected": -2.265209197998047, + "sft_loss": 1.5912220478057861, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 4.382291005981211, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.19715535640716553, + "logits/rejected": -0.13701540231704712, + "logps/chosen": -1.5412023067474365, + "logps/rejected": -2.240354061126709, + "loss": 0.6442, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5412023067474365, + "rewards/margins": 0.6991516947746277, + "rewards/rejected": -2.240354061126709, + "sft_loss": 1.5862661600112915, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 4.050601551719255, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.312357634305954, + "logits/rejected": -0.21406638622283936, + "logps/chosen": -1.5535626411437988, + "logps/rejected": -1.9893906116485596, + "loss": 0.6473, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5535626411437988, + "rewards/margins": 0.4358278214931488, + "rewards/rejected": -1.9893906116485596, + "sft_loss": 1.5592749118804932, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 5.3727559137210115, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.3214295506477356, + "logits/rejected": -0.11621556431055069, + "logps/chosen": -1.5065670013427734, + "logps/rejected": -2.185896396636963, + "loss": 0.6436, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5065670013427734, + "rewards/margins": 0.679329514503479, + "rewards/rejected": -2.185896396636963, + "sft_loss": 1.5351276397705078, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 12.061432178974616, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.34014302492141724, + "logits/rejected": -0.3240959048271179, + "logps/chosen": -1.446756362915039, + "logps/rejected": -2.0364651679992676, + "loss": 0.6389, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.446756362915039, + "rewards/margins": 0.589708685874939, + "rewards/rejected": -2.0364651679992676, + "sft_loss": 1.487903356552124, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 6.0753641029472485, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.20700252056121826, + "logits/rejected": -0.11087401211261749, + "logps/chosen": -1.531902551651001, + "logps/rejected": -2.245913505554199, + "loss": 0.6461, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.531902551651001, + "rewards/margins": 0.7140110731124878, + "rewards/rejected": -2.245913505554199, + "sft_loss": 1.5783272981643677, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 2.6050887282150343, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.3839908242225647, + "logits/rejected": -0.23379337787628174, + "logps/chosen": -1.4995805025100708, + "logps/rejected": -2.030235767364502, + "loss": 0.6446, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4995805025100708, + "rewards/margins": 0.5306550860404968, + "rewards/rejected": -2.030235767364502, + "sft_loss": 1.5718003511428833, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 2.2359474207528915, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.17385700345039368, + "logits/rejected": -0.183371439576149, + "logps/chosen": -1.6142679452896118, + "logps/rejected": -2.1658880710601807, + "loss": 0.6418, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6142679452896118, + "rewards/margins": 0.5516203045845032, + "rewards/rejected": -2.1658880710601807, + "sft_loss": 1.6339337825775146, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 2.9839216622314373, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.3360547423362732, + "logits/rejected": -0.25803929567337036, + "logps/chosen": -1.5377719402313232, + "logps/rejected": -2.0626683235168457, + "loss": 0.646, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5377719402313232, + "rewards/margins": 0.5248963832855225, + "rewards/rejected": -2.0626683235168457, + "sft_loss": 1.5281660556793213, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 5.017980829186697, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.3281833529472351, + "logits/rejected": -0.21899166703224182, + "logps/chosen": -1.560179591178894, + "logps/rejected": -2.4935262203216553, + "loss": 0.6466, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.560179591178894, + "rewards/margins": 0.9333463907241821, + "rewards/rejected": -2.4935262203216553, + "sft_loss": 1.6043787002563477, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 3.5454291356920855, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.28248411417007446, + "logits/rejected": -0.14311420917510986, + "logps/chosen": -1.6289985179901123, + "logps/rejected": -2.2365944385528564, + "loss": 0.6501, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6289985179901123, + "rewards/margins": 0.607595682144165, + "rewards/rejected": -2.2365944385528564, + "sft_loss": 1.574914574623108, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 3.0073974431466333, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.34595996141433716, + "logits/rejected": -0.15653648972511292, + "logps/chosen": -1.5226856470108032, + "logps/rejected": -2.2048676013946533, + "loss": 0.641, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5226856470108032, + "rewards/margins": 0.6821819543838501, + "rewards/rejected": -2.2048676013946533, + "sft_loss": 1.5713951587677002, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 4.209084915617624, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.3309343755245209, + "logits/rejected": -0.15454891324043274, + "logps/chosen": -1.5566303730010986, + "logps/rejected": -2.306165933609009, + "loss": 0.6392, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5566303730010986, + "rewards/margins": 0.7495354413986206, + "rewards/rejected": -2.306165933609009, + "sft_loss": 1.5894476175308228, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 5.114930891343891, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.3384936451911926, + "logits/rejected": -0.25529247522354126, + "logps/chosen": -1.7155176401138306, + "logps/rejected": -2.2585268020629883, + "loss": 0.6576, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7155176401138306, + "rewards/margins": 0.5430089831352234, + "rewards/rejected": -2.2585268020629883, + "sft_loss": 1.6163198947906494, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 2.306857615936622, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.2622672915458679, + "logits/rejected": -0.1470358669757843, + "logps/chosen": -1.6400104761123657, + "logps/rejected": -2.070843458175659, + "loss": 0.6734, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6400104761123657, + "rewards/margins": 0.43083301186561584, + "rewards/rejected": -2.070843458175659, + "sft_loss": 1.6886297464370728, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 2.8919252340955417, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.28206485509872437, + "logits/rejected": -0.16051602363586426, + "logps/chosen": -1.50666344165802, + "logps/rejected": -2.0285463333129883, + "loss": 0.6582, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.50666344165802, + "rewards/margins": 0.5218832492828369, + "rewards/rejected": -2.0285463333129883, + "sft_loss": 1.5844346284866333, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 4.013146099551089, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.3432479500770569, + "logits/rejected": -0.22454170882701874, + "logps/chosen": -1.561184287071228, + "logps/rejected": -2.172863721847534, + "loss": 0.6492, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.561184287071228, + "rewards/margins": 0.6116796135902405, + "rewards/rejected": -2.172863721847534, + "sft_loss": 1.589215636253357, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 1.7215037172081569, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.26819589734077454, + "logits/rejected": -0.09419532120227814, + "logps/chosen": -1.6141561269760132, + "logps/rejected": -2.2761244773864746, + "loss": 0.6564, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6141561269760132, + "rewards/margins": 0.6619683504104614, + "rewards/rejected": -2.2761244773864746, + "sft_loss": 1.5889050960540771, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 3.426816636652408, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.29647761583328247, + "logits/rejected": -0.06999702751636505, + "logps/chosen": -1.6192327737808228, + "logps/rejected": -2.3246865272521973, + "loss": 0.6454, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6192327737808228, + "rewards/margins": 0.7054537534713745, + "rewards/rejected": -2.3246865272521973, + "sft_loss": 1.6010668277740479, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 4.059369893080733, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.2631566524505615, + "logits/rejected": -0.15566667914390564, + "logps/chosen": -1.4722065925598145, + "logps/rejected": -2.1321847438812256, + "loss": 0.6334, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4722065925598145, + "rewards/margins": 0.6599782705307007, + "rewards/rejected": -2.1321847438812256, + "sft_loss": 1.4477260112762451, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 2.8264964738448075, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.38454973697662354, + "logits/rejected": -0.2207675725221634, + "logps/chosen": -1.5005874633789062, + "logps/rejected": -2.2709789276123047, + "loss": 0.6458, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5005874633789062, + "rewards/margins": 0.770391583442688, + "rewards/rejected": -2.2709789276123047, + "sft_loss": 1.544205665588379, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 3.7044371198908705, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.20141199231147766, + "logits/rejected": 0.038046397268772125, + "logps/chosen": -1.5491031408309937, + "logps/rejected": -2.337373733520508, + "loss": 0.6471, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5491031408309937, + "rewards/margins": 0.7882707715034485, + "rewards/rejected": -2.337373733520508, + "sft_loss": 1.5153697729110718, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 4.801419996115539, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.2652028203010559, + "logits/rejected": -0.18785782158374786, + "logps/chosen": -1.613633394241333, + "logps/rejected": -2.23420786857605, + "loss": 0.6655, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.613633394241333, + "rewards/margins": 0.6205744743347168, + "rewards/rejected": -2.23420786857605, + "sft_loss": 1.6279270648956299, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 4.468703765027423, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.29716956615448, + "logits/rejected": -0.20758691430091858, + "logps/chosen": -1.5046494007110596, + "logps/rejected": -2.1827492713928223, + "loss": 0.6342, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5046494007110596, + "rewards/margins": 0.6780996322631836, + "rewards/rejected": -2.1827492713928223, + "sft_loss": 1.498509407043457, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 6.282108459199361, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.19836069643497467, + "logits/rejected": -0.20581555366516113, + "logps/chosen": -1.507200002670288, + "logps/rejected": -2.1410250663757324, + "loss": 0.6457, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.507200002670288, + "rewards/margins": 0.6338250041007996, + "rewards/rejected": -2.1410250663757324, + "sft_loss": 1.5645170211791992, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 4.765230507949718, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.3440048098564148, + "logits/rejected": -0.18142908811569214, + "logps/chosen": -1.604383111000061, + "logps/rejected": -2.225277900695801, + "loss": 0.6648, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.604383111000061, + "rewards/margins": 0.6208949089050293, + "rewards/rejected": -2.225277900695801, + "sft_loss": 1.6145012378692627, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 3.376587341931551, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.31696051359176636, + "logits/rejected": -0.2621343433856964, + "logps/chosen": -1.5680984258651733, + "logps/rejected": -2.136096954345703, + "loss": 0.668, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5680984258651733, + "rewards/margins": 0.5679982900619507, + "rewards/rejected": -2.136096954345703, + "sft_loss": 1.5849583148956299, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 3.195937393196168, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.25740814208984375, + "logits/rejected": -0.13126549124717712, + "logps/chosen": -1.63578200340271, + "logps/rejected": -2.066324472427368, + "loss": 0.6535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.63578200340271, + "rewards/margins": 0.4305424094200134, + "rewards/rejected": -2.066324472427368, + "sft_loss": 1.5332118272781372, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 3.299213352267835, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.3346925675868988, + "logits/rejected": -0.21511289477348328, + "logps/chosen": -1.5720036029815674, + "logps/rejected": -2.3347842693328857, + "loss": 0.6268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5720036029815674, + "rewards/margins": 0.7627805471420288, + "rewards/rejected": -2.3347842693328857, + "sft_loss": 1.6377277374267578, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 5.568185476548528, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.28449270129203796, + "logits/rejected": -0.08388768136501312, + "logps/chosen": -1.519416093826294, + "logps/rejected": -2.3788251876831055, + "loss": 0.6451, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.519416093826294, + "rewards/margins": 0.8594093322753906, + "rewards/rejected": -2.3788251876831055, + "sft_loss": 1.5618746280670166, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 3.8036289931086484, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.2648676931858063, + "logits/rejected": -0.10690250247716904, + "logps/chosen": -1.6507768630981445, + "logps/rejected": -2.0753867626190186, + "loss": 0.6671, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6507768630981445, + "rewards/margins": 0.4246101379394531, + "rewards/rejected": -2.0753867626190186, + "sft_loss": 1.6561870574951172, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 3.505938051751801, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.29961520433425903, + "logits/rejected": -0.16704608500003815, + "logps/chosen": -1.6481430530548096, + "logps/rejected": -2.1766676902770996, + "loss": 0.6463, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6481430530548096, + "rewards/margins": 0.5285248756408691, + "rewards/rejected": -2.1766676902770996, + "sft_loss": 1.6171963214874268, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 5.308196788764888, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.2850746810436249, + "logits/rejected": -0.12523740530014038, + "logps/chosen": -1.6003822088241577, + "logps/rejected": -2.354501247406006, + "loss": 0.6449, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6003822088241577, + "rewards/margins": 0.7541190385818481, + "rewards/rejected": -2.354501247406006, + "sft_loss": 1.6053550243377686, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 3.782695455254811, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.26460617780685425, + "logits/rejected": -0.1535172164440155, + "logps/chosen": -1.7259595394134521, + "logps/rejected": -2.2482681274414062, + "loss": 0.6807, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7259595394134521, + "rewards/margins": 0.5223087072372437, + "rewards/rejected": -2.2482681274414062, + "sft_loss": 1.6770446300506592, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 4.0863537745622045, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.24232418835163116, + "logits/rejected": -0.17023006081581116, + "logps/chosen": -1.721494436264038, + "logps/rejected": -2.3115336894989014, + "loss": 0.6655, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.721494436264038, + "rewards/margins": 0.5900392532348633, + "rewards/rejected": -2.3115336894989014, + "sft_loss": 1.5996655225753784, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 5.08775407845278, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.28603631258010864, + "logits/rejected": -0.11851084232330322, + "logps/chosen": -1.4622247219085693, + "logps/rejected": -2.409731149673462, + "loss": 0.6344, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4622247219085693, + "rewards/margins": 0.9475065469741821, + "rewards/rejected": -2.409731149673462, + "sft_loss": 1.5201201438903809, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 3.3750509387088394, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.31978458166122437, + "logits/rejected": -0.24338774383068085, + "logps/chosen": -1.8174854516983032, + "logps/rejected": -2.1474390029907227, + "loss": 0.6798, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.8174854516983032, + "rewards/margins": 0.329953670501709, + "rewards/rejected": -2.1474390029907227, + "sft_loss": 1.6977739334106445, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 3.0296914825809864, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.2576034665107727, + "logits/rejected": -0.19173723459243774, + "logps/chosen": -1.6291992664337158, + "logps/rejected": -2.3262884616851807, + "loss": 0.6595, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6291992664337158, + "rewards/margins": 0.6970890760421753, + "rewards/rejected": -2.3262884616851807, + "sft_loss": 1.6249767541885376, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 3.385482536388645, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.32519760727882385, + "logits/rejected": -0.2157323807477951, + "logps/chosen": -1.5996949672698975, + "logps/rejected": -2.1245455741882324, + "loss": 0.6469, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5996949672698975, + "rewards/margins": 0.5248507261276245, + "rewards/rejected": -2.1245455741882324, + "sft_loss": 1.616973876953125, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 3.036633704742196, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.38313087821006775, + "logits/rejected": -0.1867683082818985, + "logps/chosen": -1.6491420269012451, + "logps/rejected": -2.2258849143981934, + "loss": 0.6628, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6491420269012451, + "rewards/margins": 0.5767428278923035, + "rewards/rejected": -2.2258849143981934, + "sft_loss": 1.6371958255767822, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 3.1469095206902136, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.21445195376873016, + "logits/rejected": -0.216725155711174, + "logps/chosen": -1.5153484344482422, + "logps/rejected": -2.1543962955474854, + "loss": 0.6457, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5153484344482422, + "rewards/margins": 0.6390475630760193, + "rewards/rejected": -2.1543962955474854, + "sft_loss": 1.5688116550445557, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 4.9960979584716325, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.2531622350215912, + "logits/rejected": -0.13175725936889648, + "logps/chosen": -1.4258439540863037, + "logps/rejected": -2.2913336753845215, + "loss": 0.625, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4258439540863037, + "rewards/margins": 0.8654899597167969, + "rewards/rejected": -2.2913336753845215, + "sft_loss": 1.4867799282073975, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 5.347358695623874, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.3808794617652893, + "logits/rejected": -0.19544386863708496, + "logps/chosen": -1.6057853698730469, + "logps/rejected": -2.25998854637146, + "loss": 0.6425, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6057853698730469, + "rewards/margins": 0.6542031168937683, + "rewards/rejected": -2.25998854637146, + "sft_loss": 1.6914764642715454, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 3.692579308133713, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.27982017397880554, + "logits/rejected": -0.08442778885364532, + "logps/chosen": -1.5698671340942383, + "logps/rejected": -2.2227370738983154, + "loss": 0.6657, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5698671340942383, + "rewards/margins": 0.6528701782226562, + "rewards/rejected": -2.2227370738983154, + "sft_loss": 1.5974746942520142, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 4.484161356916124, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.3287569582462311, + "logits/rejected": -0.1841241419315338, + "logps/chosen": -1.5247882604599, + "logps/rejected": -2.231912136077881, + "loss": 0.6458, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5247882604599, + "rewards/margins": 0.7071238160133362, + "rewards/rejected": -2.231912136077881, + "sft_loss": 1.5711970329284668, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 2.888037570851758, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.2781091034412384, + "logits/rejected": -0.14470918476581573, + "logps/chosen": -1.5172996520996094, + "logps/rejected": -2.0534207820892334, + "loss": 0.6601, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5172996520996094, + "rewards/margins": 0.536121129989624, + "rewards/rejected": -2.0534207820892334, + "sft_loss": 1.5588241815567017, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 3.4537566419987793, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.2543310523033142, + "logits/rejected": -0.10968288034200668, + "logps/chosen": -1.5393258333206177, + "logps/rejected": -2.4675350189208984, + "loss": 0.6527, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5393258333206177, + "rewards/margins": 0.9282093048095703, + "rewards/rejected": -2.4675350189208984, + "sft_loss": 1.5896894931793213, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 4.520510904522744, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.3551029562950134, + "logits/rejected": -0.2078694850206375, + "logps/chosen": -1.5725879669189453, + "logps/rejected": -2.226203441619873, + "loss": 0.6443, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5725879669189453, + "rewards/margins": 0.6536153554916382, + "rewards/rejected": -2.226203441619873, + "sft_loss": 1.5988163948059082, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": -0.03984348848462105, + "eval_logits/rejected": 0.04256761446595192, + "eval_logps/chosen": -1.6001262664794922, + "eval_logps/rejected": -2.216757297515869, + "eval_loss": 0.66923588514328, + "eval_rewards/accuracies": 0.6461424231529236, + "eval_rewards/chosen": -1.6001262664794922, + "eval_rewards/margins": 0.6166310906410217, + "eval_rewards/rejected": -2.216757297515869, + "eval_runtime": 44.4182, + "eval_samples_per_second": 30.28, + "eval_sft_loss": 1.5919395685195923, + "eval_steps_per_second": 7.587, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 5.03762010664131, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.36141958832740784, + "logits/rejected": -0.3205726444721222, + "logps/chosen": -1.536834478378296, + "logps/rejected": -2.015439510345459, + "loss": 0.6594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.536834478378296, + "rewards/margins": 0.4786049425601959, + "rewards/rejected": -2.015439510345459, + "sft_loss": 1.5660918951034546, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 2.9750365763093645, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.27190545201301575, + "logits/rejected": -0.22115862369537354, + "logps/chosen": -1.6559785604476929, + "logps/rejected": -2.380772829055786, + "loss": 0.6574, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6559785604476929, + "rewards/margins": 0.724794328212738, + "rewards/rejected": -2.380772829055786, + "sft_loss": 1.6743500232696533, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 3.183733452919812, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.2622893750667572, + "logits/rejected": -0.09581082314252853, + "logps/chosen": -1.6069536209106445, + "logps/rejected": -2.1633763313293457, + "loss": 0.6639, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6069536209106445, + "rewards/margins": 0.5564228296279907, + "rewards/rejected": -2.1633763313293457, + "sft_loss": 1.6053905487060547, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 4.6850878320413205, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.27317818999290466, + "logits/rejected": -0.18975523114204407, + "logps/chosen": -1.6047519445419312, + "logps/rejected": -2.055290699005127, + "loss": 0.6648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6047519445419312, + "rewards/margins": 0.45053917169570923, + "rewards/rejected": -2.055290699005127, + "sft_loss": 1.58820641040802, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 4.483761518462918, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.2564404010772705, + "logits/rejected": -0.15235671401023865, + "logps/chosen": -1.4038350582122803, + "logps/rejected": -2.0266623497009277, + "loss": 0.6178, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4038350582122803, + "rewards/margins": 0.6228272318840027, + "rewards/rejected": -2.0266623497009277, + "sft_loss": 1.5363707542419434, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 3.053354746961691, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.31015071272850037, + "logits/rejected": -0.14849421381950378, + "logps/chosen": -1.5916285514831543, + "logps/rejected": -2.3258919715881348, + "loss": 0.6485, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5916285514831543, + "rewards/margins": 0.7342632412910461, + "rewards/rejected": -2.3258919715881348, + "sft_loss": 1.6185725927352905, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 2.828988858883003, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.38335493206977844, + "logits/rejected": -0.17762592434883118, + "logps/chosen": -1.4943480491638184, + "logps/rejected": -2.273847818374634, + "loss": 0.6411, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4943480491638184, + "rewards/margins": 0.7794996500015259, + "rewards/rejected": -2.273847818374634, + "sft_loss": 1.5764858722686768, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 5.391975532843501, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.36591753363609314, + "logits/rejected": -0.14763380587100983, + "logps/chosen": -1.5593401193618774, + "logps/rejected": -2.1390182971954346, + "loss": 0.6635, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5593401193618774, + "rewards/margins": 0.5796783566474915, + "rewards/rejected": -2.1390182971954346, + "sft_loss": 1.6117656230926514, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 6.870915257727768, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.33582472801208496, + "logits/rejected": -0.15217497944831848, + "logps/chosen": -1.5762287378311157, + "logps/rejected": -2.1126303672790527, + "loss": 0.6549, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5762287378311157, + "rewards/margins": 0.5364019274711609, + "rewards/rejected": -2.1126303672790527, + "sft_loss": 1.6227718591690063, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 3.9406011916032324, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.2725081443786621, + "logits/rejected": -0.13785557448863983, + "logps/chosen": -1.7564218044281006, + "logps/rejected": -2.3719990253448486, + "loss": 0.6558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7564218044281006, + "rewards/margins": 0.6155771017074585, + "rewards/rejected": -2.3719990253448486, + "sft_loss": 1.6522209644317627, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 2.935103695680747, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.36804503202438354, + "logits/rejected": -0.2704111337661743, + "logps/chosen": -1.593177080154419, + "logps/rejected": -2.316232442855835, + "loss": 0.6544, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.593177080154419, + "rewards/margins": 0.7230552434921265, + "rewards/rejected": -2.316232442855835, + "sft_loss": 1.633183240890503, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 3.7772885916093477, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.3856295943260193, + "logits/rejected": -0.16961362957954407, + "logps/chosen": -1.5195658206939697, + "logps/rejected": -2.4012515544891357, + "loss": 0.6425, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5195658206939697, + "rewards/margins": 0.8816859126091003, + "rewards/rejected": -2.4012515544891357, + "sft_loss": 1.5922685861587524, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 2.457379225337726, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.38077300786972046, + "logits/rejected": -0.2020655870437622, + "logps/chosen": -1.6155993938446045, + "logps/rejected": -2.188157796859741, + "loss": 0.6811, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6155993938446045, + "rewards/margins": 0.5725584626197815, + "rewards/rejected": -2.188157796859741, + "sft_loss": 1.614628553390503, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 4.384140130616499, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.2263101041316986, + "logits/rejected": -0.14471349120140076, + "logps/chosen": -1.472354769706726, + "logps/rejected": -2.0752902030944824, + "loss": 0.6482, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.472354769706726, + "rewards/margins": 0.6029354929924011, + "rewards/rejected": -2.0752902030944824, + "sft_loss": 1.5009453296661377, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 2.914679887842728, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.3159586489200592, + "logits/rejected": -0.10197826474905014, + "logps/chosen": -1.5932174921035767, + "logps/rejected": -2.2627813816070557, + "loss": 0.6444, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5932174921035767, + "rewards/margins": 0.669563889503479, + "rewards/rejected": -2.2627813816070557, + "sft_loss": 1.5354959964752197, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 2.401984695844185, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.30270200967788696, + "logits/rejected": -0.21523793041706085, + "logps/chosen": -1.456754446029663, + "logps/rejected": -2.046170473098755, + "loss": 0.6512, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.456754446029663, + "rewards/margins": 0.5894161462783813, + "rewards/rejected": -2.046170473098755, + "sft_loss": 1.4913976192474365, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 6.470755790004308, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.30092233419418335, + "logits/rejected": -0.20879137516021729, + "logps/chosen": -1.5819059610366821, + "logps/rejected": -2.2939720153808594, + "loss": 0.6551, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5819059610366821, + "rewards/margins": 0.7120662331581116, + "rewards/rejected": -2.2939720153808594, + "sft_loss": 1.5738495588302612, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 3.2800575699036085, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.35377007722854614, + "logits/rejected": -0.14289887249469757, + "logps/chosen": -1.5346969366073608, + "logps/rejected": -2.4079833030700684, + "loss": 0.6284, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5346969366073608, + "rewards/margins": 0.8732865452766418, + "rewards/rejected": -2.4079833030700684, + "sft_loss": 1.5910435914993286, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 4.253805285650443, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.39720767736434937, + "logits/rejected": -0.1897422969341278, + "logps/chosen": -1.523134469985962, + "logps/rejected": -2.223334312438965, + "loss": 0.6535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.523134469985962, + "rewards/margins": 0.7001999020576477, + "rewards/rejected": -2.223334312438965, + "sft_loss": 1.5863049030303955, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 3.4154369606317903, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.21695657074451447, + "logits/rejected": -0.21344704926013947, + "logps/chosen": -1.5514976978302002, + "logps/rejected": -2.1682536602020264, + "loss": 0.6479, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5514976978302002, + "rewards/margins": 0.6167561411857605, + "rewards/rejected": -2.1682536602020264, + "sft_loss": 1.590196132659912, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 5.205115407141752, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.35822954773902893, + "logits/rejected": -0.21628502011299133, + "logps/chosen": -1.5886913537979126, + "logps/rejected": -2.2281494140625, + "loss": 0.6517, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5886913537979126, + "rewards/margins": 0.6394580006599426, + "rewards/rejected": -2.2281494140625, + "sft_loss": 1.6002895832061768, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 7.7046151520557355, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.37813323736190796, + "logits/rejected": -0.251869797706604, + "logps/chosen": -1.553252935409546, + "logps/rejected": -2.095944404602051, + "loss": 0.654, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.553252935409546, + "rewards/margins": 0.5426915287971497, + "rewards/rejected": -2.095944404602051, + "sft_loss": 1.5301573276519775, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 3.344080421762281, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.34831443428993225, + "logits/rejected": -0.2738765478134155, + "logps/chosen": -1.5086171627044678, + "logps/rejected": -2.204338788986206, + "loss": 0.6431, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5086171627044678, + "rewards/margins": 0.6957216858863831, + "rewards/rejected": -2.204338788986206, + "sft_loss": 1.5479824542999268, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 3.905727187770449, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.2951086461544037, + "logits/rejected": -0.2081606686115265, + "logps/chosen": -1.4997297525405884, + "logps/rejected": -2.3100061416625977, + "loss": 0.6229, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4997297525405884, + "rewards/margins": 0.8102763891220093, + "rewards/rejected": -2.3100061416625977, + "sft_loss": 1.5412787199020386, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 3.289702012753996, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.36814767122268677, + "logits/rejected": -0.1877470761537552, + "logps/chosen": -1.6343269348144531, + "logps/rejected": -2.331125497817993, + "loss": 0.6513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6343269348144531, + "rewards/margins": 0.6967986226081848, + "rewards/rejected": -2.331125497817993, + "sft_loss": 1.6918836832046509, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 3.1599141021190102, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.3575204014778137, + "logits/rejected": -0.2661263942718506, + "logps/chosen": -1.6031525135040283, + "logps/rejected": -2.1195476055145264, + "loss": 0.6732, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6031525135040283, + "rewards/margins": 0.5163952708244324, + "rewards/rejected": -2.1195476055145264, + "sft_loss": 1.5328352451324463, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 5.177464157174504, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.19036388397216797, + "logits/rejected": -0.07992489635944366, + "logps/chosen": -1.512872338294983, + "logps/rejected": -2.2547543048858643, + "loss": 0.6504, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.512872338294983, + "rewards/margins": 0.7418821454048157, + "rewards/rejected": -2.2547543048858643, + "sft_loss": 1.5622663497924805, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 3.7895314021271957, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.2304213047027588, + "logits/rejected": -0.15390145778656006, + "logps/chosen": -1.4937970638275146, + "logps/rejected": -2.169123649597168, + "loss": 0.6532, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4937970638275146, + "rewards/margins": 0.6753265261650085, + "rewards/rejected": -2.169123649597168, + "sft_loss": 1.5163295269012451, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 3.9449040606215453, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.2762044072151184, + "logits/rejected": -0.1484527587890625, + "logps/chosen": -1.5513827800750732, + "logps/rejected": -2.183565855026245, + "loss": 0.6705, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5513827800750732, + "rewards/margins": 0.6321829557418823, + "rewards/rejected": -2.183565855026245, + "sft_loss": 1.5500465631484985, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 5.804362394565838, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.3430117964744568, + "logits/rejected": -0.23104672133922577, + "logps/chosen": -1.4433432817459106, + "logps/rejected": -2.170539379119873, + "loss": 0.6434, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4433432817459106, + "rewards/margins": 0.727196455001831, + "rewards/rejected": -2.170539379119873, + "sft_loss": 1.4824066162109375, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 2.827950745351294, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.31212273240089417, + "logits/rejected": -0.24187254905700684, + "logps/chosen": -1.669054627418518, + "logps/rejected": -2.145447254180908, + "loss": 0.663, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.669054627418518, + "rewards/margins": 0.4763926565647125, + "rewards/rejected": -2.145447254180908, + "sft_loss": 1.6982654333114624, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 4.434657094438286, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.3376582860946655, + "logits/rejected": -0.16643747687339783, + "logps/chosen": -1.5501635074615479, + "logps/rejected": -2.198604106903076, + "loss": 0.646, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5501635074615479, + "rewards/margins": 0.6484406590461731, + "rewards/rejected": -2.198604106903076, + "sft_loss": 1.4974991083145142, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 6.878050594650564, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.2798961102962494, + "logits/rejected": -0.18283778429031372, + "logps/chosen": -1.583674669265747, + "logps/rejected": -2.2180368900299072, + "loss": 0.6497, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.583674669265747, + "rewards/margins": 0.6343621015548706, + "rewards/rejected": -2.2180368900299072, + "sft_loss": 1.567760944366455, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 3.5233828625827037, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.22706007957458496, + "logits/rejected": -0.0819624587893486, + "logps/chosen": -1.5315229892730713, + "logps/rejected": -2.269984245300293, + "loss": 0.6477, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5315229892730713, + "rewards/margins": 0.7384613752365112, + "rewards/rejected": -2.269984245300293, + "sft_loss": 1.5649573802947998, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 3.1923233685494954, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.2299167662858963, + "logits/rejected": -0.17413330078125, + "logps/chosen": -1.5149710178375244, + "logps/rejected": -2.1643295288085938, + "loss": 0.6469, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5149710178375244, + "rewards/margins": 0.6493586301803589, + "rewards/rejected": -2.1643295288085938, + "sft_loss": 1.5522096157073975, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 3.8568871271134166, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.2824332118034363, + "logits/rejected": -0.2337716519832611, + "logps/chosen": -1.5122730731964111, + "logps/rejected": -2.1140122413635254, + "loss": 0.6332, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5122730731964111, + "rewards/margins": 0.6017390489578247, + "rewards/rejected": -2.1140122413635254, + "sft_loss": 1.5447158813476562, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 3.2764156874574533, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.23593628406524658, + "logits/rejected": -0.09226083010435104, + "logps/chosen": -1.5803115367889404, + "logps/rejected": -2.2210307121276855, + "loss": 0.6574, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5803115367889404, + "rewards/margins": 0.6407192945480347, + "rewards/rejected": -2.2210307121276855, + "sft_loss": 1.6641523838043213, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 5.246348662800345, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.35039710998535156, + "logits/rejected": -0.1667160540819168, + "logps/chosen": -1.4865039587020874, + "logps/rejected": -2.0124218463897705, + "loss": 0.6528, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4865039587020874, + "rewards/margins": 0.5259180665016174, + "rewards/rejected": -2.0124218463897705, + "sft_loss": 1.5617542266845703, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 3.664602000177623, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.3106224834918976, + "logits/rejected": -0.190143883228302, + "logps/chosen": -1.510183572769165, + "logps/rejected": -2.31964111328125, + "loss": 0.6512, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.510183572769165, + "rewards/margins": 0.8094574809074402, + "rewards/rejected": -2.31964111328125, + "sft_loss": 1.547892451286316, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 4.362679956691847, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.3063965439796448, + "logits/rejected": -0.0520622618496418, + "logps/chosen": -1.5510295629501343, + "logps/rejected": -2.333533763885498, + "loss": 0.6483, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5510295629501343, + "rewards/margins": 0.7825039029121399, + "rewards/rejected": -2.333533763885498, + "sft_loss": 1.6380821466445923, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 4.317027417531272, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.2685374617576599, + "logits/rejected": -0.127393901348114, + "logps/chosen": -1.544651746749878, + "logps/rejected": -2.279839277267456, + "loss": 0.6565, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.544651746749878, + "rewards/margins": 0.7351876497268677, + "rewards/rejected": -2.279839277267456, + "sft_loss": 1.5730217695236206, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 4.669673690973595, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.3165430426597595, + "logits/rejected": -0.1727561503648758, + "logps/chosen": -1.566685676574707, + "logps/rejected": -2.318009376525879, + "loss": 0.6452, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.566685676574707, + "rewards/margins": 0.7513237595558167, + "rewards/rejected": -2.318009376525879, + "sft_loss": 1.5769126415252686, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 4.072456844305052, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.25783801078796387, + "logits/rejected": -0.12196172773838043, + "logps/chosen": -1.5721653699874878, + "logps/rejected": -2.546551465988159, + "loss": 0.6502, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5721653699874878, + "rewards/margins": 0.9743859171867371, + "rewards/rejected": -2.546551465988159, + "sft_loss": 1.6333370208740234, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 4.227785346418531, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.27418097853660583, + "logits/rejected": -0.07549251616001129, + "logps/chosen": -1.4618358612060547, + "logps/rejected": -2.13948130607605, + "loss": 0.6386, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4618358612060547, + "rewards/margins": 0.6776456832885742, + "rewards/rejected": -2.13948130607605, + "sft_loss": 1.5798118114471436, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 2.4850185145803105, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.35881730914115906, + "logits/rejected": -0.30776986479759216, + "logps/chosen": -1.5527098178863525, + "logps/rejected": -2.214914321899414, + "loss": 0.6546, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5527098178863525, + "rewards/margins": 0.6622045636177063, + "rewards/rejected": -2.214914321899414, + "sft_loss": 1.5944491624832153, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 19.21874409478125, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.2559207081794739, + "logits/rejected": -0.17347490787506104, + "logps/chosen": -1.482177495956421, + "logps/rejected": -2.1850438117980957, + "loss": 0.6479, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.482177495956421, + "rewards/margins": 0.7028661966323853, + "rewards/rejected": -2.1850438117980957, + "sft_loss": 1.5517394542694092, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 7.228158952382859, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.16527710855007172, + "logits/rejected": -0.008183039724826813, + "logps/chosen": -1.592286467552185, + "logps/rejected": -2.4563093185424805, + "loss": 0.6491, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.592286467552185, + "rewards/margins": 0.8640230298042297, + "rewards/rejected": -2.4563093185424805, + "sft_loss": 1.557730793952942, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 3.149954021366626, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.19139009714126587, + "logits/rejected": -0.07891600579023361, + "logps/chosen": -1.568982481956482, + "logps/rejected": -2.2609806060791016, + "loss": 0.6486, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.568982481956482, + "rewards/margins": 0.6919980049133301, + "rewards/rejected": -2.2609806060791016, + "sft_loss": 1.5659347772598267, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 6.69841358488078, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.2480025738477707, + "logits/rejected": -0.10911725461483002, + "logps/chosen": -1.4874827861785889, + "logps/rejected": -2.092480182647705, + "loss": 0.6445, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4874827861785889, + "rewards/margins": 0.6049972772598267, + "rewards/rejected": -2.092480182647705, + "sft_loss": 1.551493763923645, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 3.597871974349541, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.2941132187843323, + "logits/rejected": -0.055225301533937454, + "logps/chosen": -1.5422605276107788, + "logps/rejected": -2.309134006500244, + "loss": 0.6447, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5422605276107788, + "rewards/margins": 0.7668734788894653, + "rewards/rejected": -2.309134006500244, + "sft_loss": 1.5496307611465454, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 2.8976921514005363, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.30288466811180115, + "logits/rejected": -0.15188676118850708, + "logps/chosen": -1.5327774286270142, + "logps/rejected": -2.0788652896881104, + "loss": 0.6624, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5327774286270142, + "rewards/margins": 0.5460880398750305, + "rewards/rejected": -2.0788652896881104, + "sft_loss": 1.603165626525879, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 3.0556336510613438, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.3329222500324249, + "logits/rejected": -0.20975284278392792, + "logps/chosen": -1.6097650527954102, + "logps/rejected": -2.1826891899108887, + "loss": 0.652, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6097650527954102, + "rewards/margins": 0.5729240775108337, + "rewards/rejected": -2.1826891899108887, + "sft_loss": 1.6203798055648804, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 3.6312464079223603, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.2509586215019226, + "logits/rejected": -0.17944104969501495, + "logps/chosen": -1.7537086009979248, + "logps/rejected": -2.327963352203369, + "loss": 0.6712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7537086009979248, + "rewards/margins": 0.5742546319961548, + "rewards/rejected": -2.327963352203369, + "sft_loss": 1.6698287725448608, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 3.6172161673241825, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.3168428838253021, + "logits/rejected": -0.19337889552116394, + "logps/chosen": -1.560003399848938, + "logps/rejected": -2.045964241027832, + "loss": 0.6544, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.560003399848938, + "rewards/margins": 0.48596081137657166, + "rewards/rejected": -2.045964241027832, + "sft_loss": 1.607225775718689, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 2.6556568106703256, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.2559506595134735, + "logits/rejected": -0.11300382763147354, + "logps/chosen": -1.5347226858139038, + "logps/rejected": -2.135934591293335, + "loss": 0.6385, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5347226858139038, + "rewards/margins": 0.6012121438980103, + "rewards/rejected": -2.135934591293335, + "sft_loss": 1.4947535991668701, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 3.3277907557458306, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.21113601326942444, + "logits/rejected": -0.10068633407354355, + "logps/chosen": -1.5916776657104492, + "logps/rejected": -2.350574254989624, + "loss": 0.6509, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5916776657104492, + "rewards/margins": 0.7588964700698853, + "rewards/rejected": -2.350574254989624, + "sft_loss": 1.557253122329712, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 3.050861899566008, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.3704945743083954, + "logits/rejected": -0.17528492212295532, + "logps/chosen": -1.4895609617233276, + "logps/rejected": -2.116403818130493, + "loss": 0.6472, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4895609617233276, + "rewards/margins": 0.6268427968025208, + "rewards/rejected": -2.116403818130493, + "sft_loss": 1.5278401374816895, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 4.411901521813038, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.25644347071647644, + "logits/rejected": -0.20167987048625946, + "logps/chosen": -1.5314282178878784, + "logps/rejected": -2.092921733856201, + "loss": 0.6582, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5314282178878784, + "rewards/margins": 0.5614933967590332, + "rewards/rejected": -2.092921733856201, + "sft_loss": 1.516566276550293, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 2.577157103525146, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.256502628326416, + "logits/rejected": -0.0664641335606575, + "logps/chosen": -1.5892064571380615, + "logps/rejected": -2.223823070526123, + "loss": 0.643, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5892064571380615, + "rewards/margins": 0.6346170902252197, + "rewards/rejected": -2.223823070526123, + "sft_loss": 1.6083158254623413, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 2.958201859488952, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.36429911851882935, + "logits/rejected": -0.24181333184242249, + "logps/chosen": -1.4857892990112305, + "logps/rejected": -2.1328506469726562, + "loss": 0.6482, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4857892990112305, + "rewards/margins": 0.6470614075660706, + "rewards/rejected": -2.1328506469726562, + "sft_loss": 1.5447801351547241, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 4.75676160474035, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.23873957991600037, + "logits/rejected": -0.19696664810180664, + "logps/chosen": -1.6221033334732056, + "logps/rejected": -2.195286273956299, + "loss": 0.6488, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6221033334732056, + "rewards/margins": 0.5731828808784485, + "rewards/rejected": -2.195286273956299, + "sft_loss": 1.5847132205963135, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 10.625975645164717, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.3477099537849426, + "logits/rejected": -0.20050649344921112, + "logps/chosen": -1.6094754934310913, + "logps/rejected": -2.2957520484924316, + "loss": 0.6557, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6094754934310913, + "rewards/margins": 0.6862764954566956, + "rewards/rejected": -2.2957520484924316, + "sft_loss": 1.6209688186645508, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 2.7541151173206897, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.294449120759964, + "logits/rejected": -0.25376224517822266, + "logps/chosen": -1.6765820980072021, + "logps/rejected": -2.2641329765319824, + "loss": 0.6513, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6765820980072021, + "rewards/margins": 0.5875504612922668, + "rewards/rejected": -2.2641329765319824, + "sft_loss": 1.7093673944473267, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 5.381113186903328, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.2696743607521057, + "logits/rejected": -0.16265803575515747, + "logps/chosen": -1.4993641376495361, + "logps/rejected": -2.1580071449279785, + "loss": 0.641, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4993641376495361, + "rewards/margins": 0.6586429476737976, + "rewards/rejected": -2.1580071449279785, + "sft_loss": 1.501667857170105, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 4.363832899390693, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.3282429873943329, + "logits/rejected": -0.2649852931499481, + "logps/chosen": -1.5814213752746582, + "logps/rejected": -2.1092419624328613, + "loss": 0.656, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5814213752746582, + "rewards/margins": 0.527820348739624, + "rewards/rejected": -2.1092419624328613, + "sft_loss": 1.6161930561065674, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 5.399454559214379, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.27031904458999634, + "logits/rejected": -0.14918789267539978, + "logps/chosen": -1.4463913440704346, + "logps/rejected": -2.188873529434204, + "loss": 0.6323, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4463913440704346, + "rewards/margins": 0.7424818277359009, + "rewards/rejected": -2.188873529434204, + "sft_loss": 1.5468406677246094, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 3.542438782265644, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.24759738147258759, + "logits/rejected": -0.2921431064605713, + "logps/chosen": -1.5594487190246582, + "logps/rejected": -2.0642504692077637, + "loss": 0.6454, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5594487190246582, + "rewards/margins": 0.504801869392395, + "rewards/rejected": -2.0642504692077637, + "sft_loss": 1.593975305557251, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 4.184646108234178, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.3225509226322174, + "logits/rejected": -0.180209219455719, + "logps/chosen": -1.5613961219787598, + "logps/rejected": -2.246628522872925, + "loss": 0.6549, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5613961219787598, + "rewards/margins": 0.6852323412895203, + "rewards/rejected": -2.246628522872925, + "sft_loss": 1.5798487663269043, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 3.6110635278658036, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.2740827202796936, + "logits/rejected": -0.22729679942131042, + "logps/chosen": -1.5970016717910767, + "logps/rejected": -2.1766319274902344, + "loss": 0.6458, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5970016717910767, + "rewards/margins": 0.5796300172805786, + "rewards/rejected": -2.1766319274902344, + "sft_loss": 1.6187547445297241, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 3.311222532309807, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.213700532913208, + "logits/rejected": -0.0047538997605443, + "logps/chosen": -1.5098040103912354, + "logps/rejected": -2.4965951442718506, + "loss": 0.6444, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5098040103912354, + "rewards/margins": 0.9867914319038391, + "rewards/rejected": -2.4965951442718506, + "sft_loss": 1.5469787120819092, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 2.110121081515731, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.31291213631629944, + "logits/rejected": -0.1478758156299591, + "logps/chosen": -1.4469785690307617, + "logps/rejected": -2.3485617637634277, + "loss": 0.63, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4469785690307617, + "rewards/margins": 0.9015833139419556, + "rewards/rejected": -2.3485617637634277, + "sft_loss": 1.5024610757827759, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 5.092788021650294, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.3487478792667389, + "logits/rejected": -0.2094249725341797, + "logps/chosen": -1.4646135568618774, + "logps/rejected": -2.1586451530456543, + "loss": 0.6525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4646135568618774, + "rewards/margins": 0.6940317153930664, + "rewards/rejected": -2.1586451530456543, + "sft_loss": 1.5558288097381592, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 3.2332751465828644, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.3419414460659027, + "logits/rejected": -0.15927313268184662, + "logps/chosen": -1.6466041803359985, + "logps/rejected": -2.3158504962921143, + "loss": 0.6687, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6466041803359985, + "rewards/margins": 0.6692460775375366, + "rewards/rejected": -2.3158504962921143, + "sft_loss": 1.6631864309310913, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 3.737944816854884, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.28476816415786743, + "logits/rejected": -0.17611289024353027, + "logps/chosen": -1.5729347467422485, + "logps/rejected": -2.3755788803100586, + "loss": 0.6585, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5729347467422485, + "rewards/margins": 0.8026441335678101, + "rewards/rejected": -2.3755788803100586, + "sft_loss": 1.5690858364105225, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 2.389334041003902, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.17536935210227966, + "logits/rejected": -0.07199620455503464, + "logps/chosen": -1.5184999704360962, + "logps/rejected": -2.2542495727539062, + "loss": 0.6439, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5184999704360962, + "rewards/margins": 0.7357496023178101, + "rewards/rejected": -2.2542495727539062, + "sft_loss": 1.4695793390274048, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 2.9252819329362207, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.26403623819351196, + "logits/rejected": -0.163148432970047, + "logps/chosen": -1.5481630563735962, + "logps/rejected": -2.1773335933685303, + "loss": 0.6527, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5481630563735962, + "rewards/margins": 0.6291707754135132, + "rewards/rejected": -2.1773335933685303, + "sft_loss": 1.5882800817489624, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 5.040500759338175, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.2527233958244324, + "logits/rejected": -0.19683003425598145, + "logps/chosen": -1.5815021991729736, + "logps/rejected": -2.067279815673828, + "loss": 0.668, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5815021991729736, + "rewards/margins": 0.4857775568962097, + "rewards/rejected": -2.067279815673828, + "sft_loss": 1.5577093362808228, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 3.006823313342819, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.34018540382385254, + "logits/rejected": -0.20170053839683533, + "logps/chosen": -1.576545000076294, + "logps/rejected": -2.3532755374908447, + "loss": 0.6513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.576545000076294, + "rewards/margins": 0.776730477809906, + "rewards/rejected": -2.3532755374908447, + "sft_loss": 1.5583680868148804, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 4.808664555754724, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.231277734041214, + "logits/rejected": -0.15128371119499207, + "logps/chosen": -1.6274127960205078, + "logps/rejected": -2.1662192344665527, + "loss": 0.6609, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6274127960205078, + "rewards/margins": 0.5388065576553345, + "rewards/rejected": -2.1662192344665527, + "sft_loss": 1.6247822046279907, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 2.4118704077225996, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.2833016514778137, + "logits/rejected": -0.17131468653678894, + "logps/chosen": -1.4958717823028564, + "logps/rejected": -2.128330707550049, + "loss": 0.6356, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4958717823028564, + "rewards/margins": 0.6324589848518372, + "rewards/rejected": -2.128330707550049, + "sft_loss": 1.573540449142456, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.022585922852158546, + "eval_logits/rejected": 0.11062997579574585, + "eval_logps/chosen": -1.59637451171875, + "eval_logps/rejected": -2.2215960025787354, + "eval_loss": 0.6686033606529236, + "eval_rewards/accuracies": 0.6483679413795471, + "eval_rewards/chosen": -1.59637451171875, + "eval_rewards/margins": 0.6252216696739197, + "eval_rewards/rejected": -2.2215960025787354, + "eval_runtime": 44.1182, + "eval_samples_per_second": 30.486, + "eval_sft_loss": 1.5864040851593018, + "eval_steps_per_second": 7.639, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 4.713763181183661, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.33478087186813354, + "logits/rejected": -0.20814982056617737, + "logps/chosen": -1.5814807415008545, + "logps/rejected": -2.2600340843200684, + "loss": 0.6477, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5814807415008545, + "rewards/margins": 0.6785534620285034, + "rewards/rejected": -2.2600340843200684, + "sft_loss": 1.5960379838943481, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 3.0658788495930707, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.24046310782432556, + "logits/rejected": -0.1067015677690506, + "logps/chosen": -1.618259072303772, + "logps/rejected": -2.225800037384033, + "loss": 0.6609, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.618259072303772, + "rewards/margins": 0.6075407266616821, + "rewards/rejected": -2.225800037384033, + "sft_loss": 1.616916298866272, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 7.482433847006735, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.25230371952056885, + "logits/rejected": -0.123440682888031, + "logps/chosen": -1.4187307357788086, + "logps/rejected": -2.219444751739502, + "loss": 0.6197, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4187307357788086, + "rewards/margins": 0.800714373588562, + "rewards/rejected": -2.219444751739502, + "sft_loss": 1.478771448135376, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 9.582868855705456, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.22999422252178192, + "logits/rejected": -0.15210556983947754, + "logps/chosen": -1.6584303379058838, + "logps/rejected": -2.1190333366394043, + "loss": 0.6575, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6584303379058838, + "rewards/margins": 0.46060290932655334, + "rewards/rejected": -2.1190333366394043, + "sft_loss": 1.6076551675796509, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 8.24557548071507, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.2544562518596649, + "logits/rejected": -0.12751419842243195, + "logps/chosen": -1.5638656616210938, + "logps/rejected": -2.0885910987854004, + "loss": 0.6527, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5638656616210938, + "rewards/margins": 0.524725615978241, + "rewards/rejected": -2.0885910987854004, + "sft_loss": 1.5143884420394897, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 2.765376813488303, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.31105369329452515, + "logits/rejected": -0.1702210158109665, + "logps/chosen": -1.6085840463638306, + "logps/rejected": -2.1430881023406982, + "loss": 0.658, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6085840463638306, + "rewards/margins": 0.5345041751861572, + "rewards/rejected": -2.1430881023406982, + "sft_loss": 1.6004174947738647, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 6.023616839080402, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.39601072669029236, + "logits/rejected": -0.20983512699604034, + "logps/chosen": -1.5357941389083862, + "logps/rejected": -2.263760566711426, + "loss": 0.6555, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5357941389083862, + "rewards/margins": 0.7279661893844604, + "rewards/rejected": -2.263760566711426, + "sft_loss": 1.5607926845550537, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 4.578710871693104, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.34746941924095154, + "logits/rejected": -0.15091314911842346, + "logps/chosen": -1.5270739793777466, + "logps/rejected": -2.2461137771606445, + "loss": 0.6446, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5270739793777466, + "rewards/margins": 0.7190399169921875, + "rewards/rejected": -2.2461137771606445, + "sft_loss": 1.5467076301574707, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 3.077774959202608, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.3103768229484558, + "logits/rejected": -0.20531435310840607, + "logps/chosen": -1.5353167057037354, + "logps/rejected": -2.1882166862487793, + "loss": 0.6514, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5353167057037354, + "rewards/margins": 0.6528998613357544, + "rewards/rejected": -2.1882166862487793, + "sft_loss": 1.6176245212554932, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 5.342418138387475, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.439974844455719, + "logits/rejected": -0.2942073345184326, + "logps/chosen": -1.5611355304718018, + "logps/rejected": -2.311400890350342, + "loss": 0.6555, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5611355304718018, + "rewards/margins": 0.7502651214599609, + "rewards/rejected": -2.311400890350342, + "sft_loss": 1.564321517944336, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 5.070791428130677, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.2757100760936737, + "logits/rejected": -0.12967705726623535, + "logps/chosen": -1.4793365001678467, + "logps/rejected": -2.2874951362609863, + "loss": 0.6413, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4793365001678467, + "rewards/margins": 0.8081587553024292, + "rewards/rejected": -2.2874951362609863, + "sft_loss": 1.540814995765686, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 10.029441019921531, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.2699975371360779, + "logits/rejected": -0.20369283854961395, + "logps/chosen": -1.400427222251892, + "logps/rejected": -2.2184746265411377, + "loss": 0.6167, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.400427222251892, + "rewards/margins": 0.8180474042892456, + "rewards/rejected": -2.2184746265411377, + "sft_loss": 1.4655218124389648, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 2.957964263025511, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.3764112591743469, + "logits/rejected": -0.2846323549747467, + "logps/chosen": -1.508000373840332, + "logps/rejected": -2.2652816772460938, + "loss": 0.6397, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.508000373840332, + "rewards/margins": 0.7572811841964722, + "rewards/rejected": -2.2652816772460938, + "sft_loss": 1.5318291187286377, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 2.8734137531975854, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.2513507008552551, + "logits/rejected": -0.12049970775842667, + "logps/chosen": -1.5877680778503418, + "logps/rejected": -2.249692678451538, + "loss": 0.6478, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5877680778503418, + "rewards/margins": 0.6619247198104858, + "rewards/rejected": -2.249692678451538, + "sft_loss": 1.567791223526001, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 3.5603848715105464, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.11695779860019684, + "logits/rejected": -0.03770308196544647, + "logps/chosen": -1.512600064277649, + "logps/rejected": -2.315598726272583, + "loss": 0.6438, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.512600064277649, + "rewards/margins": 0.8029987215995789, + "rewards/rejected": -2.315598726272583, + "sft_loss": 1.5583285093307495, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 5.285715997291284, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.18301086127758026, + "logits/rejected": -0.06374426931142807, + "logps/chosen": -1.464905023574829, + "logps/rejected": -2.3175206184387207, + "loss": 0.6273, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.464905023574829, + "rewards/margins": 0.8526156544685364, + "rewards/rejected": -2.3175206184387207, + "sft_loss": 1.5341923236846924, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 4.094377413513966, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.18760551512241364, + "logits/rejected": -0.11009053885936737, + "logps/chosen": -1.7325265407562256, + "logps/rejected": -2.3709464073181152, + "loss": 0.6565, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7325265407562256, + "rewards/margins": 0.6384199261665344, + "rewards/rejected": -2.3709464073181152, + "sft_loss": 1.7828750610351562, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 3.609069401041533, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.3653804659843445, + "logits/rejected": -0.2060328722000122, + "logps/chosen": -1.5337392091751099, + "logps/rejected": -2.2383885383605957, + "loss": 0.6428, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5337392091751099, + "rewards/margins": 0.7046495676040649, + "rewards/rejected": -2.2383885383605957, + "sft_loss": 1.5887818336486816, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 2.8786237616867143, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.29676735401153564, + "logits/rejected": -0.205804705619812, + "logps/chosen": -1.6045278310775757, + "logps/rejected": -2.143934965133667, + "loss": 0.6551, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6045278310775757, + "rewards/margins": 0.53940749168396, + "rewards/rejected": -2.143934965133667, + "sft_loss": 1.7019850015640259, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 2.655176451010275, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.2333211600780487, + "logits/rejected": -0.09696978330612183, + "logps/chosen": -1.5167087316513062, + "logps/rejected": -2.274289131164551, + "loss": 0.641, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5167087316513062, + "rewards/margins": 0.7575803995132446, + "rewards/rejected": -2.274289131164551, + "sft_loss": 1.5980942249298096, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 2.8007372584673638, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.2577126622200012, + "logits/rejected": -0.10144130885601044, + "logps/chosen": -1.5728174448013306, + "logps/rejected": -2.333540678024292, + "loss": 0.6518, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5728174448013306, + "rewards/margins": 0.7607229948043823, + "rewards/rejected": -2.333540678024292, + "sft_loss": 1.6706300973892212, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 3.31920416819512, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.30460885167121887, + "logits/rejected": -0.31871408224105835, + "logps/chosen": -1.6616413593292236, + "logps/rejected": -2.6286277770996094, + "loss": 0.6606, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6616413593292236, + "rewards/margins": 0.9669864773750305, + "rewards/rejected": -2.6286277770996094, + "sft_loss": 1.6571128368377686, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 5.451558355471818, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.21932478249073029, + "logits/rejected": -0.10749228298664093, + "logps/chosen": -1.426175594329834, + "logps/rejected": -2.144705295562744, + "loss": 0.6305, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.426175594329834, + "rewards/margins": 0.7185295820236206, + "rewards/rejected": -2.144705295562744, + "sft_loss": 1.535812497138977, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 3.13030607289851, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.23966650664806366, + "logits/rejected": -0.1604257971048355, + "logps/chosen": -1.6174768209457397, + "logps/rejected": -2.1807265281677246, + "loss": 0.6631, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6174768209457397, + "rewards/margins": 0.5632495880126953, + "rewards/rejected": -2.1807265281677246, + "sft_loss": 1.5944290161132812, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 3.3190440778545014, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.40968436002731323, + "logits/rejected": -0.16236819326877594, + "logps/chosen": -1.4841177463531494, + "logps/rejected": -2.236159086227417, + "loss": 0.6345, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4841177463531494, + "rewards/margins": 0.7520411014556885, + "rewards/rejected": -2.236159086227417, + "sft_loss": 1.5622247457504272, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 2.270307742945154, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.22066080570220947, + "logits/rejected": -0.06292378902435303, + "logps/chosen": -1.7009875774383545, + "logps/rejected": -2.5794739723205566, + "loss": 0.6494, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7009875774383545, + "rewards/margins": 0.8784863352775574, + "rewards/rejected": -2.5794739723205566, + "sft_loss": 1.628734827041626, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 3.9659339990265505, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.3291738033294678, + "logits/rejected": -0.17241686582565308, + "logps/chosen": -1.5517101287841797, + "logps/rejected": -2.16611909866333, + "loss": 0.6478, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5517101287841797, + "rewards/margins": 0.6144087910652161, + "rewards/rejected": -2.16611909866333, + "sft_loss": 1.5594476461410522, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 2.5452526134457325, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.2720089554786682, + "logits/rejected": -0.19060388207435608, + "logps/chosen": -1.5684664249420166, + "logps/rejected": -1.973120927810669, + "loss": 0.65, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5684664249420166, + "rewards/margins": 0.40465426445007324, + "rewards/rejected": -1.973120927810669, + "sft_loss": 1.5459164381027222, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 4.884060085397572, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.3061191439628601, + "logits/rejected": -0.1086500734090805, + "logps/chosen": -1.523645043373108, + "logps/rejected": -2.4154305458068848, + "loss": 0.6398, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.523645043373108, + "rewards/margins": 0.8917851448059082, + "rewards/rejected": -2.4154305458068848, + "sft_loss": 1.5471687316894531, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 6.943349502257102, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.3145865797996521, + "logits/rejected": -0.19390031695365906, + "logps/chosen": -1.5767168998718262, + "logps/rejected": -2.212101936340332, + "loss": 0.6458, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5767168998718262, + "rewards/margins": 0.6353851556777954, + "rewards/rejected": -2.212101936340332, + "sft_loss": 1.5280985832214355, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 5.359155241881496, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.2595178484916687, + "logits/rejected": -0.2513166666030884, + "logps/chosen": -1.63980233669281, + "logps/rejected": -2.196504831314087, + "loss": 0.6431, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.63980233669281, + "rewards/margins": 0.5567022562026978, + "rewards/rejected": -2.196504831314087, + "sft_loss": 1.6192525625228882, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 2.2051574072073836, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.18522128462791443, + "logits/rejected": -0.08331882953643799, + "logps/chosen": -1.5592795610427856, + "logps/rejected": -2.1713385581970215, + "loss": 0.6496, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5592795610427856, + "rewards/margins": 0.6120591759681702, + "rewards/rejected": -2.1713385581970215, + "sft_loss": 1.558884859085083, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 6.925666390173548, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.310133159160614, + "logits/rejected": -0.18919309973716736, + "logps/chosen": -1.3932650089263916, + "logps/rejected": -2.144735813140869, + "loss": 0.625, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3932650089263916, + "rewards/margins": 0.7514708638191223, + "rewards/rejected": -2.144735813140869, + "sft_loss": 1.5189520120620728, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 3.1649653830969893, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.1915835440158844, + "logits/rejected": -0.11682520061731339, + "logps/chosen": -1.5142855644226074, + "logps/rejected": -2.183206558227539, + "loss": 0.641, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5142855644226074, + "rewards/margins": 0.6689208149909973, + "rewards/rejected": -2.183206558227539, + "sft_loss": 1.4711930751800537, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 2.4456036606950793, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.2733498215675354, + "logits/rejected": -0.2143729031085968, + "logps/chosen": -1.6624078750610352, + "logps/rejected": -2.2634522914886475, + "loss": 0.67, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6624078750610352, + "rewards/margins": 0.6010446548461914, + "rewards/rejected": -2.2634522914886475, + "sft_loss": 1.6090848445892334, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 9.539019846441738, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.13736586272716522, + "logits/rejected": -0.07004425674676895, + "logps/chosen": -1.7409322261810303, + "logps/rejected": -2.433462142944336, + "loss": 0.673, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7409322261810303, + "rewards/margins": 0.6925297975540161, + "rewards/rejected": -2.433462142944336, + "sft_loss": 1.5974147319793701, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 7.167630686802431, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.23968443274497986, + "logits/rejected": -0.06813397258520126, + "logps/chosen": -1.5134305953979492, + "logps/rejected": -2.390659809112549, + "loss": 0.6366, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5134305953979492, + "rewards/margins": 0.8772293925285339, + "rewards/rejected": -2.390659809112549, + "sft_loss": 1.5120487213134766, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 2.8825014543320644, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.34571391344070435, + "logits/rejected": -0.13643576204776764, + "logps/chosen": -1.5214189291000366, + "logps/rejected": -2.2927017211914062, + "loss": 0.6436, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5214189291000366, + "rewards/margins": 0.7712829113006592, + "rewards/rejected": -2.2927017211914062, + "sft_loss": 1.5628582239151, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 3.6960718669777375, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.3258179724216461, + "logits/rejected": -0.30619877576828003, + "logps/chosen": -1.6004127264022827, + "logps/rejected": -2.1896750926971436, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6004127264022827, + "rewards/margins": 0.5892623662948608, + "rewards/rejected": -2.1896750926971436, + "sft_loss": 1.6143624782562256, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 6.243950321723729, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.2789207398891449, + "logits/rejected": -0.25094786286354065, + "logps/chosen": -1.6025774478912354, + "logps/rejected": -2.304051160812378, + "loss": 0.6485, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6025774478912354, + "rewards/margins": 0.7014739513397217, + "rewards/rejected": -2.304051160812378, + "sft_loss": 1.595412015914917, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 4.093118605787219, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.23231148719787598, + "logits/rejected": -0.18588271737098694, + "logps/chosen": -1.567158818244934, + "logps/rejected": -2.132018566131592, + "loss": 0.652, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.567158818244934, + "rewards/margins": 0.5648595094680786, + "rewards/rejected": -2.132018566131592, + "sft_loss": 1.5321518182754517, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 3.6018307312522753, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.2296139895915985, + "logits/rejected": -0.12728151679039001, + "logps/chosen": -1.7475757598876953, + "logps/rejected": -2.402459144592285, + "loss": 0.6692, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7475757598876953, + "rewards/margins": 0.6548832654953003, + "rewards/rejected": -2.402459144592285, + "sft_loss": 1.7332561016082764, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 3.343841512434198, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.32258838415145874, + "logits/rejected": -0.25735121965408325, + "logps/chosen": -1.661834478378296, + "logps/rejected": -2.3188681602478027, + "loss": 0.6459, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.661834478378296, + "rewards/margins": 0.6570338010787964, + "rewards/rejected": -2.3188681602478027, + "sft_loss": 1.5429569482803345, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 2.69101166999373, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.3427257239818573, + "logits/rejected": -0.15235450863838196, + "logps/chosen": -1.5064865350723267, + "logps/rejected": -2.1955904960632324, + "loss": 0.6611, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5064865350723267, + "rewards/margins": 0.6891041398048401, + "rewards/rejected": -2.1955904960632324, + "sft_loss": 1.537213921546936, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 2.8557599983363597, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.31604811549186707, + "logits/rejected": -0.17826204001903534, + "logps/chosen": -1.597011685371399, + "logps/rejected": -2.380256175994873, + "loss": 0.662, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.597011685371399, + "rewards/margins": 0.7832446098327637, + "rewards/rejected": -2.380256175994873, + "sft_loss": 1.5641099214553833, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 3.686247803705324, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.2612699568271637, + "logits/rejected": -0.2027551680803299, + "logps/chosen": -1.5545036792755127, + "logps/rejected": -2.373471975326538, + "loss": 0.6556, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5545036792755127, + "rewards/margins": 0.8189681768417358, + "rewards/rejected": -2.373471975326538, + "sft_loss": 1.5277230739593506, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 4.738638244496703, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.3124659061431885, + "logits/rejected": -0.18985530734062195, + "logps/chosen": -1.681979775428772, + "logps/rejected": -2.28306245803833, + "loss": 0.66, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.681979775428772, + "rewards/margins": 0.6010830402374268, + "rewards/rejected": -2.28306245803833, + "sft_loss": 1.6827268600463867, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 4.123633534097979, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": -0.09936905652284622, + "logits/rejected": -0.13657012581825256, + "logps/chosen": -1.5900905132293701, + "logps/rejected": -2.274933338165283, + "loss": 0.6433, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5900905132293701, + "rewards/margins": 0.6848429441452026, + "rewards/rejected": -2.274933338165283, + "sft_loss": 1.5598783493041992, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 3.126735515936529, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.20340998470783234, + "logits/rejected": -0.17745666205883026, + "logps/chosen": -1.6050293445587158, + "logps/rejected": -2.101975917816162, + "loss": 0.647, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6050293445587158, + "rewards/margins": 0.4969464838504791, + "rewards/rejected": -2.101975917816162, + "sft_loss": 1.584077000617981, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 3.5855929394333246, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.26367872953414917, + "logits/rejected": -0.19293320178985596, + "logps/chosen": -1.3915940523147583, + "logps/rejected": -2.081035852432251, + "loss": 0.6271, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3915940523147583, + "rewards/margins": 0.6894418597221375, + "rewards/rejected": -2.081035852432251, + "sft_loss": 1.4362269639968872, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 5.509628511652226, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.3857537806034088, + "logits/rejected": -0.30278491973876953, + "logps/chosen": -1.4894828796386719, + "logps/rejected": -2.191249370574951, + "loss": 0.6392, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4894828796386719, + "rewards/margins": 0.7017661929130554, + "rewards/rejected": -2.191249370574951, + "sft_loss": 1.4975861310958862, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 4.008088307428305, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.2717474400997162, + "logits/rejected": -0.20423254370689392, + "logps/chosen": -1.5966397523880005, + "logps/rejected": -2.134685754776001, + "loss": 0.6378, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5966397523880005, + "rewards/margins": 0.5380457639694214, + "rewards/rejected": -2.134685754776001, + "sft_loss": 1.6769453287124634, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 5.957312484952148, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.32032904028892517, + "logits/rejected": -0.0922280102968216, + "logps/chosen": -1.5731478929519653, + "logps/rejected": -2.2426846027374268, + "loss": 0.6453, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5731478929519653, + "rewards/margins": 0.6695364713668823, + "rewards/rejected": -2.2426846027374268, + "sft_loss": 1.5867424011230469, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 3.89727463675, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.15399877727031708, + "logits/rejected": 0.004412566777318716, + "logps/chosen": -1.4596203565597534, + "logps/rejected": -1.9406630992889404, + "loss": 0.6556, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4596203565597534, + "rewards/margins": 0.4810427725315094, + "rewards/rejected": -1.9406630992889404, + "sft_loss": 1.4862662553787231, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 2.960012412141032, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.22320058941841125, + "logits/rejected": -0.14689429104328156, + "logps/chosen": -1.5327249765396118, + "logps/rejected": -2.014547824859619, + "loss": 0.6622, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5327249765396118, + "rewards/margins": 0.4818227291107178, + "rewards/rejected": -2.014547824859619, + "sft_loss": 1.5514531135559082, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 3.490855056544133, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.2697138786315918, + "logits/rejected": -0.13029912114143372, + "logps/chosen": -1.5601483583450317, + "logps/rejected": -2.2007036209106445, + "loss": 0.6519, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5601483583450317, + "rewards/margins": 0.6405550241470337, + "rewards/rejected": -2.2007036209106445, + "sft_loss": 1.5582091808319092, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 7.90813154898587, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.30239084362983704, + "logits/rejected": -0.17908033728599548, + "logps/chosen": -1.4057493209838867, + "logps/rejected": -2.244786262512207, + "loss": 0.6209, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4057493209838867, + "rewards/margins": 0.839036762714386, + "rewards/rejected": -2.244786262512207, + "sft_loss": 1.4612065553665161, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 11.903186469798271, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.3835442364215851, + "logits/rejected": -0.2295791357755661, + "logps/chosen": -1.5785081386566162, + "logps/rejected": -2.267824649810791, + "loss": 0.6608, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5785081386566162, + "rewards/margins": 0.68931645154953, + "rewards/rejected": -2.267824649810791, + "sft_loss": 1.6034950017929077, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 3.276626505410632, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.3765779435634613, + "logits/rejected": -0.17551617324352264, + "logps/chosen": -1.5655916929244995, + "logps/rejected": -2.552032470703125, + "loss": 0.6348, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5655916929244995, + "rewards/margins": 0.9864408373832703, + "rewards/rejected": -2.552032470703125, + "sft_loss": 1.5733572244644165, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 3.8762685392197835, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.20498760044574738, + "logits/rejected": -0.05653483793139458, + "logps/chosen": -1.575437307357788, + "logps/rejected": -2.520442008972168, + "loss": 0.6411, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.575437307357788, + "rewards/margins": 0.9450046420097351, + "rewards/rejected": -2.520442008972168, + "sft_loss": 1.5051355361938477, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 6.6542950500767954, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.24930492043495178, + "logits/rejected": -0.08691352605819702, + "logps/chosen": -1.600701093673706, + "logps/rejected": -2.2791550159454346, + "loss": 0.6538, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.600701093673706, + "rewards/margins": 0.678453803062439, + "rewards/rejected": -2.2791550159454346, + "sft_loss": 1.5768768787384033, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 6.431115821488516, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.32295745611190796, + "logits/rejected": -0.18493905663490295, + "logps/chosen": -1.6067241430282593, + "logps/rejected": -2.3083348274230957, + "loss": 0.6458, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6067241430282593, + "rewards/margins": 0.701610803604126, + "rewards/rejected": -2.3083348274230957, + "sft_loss": 1.5453039407730103, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 3.5398598206435405, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.3503361642360687, + "logits/rejected": -0.14796528220176697, + "logps/chosen": -1.5512923002243042, + "logps/rejected": -2.379483461380005, + "loss": 0.6448, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5512923002243042, + "rewards/margins": 0.8281909823417664, + "rewards/rejected": -2.379483461380005, + "sft_loss": 1.6102874279022217, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 4.563942521186379, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.3200659155845642, + "logits/rejected": -0.25785502791404724, + "logps/chosen": -1.613032579421997, + "logps/rejected": -2.366720676422119, + "loss": 0.6446, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.613032579421997, + "rewards/margins": 0.7536881566047668, + "rewards/rejected": -2.366720676422119, + "sft_loss": 1.608844518661499, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 4.0383245647248085, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.35934972763061523, + "logits/rejected": -0.1265702247619629, + "logps/chosen": -1.5786548852920532, + "logps/rejected": -2.128849744796753, + "loss": 0.6373, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5786548852920532, + "rewards/margins": 0.5501947999000549, + "rewards/rejected": -2.128849744796753, + "sft_loss": 1.5914679765701294, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 3.159487481443361, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.41795772314071655, + "logits/rejected": -0.2262589931488037, + "logps/chosen": -1.5015418529510498, + "logps/rejected": -2.3633315563201904, + "loss": 0.642, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5015418529510498, + "rewards/margins": 0.861789882183075, + "rewards/rejected": -2.3633315563201904, + "sft_loss": 1.5934960842132568, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 6.020860195660031, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.1939457505941391, + "logits/rejected": -0.1189827099442482, + "logps/chosen": -1.5646998882293701, + "logps/rejected": -2.241270065307617, + "loss": 0.6504, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5646998882293701, + "rewards/margins": 0.6765701770782471, + "rewards/rejected": -2.241270065307617, + "sft_loss": 1.5589029788970947, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 6.348729922378617, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.23039841651916504, + "logits/rejected": -0.09529824554920197, + "logps/chosen": -1.6141326427459717, + "logps/rejected": -2.1430962085723877, + "loss": 0.6735, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6141326427459717, + "rewards/margins": 0.5289635062217712, + "rewards/rejected": -2.1430962085723877, + "sft_loss": 1.6168859004974365, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 7.055913054164829, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.29964110255241394, + "logits/rejected": -0.1342334747314453, + "logps/chosen": -1.4779428243637085, + "logps/rejected": -2.107789993286133, + "loss": 0.6531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4779428243637085, + "rewards/margins": 0.6298472285270691, + "rewards/rejected": -2.107789993286133, + "sft_loss": 1.533432960510254, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 3.2676365993423557, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.34181922674179077, + "logits/rejected": -0.2150876522064209, + "logps/chosen": -1.4962215423583984, + "logps/rejected": -2.098529100418091, + "loss": 0.6449, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4962215423583984, + "rewards/margins": 0.6023076176643372, + "rewards/rejected": -2.098529100418091, + "sft_loss": 1.5299503803253174, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 4.71361619614656, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.3603561222553253, + "logits/rejected": -0.25522202253341675, + "logps/chosen": -1.6200759410858154, + "logps/rejected": -2.1835179328918457, + "loss": 0.6421, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6200759410858154, + "rewards/margins": 0.563442051410675, + "rewards/rejected": -2.1835179328918457, + "sft_loss": 1.6475378274917603, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 2.5972991715549223, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.2789638042449951, + "logits/rejected": -0.211712047457695, + "logps/chosen": -1.5185253620147705, + "logps/rejected": -2.2027554512023926, + "loss": 0.6123, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5185253620147705, + "rewards/margins": 0.6842299699783325, + "rewards/rejected": -2.2027554512023926, + "sft_loss": 1.5905386209487915, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 2.6839215043247804, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.27316179871559143, + "logits/rejected": -0.0884641632437706, + "logps/chosen": -1.455093264579773, + "logps/rejected": -2.277247905731201, + "loss": 0.6307, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.455093264579773, + "rewards/margins": 0.8221546411514282, + "rewards/rejected": -2.277247905731201, + "sft_loss": 1.4999747276306152, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 3.8919432235391316, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.2556915879249573, + "logits/rejected": -0.11450406163930893, + "logps/chosen": -1.6157286167144775, + "logps/rejected": -2.233842134475708, + "loss": 0.6536, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6157286167144775, + "rewards/margins": 0.6181135177612305, + "rewards/rejected": -2.233842134475708, + "sft_loss": 1.6188828945159912, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 5.318265389016087, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.31146538257598877, + "logits/rejected": -0.20772738754749298, + "logps/chosen": -1.5671066045761108, + "logps/rejected": -2.3081376552581787, + "loss": 0.6414, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5671066045761108, + "rewards/margins": 0.7410310506820679, + "rewards/rejected": -2.3081376552581787, + "sft_loss": 1.5558583736419678, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 3.857002508565315, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.22554603219032288, + "logits/rejected": -0.17537787556648254, + "logps/chosen": -1.6106573343276978, + "logps/rejected": -2.254077196121216, + "loss": 0.6622, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6106573343276978, + "rewards/margins": 0.6434198617935181, + "rewards/rejected": -2.254077196121216, + "sft_loss": 1.6596205234527588, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 5.61133508602783, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.26040753722190857, + "logits/rejected": -0.1419600248336792, + "logps/chosen": -1.4490015506744385, + "logps/rejected": -2.20499849319458, + "loss": 0.6401, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4490015506744385, + "rewards/margins": 0.755996823310852, + "rewards/rejected": -2.20499849319458, + "sft_loss": 1.4934262037277222, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 4.324923819447895, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.4055728316307068, + "logits/rejected": -0.24443522095680237, + "logps/chosen": -1.5296590328216553, + "logps/rejected": -2.17419695854187, + "loss": 0.631, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5296590328216553, + "rewards/margins": 0.6445378661155701, + "rewards/rejected": -2.17419695854187, + "sft_loss": 1.6248109340667725, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 3.325761392550191, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.2833782732486725, + "logits/rejected": -0.15649259090423584, + "logps/chosen": -1.5566611289978027, + "logps/rejected": -2.14329195022583, + "loss": 0.6519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5566611289978027, + "rewards/margins": 0.5866307020187378, + "rewards/rejected": -2.14329195022583, + "sft_loss": 1.547494649887085, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 6.33032465055404, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.3567900061607361, + "logits/rejected": -0.20585401356220245, + "logps/chosen": -1.4549974203109741, + "logps/rejected": -2.2079615592956543, + "loss": 0.6448, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4549974203109741, + "rewards/margins": 0.7529643774032593, + "rewards/rejected": -2.2079615592956543, + "sft_loss": 1.535797357559204, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.01046951673924923, + "eval_logits/rejected": 0.09744537621736526, + "eval_logps/chosen": -1.5994479656219482, + "eval_logps/rejected": -2.2308409214019775, + "eval_loss": 0.6683368682861328, + "eval_rewards/accuracies": 0.6505934596061707, + "eval_rewards/chosen": -1.5994479656219482, + "eval_rewards/margins": 0.6313928961753845, + "eval_rewards/rejected": -2.2308409214019775, + "eval_runtime": 43.2663, + "eval_samples_per_second": 31.087, + "eval_sft_loss": 1.5882415771484375, + "eval_steps_per_second": 7.789, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 2.680162601967427, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.24441878497600555, + "logits/rejected": -0.2090197503566742, + "logps/chosen": -1.5445456504821777, + "logps/rejected": -2.1508004665374756, + "loss": 0.6597, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5445456504821777, + "rewards/margins": 0.6062547564506531, + "rewards/rejected": -2.1508004665374756, + "sft_loss": 1.5353953838348389, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 3.054166573705804, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.3028886914253235, + "logits/rejected": -0.2101995050907135, + "logps/chosen": -1.6469351053237915, + "logps/rejected": -2.328545093536377, + "loss": 0.6422, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6469351053237915, + "rewards/margins": 0.6816102266311646, + "rewards/rejected": -2.328545093536377, + "sft_loss": 1.56937837600708, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 4.641939170377754, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.24724645912647247, + "logits/rejected": -0.1400899440050125, + "logps/chosen": -1.4612640142440796, + "logps/rejected": -2.281329870223999, + "loss": 0.6408, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4612640142440796, + "rewards/margins": 0.820065975189209, + "rewards/rejected": -2.281329870223999, + "sft_loss": 1.5288423299789429, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 6.570214132928371, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.2952974736690521, + "logits/rejected": -0.21536913514137268, + "logps/chosen": -1.4964368343353271, + "logps/rejected": -2.307354688644409, + "loss": 0.6324, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4964368343353271, + "rewards/margins": 0.8109177350997925, + "rewards/rejected": -2.307354688644409, + "sft_loss": 1.5774792432785034, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 6.497025295805081, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.2488592565059662, + "logits/rejected": -0.11200263351202011, + "logps/chosen": -1.6183297634124756, + "logps/rejected": -2.316483974456787, + "loss": 0.6522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6183297634124756, + "rewards/margins": 0.6981542706489563, + "rewards/rejected": -2.316483974456787, + "sft_loss": 1.6047000885009766, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 4.641487498277689, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.39767542481422424, + "logits/rejected": -0.25908923149108887, + "logps/chosen": -1.5914344787597656, + "logps/rejected": -2.2321314811706543, + "loss": 0.6467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5914344787597656, + "rewards/margins": 0.6406969428062439, + "rewards/rejected": -2.2321314811706543, + "sft_loss": 1.5355947017669678, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 3.0558847707427317, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.3713652491569519, + "logits/rejected": -0.2548134922981262, + "logps/chosen": -1.5258245468139648, + "logps/rejected": -2.2273030281066895, + "loss": 0.6565, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5258245468139648, + "rewards/margins": 0.7014786601066589, + "rewards/rejected": -2.2273030281066895, + "sft_loss": 1.5043542385101318, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 6.559227207256044, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.3284551799297333, + "logits/rejected": -0.12331485748291016, + "logps/chosen": -1.5666601657867432, + "logps/rejected": -2.338242769241333, + "loss": 0.6515, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5666601657867432, + "rewards/margins": 0.7715827226638794, + "rewards/rejected": -2.338242769241333, + "sft_loss": 1.5562536716461182, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 2.4036030650374505, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.24320153892040253, + "logits/rejected": -0.19215801358222961, + "logps/chosen": -1.5605636835098267, + "logps/rejected": -2.0627856254577637, + "loss": 0.6685, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5605636835098267, + "rewards/margins": 0.5022218823432922, + "rewards/rejected": -2.0627856254577637, + "sft_loss": 1.5639190673828125, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 8.49748018591424, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.3559146523475647, + "logits/rejected": -0.218179389834404, + "logps/chosen": -1.5596989393234253, + "logps/rejected": -2.18450665473938, + "loss": 0.6614, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5596989393234253, + "rewards/margins": 0.6248075366020203, + "rewards/rejected": -2.18450665473938, + "sft_loss": 1.591348648071289, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 4.364225513311659, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.3184386193752289, + "logits/rejected": -0.11092986166477203, + "logps/chosen": -1.5821802616119385, + "logps/rejected": -2.2117114067077637, + "loss": 0.6499, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5821802616119385, + "rewards/margins": 0.6295314431190491, + "rewards/rejected": -2.2117114067077637, + "sft_loss": 1.5408474206924438, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 4.4509052230402215, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.2785467803478241, + "logits/rejected": -0.2110917866230011, + "logps/chosen": -1.5520164966583252, + "logps/rejected": -2.3556294441223145, + "loss": 0.6417, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5520164966583252, + "rewards/margins": 0.8036128878593445, + "rewards/rejected": -2.3556294441223145, + "sft_loss": 1.6278345584869385, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 3.909993815706318, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.416201651096344, + "logits/rejected": -0.20776471495628357, + "logps/chosen": -1.7517368793487549, + "logps/rejected": -2.3358230590820312, + "loss": 0.6689, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7517368793487549, + "rewards/margins": 0.5840864181518555, + "rewards/rejected": -2.3358230590820312, + "sft_loss": 1.7384824752807617, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 7.9290690308624425, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.2663486897945404, + "logits/rejected": -0.11282068490982056, + "logps/chosen": -1.5236700773239136, + "logps/rejected": -2.178757905960083, + "loss": 0.6383, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5236700773239136, + "rewards/margins": 0.6550878286361694, + "rewards/rejected": -2.178757905960083, + "sft_loss": 1.5293155908584595, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 4.591601960097695, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.26813387870788574, + "logits/rejected": -0.1137576550245285, + "logps/chosen": -1.5806580781936646, + "logps/rejected": -2.180480480194092, + "loss": 0.6528, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5806580781936646, + "rewards/margins": 0.5998224020004272, + "rewards/rejected": -2.180480480194092, + "sft_loss": 1.561107873916626, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 3.5423652427077172, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.26632776856422424, + "logits/rejected": -0.09625057876110077, + "logps/chosen": -1.7073192596435547, + "logps/rejected": -2.460038423538208, + "loss": 0.6569, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7073192596435547, + "rewards/margins": 0.7527190446853638, + "rewards/rejected": -2.460038423538208, + "sft_loss": 1.6452124118804932, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 2.8404674053871117, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.33919811248779297, + "logits/rejected": -0.23609662055969238, + "logps/chosen": -1.4722113609313965, + "logps/rejected": -2.3265535831451416, + "loss": 0.6373, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4722113609313965, + "rewards/margins": 0.854341983795166, + "rewards/rejected": -2.3265535831451416, + "sft_loss": 1.4935623407363892, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 4.0797872044318915, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.32204926013946533, + "logits/rejected": -0.1875072419643402, + "logps/chosen": -1.5559985637664795, + "logps/rejected": -2.128537654876709, + "loss": 0.6622, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5559985637664795, + "rewards/margins": 0.572539210319519, + "rewards/rejected": -2.128537654876709, + "sft_loss": 1.6035436391830444, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 2.8017841765647, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.36441072821617126, + "logits/rejected": -0.18555672466754913, + "logps/chosen": -1.6992461681365967, + "logps/rejected": -2.3326878547668457, + "loss": 0.6643, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6992461681365967, + "rewards/margins": 0.6334417462348938, + "rewards/rejected": -2.3326878547668457, + "sft_loss": 1.6844333410263062, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 3.571625497941765, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.3157997727394104, + "logits/rejected": -0.2676517367362976, + "logps/chosen": -1.5811196565628052, + "logps/rejected": -2.3758318424224854, + "loss": 0.645, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5811196565628052, + "rewards/margins": 0.7947121858596802, + "rewards/rejected": -2.3758318424224854, + "sft_loss": 1.5407793521881104, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 4.93834901122263, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.2959749102592468, + "logits/rejected": -0.17174866795539856, + "logps/chosen": -1.5706027746200562, + "logps/rejected": -2.3317575454711914, + "loss": 0.6561, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5706027746200562, + "rewards/margins": 0.7611549496650696, + "rewards/rejected": -2.3317575454711914, + "sft_loss": 1.6460769176483154, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 3.762595430783216, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.4393877387046814, + "logits/rejected": -0.2043670117855072, + "logps/chosen": -1.5172808170318604, + "logps/rejected": -2.394310474395752, + "loss": 0.6391, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5172808170318604, + "rewards/margins": 0.8770295977592468, + "rewards/rejected": -2.394310474395752, + "sft_loss": 1.5674129724502563, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 4.443768579457408, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.19146893918514252, + "logits/rejected": -0.07724063098430634, + "logps/chosen": -1.553026795387268, + "logps/rejected": -2.1316425800323486, + "loss": 0.6546, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.553026795387268, + "rewards/margins": 0.5786157846450806, + "rewards/rejected": -2.1316425800323486, + "sft_loss": 1.562693476676941, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 5.722678383489239, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.2044384479522705, + "logits/rejected": -0.11913935840129852, + "logps/chosen": -1.5690629482269287, + "logps/rejected": -2.0477561950683594, + "loss": 0.6522, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5690629482269287, + "rewards/margins": 0.4786931872367859, + "rewards/rejected": -2.0477561950683594, + "sft_loss": 1.4781545400619507, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 2.0470696594380793, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.24658894538879395, + "logits/rejected": -0.10288085043430328, + "logps/chosen": -1.6159712076187134, + "logps/rejected": -2.430453062057495, + "loss": 0.6456, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6159712076187134, + "rewards/margins": 0.8144820332527161, + "rewards/rejected": -2.430453062057495, + "sft_loss": 1.5644042491912842, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 7.680564951023092, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.2537384331226349, + "logits/rejected": -0.186565563082695, + "logps/chosen": -1.5757337808609009, + "logps/rejected": -2.265808582305908, + "loss": 0.6431, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5757337808609009, + "rewards/margins": 0.6900747418403625, + "rewards/rejected": -2.265808582305908, + "sft_loss": 1.5722882747650146, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 8.408067887122359, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.21966715157032013, + "logits/rejected": -0.2432653158903122, + "logps/chosen": -1.5844430923461914, + "logps/rejected": -2.055832624435425, + "loss": 0.6625, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5844430923461914, + "rewards/margins": 0.4713897109031677, + "rewards/rejected": -2.055832624435425, + "sft_loss": 1.5317051410675049, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 4.190014485730588, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.2685621380805969, + "logits/rejected": -0.22150039672851562, + "logps/chosen": -1.5397356748580933, + "logps/rejected": -2.119022846221924, + "loss": 0.6605, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5397356748580933, + "rewards/margins": 0.5792871713638306, + "rewards/rejected": -2.119022846221924, + "sft_loss": 1.502520203590393, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 2.41810357458596, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.4025016725063324, + "logits/rejected": -0.2326141893863678, + "logps/chosen": -1.4536174535751343, + "logps/rejected": -2.1672511100769043, + "loss": 0.6297, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4536174535751343, + "rewards/margins": 0.71363365650177, + "rewards/rejected": -2.1672511100769043, + "sft_loss": 1.5110516548156738, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 4.798281016538979, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.1976149082183838, + "logits/rejected": -0.21479268372058868, + "logps/chosen": -1.6265045404434204, + "logps/rejected": -2.204802989959717, + "loss": 0.6641, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6265045404434204, + "rewards/margins": 0.5782985687255859, + "rewards/rejected": -2.204802989959717, + "sft_loss": 1.5611135959625244, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 3.3184133275680985, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.2918176054954529, + "logits/rejected": -0.20298846065998077, + "logps/chosen": -1.5935795307159424, + "logps/rejected": -2.1652448177337646, + "loss": 0.6706, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5935795307159424, + "rewards/margins": 0.5716655850410461, + "rewards/rejected": -2.1652448177337646, + "sft_loss": 1.602667212486267, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 9.405439850958698, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.2645604908466339, + "logits/rejected": -0.06947880983352661, + "logps/chosen": -1.5236592292785645, + "logps/rejected": -2.236105442047119, + "loss": 0.6433, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5236592292785645, + "rewards/margins": 0.7124462127685547, + "rewards/rejected": -2.236105442047119, + "sft_loss": 1.584325909614563, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 3.9460809975459488, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.33310556411743164, + "logits/rejected": -0.16928938031196594, + "logps/chosen": -1.6103332042694092, + "logps/rejected": -2.4057180881500244, + "loss": 0.6489, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6103332042694092, + "rewards/margins": 0.7953847646713257, + "rewards/rejected": -2.4057180881500244, + "sft_loss": 1.6243871450424194, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 3.3978433077924657, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.18408358097076416, + "logits/rejected": -0.14971300959587097, + "logps/chosen": -1.610769510269165, + "logps/rejected": -1.966650366783142, + "loss": 0.6748, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.610769510269165, + "rewards/margins": 0.355881005525589, + "rewards/rejected": -1.966650366783142, + "sft_loss": 1.5604798793792725, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 4.7975969213514515, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.24301142990589142, + "logits/rejected": -0.14734292030334473, + "logps/chosen": -1.503179907798767, + "logps/rejected": -2.5005712509155273, + "loss": 0.6303, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.503179907798767, + "rewards/margins": 0.9973915219306946, + "rewards/rejected": -2.5005712509155273, + "sft_loss": 1.5697084665298462, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 5.333869239307722, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.33797964453697205, + "logits/rejected": -0.12401552498340607, + "logps/chosen": -1.6644957065582275, + "logps/rejected": -2.3786120414733887, + "loss": 0.6483, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6644957065582275, + "rewards/margins": 0.7141159772872925, + "rewards/rejected": -2.3786120414733887, + "sft_loss": 1.7237141132354736, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 4.832180713170385, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.22082212567329407, + "logits/rejected": -0.1354362666606903, + "logps/chosen": -1.6065336465835571, + "logps/rejected": -2.0319314002990723, + "loss": 0.6486, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6065336465835571, + "rewards/margins": 0.42539745569229126, + "rewards/rejected": -2.0319314002990723, + "sft_loss": 1.5567100048065186, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 4.212411607729048, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.2250661551952362, + "logits/rejected": -0.0926482304930687, + "logps/chosen": -1.6468353271484375, + "logps/rejected": -2.17008638381958, + "loss": 0.648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6468353271484375, + "rewards/margins": 0.5232512950897217, + "rewards/rejected": -2.17008638381958, + "sft_loss": 1.6684173345565796, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 2.5706071635517014, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.3760332465171814, + "logits/rejected": -0.17019043862819672, + "logps/chosen": -1.448594331741333, + "logps/rejected": -2.1622211933135986, + "loss": 0.6252, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.448594331741333, + "rewards/margins": 0.7136268615722656, + "rewards/rejected": -2.1622211933135986, + "sft_loss": 1.4942363500595093, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 4.723585039176921, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.27923649549484253, + "logits/rejected": -0.02440541796386242, + "logps/chosen": -1.5308659076690674, + "logps/rejected": -2.5761771202087402, + "loss": 0.6419, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5308659076690674, + "rewards/margins": 1.0453112125396729, + "rewards/rejected": -2.5761771202087402, + "sft_loss": 1.5155452489852905, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 2.891451796857847, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.21183153986930847, + "logits/rejected": -0.15966220200061798, + "logps/chosen": -1.8173433542251587, + "logps/rejected": -2.3437368869781494, + "loss": 0.6572, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.8173433542251587, + "rewards/margins": 0.5263934135437012, + "rewards/rejected": -2.3437368869781494, + "sft_loss": 1.6783323287963867, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 3.0268618418878908, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.20779451727867126, + "logits/rejected": -0.08706004917621613, + "logps/chosen": -1.557328701019287, + "logps/rejected": -2.4320130348205566, + "loss": 0.6405, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.557328701019287, + "rewards/margins": 0.8746845126152039, + "rewards/rejected": -2.4320130348205566, + "sft_loss": 1.577407956123352, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 2.970437881892552, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.28720521926879883, + "logits/rejected": -0.10532665252685547, + "logps/chosen": -1.5876381397247314, + "logps/rejected": -2.1869492530822754, + "loss": 0.6588, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5876381397247314, + "rewards/margins": 0.5993111729621887, + "rewards/rejected": -2.1869492530822754, + "sft_loss": 1.6944866180419922, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 5.731162383629648, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.22121545672416687, + "logits/rejected": -0.1469089388847351, + "logps/chosen": -1.5260721445083618, + "logps/rejected": -2.099198579788208, + "loss": 0.6441, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5260721445083618, + "rewards/margins": 0.5731264352798462, + "rewards/rejected": -2.099198579788208, + "sft_loss": 1.5268216133117676, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 5.047704291212816, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.2476317435503006, + "logits/rejected": -0.11390645802021027, + "logps/chosen": -1.580430030822754, + "logps/rejected": -2.254305601119995, + "loss": 0.65, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.580430030822754, + "rewards/margins": 0.6738757491111755, + "rewards/rejected": -2.254305601119995, + "sft_loss": 1.5548816919326782, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 4.722096505210587, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.29212015867233276, + "logits/rejected": -0.12381164729595184, + "logps/chosen": -1.5768067836761475, + "logps/rejected": -2.2491636276245117, + "loss": 0.6373, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5768067836761475, + "rewards/margins": 0.6723569631576538, + "rewards/rejected": -2.2491636276245117, + "sft_loss": 1.6210720539093018, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 3.2095242212092137, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.4141046404838562, + "logits/rejected": -0.3132175803184509, + "logps/chosen": -1.5623668432235718, + "logps/rejected": -2.2315099239349365, + "loss": 0.6424, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5623668432235718, + "rewards/margins": 0.6691430807113647, + "rewards/rejected": -2.2315099239349365, + "sft_loss": 1.5677772760391235, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 3.237316941256485, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.29903578758239746, + "logits/rejected": -0.1807752251625061, + "logps/chosen": -1.5542609691619873, + "logps/rejected": -2.1173808574676514, + "loss": 0.649, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5542609691619873, + "rewards/margins": 0.5631201863288879, + "rewards/rejected": -2.1173808574676514, + "sft_loss": 1.5871899127960205, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 4.449145241684852, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": -0.10690195858478546, + "logits/rejected": -0.09125228226184845, + "logps/chosen": -1.6235891580581665, + "logps/rejected": -2.516434907913208, + "loss": 0.6415, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6235891580581665, + "rewards/margins": 0.892845630645752, + "rewards/rejected": -2.516434907913208, + "sft_loss": 1.5800119638442993, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 4.611714592430611, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.14552733302116394, + "logits/rejected": -0.04473976045846939, + "logps/chosen": -1.5664246082305908, + "logps/rejected": -2.4397082328796387, + "loss": 0.6506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5664246082305908, + "rewards/margins": 0.8732835054397583, + "rewards/rejected": -2.4397082328796387, + "sft_loss": 1.5751209259033203, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 5.547178095937166, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.28409165143966675, + "logits/rejected": -0.10616005957126617, + "logps/chosen": -1.579883098602295, + "logps/rejected": -2.171001672744751, + "loss": 0.666, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.579883098602295, + "rewards/margins": 0.591118574142456, + "rewards/rejected": -2.171001672744751, + "sft_loss": 1.5925939083099365, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 4.930655705283452, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.2638966143131256, + "logits/rejected": -0.16020557284355164, + "logps/chosen": -1.5293127298355103, + "logps/rejected": -2.3441619873046875, + "loss": 0.6344, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5293127298355103, + "rewards/margins": 0.8148494958877563, + "rewards/rejected": -2.3441619873046875, + "sft_loss": 1.5903923511505127, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 5.135646130457434, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.36130863428115845, + "logits/rejected": -0.17520561814308167, + "logps/chosen": -1.6114094257354736, + "logps/rejected": -2.241234064102173, + "loss": 0.6592, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6114094257354736, + "rewards/margins": 0.6298244595527649, + "rewards/rejected": -2.241234064102173, + "sft_loss": 1.5910862684249878, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 4.723087213112723, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.22094795107841492, + "logits/rejected": -0.12475217878818512, + "logps/chosen": -1.5482748746871948, + "logps/rejected": -2.1302969455718994, + "loss": 0.6556, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5482748746871948, + "rewards/margins": 0.582021951675415, + "rewards/rejected": -2.1302969455718994, + "sft_loss": 1.5279250144958496, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 3.8916001177332555, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.20356476306915283, + "logits/rejected": -0.24255618453025818, + "logps/chosen": -1.5652469396591187, + "logps/rejected": -2.1905503273010254, + "loss": 0.6414, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5652469396591187, + "rewards/margins": 0.6253035068511963, + "rewards/rejected": -2.1905503273010254, + "sft_loss": 1.6226780414581299, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 4.672538576812152, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.26059651374816895, + "logits/rejected": -0.15020112693309784, + "logps/chosen": -1.5707706212997437, + "logps/rejected": -2.1416842937469482, + "loss": 0.6543, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5707706212997437, + "rewards/margins": 0.5709136724472046, + "rewards/rejected": -2.1416842937469482, + "sft_loss": 1.595155119895935, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 7.175122015176713, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.2035883218050003, + "logits/rejected": -0.031918738037347794, + "logps/chosen": -1.4992424249649048, + "logps/rejected": -2.2319529056549072, + "loss": 0.642, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4992424249649048, + "rewards/margins": 0.7327104806900024, + "rewards/rejected": -2.2319529056549072, + "sft_loss": 1.5359947681427002, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 4.796408365540654, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.23860719799995422, + "logits/rejected": -0.19544881582260132, + "logps/chosen": -1.5336408615112305, + "logps/rejected": -2.178316593170166, + "loss": 0.6522, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5336408615112305, + "rewards/margins": 0.6446754336357117, + "rewards/rejected": -2.178316593170166, + "sft_loss": 1.5777490139007568, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 4.451082691730898, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.3357131779193878, + "logits/rejected": -0.1376783400774002, + "logps/chosen": -1.5098403692245483, + "logps/rejected": -2.2403979301452637, + "loss": 0.6342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5098403692245483, + "rewards/margins": 0.7305575609207153, + "rewards/rejected": -2.2403979301452637, + "sft_loss": 1.5593976974487305, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 2.2502258639724393, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.2203347384929657, + "logits/rejected": -0.10753805935382843, + "logps/chosen": -1.7016786336898804, + "logps/rejected": -2.338911771774292, + "loss": 0.6609, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.7016786336898804, + "rewards/margins": 0.6372330188751221, + "rewards/rejected": -2.338911771774292, + "sft_loss": 1.6395244598388672, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 2.1470051189006942, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.19624938070774078, + "logits/rejected": -0.18421564996242523, + "logps/chosen": -1.5481547117233276, + "logps/rejected": -2.177351474761963, + "loss": 0.6356, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5481547117233276, + "rewards/margins": 0.6291965246200562, + "rewards/rejected": -2.177351474761963, + "sft_loss": 1.6014915704727173, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 2.8801187629736114, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.2564873993396759, + "logits/rejected": -0.12604650855064392, + "logps/chosen": -1.596642255783081, + "logps/rejected": -2.6083335876464844, + "loss": 0.635, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.596642255783081, + "rewards/margins": 1.0116910934448242, + "rewards/rejected": -2.6083335876464844, + "sft_loss": 1.5695760250091553, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 5.366481336133112, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.3548617959022522, + "logits/rejected": -0.2135559767484665, + "logps/chosen": -1.679685354232788, + "logps/rejected": -2.35361647605896, + "loss": 0.6378, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.679685354232788, + "rewards/margins": 0.6739312410354614, + "rewards/rejected": -2.35361647605896, + "sft_loss": 1.6176130771636963, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 7.126941759915519, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.24436382949352264, + "logits/rejected": -0.18135519325733185, + "logps/chosen": -1.5110137462615967, + "logps/rejected": -2.1283164024353027, + "loss": 0.6535, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5110137462615967, + "rewards/margins": 0.6173027753829956, + "rewards/rejected": -2.1283164024353027, + "sft_loss": 1.5346163511276245, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 4.35177873431574, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.25509151816368103, + "logits/rejected": -0.12994591891765594, + "logps/chosen": -1.594662070274353, + "logps/rejected": -2.4333889484405518, + "loss": 0.6349, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.594662070274353, + "rewards/margins": 0.8387266993522644, + "rewards/rejected": -2.4333889484405518, + "sft_loss": 1.6135175228118896, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 3.6754757430630676, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.3149816393852234, + "logits/rejected": -0.08832015097141266, + "logps/chosen": -1.7405153512954712, + "logps/rejected": -2.200068473815918, + "loss": 0.6767, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7405153512954712, + "rewards/margins": 0.4595530927181244, + "rewards/rejected": -2.200068473815918, + "sft_loss": 1.6633392572402954, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 6.369769662890393, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.27111494541168213, + "logits/rejected": -0.218735933303833, + "logps/chosen": -1.5510780811309814, + "logps/rejected": -2.116690158843994, + "loss": 0.6484, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5510780811309814, + "rewards/margins": 0.5656118988990784, + "rewards/rejected": -2.116690158843994, + "sft_loss": 1.630822777748108, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 6.366283924334156, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.38514214754104614, + "logits/rejected": -0.23634997010231018, + "logps/chosen": -1.619696855545044, + "logps/rejected": -2.1308228969573975, + "loss": 0.6604, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.619696855545044, + "rewards/margins": 0.5111261606216431, + "rewards/rejected": -2.1308228969573975, + "sft_loss": 1.5566799640655518, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 5.181437814631028, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.2535046637058258, + "logits/rejected": -0.14234676957130432, + "logps/chosen": -1.4892795085906982, + "logps/rejected": -2.0935254096984863, + "loss": 0.6488, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4892795085906982, + "rewards/margins": 0.6042462587356567, + "rewards/rejected": -2.0935254096984863, + "sft_loss": 1.5298943519592285, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 2.7048965748061162, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.14938172698020935, + "logits/rejected": -0.03671771287918091, + "logps/chosen": -1.6389738321304321, + "logps/rejected": -2.2659668922424316, + "loss": 0.6439, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6389738321304321, + "rewards/margins": 0.6269931793212891, + "rewards/rejected": -2.2659668922424316, + "sft_loss": 1.6594352722167969, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 4.614081575961701, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.25345319509506226, + "logits/rejected": -0.04088393598794937, + "logps/chosen": -1.6478992700576782, + "logps/rejected": -2.488649606704712, + "loss": 0.6508, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6478992700576782, + "rewards/margins": 0.8407502174377441, + "rewards/rejected": -2.488649606704712, + "sft_loss": 1.6653321981430054, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 5.103698961424684, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.17153751850128174, + "logits/rejected": -0.15024232864379883, + "logps/chosen": -1.629041314125061, + "logps/rejected": -2.036198377609253, + "loss": 0.6636, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.629041314125061, + "rewards/margins": 0.4071568548679352, + "rewards/rejected": -2.036198377609253, + "sft_loss": 1.6581542491912842, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 7.034656102261009, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.2786411643028259, + "logits/rejected": -0.21582278609275818, + "logps/chosen": -1.4769775867462158, + "logps/rejected": -2.1230950355529785, + "loss": 0.6537, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4769775867462158, + "rewards/margins": 0.646117627620697, + "rewards/rejected": -2.1230950355529785, + "sft_loss": 1.5027004480361938, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 4.850132598841623, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.1930471956729889, + "logits/rejected": -0.05508570745587349, + "logps/chosen": -1.4973084926605225, + "logps/rejected": -2.397449016571045, + "loss": 0.6252, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4973084926605225, + "rewards/margins": 0.9001402854919434, + "rewards/rejected": -2.397449016571045, + "sft_loss": 1.560623049736023, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 7.094611040703844, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.23316261172294617, + "logits/rejected": -0.07678346335887909, + "logps/chosen": -1.532983660697937, + "logps/rejected": -2.2914743423461914, + "loss": 0.6309, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.532983660697937, + "rewards/margins": 0.7584906816482544, + "rewards/rejected": -2.2914743423461914, + "sft_loss": 1.5512092113494873, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 3.6689372276733128, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.32375577092170715, + "logits/rejected": -0.057362549006938934, + "logps/chosen": -1.4568369388580322, + "logps/rejected": -2.2207484245300293, + "loss": 0.6377, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4568369388580322, + "rewards/margins": 0.7639115452766418, + "rewards/rejected": -2.2207484245300293, + "sft_loss": 1.5315742492675781, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 3.901840869174015, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.27934569120407104, + "logits/rejected": -0.11083509773015976, + "logps/chosen": -1.56743586063385, + "logps/rejected": -2.3566184043884277, + "loss": 0.6448, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.56743586063385, + "rewards/margins": 0.7891825437545776, + "rewards/rejected": -2.3566184043884277, + "sft_loss": 1.6707375049591064, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 2.7744122100860062, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.19820816814899445, + "logits/rejected": -0.13724127411842346, + "logps/chosen": -1.6933910846710205, + "logps/rejected": -2.307555913925171, + "loss": 0.6645, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6933910846710205, + "rewards/margins": 0.6141648292541504, + "rewards/rejected": -2.307555913925171, + "sft_loss": 1.6180213689804077, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 4.146374408259338, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.22346720099449158, + "logits/rejected": -0.0670088678598404, + "logps/chosen": -1.53458571434021, + "logps/rejected": -2.1950907707214355, + "loss": 0.6368, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.53458571434021, + "rewards/margins": 0.6605050563812256, + "rewards/rejected": -2.1950907707214355, + "sft_loss": 1.5120246410369873, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 4.666700730222403, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.21801479160785675, + "logits/rejected": -0.1040673479437828, + "logps/chosen": -1.5110822916030884, + "logps/rejected": -2.4874258041381836, + "loss": 0.6368, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5110822916030884, + "rewards/margins": 0.9763437509536743, + "rewards/rejected": -2.4874258041381836, + "sft_loss": 1.6090261936187744, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.11780353635549545, + "eval_logits/rejected": 0.21422991156578064, + "eval_logps/chosen": -1.6016647815704346, + "eval_logps/rejected": -2.2330212593078613, + "eval_loss": 0.6684653759002686, + "eval_rewards/accuracies": 0.6505934596061707, + "eval_rewards/chosen": -1.6016647815704346, + "eval_rewards/margins": 0.6313564777374268, + "eval_rewards/rejected": -2.2330212593078613, + "eval_runtime": 43.1555, + "eval_samples_per_second": 31.166, + "eval_sft_loss": 1.5896871089935303, + "eval_steps_per_second": 7.809, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.6685084665272595, + "train_runtime": 33774.3849, + "train_samples_per_second": 5.311, + "train_steps_per_second": 0.166 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}