{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 11.487560113167183, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.07002891600131989, "logits/rejected": 0.1360647976398468, "logps/chosen": -1.7161403894424438, "logps/rejected": -1.8893934488296509, "loss": 1.9598, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7161403894424438, "rewards/margins": 0.17325332760810852, "rewards/rejected": -1.8893934488296509, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 24.605044963130553, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.011153340339660645, "logits/rejected": 0.13156278431415558, "logps/chosen": -1.8016386032104492, "logps/rejected": -1.844559907913208, "loss": 2.0517, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8016386032104492, "rewards/margins": 0.0429212786257267, "rewards/rejected": -1.844559907913208, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 22.47805312136356, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.024431750178337097, "logits/rejected": 0.07315035164356232, "logps/chosen": -1.6345176696777344, "logps/rejected": -1.7633349895477295, "loss": 1.9217, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6345176696777344, "rewards/margins": 0.1288173794746399, "rewards/rejected": -1.7633349895477295, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 13.11760535567929, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.03134971112012863, "logits/rejected": 0.05292157083749771, "logps/chosen": -1.7249581813812256, "logps/rejected": -1.8054351806640625, "loss": 2.0033, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7249581813812256, "rewards/margins": 0.08047701418399811, "rewards/rejected": -1.8054351806640625, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 30.312039716334574, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.04933280870318413, "logits/rejected": 0.03452097624540329, "logps/chosen": -1.8688116073608398, "logps/rejected": -1.7789065837860107, "loss": 2.1755, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -1.8688116073608398, "rewards/margins": -0.08990499377250671, "rewards/rejected": -1.7789065837860107, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 27.26040466944694, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.07853099703788757, "logits/rejected": 0.013800591230392456, "logps/chosen": -1.908769965171814, "logps/rejected": -1.8326698541641235, "loss": 2.1563, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.908769965171814, "rewards/margins": -0.0761001706123352, "rewards/rejected": -1.8326698541641235, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 19.586132665797237, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.05052985996007919, "logits/rejected": 0.10728694498538971, "logps/chosen": -1.8463678359985352, "logps/rejected": -1.9973781108856201, "loss": 2.1094, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.8463678359985352, "rewards/margins": 0.15101028978824615, "rewards/rejected": -1.9973781108856201, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 25.702013601620546, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.036237478256225586, "logits/rejected": 0.20881040394306183, "logps/chosen": -1.8786709308624268, "logps/rejected": -1.7416874170303345, "loss": 2.1575, "rewards/accuracies": 0.46875, "rewards/chosen": -1.8786709308624268, "rewards/margins": -0.13698363304138184, "rewards/rejected": -1.7416874170303345, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 23.35974114943005, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.027120601385831833, "logits/rejected": 0.22463078796863556, "logps/chosen": -1.8334786891937256, "logps/rejected": -1.869138479232788, "loss": 2.1041, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.8334786891937256, "rewards/margins": 0.03565989434719086, "rewards/rejected": -1.869138479232788, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 28.029289251958236, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.05109367519617081, "logits/rejected": 0.09518839418888092, "logps/chosen": -1.8920953273773193, "logps/rejected": -1.7738196849822998, "loss": 2.1586, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8920953273773193, "rewards/margins": -0.11827566474676132, "rewards/rejected": -1.7738196849822998, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 22.434347301056413, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.11258158832788467, "logits/rejected": 0.10594246536493301, "logps/chosen": -1.8228343725204468, "logps/rejected": -1.8576141595840454, "loss": 2.0774, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.8228343725204468, "rewards/margins": 0.034779977053403854, "rewards/rejected": -1.8576141595840454, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 25.1075556418722, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.07487720251083374, "logits/rejected": 0.11652688682079315, "logps/chosen": -1.7783119678497314, "logps/rejected": -1.882073998451233, "loss": 2.026, "rewards/accuracies": 0.53125, "rewards/chosen": -1.7783119678497314, "rewards/margins": 0.1037621945142746, "rewards/rejected": -1.882073998451233, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 24.457172876439813, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.02016451396048069, "logits/rejected": 0.12632247805595398, "logps/chosen": -1.6278730630874634, "logps/rejected": -1.7560151815414429, "loss": 1.8995, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6278730630874634, "rewards/margins": 0.12814214825630188, "rewards/rejected": -1.7560151815414429, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 24.621380365467704, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06752300262451172, "logits/rejected": 0.08107473701238632, "logps/chosen": -1.7528495788574219, "logps/rejected": -1.7970850467681885, "loss": 2.0302, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7528495788574219, "rewards/margins": 0.04423557221889496, "rewards/rejected": -1.7970850467681885, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 20.855832452699083, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.05488968640565872, "logits/rejected": 0.12391235679388046, "logps/chosen": -1.7454535961151123, "logps/rejected": -2.000891923904419, "loss": 1.9937, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7454535961151123, "rewards/margins": 0.2554382085800171, "rewards/rejected": -2.000891923904419, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 19.142552795246292, "learning_rate": 1.42602495543672e-07, "logits/chosen": -0.015638595446944237, "logits/rejected": 0.08477606624364853, "logps/chosen": -1.6801780462265015, "logps/rejected": -1.7129218578338623, "loss": 1.9599, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6801780462265015, "rewards/margins": 0.03274388611316681, "rewards/rejected": -1.7129218578338623, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 14.356282609461584, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.16840913891792297, "logits/rejected": 0.07064966857433319, "logps/chosen": -1.746219277381897, "logps/rejected": -1.9129278659820557, "loss": 2.0241, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.746219277381897, "rewards/margins": 0.16670863330364227, "rewards/rejected": -1.9129278659820557, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 25.731648485863353, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.08056856691837311, "logits/rejected": 0.04404681921005249, "logps/chosen": -1.6959501504898071, "logps/rejected": -1.7361692190170288, "loss": 1.9836, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6959501504898071, "rewards/margins": 0.040219251066446304, "rewards/rejected": -1.7361692190170288, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 30.275642149716607, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.0880376473069191, "logits/rejected": 0.05738549306988716, "logps/chosen": -1.7332429885864258, "logps/rejected": -1.8584188222885132, "loss": 1.9958, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7332429885864258, "rewards/margins": 0.1251758486032486, "rewards/rejected": -1.8584188222885132, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 13.92404217311914, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.06531379371881485, "logits/rejected": -0.004916741047054529, "logps/chosen": -1.6134307384490967, "logps/rejected": -1.7194831371307373, "loss": 1.8853, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.6134307384490967, "rewards/margins": 0.10605257749557495, "rewards/rejected": -1.7194831371307373, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 20.83071580061446, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.024060076102614403, "logits/rejected": 0.048125751316547394, "logps/chosen": -1.5307506322860718, "logps/rejected": -1.7004512548446655, "loss": 1.8107, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5307506322860718, "rewards/margins": 0.1697007268667221, "rewards/rejected": -1.7004512548446655, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 18.9086884907858, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.0007185645517893136, "logits/rejected": 0.0946788415312767, "logps/chosen": -1.5208451747894287, "logps/rejected": -1.5815619230270386, "loss": 1.8283, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5208451747894287, "rewards/margins": 0.06071670725941658, "rewards/rejected": -1.5815619230270386, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 20.642901156172965, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.0076905665919184685, "logits/rejected": 0.21130745112895966, "logps/chosen": -1.5200780630111694, "logps/rejected": -1.757401466369629, "loss": 1.7815, "rewards/accuracies": 0.625, "rewards/chosen": -1.5200780630111694, "rewards/margins": 0.23732347786426544, "rewards/rejected": -1.757401466369629, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 22.300246680384884, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.08757462352514267, "logits/rejected": 0.08249307423830032, "logps/chosen": -1.5570647716522217, "logps/rejected": -1.6597316265106201, "loss": 1.8545, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.5570647716522217, "rewards/margins": 0.10266710817813873, "rewards/rejected": -1.6597316265106201, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 7.451369428561403, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.08727750927209854, "logits/rejected": 0.04271925613284111, "logps/chosen": -1.508993148803711, "logps/rejected": -1.4839754104614258, "loss": 1.8361, "rewards/accuracies": 0.46875, "rewards/chosen": -1.508993148803711, "rewards/margins": -0.025017833337187767, "rewards/rejected": -1.4839754104614258, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 26.668725802501854, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.032693587243556976, "logits/rejected": 0.16499973833560944, "logps/chosen": -1.53310227394104, "logps/rejected": -1.6465816497802734, "loss": 1.8239, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.53310227394104, "rewards/margins": 0.11347933113574982, "rewards/rejected": -1.6465816497802734, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 22.517570026833223, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.07050226628780365, "logits/rejected": 0.04343460127711296, "logps/chosen": -1.5483382940292358, "logps/rejected": -1.595626950263977, "loss": 1.8555, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5483382940292358, "rewards/margins": 0.04728863015770912, "rewards/rejected": -1.595626950263977, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 14.420706557361244, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.07175682485103607, "logits/rejected": 0.08573417365550995, "logps/chosen": -1.4752639532089233, "logps/rejected": -1.5440343618392944, "loss": 1.8151, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4752639532089233, "rewards/margins": 0.06877056509256363, "rewards/rejected": -1.5440343618392944, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 13.975507283277274, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.0945412889122963, "logits/rejected": 0.04638366773724556, "logps/chosen": -1.3633663654327393, "logps/rejected": -1.4590461254119873, "loss": 1.7122, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3633663654327393, "rewards/margins": 0.0956796258687973, "rewards/rejected": -1.4590461254119873, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 14.532988699334254, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.12252895534038544, "logits/rejected": 0.021989500150084496, "logps/chosen": -1.2998199462890625, "logps/rejected": -1.31027352809906, "loss": 1.6962, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2998199462890625, "rewards/margins": 0.010453557595610619, "rewards/rejected": -1.31027352809906, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 10.076529575305797, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.11439421027898788, "logits/rejected": -0.07082248479127884, "logps/chosen": -1.3111894130706787, "logps/rejected": -1.4266737699508667, "loss": 1.6755, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3111894130706787, "rewards/margins": 0.1154845803976059, "rewards/rejected": -1.4266737699508667, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 10.107152904622856, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.20813456177711487, "logits/rejected": -0.07859252393245697, "logps/chosen": -1.4011151790618896, "logps/rejected": -1.3801295757293701, "loss": 1.7873, "rewards/accuracies": 0.46875, "rewards/chosen": -1.4011151790618896, "rewards/margins": -0.020985547453165054, "rewards/rejected": -1.3801295757293701, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 12.806131515610266, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.10335227102041245, "logits/rejected": 0.0551244392991066, "logps/chosen": -1.3156263828277588, "logps/rejected": -1.3915605545043945, "loss": 1.7149, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3156263828277588, "rewards/margins": 0.07593418657779694, "rewards/rejected": -1.3915605545043945, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 12.849296884231892, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.1526593118906021, "logits/rejected": -0.10797767341136932, "logps/chosen": -1.4327340126037598, "logps/rejected": -1.5010325908660889, "loss": 1.7766, "rewards/accuracies": 0.5, "rewards/chosen": -1.4327340126037598, "rewards/margins": 0.06829849630594254, "rewards/rejected": -1.5010325908660889, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 10.642151115867977, "learning_rate": 3.1194295900178254e-07, "logits/chosen": -0.01714947633445263, "logits/rejected": -0.023959076032042503, "logps/chosen": -1.320685863494873, "logps/rejected": -1.4081209897994995, "loss": 1.7025, "rewards/accuracies": 0.5, "rewards/chosen": -1.320685863494873, "rewards/margins": 0.08743523061275482, "rewards/rejected": -1.4081209897994995, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 8.019318830495136, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.03761307895183563, "logits/rejected": -0.038325823843479156, "logps/chosen": -1.3384017944335938, "logps/rejected": -1.539563775062561, "loss": 1.6765, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3384017944335938, "rewards/margins": 0.20116198062896729, "rewards/rejected": -1.539563775062561, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 8.494090911892958, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.19622287154197693, "logits/rejected": -0.11487259715795517, "logps/chosen": -1.331474781036377, "logps/rejected": -1.3787868022918701, "loss": 1.7307, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.331474781036377, "rewards/margins": 0.04731215909123421, "rewards/rejected": -1.3787868022918701, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 10.351738064447387, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.08686023950576782, "logits/rejected": 0.02442360296845436, "logps/chosen": -1.2566090822219849, "logps/rejected": -1.3870621919631958, "loss": 1.6552, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2566090822219849, "rewards/margins": 0.13045313954353333, "rewards/rejected": -1.3870621919631958, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 8.615804581626048, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.026592861860990524, "logits/rejected": 0.11833508312702179, "logps/chosen": -1.2394936084747314, "logps/rejected": -1.401416540145874, "loss": 1.6174, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2394936084747314, "rewards/margins": 0.16192278265953064, "rewards/rejected": -1.401416540145874, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 21.517358062706677, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.09331141412258148, "logits/rejected": 0.03843807801604271, "logps/chosen": -1.3600974082946777, "logps/rejected": -1.4013712406158447, "loss": 1.7361, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3600974082946777, "rewards/margins": 0.04127373918890953, "rewards/rejected": -1.4013712406158447, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 15.348544529088636, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.07399742305278778, "logits/rejected": 0.06137485057115555, "logps/chosen": -1.272519826889038, "logps/rejected": -1.3434226512908936, "loss": 1.6771, "rewards/accuracies": 0.53125, "rewards/chosen": -1.272519826889038, "rewards/margins": 0.07090290635824203, "rewards/rejected": -1.3434226512908936, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 12.32485215494046, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.18402157723903656, "logits/rejected": -0.010704070329666138, "logps/chosen": -1.3493117094039917, "logps/rejected": -1.4580633640289307, "loss": 1.7154, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3493117094039917, "rewards/margins": 0.10875160992145538, "rewards/rejected": -1.4580633640289307, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 10.482989998891275, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.19349366426467896, "logits/rejected": 0.042197782546281815, "logps/chosen": -1.3733896017074585, "logps/rejected": -1.4248539209365845, "loss": 1.7302, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3733896017074585, "rewards/margins": 0.051464296877384186, "rewards/rejected": -1.4248539209365845, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 16.868719953303888, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.03658125922083855, "logits/rejected": 0.13169129192829132, "logps/chosen": -1.2940952777862549, "logps/rejected": -1.436645269393921, "loss": 1.6562, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2940952777862549, "rewards/margins": 0.14254987239837646, "rewards/rejected": -1.436645269393921, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 7.797019386575935, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.10609889030456543, "logits/rejected": 0.0540977343916893, "logps/chosen": -1.301684856414795, "logps/rejected": -1.4272652864456177, "loss": 1.6564, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.301684856414795, "rewards/margins": 0.12558043003082275, "rewards/rejected": -1.4272652864456177, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 8.729087338113398, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.03860308602452278, "logits/rejected": 0.03049338422715664, "logps/chosen": -1.3036974668502808, "logps/rejected": -1.4600293636322021, "loss": 1.6824, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3036974668502808, "rewards/margins": 0.15633180737495422, "rewards/rejected": -1.4600293636322021, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 12.197820443233079, "learning_rate": 4.188948306595365e-07, "logits/chosen": -0.0038608163595199585, "logits/rejected": 0.12154042720794678, "logps/chosen": -1.272853136062622, "logps/rejected": -1.437613844871521, "loss": 1.6418, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.272853136062622, "rewards/margins": 0.16476061940193176, "rewards/rejected": -1.437613844871521, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 6.466615504548621, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.03536298871040344, "logits/rejected": 0.08561581373214722, "logps/chosen": -1.2899919748306274, "logps/rejected": -1.470727801322937, "loss": 1.6874, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2899919748306274, "rewards/margins": 0.18073596060276031, "rewards/rejected": -1.470727801322937, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 8.822284856207192, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.008248868398368359, "logits/rejected": 0.11743185669183731, "logps/chosen": -1.4091993570327759, "logps/rejected": -1.4329038858413696, "loss": 1.7752, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4091993570327759, "rewards/margins": 0.02370438165962696, "rewards/rejected": -1.4329038858413696, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 10.209906693953846, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.08582687377929688, "logits/rejected": 0.06534568220376968, "logps/chosen": -1.291361689567566, "logps/rejected": -1.3486815690994263, "loss": 1.6999, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.291361689567566, "rewards/margins": 0.05731973797082901, "rewards/rejected": -1.3486815690994263, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 9.005677794931367, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.05144655704498291, "logits/rejected": 0.07802295684814453, "logps/chosen": -1.258061408996582, "logps/rejected": -1.3606371879577637, "loss": 1.6505, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.258061408996582, "rewards/margins": 0.10257569700479507, "rewards/rejected": -1.3606371879577637, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.861754856216085, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.24945001304149628, "logits/rejected": -0.1495785415172577, "logps/chosen": -1.3438951969146729, "logps/rejected": -1.4988113641738892, "loss": 1.668, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3438951969146729, "rewards/margins": 0.1549161821603775, "rewards/rejected": -1.4988113641738892, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 11.356790191417975, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.11152307689189911, "logits/rejected": -0.035057198256254196, "logps/chosen": -1.3328602313995361, "logps/rejected": -1.4962241649627686, "loss": 1.6802, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3328602313995361, "rewards/margins": 0.16336390376091003, "rewards/rejected": -1.4962241649627686, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 8.00537843083759, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.110984206199646, "logits/rejected": 0.006986084394156933, "logps/chosen": -1.309006929397583, "logps/rejected": -1.4050885438919067, "loss": 1.6973, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.309006929397583, "rewards/margins": 0.0960814505815506, "rewards/rejected": -1.4050885438919067, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 10.137734751765233, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.03913550823926926, "logits/rejected": 0.05400773882865906, "logps/chosen": -1.2604626417160034, "logps/rejected": -1.410111904144287, "loss": 1.6664, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2604626417160034, "rewards/margins": 0.14964918792247772, "rewards/rejected": -1.410111904144287, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 10.412520382560047, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.12299992144107819, "logits/rejected": 0.020667919889092445, "logps/chosen": -1.308929681777954, "logps/rejected": -1.4028128385543823, "loss": 1.668, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.308929681777954, "rewards/margins": 0.09388315677642822, "rewards/rejected": -1.4028128385543823, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 9.309541745069055, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.10612060874700546, "logits/rejected": 0.021303869783878326, "logps/chosen": -1.3394674062728882, "logps/rejected": -1.416865587234497, "loss": 1.7065, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3394674062728882, "rewards/margins": 0.07739803940057755, "rewards/rejected": -1.416865587234497, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 8.410936884911042, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.12678056955337524, "logits/rejected": 0.15584680438041687, "logps/chosen": -1.356748342514038, "logps/rejected": -1.4695770740509033, "loss": 1.6826, "rewards/accuracies": 0.59375, "rewards/chosen": -1.356748342514038, "rewards/margins": 0.112828828394413, "rewards/rejected": -1.4695770740509033, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 12.629236143849868, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.07856544852256775, "logits/rejected": -0.02292429283261299, "logps/chosen": -1.2590734958648682, "logps/rejected": -1.3933165073394775, "loss": 1.6434, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2590734958648682, "rewards/margins": 0.13424314558506012, "rewards/rejected": -1.3933165073394775, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 8.303889732551683, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.08285187184810638, "logits/rejected": 0.07641416043043137, "logps/chosen": -1.295358657836914, "logps/rejected": -1.3709486722946167, "loss": 1.6979, "rewards/accuracies": 0.5625, "rewards/chosen": -1.295358657836914, "rewards/margins": 0.07559005171060562, "rewards/rejected": -1.3709486722946167, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 7.078543768571527, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.0555441789329052, "logits/rejected": 0.010712100192904472, "logps/chosen": -1.3955779075622559, "logps/rejected": -1.4059025049209595, "loss": 1.7534, "rewards/accuracies": 0.5, "rewards/chosen": -1.3955779075622559, "rewards/margins": 0.01032471377402544, "rewards/rejected": -1.4059025049209595, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 8.740043566258406, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.23369836807250977, "logits/rejected": -0.15311121940612793, "logps/chosen": -1.3617355823516846, "logps/rejected": -1.4438841342926025, "loss": 1.7309, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3617355823516846, "rewards/margins": 0.08214850723743439, "rewards/rejected": -1.4438841342926025, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 10.299948854839913, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.04319891706109047, "logits/rejected": 0.1049783006310463, "logps/chosen": -1.352993369102478, "logps/rejected": -1.4952139854431152, "loss": 1.6967, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.352993369102478, "rewards/margins": 0.1422206461429596, "rewards/rejected": -1.4952139854431152, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 6.868177363644339, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.09707468748092651, "logits/rejected": 0.024421801790595055, "logps/chosen": -1.3122655153274536, "logps/rejected": -1.3613016605377197, "loss": 1.7078, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3122655153274536, "rewards/margins": 0.04903603345155716, "rewards/rejected": -1.3613016605377197, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 9.643934962875473, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.1591241955757141, "logits/rejected": -0.05875442177057266, "logps/chosen": -1.3131062984466553, "logps/rejected": -1.5493340492248535, "loss": 1.6641, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3131062984466553, "rewards/margins": 0.2362278401851654, "rewards/rejected": -1.5493340492248535, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 10.01176669398041, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.06524165719747543, "logits/rejected": 0.06379499286413193, "logps/chosen": -1.3261775970458984, "logps/rejected": -1.4934340715408325, "loss": 1.6925, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3261775970458984, "rewards/margins": 0.16725634038448334, "rewards/rejected": -1.4934340715408325, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 9.575794458520054, "learning_rate": 5.971479500891266e-07, "logits/chosen": -0.0011968165636062622, "logits/rejected": 0.0859590619802475, "logps/chosen": -1.3384212255477905, "logps/rejected": -1.369568943977356, "loss": 1.7283, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3384212255477905, "rewards/margins": 0.031147807836532593, "rewards/rejected": -1.369568943977356, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 11.07398297496942, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.045141976326704025, "logits/rejected": 0.091048464179039, "logps/chosen": -1.394019603729248, "logps/rejected": -1.477367639541626, "loss": 1.7579, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.394019603729248, "rewards/margins": 0.0833478718996048, "rewards/rejected": -1.477367639541626, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 9.720083946898317, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.029754549264907837, "logits/rejected": 0.05618869513273239, "logps/chosen": -1.3053172826766968, "logps/rejected": -1.4416307210922241, "loss": 1.6551, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3053172826766968, "rewards/margins": 0.13631334900856018, "rewards/rejected": -1.4416307210922241, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 10.175563681422924, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.0037210776936262846, "logits/rejected": 0.08502546697854996, "logps/chosen": -1.2827966213226318, "logps/rejected": -1.399217128753662, "loss": 1.669, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2827966213226318, "rewards/margins": 0.11642041057348251, "rewards/rejected": -1.399217128753662, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 11.064201007283366, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.09607705473899841, "logits/rejected": 0.11924111843109131, "logps/chosen": -1.386281967163086, "logps/rejected": -1.4111428260803223, "loss": 1.7495, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.386281967163086, "rewards/margins": 0.024860884994268417, "rewards/rejected": -1.4111428260803223, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 9.868246722819997, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.07199651002883911, "logits/rejected": 0.0037381022702902555, "logps/chosen": -1.315307855606079, "logps/rejected": -1.4310729503631592, "loss": 1.7349, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.315307855606079, "rewards/margins": 0.11576519906520844, "rewards/rejected": -1.4310729503631592, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 11.769854537932227, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.0018426328897476196, "logits/rejected": 0.07703053951263428, "logps/chosen": -1.2928941249847412, "logps/rejected": -1.3936841487884521, "loss": 1.6828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2928941249847412, "rewards/margins": 0.10079008340835571, "rewards/rejected": -1.3936841487884521, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 6.711024492277533, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.031435489654541016, "logits/rejected": 0.04781597852706909, "logps/chosen": -1.2883342504501343, "logps/rejected": -1.3285871744155884, "loss": 1.69, "rewards/accuracies": 0.5, "rewards/chosen": -1.2883342504501343, "rewards/margins": 0.040252961218357086, "rewards/rejected": -1.3285871744155884, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 15.976190113657609, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.08474372327327728, "logits/rejected": 0.06233643367886543, "logps/chosen": -1.2696669101715088, "logps/rejected": -1.4098860025405884, "loss": 1.6663, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2696669101715088, "rewards/margins": 0.1402190923690796, "rewards/rejected": -1.4098860025405884, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 9.503917608234607, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.05148506909608841, "logits/rejected": 0.024202097207307816, "logps/chosen": -1.2852833271026611, "logps/rejected": -1.4389188289642334, "loss": 1.6491, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2852833271026611, "rewards/margins": 0.15363556146621704, "rewards/rejected": -1.4389188289642334, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 5.7477820941055615, "learning_rate": 6.862745098039216e-07, "logits/chosen": 0.006725990679115057, "logits/rejected": 0.08241648972034454, "logps/chosen": -1.3825719356536865, "logps/rejected": -1.3719950914382935, "loss": 1.7442, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3825719356536865, "rewards/margins": -0.010577131994068623, "rewards/rejected": -1.3719950914382935, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 10.60627748217004, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.04871059209108353, "logits/rejected": 0.2094159871339798, "logps/chosen": -1.373580813407898, "logps/rejected": -1.433243989944458, "loss": 1.7591, "rewards/accuracies": 0.46875, "rewards/chosen": -1.373580813407898, "rewards/margins": 0.05966333672404289, "rewards/rejected": -1.433243989944458, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 7.903747067272735, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.06710124015808105, "logits/rejected": 0.08114752173423767, "logps/chosen": -1.323225736618042, "logps/rejected": -1.334172010421753, "loss": 1.7033, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.323225736618042, "rewards/margins": 0.010946071706712246, "rewards/rejected": -1.334172010421753, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 7.656216532972495, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.04969733580946922, "logits/rejected": 0.13665138185024261, "logps/chosen": -1.3036470413208008, "logps/rejected": -1.3962353467941284, "loss": 1.6549, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3036470413208008, "rewards/margins": 0.09258836507797241, "rewards/rejected": -1.3962353467941284, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.279936283826828, "eval_logits/rejected": 0.3663715124130249, "eval_logps/chosen": -1.3375245332717896, "eval_logps/rejected": -1.4630897045135498, "eval_loss": 1.6938890218734741, "eval_rewards/accuracies": 0.5563797950744629, "eval_rewards/chosen": -1.3375245332717896, "eval_rewards/margins": 0.12556517124176025, "eval_rewards/rejected": -1.4630897045135498, "eval_runtime": 41.3293, "eval_samples_per_second": 32.543, "eval_steps_per_second": 8.154, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 8.54179230229592, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.005595216061919928, "logits/rejected": 0.10029877722263336, "logps/chosen": -1.306983470916748, "logps/rejected": -1.371383547782898, "loss": 1.7059, "rewards/accuracies": 0.5, "rewards/chosen": -1.306983470916748, "rewards/margins": 0.06440006196498871, "rewards/rejected": -1.371383547782898, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 10.367031723096446, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.023940464481711388, "logits/rejected": 0.15087561309337616, "logps/chosen": -1.2818481922149658, "logps/rejected": -1.3659837245941162, "loss": 1.6734, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2818481922149658, "rewards/margins": 0.08413554728031158, "rewards/rejected": -1.3659837245941162, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 6.056959954806769, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.012898044660687447, "logits/rejected": 0.018010448664426804, "logps/chosen": -1.2827682495117188, "logps/rejected": -1.4346555471420288, "loss": 1.6427, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2827682495117188, "rewards/margins": 0.15188735723495483, "rewards/rejected": -1.4346555471420288, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 7.653903978061298, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.03181108459830284, "logits/rejected": 0.14745911955833435, "logps/chosen": -1.2670233249664307, "logps/rejected": -1.356186866760254, "loss": 1.6684, "rewards/accuracies": 0.5, "rewards/chosen": -1.2670233249664307, "rewards/margins": 0.08916331827640533, "rewards/rejected": -1.356186866760254, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 6.707108967213688, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.07813435047864914, "logits/rejected": 0.10958679765462875, "logps/chosen": -1.3069932460784912, "logps/rejected": -1.457546353340149, "loss": 1.6623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3069932460784912, "rewards/margins": 0.15055301785469055, "rewards/rejected": -1.457546353340149, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 7.3866104224848215, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.07318639755249023, "logits/rejected": 0.11783840507268906, "logps/chosen": -1.339181661605835, "logps/rejected": -1.4602099657058716, "loss": 1.691, "rewards/accuracies": 0.5625, "rewards/chosen": -1.339181661605835, "rewards/margins": 0.12102824449539185, "rewards/rejected": -1.4602099657058716, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 12.019218413216004, "learning_rate": 7.754010695187165e-07, "logits/chosen": 0.009309527464210987, "logits/rejected": 0.09513361006975174, "logps/chosen": -1.207874059677124, "logps/rejected": -1.3379977941513062, "loss": 1.6147, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.207874059677124, "rewards/margins": 0.13012397289276123, "rewards/rejected": -1.3379977941513062, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 7.553555665151894, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.006134913768619299, "logits/rejected": 0.09206631034612656, "logps/chosen": -1.2856277227401733, "logps/rejected": -1.3689930438995361, "loss": 1.6755, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2856277227401733, "rewards/margins": 0.08336522430181503, "rewards/rejected": -1.3689930438995361, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 7.648705966898978, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.02347562648355961, "logits/rejected": 0.08584611117839813, "logps/chosen": -1.3102935552597046, "logps/rejected": -1.433739185333252, "loss": 1.6955, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3102935552597046, "rewards/margins": 0.12344559282064438, "rewards/rejected": -1.433739185333252, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 8.882655730190928, "learning_rate": 8.02139037433155e-07, "logits/chosen": -0.0030852502677589655, "logits/rejected": 0.11593957990407944, "logps/chosen": -1.2922875881195068, "logps/rejected": -1.4286361932754517, "loss": 1.6509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2922875881195068, "rewards/margins": 0.1363484412431717, "rewards/rejected": -1.4286361932754517, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 9.557364259695511, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.013877347111701965, "logits/rejected": 0.07154419273138046, "logps/chosen": -1.2427895069122314, "logps/rejected": -1.419550895690918, "loss": 1.6117, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2427895069122314, "rewards/margins": 0.1767614483833313, "rewards/rejected": -1.419550895690918, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 8.275428432095703, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.14314907789230347, "logits/rejected": -0.029183436185121536, "logps/chosen": -1.3672678470611572, "logps/rejected": -1.4119517803192139, "loss": 1.7353, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3672678470611572, "rewards/margins": 0.04468398913741112, "rewards/rejected": -1.4119517803192139, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 9.505344628861407, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.11223528534173965, "logits/rejected": 0.12632213532924652, "logps/chosen": -1.2721918821334839, "logps/rejected": -1.4321367740631104, "loss": 1.6727, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2721918821334839, "rewards/margins": 0.15994493663311005, "rewards/rejected": -1.4321367740631104, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 9.48905108499897, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.14261920750141144, "logits/rejected": 0.09591357409954071, "logps/chosen": -1.2352259159088135, "logps/rejected": -1.4124305248260498, "loss": 1.6061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2352259159088135, "rewards/margins": 0.17720454931259155, "rewards/rejected": -1.4124305248260498, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 6.899198780305083, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.0794256404042244, "logits/rejected": 0.052232641726732254, "logps/chosen": -1.301062822341919, "logps/rejected": -1.4891085624694824, "loss": 1.6342, "rewards/accuracies": 0.5625, "rewards/chosen": -1.301062822341919, "rewards/margins": 0.1880457103252411, "rewards/rejected": -1.4891085624694824, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 12.486878937681752, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.055589865893125534, "logits/rejected": 0.13701441884040833, "logps/chosen": -1.2720788717269897, "logps/rejected": -1.338479995727539, "loss": 1.7002, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2720788717269897, "rewards/margins": 0.06640110909938812, "rewards/rejected": -1.338479995727539, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 11.29180900188912, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.008862579241394997, "logits/rejected": 0.046294886618852615, "logps/chosen": -1.3658168315887451, "logps/rejected": -1.4353352785110474, "loss": 1.7308, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3658168315887451, "rewards/margins": 0.06951842457056046, "rewards/rejected": -1.4353352785110474, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 8.596970816409131, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.03249656409025192, "logits/rejected": 0.10003700107336044, "logps/chosen": -1.3026869297027588, "logps/rejected": -1.372564673423767, "loss": 1.7021, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3026869297027588, "rewards/margins": 0.06987786293029785, "rewards/rejected": -1.372564673423767, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 9.491755847867198, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.019991319626569748, "logits/rejected": -0.002196407411247492, "logps/chosen": -1.3101972341537476, "logps/rejected": -1.418125867843628, "loss": 1.684, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3101972341537476, "rewards/margins": 0.10792861878871918, "rewards/rejected": -1.418125867843628, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 8.371010893036301, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.057091616094112396, "logits/rejected": 0.039482321590185165, "logps/chosen": -1.2270276546478271, "logps/rejected": -1.3793421983718872, "loss": 1.6195, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2270276546478271, "rewards/margins": 0.1523144692182541, "rewards/rejected": -1.3793421983718872, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 7.881144482280576, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.07503107190132141, "logits/rejected": 0.05724053829908371, "logps/chosen": -1.3419079780578613, "logps/rejected": -1.3775643110275269, "loss": 1.6939, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3419079780578613, "rewards/margins": 0.03565652295947075, "rewards/rejected": -1.3775643110275269, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 8.04083324264338, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.08807464689016342, "logits/rejected": 0.14513877034187317, "logps/chosen": -1.3025153875350952, "logps/rejected": -1.4536783695220947, "loss": 1.6515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3025153875350952, "rewards/margins": 0.15116293728351593, "rewards/rejected": -1.4536783695220947, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 8.680474424479687, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.035877007991075516, "logits/rejected": 0.12638869881629944, "logps/chosen": -1.2459619045257568, "logps/rejected": -1.3836866617202759, "loss": 1.6087, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2459619045257568, "rewards/margins": 0.13772478699684143, "rewards/rejected": -1.3836866617202759, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 6.892237240548103, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.07581953704357147, "logits/rejected": 0.060358207672834396, "logps/chosen": -1.2861921787261963, "logps/rejected": -1.3751986026763916, "loss": 1.6935, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2861921787261963, "rewards/margins": 0.08900648355484009, "rewards/rejected": -1.3751986026763916, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 19.55498923733445, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.11548501253128052, "logits/rejected": 0.18100161850452423, "logps/chosen": -1.2647411823272705, "logps/rejected": -1.4218060970306396, "loss": 1.6418, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2647411823272705, "rewards/margins": 0.15706504881381989, "rewards/rejected": -1.4218060970306396, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 6.294694914877995, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.08505548536777496, "logits/rejected": 0.16642725467681885, "logps/chosen": -1.2363998889923096, "logps/rejected": -1.3269596099853516, "loss": 1.6577, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2363998889923096, "rewards/margins": 0.09055972844362259, "rewards/rejected": -1.3269596099853516, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 7.066635125091955, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.07894889265298843, "logits/rejected": 0.1766219586133957, "logps/chosen": -1.2514185905456543, "logps/rejected": -1.3117074966430664, "loss": 1.6592, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2514185905456543, "rewards/margins": 0.06028900295495987, "rewards/rejected": -1.3117074966430664, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 7.622271735376781, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.04230818152427673, "logits/rejected": 0.11139015853404999, "logps/chosen": -1.3790353536605835, "logps/rejected": -1.427674412727356, "loss": 1.7338, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3790353536605835, "rewards/margins": 0.04863894730806351, "rewards/rejected": -1.427674412727356, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 7.395680847120547, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.08460564911365509, "logits/rejected": 0.11045414209365845, "logps/chosen": -1.282997727394104, "logps/rejected": -1.3860433101654053, "loss": 1.6557, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.282997727394104, "rewards/margins": 0.10304556041955948, "rewards/rejected": -1.3860433101654053, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 8.761993018053895, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.04571625217795372, "logits/rejected": 0.10803280025720596, "logps/chosen": -1.2807788848876953, "logps/rejected": -1.402484655380249, "loss": 1.637, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2807788848876953, "rewards/margins": 0.12170571088790894, "rewards/rejected": -1.402484655380249, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 11.10366030266417, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.06252036243677139, "logits/rejected": 0.05841824412345886, "logps/chosen": -1.3565037250518799, "logps/rejected": -1.4139055013656616, "loss": 1.7212, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3565037250518799, "rewards/margins": 0.057401906698942184, "rewards/rejected": -1.4139055013656616, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 9.864439916293803, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.04508183151483536, "logits/rejected": 0.051397740840911865, "logps/chosen": -1.2115132808685303, "logps/rejected": -1.3322150707244873, "loss": 1.6101, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2115132808685303, "rewards/margins": 0.12070177495479584, "rewards/rejected": -1.3322150707244873, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 6.165225503632382, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.02966439723968506, "logits/rejected": 0.07653049379587173, "logps/chosen": -1.3243474960327148, "logps/rejected": -1.4375768899917603, "loss": 1.6986, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3243474960327148, "rewards/margins": 0.11322925984859467, "rewards/rejected": -1.4375768899917603, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 8.854251236446792, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.04597383365035057, "logits/rejected": 0.16426034271717072, "logps/chosen": -1.3067337274551392, "logps/rejected": -1.373793601989746, "loss": 1.7024, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3067337274551392, "rewards/margins": 0.06705982983112335, "rewards/rejected": -1.373793601989746, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 6.840144493353446, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.013875524513423443, "logits/rejected": 0.04112040624022484, "logps/chosen": -1.2333790063858032, "logps/rejected": -1.3387117385864258, "loss": 1.6608, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2333790063858032, "rewards/margins": 0.10533283650875092, "rewards/rejected": -1.3387117385864258, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 7.4380198998326055, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.03013746812939644, "logits/rejected": 0.11791408061981201, "logps/chosen": -1.2293686866760254, "logps/rejected": -1.387367844581604, "loss": 1.6001, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2293686866760254, "rewards/margins": 0.15799903869628906, "rewards/rejected": -1.387367844581604, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 7.50710540185207, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.06569008529186249, "logits/rejected": 0.03065279684960842, "logps/chosen": -1.3712207078933716, "logps/rejected": -1.4126875400543213, "loss": 1.7422, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3712207078933716, "rewards/margins": 0.041466858237981796, "rewards/rejected": -1.4126875400543213, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 15.466465592760187, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.04614231735467911, "logits/rejected": 0.0724630355834961, "logps/chosen": -1.3442991971969604, "logps/rejected": -1.3700175285339355, "loss": 1.7183, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3442991971969604, "rewards/margins": 0.025718364864587784, "rewards/rejected": -1.3700175285339355, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 8.758215512499829, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.018415410071611404, "logits/rejected": 0.13863402605056763, "logps/chosen": -1.2763307094573975, "logps/rejected": -1.3321726322174072, "loss": 1.6815, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2763307094573975, "rewards/margins": 0.05584187060594559, "rewards/rejected": -1.3321726322174072, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 8.227548047255322, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.03972851112484932, "logits/rejected": 0.097906194627285, "logps/chosen": -1.269298791885376, "logps/rejected": -1.2822902202606201, "loss": 1.6859, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.269298791885376, "rewards/margins": 0.012991341762244701, "rewards/rejected": -1.2822902202606201, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 9.777490952887712, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.07380813360214233, "logits/rejected": 0.008721251972019672, "logps/chosen": -1.3301905393600464, "logps/rejected": -1.4657261371612549, "loss": 1.6796, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3301905393600464, "rewards/margins": 0.13553544878959656, "rewards/rejected": -1.4657261371612549, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 12.299409373491619, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.047155968844890594, "logits/rejected": 0.187996968626976, "logps/chosen": -1.307685136795044, "logps/rejected": -1.3639806509017944, "loss": 1.7023, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.307685136795044, "rewards/margins": 0.056295741349458694, "rewards/rejected": -1.3639806509017944, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 5.6392494655949825, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.0546308234333992, "logits/rejected": 0.04433388262987137, "logps/chosen": -1.3067935705184937, "logps/rejected": -1.43153977394104, "loss": 1.6778, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3067935705184937, "rewards/margins": 0.12474598735570908, "rewards/rejected": -1.43153977394104, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 7.341889878119668, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.06176387146115303, "logits/rejected": 0.12294511497020721, "logps/chosen": -1.3664953708648682, "logps/rejected": -1.4626625776290894, "loss": 1.7016, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3664953708648682, "rewards/margins": 0.09616713225841522, "rewards/rejected": -1.4626625776290894, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 8.995369784784629, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.061516355723142624, "logits/rejected": 0.16859427094459534, "logps/chosen": -1.2846791744232178, "logps/rejected": -1.4000781774520874, "loss": 1.659, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2846791744232178, "rewards/margins": 0.11539904773235321, "rewards/rejected": -1.4000781774520874, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 7.600144746952444, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.007964099757373333, "logits/rejected": 0.09590659290552139, "logps/chosen": -1.3186440467834473, "logps/rejected": -1.4482091665267944, "loss": 1.6537, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3186440467834473, "rewards/margins": 0.12956508994102478, "rewards/rejected": -1.4482091665267944, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 6.593150440126051, "learning_rate": 9.994688118905471e-07, "logits/chosen": 0.02119055762887001, "logits/rejected": 0.25564703345298767, "logps/chosen": -1.3833516836166382, "logps/rejected": -1.409256935119629, "loss": 1.7706, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.3833516836166382, "rewards/margins": 0.02590516209602356, "rewards/rejected": -1.409256935119629, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 8.898308653082006, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.07971400767564774, "logits/rejected": 0.10469271242618561, "logps/chosen": -1.322509527206421, "logps/rejected": -1.4098836183547974, "loss": 1.7398, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.322509527206421, "rewards/margins": 0.08737409114837646, "rewards/rejected": -1.4098836183547974, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 7.389930536963129, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.07686187326908112, "logits/rejected": 0.0053823827765882015, "logps/chosen": -1.2334957122802734, "logps/rejected": -1.385599136352539, "loss": 1.6508, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2334957122802734, "rewards/margins": 0.15210336446762085, "rewards/rejected": -1.385599136352539, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 10.833609564944332, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.028376352041959763, "logits/rejected": 0.10559730231761932, "logps/chosen": -1.3749666213989258, "logps/rejected": -1.5166696310043335, "loss": 1.7152, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3749666213989258, "rewards/margins": 0.14170297980308533, "rewards/rejected": -1.5166696310043335, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 8.272238064740044, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.06738613545894623, "logits/rejected": 0.0676032230257988, "logps/chosen": -1.312971830368042, "logps/rejected": -1.483418583869934, "loss": 1.671, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.312971830368042, "rewards/margins": 0.17044667899608612, "rewards/rejected": -1.483418583869934, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 7.0542822338071005, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.012846325524151325, "logits/rejected": 0.13787353038787842, "logps/chosen": -1.2715208530426025, "logps/rejected": -1.2986509799957275, "loss": 1.681, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2715208530426025, "rewards/margins": 0.02712990716099739, "rewards/rejected": -1.2986509799957275, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 7.7826006640852095, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.0050326017662882805, "logits/rejected": 0.12580212950706482, "logps/chosen": -1.2370613813400269, "logps/rejected": -1.357429027557373, "loss": 1.6309, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2370613813400269, "rewards/margins": 0.12036754935979843, "rewards/rejected": -1.357429027557373, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 8.526708013653826, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.04048728197813034, "logits/rejected": 0.15382294356822968, "logps/chosen": -1.273474931716919, "logps/rejected": -1.3108594417572021, "loss": 1.6689, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.273474931716919, "rewards/margins": 0.037384580820798874, "rewards/rejected": -1.3108594417572021, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 10.337099243083788, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.030770743265748024, "logits/rejected": 0.043833933770656586, "logps/chosen": -1.2658917903900146, "logps/rejected": -1.4739489555358887, "loss": 1.6307, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2658917903900146, "rewards/margins": 0.20805713534355164, "rewards/rejected": -1.4739489555358887, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 9.013546350604763, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.07694297283887863, "logits/rejected": 0.2321816235780716, "logps/chosen": -1.3103058338165283, "logps/rejected": -1.35648775100708, "loss": 1.721, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3103058338165283, "rewards/margins": 0.046181898564100266, "rewards/rejected": -1.35648775100708, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 9.238506803782625, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.01726577617228031, "logits/rejected": 0.1645032912492752, "logps/chosen": -1.291521430015564, "logps/rejected": -1.3422341346740723, "loss": 1.6962, "rewards/accuracies": 0.53125, "rewards/chosen": -1.291521430015564, "rewards/margins": 0.050712697207927704, "rewards/rejected": -1.3422341346740723, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 8.565753547349857, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.03928017243742943, "logits/rejected": 0.06918928772211075, "logps/chosen": -1.2919032573699951, "logps/rejected": -1.4055030345916748, "loss": 1.6564, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2919032573699951, "rewards/margins": 0.1135997399687767, "rewards/rejected": -1.4055030345916748, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 7.189058484659085, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.00870530866086483, "logits/rejected": 0.07581108063459396, "logps/chosen": -1.1800092458724976, "logps/rejected": -1.3516024351119995, "loss": 1.5623, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1800092458724976, "rewards/margins": 0.1715930700302124, "rewards/rejected": -1.3516024351119995, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 6.2701986940644625, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.11826181411743164, "logits/rejected": 0.009017865173518658, "logps/chosen": -1.3536287546157837, "logps/rejected": -1.4267635345458984, "loss": 1.7099, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3536287546157837, "rewards/margins": 0.07313470542430878, "rewards/rejected": -1.4267635345458984, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 6.504609947969548, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.08614318817853928, "logits/rejected": 0.07920202612876892, "logps/chosen": -1.3100945949554443, "logps/rejected": -1.4507384300231934, "loss": 1.6808, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3100945949554443, "rewards/margins": 0.1406438946723938, "rewards/rejected": -1.4507384300231934, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 7.971345593043526, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.062352318316698074, "logits/rejected": 0.1580919325351715, "logps/chosen": -1.2454324960708618, "logps/rejected": -1.3666789531707764, "loss": 1.6294, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2454324960708618, "rewards/margins": 0.12124643474817276, "rewards/rejected": -1.3666789531707764, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 8.56313993808138, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.025264907628297806, "logits/rejected": 0.13151118159294128, "logps/chosen": -1.323120355606079, "logps/rejected": -1.4409905672073364, "loss": 1.703, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.323120355606079, "rewards/margins": 0.11787022650241852, "rewards/rejected": -1.4409905672073364, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 7.449686788662097, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.03104434348642826, "logits/rejected": 0.20914196968078613, "logps/chosen": -1.3684965372085571, "logps/rejected": -1.4199926853179932, "loss": 1.7275, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3684965372085571, "rewards/margins": 0.05149605870246887, "rewards/rejected": -1.4199926853179932, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 9.052793290445388, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.07327831536531448, "logits/rejected": 0.05761227756738663, "logps/chosen": -1.2101671695709229, "logps/rejected": -1.4562008380889893, "loss": 1.5896, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2101671695709229, "rewards/margins": 0.24603363871574402, "rewards/rejected": -1.4562008380889893, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 9.535694817556166, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.11410193145275116, "logits/rejected": 0.04373040795326233, "logps/chosen": -1.3051345348358154, "logps/rejected": -1.393215298652649, "loss": 1.6759, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3051345348358154, "rewards/margins": 0.08808077871799469, "rewards/rejected": -1.393215298652649, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 6.620606779481893, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.09182188659906387, "logits/rejected": 0.0029895335901528597, "logps/chosen": -1.3273653984069824, "logps/rejected": -1.3844788074493408, "loss": 1.6854, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3273653984069824, "rewards/margins": 0.05711333081126213, "rewards/rejected": -1.3844788074493408, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 10.697652848841608, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.04950173944234848, "logits/rejected": 0.1153801828622818, "logps/chosen": -1.3779503107070923, "logps/rejected": -1.4143047332763672, "loss": 1.748, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3779503107070923, "rewards/margins": 0.03635428473353386, "rewards/rejected": -1.4143047332763672, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 7.083780993102439, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.0645158439874649, "logits/rejected": 0.1322767436504364, "logps/chosen": -1.3595610857009888, "logps/rejected": -1.5478967428207397, "loss": 1.7024, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3595610857009888, "rewards/margins": 0.18833574652671814, "rewards/rejected": -1.5478967428207397, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 7.016282897067713, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.014546563848853111, "logits/rejected": 0.14593543112277985, "logps/chosen": -1.295961618423462, "logps/rejected": -1.4461864233016968, "loss": 1.6495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.295961618423462, "rewards/margins": 0.15022492408752441, "rewards/rejected": -1.4461864233016968, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 5.697543818514006, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.015908140689134598, "logits/rejected": 0.1339842528104782, "logps/chosen": -1.3269898891448975, "logps/rejected": -1.4011058807373047, "loss": 1.6926, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3269898891448975, "rewards/margins": 0.07411597669124603, "rewards/rejected": -1.4011058807373047, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 5.948048365747885, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.06778191030025482, "logits/rejected": -0.0030374140478670597, "logps/chosen": -1.3420777320861816, "logps/rejected": -1.4758747816085815, "loss": 1.6879, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3420777320861816, "rewards/margins": 0.13379700481891632, "rewards/rejected": -1.4758747816085815, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 14.268347903411364, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.037238337099552155, "logits/rejected": 0.049766041338443756, "logps/chosen": -1.2987943887710571, "logps/rejected": -1.3975147008895874, "loss": 1.6594, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2987943887710571, "rewards/margins": 0.09872031211853027, "rewards/rejected": -1.3975147008895874, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 7.701723969333382, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.058088578283786774, "logits/rejected": 0.17226721346378326, "logps/chosen": -1.2571780681610107, "logps/rejected": -1.4605745077133179, "loss": 1.6128, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2571780681610107, "rewards/margins": 0.20339655876159668, "rewards/rejected": -1.4605745077133179, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 9.191080049332337, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.08166106045246124, "logits/rejected": 0.08174169063568115, "logps/chosen": -1.3375778198242188, "logps/rejected": -1.5057597160339355, "loss": 1.6961, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3375778198242188, "rewards/margins": 0.16818185150623322, "rewards/rejected": -1.5057597160339355, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 7.203825122320527, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.13217060267925262, "logits/rejected": 0.009336207062005997, "logps/chosen": -1.261022925376892, "logps/rejected": -1.4133572578430176, "loss": 1.6253, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.261022925376892, "rewards/margins": 0.1523343175649643, "rewards/rejected": -1.4133572578430176, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 8.499021010654502, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.024778928607702255, "logits/rejected": 0.14496475458145142, "logps/chosen": -1.248429298400879, "logps/rejected": -1.434415578842163, "loss": 1.6252, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.248429298400879, "rewards/margins": 0.18598642945289612, "rewards/rejected": -1.434415578842163, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 6.789172055949601, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.03894413262605667, "logits/rejected": 0.08288483321666718, "logps/chosen": -1.3003264665603638, "logps/rejected": -1.3643810749053955, "loss": 1.6826, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3003264665603638, "rewards/margins": 0.0640547126531601, "rewards/rejected": -1.3643810749053955, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 5.945139150631693, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.05952250957489014, "logits/rejected": 0.12411437183618546, "logps/chosen": -1.2805763483047485, "logps/rejected": -1.4107916355133057, "loss": 1.6567, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2805763483047485, "rewards/margins": 0.13021525740623474, "rewards/rejected": -1.4107916355133057, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 7.442932191018972, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.013432202860713005, "logits/rejected": 0.06116775795817375, "logps/chosen": -1.2843835353851318, "logps/rejected": -1.3335177898406982, "loss": 1.6692, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2843835353851318, "rewards/margins": 0.049134280532598495, "rewards/rejected": -1.3335177898406982, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.288900762796402, "eval_logits/rejected": 0.3707810044288635, "eval_logps/chosen": -1.3150991201400757, "eval_logps/rejected": -1.4532443284988403, "eval_loss": 1.6718252897262573, "eval_rewards/accuracies": 0.5578634738922119, "eval_rewards/chosen": -1.3150991201400757, "eval_rewards/margins": 0.13814544677734375, "eval_rewards/rejected": -1.4532443284988403, "eval_runtime": 40.0611, "eval_samples_per_second": 33.574, "eval_steps_per_second": 8.412, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 7.448454933590522, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.08388860523700714, "logits/rejected": 0.05477358028292656, "logps/chosen": -1.3262841701507568, "logps/rejected": -1.4756443500518799, "loss": 1.6797, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3262841701507568, "rewards/margins": 0.14936020970344543, "rewards/rejected": -1.4756443500518799, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 7.629956912163664, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.03200405091047287, "logits/rejected": 0.10098972171545029, "logps/chosen": -1.2373231649398804, "logps/rejected": -1.3669945001602173, "loss": 1.6151, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2373231649398804, "rewards/margins": 0.1296711266040802, "rewards/rejected": -1.3669945001602173, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 8.098240198326735, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.03276928886771202, "logits/rejected": 0.15491512417793274, "logps/chosen": -1.2305742502212524, "logps/rejected": -1.4300734996795654, "loss": 1.5791, "rewards/accuracies": 0.625, "rewards/chosen": -1.2305742502212524, "rewards/margins": 0.19949916005134583, "rewards/rejected": -1.4300734996795654, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 7.768530788641451, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.040918465703725815, "logits/rejected": 0.08255477994680405, "logps/chosen": -1.27720046043396, "logps/rejected": -1.4338114261627197, "loss": 1.6462, "rewards/accuracies": 0.53125, "rewards/chosen": -1.27720046043396, "rewards/margins": 0.15661077201366425, "rewards/rejected": -1.4338114261627197, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 7.776726581191855, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.03660018369555473, "logits/rejected": 0.0787377804517746, "logps/chosen": -1.2200651168823242, "logps/rejected": -1.3064601421356201, "loss": 1.6296, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2200651168823242, "rewards/margins": 0.08639508485794067, "rewards/rejected": -1.3064601421356201, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 7.024572269311969, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.05554322525858879, "logits/rejected": 0.20837612450122833, "logps/chosen": -1.3340895175933838, "logps/rejected": -1.3898991346359253, "loss": 1.7005, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3340895175933838, "rewards/margins": 0.05580978840589523, "rewards/rejected": -1.3898991346359253, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 10.69377124180876, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.009267118759453297, "logits/rejected": 0.13208474218845367, "logps/chosen": -1.2435743808746338, "logps/rejected": -1.3662970066070557, "loss": 1.6503, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2435743808746338, "rewards/margins": 0.12272258847951889, "rewards/rejected": -1.3662970066070557, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 8.144607331637909, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.006433224771171808, "logits/rejected": 0.13204315304756165, "logps/chosen": -1.2838013172149658, "logps/rejected": -1.4483354091644287, "loss": 1.6652, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2838013172149658, "rewards/margins": 0.16453418135643005, "rewards/rejected": -1.4483354091644287, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 5.55810909000904, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.047361455857753754, "logits/rejected": 0.10631246864795685, "logps/chosen": -1.2872891426086426, "logps/rejected": -1.3429710865020752, "loss": 1.6884, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2872891426086426, "rewards/margins": 0.05568184331059456, "rewards/rejected": -1.3429710865020752, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 9.11402763497159, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.01465144194662571, "logits/rejected": 0.06983944028615952, "logps/chosen": -1.2490416765213013, "logps/rejected": -1.3807716369628906, "loss": 1.612, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2490416765213013, "rewards/margins": 0.13172993063926697, "rewards/rejected": -1.3807716369628906, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 6.438125866962163, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.03615623712539673, "logits/rejected": 0.09094985574483871, "logps/chosen": -1.3045870065689087, "logps/rejected": -1.4764373302459717, "loss": 1.6946, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3045870065689087, "rewards/margins": 0.17185047268867493, "rewards/rejected": -1.4764373302459717, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 5.863387820990229, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.06574797630310059, "logits/rejected": 0.07671912014484406, "logps/chosen": -1.3422025442123413, "logps/rejected": -1.524712324142456, "loss": 1.6772, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3422025442123413, "rewards/margins": 0.18250969052314758, "rewards/rejected": -1.524712324142456, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 10.92852653610282, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.004780948162078857, "logits/rejected": 0.09096747636795044, "logps/chosen": -1.2448174953460693, "logps/rejected": -1.4256184101104736, "loss": 1.6361, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2448174953460693, "rewards/margins": 0.18080079555511475, "rewards/rejected": -1.4256184101104736, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 7.087121964770353, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.08116715401411057, "logits/rejected": 0.15623739361763, "logps/chosen": -1.3013743162155151, "logps/rejected": -1.3912999629974365, "loss": 1.6759, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3013743162155151, "rewards/margins": 0.08992559462785721, "rewards/rejected": -1.3912999629974365, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 7.544115225961909, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.01658749394118786, "logits/rejected": 0.08199294656515121, "logps/chosen": -1.3255009651184082, "logps/rejected": -1.4433691501617432, "loss": 1.7238, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3255009651184082, "rewards/margins": 0.1178680881857872, "rewards/rejected": -1.4433691501617432, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 6.029370913132447, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.019355863332748413, "logits/rejected": 0.16283582150936127, "logps/chosen": -1.3766533136367798, "logps/rejected": -1.4846994876861572, "loss": 1.7245, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3766533136367798, "rewards/margins": 0.10804629325866699, "rewards/rejected": -1.4846994876861572, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 8.629139935459667, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.060194194316864014, "logits/rejected": 0.013257995247840881, "logps/chosen": -1.269682765007019, "logps/rejected": -1.453184723854065, "loss": 1.6313, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.269682765007019, "rewards/margins": 0.18350182473659515, "rewards/rejected": -1.453184723854065, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 7.777444725456523, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.14868099987506866, "logits/rejected": -0.03934749215841293, "logps/chosen": -1.3191107511520386, "logps/rejected": -1.4460171461105347, "loss": 1.6891, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3191107511520386, "rewards/margins": 0.1269063651561737, "rewards/rejected": -1.4460171461105347, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 7.654458430179175, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.049044668674468994, "logits/rejected": 0.15491130948066711, "logps/chosen": -1.2157189846038818, "logps/rejected": -1.3172571659088135, "loss": 1.6145, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2157189846038818, "rewards/margins": 0.10153820365667343, "rewards/rejected": -1.3172571659088135, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 7.237952882977966, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.07838686555624008, "logits/rejected": -0.023043682798743248, "logps/chosen": -1.208280086517334, "logps/rejected": -1.4431705474853516, "loss": 1.5745, "rewards/accuracies": 0.59375, "rewards/chosen": -1.208280086517334, "rewards/margins": 0.23489037156105042, "rewards/rejected": -1.4431705474853516, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 7.124123089263012, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.04647911712527275, "logits/rejected": 0.21987970173358917, "logps/chosen": -1.2849841117858887, "logps/rejected": -1.466862678527832, "loss": 1.6517, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2849841117858887, "rewards/margins": 0.18187864124774933, "rewards/rejected": -1.466862678527832, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 5.099811653065707, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.02049119397997856, "logits/rejected": 0.08140434324741364, "logps/chosen": -1.3364083766937256, "logps/rejected": -1.409537434577942, "loss": 1.7328, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3364083766937256, "rewards/margins": 0.07312921434640884, "rewards/rejected": -1.409537434577942, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 6.450546306377393, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.01597698964178562, "logits/rejected": 0.1421457976102829, "logps/chosen": -1.2553143501281738, "logps/rejected": -1.3936388492584229, "loss": 1.6273, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2553143501281738, "rewards/margins": 0.13832440972328186, "rewards/rejected": -1.3936388492584229, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 9.921746807110368, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.09873722493648529, "logits/rejected": 0.2159980982542038, "logps/chosen": -1.2219921350479126, "logps/rejected": -1.366690993309021, "loss": 1.6169, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2219921350479126, "rewards/margins": 0.144698828458786, "rewards/rejected": -1.366690993309021, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 7.07195948209997, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.04622996598482132, "logits/rejected": 0.09983499348163605, "logps/chosen": -1.3058781623840332, "logps/rejected": -1.3538609743118286, "loss": 1.6725, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3058781623840332, "rewards/margins": 0.04798286780714989, "rewards/rejected": -1.3538609743118286, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 11.693390251271618, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.03761199861764908, "logits/rejected": 0.06936880201101303, "logps/chosen": -1.2870805263519287, "logps/rejected": -1.4754420518875122, "loss": 1.6781, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2870805263519287, "rewards/margins": 0.1883614957332611, "rewards/rejected": -1.4754420518875122, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 8.89968298486334, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.03245195001363754, "logits/rejected": 0.06252043694257736, "logps/chosen": -1.2151482105255127, "logps/rejected": -1.4019325971603394, "loss": 1.6228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2151482105255127, "rewards/margins": 0.18678443133831024, "rewards/rejected": -1.4019325971603394, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 8.443843103780504, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.024701697751879692, "logits/rejected": 0.07754331827163696, "logps/chosen": -1.2799854278564453, "logps/rejected": -1.3430149555206299, "loss": 1.6659, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2799854278564453, "rewards/margins": 0.06302952766418457, "rewards/rejected": -1.3430149555206299, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 7.616171348420136, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.036475539207458496, "logits/rejected": 0.07666192948818207, "logps/chosen": -1.2560824155807495, "logps/rejected": -1.4120190143585205, "loss": 1.6173, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2560824155807495, "rewards/margins": 0.15593667328357697, "rewards/rejected": -1.4120190143585205, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 6.162783467034935, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.02106516622006893, "logits/rejected": 0.12846365571022034, "logps/chosen": -1.3557555675506592, "logps/rejected": -1.384013295173645, "loss": 1.749, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3557555675506592, "rewards/margins": 0.028257649391889572, "rewards/rejected": -1.384013295173645, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 5.876016724260786, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.01717526838183403, "logits/rejected": 0.21609918773174286, "logps/chosen": -1.3414275646209717, "logps/rejected": -1.393541932106018, "loss": 1.7376, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3414275646209717, "rewards/margins": 0.05211412161588669, "rewards/rejected": -1.393541932106018, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 7.413105057921761, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.042240239679813385, "logits/rejected": 0.08787455409765244, "logps/chosen": -1.2983901500701904, "logps/rejected": -1.37651526927948, "loss": 1.67, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2983901500701904, "rewards/margins": 0.0781250074505806, "rewards/rejected": -1.37651526927948, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 7.210572778613, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.024538477882742882, "logits/rejected": 0.007992692291736603, "logps/chosen": -1.3409980535507202, "logps/rejected": -1.4976381063461304, "loss": 1.6831, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3409980535507202, "rewards/margins": 0.15664002299308777, "rewards/rejected": -1.4976381063461304, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 14.130211133969373, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.05639838054776192, "logits/rejected": 0.05497770383954048, "logps/chosen": -1.2233648300170898, "logps/rejected": -1.3103077411651611, "loss": 1.661, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2233648300170898, "rewards/margins": 0.08694292604923248, "rewards/rejected": -1.3103077411651611, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 6.923884257166216, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.105961874127388, "logits/rejected": 0.04743831977248192, "logps/chosen": -1.293992042541504, "logps/rejected": -1.3859124183654785, "loss": 1.6687, "rewards/accuracies": 0.53125, "rewards/chosen": -1.293992042541504, "rewards/margins": 0.0919203907251358, "rewards/rejected": -1.3859124183654785, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 12.316637061932141, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.1073160171508789, "logits/rejected": 0.06393261253833771, "logps/chosen": -1.3275487422943115, "logps/rejected": -1.4250518083572388, "loss": 1.7113, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3275487422943115, "rewards/margins": 0.09750307351350784, "rewards/rejected": -1.4250518083572388, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 9.038208075808697, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.007983547635376453, "logits/rejected": 0.17008168995380402, "logps/chosen": -1.2993818521499634, "logps/rejected": -1.4190666675567627, "loss": 1.6644, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2993818521499634, "rewards/margins": 0.11968479305505753, "rewards/rejected": -1.4190666675567627, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 7.199171931454207, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.02937675081193447, "logits/rejected": 0.06694046407938004, "logps/chosen": -1.1760294437408447, "logps/rejected": -1.3847202062606812, "loss": 1.5567, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1760294437408447, "rewards/margins": 0.20869088172912598, "rewards/rejected": -1.3847202062606812, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 7.8135289022921, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.055761635303497314, "logits/rejected": 0.11374112218618393, "logps/chosen": -1.2928580045700073, "logps/rejected": -1.311802864074707, "loss": 1.7083, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2928580045700073, "rewards/margins": 0.01894478313624859, "rewards/rejected": -1.311802864074707, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 7.405239214870988, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.05504922941327095, "logits/rejected": 0.14741148054599762, "logps/chosen": -1.3170385360717773, "logps/rejected": -1.4345099925994873, "loss": 1.6722, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3170385360717773, "rewards/margins": 0.11747147142887115, "rewards/rejected": -1.4345099925994873, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 9.441656068563447, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.028905656188726425, "logits/rejected": 0.17964079976081848, "logps/chosen": -1.317063808441162, "logps/rejected": -1.397226333618164, "loss": 1.7064, "rewards/accuracies": 0.53125, "rewards/chosen": -1.317063808441162, "rewards/margins": 0.08016278594732285, "rewards/rejected": -1.397226333618164, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 6.208741723671985, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.009346907958388329, "logits/rejected": 0.17316535115242004, "logps/chosen": -1.179496169090271, "logps/rejected": -1.378589153289795, "loss": 1.5714, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.179496169090271, "rewards/margins": 0.1990930140018463, "rewards/rejected": -1.378589153289795, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 8.64120907951514, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.02329869009554386, "logits/rejected": 0.12833422422409058, "logps/chosen": -1.2491451501846313, "logps/rejected": -1.4052848815917969, "loss": 1.6333, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2491451501846313, "rewards/margins": 0.15613953769207, "rewards/rejected": -1.4052848815917969, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 8.633383591041156, "learning_rate": 9.796985931808949e-07, "logits/chosen": -0.0002554789243731648, "logits/rejected": 0.12322355806827545, "logps/chosen": -1.3016084432601929, "logps/rejected": -1.4578951597213745, "loss": 1.6696, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3016084432601929, "rewards/margins": 0.1562865972518921, "rewards/rejected": -1.4578951597213745, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 7.691560710071344, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.05090395361185074, "logits/rejected": 0.048211853951215744, "logps/chosen": -1.2026124000549316, "logps/rejected": -1.439203143119812, "loss": 1.5598, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2026124000549316, "rewards/margins": 0.2365906983613968, "rewards/rejected": -1.439203143119812, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 8.010467384551692, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.017558079212903976, "logits/rejected": 0.06259426474571228, "logps/chosen": -1.280426025390625, "logps/rejected": -1.343672513961792, "loss": 1.6636, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.280426025390625, "rewards/margins": 0.06324651092290878, "rewards/rejected": -1.343672513961792, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 7.046839044633882, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.03272116556763649, "logits/rejected": 0.06766636669635773, "logps/chosen": -1.3677171468734741, "logps/rejected": -1.4003586769104004, "loss": 1.7535, "rewards/accuracies": 0.5, "rewards/chosen": -1.3677171468734741, "rewards/margins": 0.03264139965176582, "rewards/rejected": -1.4003586769104004, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 7.201284326111254, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.0016980856889858842, "logits/rejected": 0.1325521320104599, "logps/chosen": -1.2892191410064697, "logps/rejected": -1.4091163873672485, "loss": 1.6637, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2892191410064697, "rewards/margins": 0.11989720165729523, "rewards/rejected": -1.4091163873672485, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 6.310168511316686, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.04989878088235855, "logits/rejected": 0.07154484838247299, "logps/chosen": -1.2653822898864746, "logps/rejected": -1.4335815906524658, "loss": 1.6333, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2653822898864746, "rewards/margins": 0.16819944977760315, "rewards/rejected": -1.4335815906524658, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 12.552512510195202, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.09465078264474869, "logits/rejected": -0.012076696380972862, "logps/chosen": -1.3088659048080444, "logps/rejected": -1.4423149824142456, "loss": 1.678, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3088659048080444, "rewards/margins": 0.13344910740852356, "rewards/rejected": -1.4423149824142456, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 11.281832576670151, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.008454844355583191, "logits/rejected": 0.06004105880856514, "logps/chosen": -1.2986464500427246, "logps/rejected": -1.4412143230438232, "loss": 1.6622, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2986464500427246, "rewards/margins": 0.14256784319877625, "rewards/rejected": -1.4412143230438232, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 8.998794142616624, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.02145402505993843, "logits/rejected": 0.17861071228981018, "logps/chosen": -1.3666810989379883, "logps/rejected": -1.471959114074707, "loss": 1.7192, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3666810989379883, "rewards/margins": 0.10527794063091278, "rewards/rejected": -1.471959114074707, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 5.910688973070728, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.061039119958877563, "logits/rejected": 0.17979125678539276, "logps/chosen": -1.2307828664779663, "logps/rejected": -1.4314491748809814, "loss": 1.6004, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2307828664779663, "rewards/margins": 0.20066621899604797, "rewards/rejected": -1.4314491748809814, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 7.079875986443437, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.1030394583940506, "logits/rejected": 0.11257269233465195, "logps/chosen": -1.2871735095977783, "logps/rejected": -1.462566614151001, "loss": 1.6165, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2871735095977783, "rewards/margins": 0.17539313435554504, "rewards/rejected": -1.462566614151001, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 6.974332495983628, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.025203552097082138, "logits/rejected": 0.0852215439081192, "logps/chosen": -1.2349945306777954, "logps/rejected": -1.438617467880249, "loss": 1.6265, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2349945306777954, "rewards/margins": 0.20362301170825958, "rewards/rejected": -1.438617467880249, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 7.500831024647689, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.11578289419412613, "logits/rejected": -0.024845128878951073, "logps/chosen": -1.3552777767181396, "logps/rejected": -1.3890711069107056, "loss": 1.7297, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3552777767181396, "rewards/margins": 0.03379334136843681, "rewards/rejected": -1.3890711069107056, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 6.994719249130065, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.01449662446975708, "logits/rejected": 0.1305081993341446, "logps/chosen": -1.3126853704452515, "logps/rejected": -1.344916582107544, "loss": 1.7248, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3126853704452515, "rewards/margins": 0.03223109990358353, "rewards/rejected": -1.344916582107544, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 5.89657380761943, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.006966861430555582, "logits/rejected": 0.08126161247491837, "logps/chosen": -1.2898008823394775, "logps/rejected": -1.3175550699234009, "loss": 1.6988, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2898008823394775, "rewards/margins": 0.02775425836443901, "rewards/rejected": -1.3175550699234009, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 9.83209660948282, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.14097405970096588, "logits/rejected": -0.023608043789863586, "logps/chosen": -1.242818832397461, "logps/rejected": -1.3774590492248535, "loss": 1.6302, "rewards/accuracies": 0.53125, "rewards/chosen": -1.242818832397461, "rewards/margins": 0.1346401870250702, "rewards/rejected": -1.3774590492248535, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 8.709679661390545, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.002015142235904932, "logits/rejected": 0.08891472965478897, "logps/chosen": -1.1542670726776123, "logps/rejected": -1.448899745941162, "loss": 1.5282, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1542670726776123, "rewards/margins": 0.29463261365890503, "rewards/rejected": -1.448899745941162, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 7.005717163731625, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.04424266517162323, "logits/rejected": 0.01915797032415867, "logps/chosen": -1.2812672853469849, "logps/rejected": -1.3807398080825806, "loss": 1.6659, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2812672853469849, "rewards/margins": 0.09947264194488525, "rewards/rejected": -1.3807398080825806, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 8.61150436242116, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.007998655550181866, "logits/rejected": 0.10622884333133698, "logps/chosen": -1.2612874507904053, "logps/rejected": -1.2952603101730347, "loss": 1.6641, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2612874507904053, "rewards/margins": 0.03397286683320999, "rewards/rejected": -1.2952603101730347, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 12.059532727951146, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.0511791817843914, "logits/rejected": 0.15223221480846405, "logps/chosen": -1.1839970350265503, "logps/rejected": -1.3715298175811768, "loss": 1.5661, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1839970350265503, "rewards/margins": 0.18753281235694885, "rewards/rejected": -1.3715298175811768, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 8.460088417331862, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.13747264444828033, "logits/rejected": -0.044413208961486816, "logps/chosen": -1.2539507150650024, "logps/rejected": -1.3953502178192139, "loss": 1.6098, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2539507150650024, "rewards/margins": 0.14139962196350098, "rewards/rejected": -1.3953502178192139, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 9.32199269477602, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.1262091100215912, "logits/rejected": 0.03779655322432518, "logps/chosen": -1.2518672943115234, "logps/rejected": -1.3900775909423828, "loss": 1.6304, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2518672943115234, "rewards/margins": 0.13821040093898773, "rewards/rejected": -1.3900775909423828, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 7.3873109008422615, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.04846873879432678, "logits/rejected": 0.09871874004602432, "logps/chosen": -1.2924776077270508, "logps/rejected": -1.4353249073028564, "loss": 1.6502, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2924776077270508, "rewards/margins": 0.1428474634885788, "rewards/rejected": -1.4353249073028564, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 10.565256810319221, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.029242968186736107, "logits/rejected": 0.11600425094366074, "logps/chosen": -1.3042986392974854, "logps/rejected": -1.4900563955307007, "loss": 1.6411, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3042986392974854, "rewards/margins": 0.18575787544250488, "rewards/rejected": -1.4900563955307007, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 7.840932704918959, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.021750206127762794, "logits/rejected": 0.005784572567790747, "logps/chosen": -1.2818795442581177, "logps/rejected": -1.3069055080413818, "loss": 1.6906, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2818795442581177, "rewards/margins": 0.025025952607393265, "rewards/rejected": -1.3069055080413818, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 7.417020729970113, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.1478433907032013, "logits/rejected": -0.04975400120019913, "logps/chosen": -1.3073512315750122, "logps/rejected": -1.417533040046692, "loss": 1.6762, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3073512315750122, "rewards/margins": 0.1101817637681961, "rewards/rejected": -1.417533040046692, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 8.074345294334199, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.13061223924160004, "logits/rejected": -0.005057701375335455, "logps/chosen": -1.1960101127624512, "logps/rejected": -1.3409080505371094, "loss": 1.5788, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1960101127624512, "rewards/margins": 0.144897922873497, "rewards/rejected": -1.3409080505371094, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 6.857872596234929, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.008821931667625904, "logits/rejected": 0.064828060567379, "logps/chosen": -1.2959681749343872, "logps/rejected": -1.3864507675170898, "loss": 1.6786, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2959681749343872, "rewards/margins": 0.09048257768154144, "rewards/rejected": -1.3864507675170898, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 7.673039625152903, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.021009016782045364, "logits/rejected": 0.08989116549491882, "logps/chosen": -1.2484314441680908, "logps/rejected": -1.3629471063613892, "loss": 1.6193, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2484314441680908, "rewards/margins": 0.11451568454504013, "rewards/rejected": -1.3629471063613892, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 5.934848176767732, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.02263835072517395, "logits/rejected": 0.0471782460808754, "logps/chosen": -1.187469244003296, "logps/rejected": -1.3563673496246338, "loss": 1.5709, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.187469244003296, "rewards/margins": 0.16889812052249908, "rewards/rejected": -1.3563673496246338, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 5.886224416634032, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.059956144541502, "logits/rejected": 0.08554370701313019, "logps/chosen": -1.4021120071411133, "logps/rejected": -1.421523094177246, "loss": 1.7618, "rewards/accuracies": 0.5, "rewards/chosen": -1.4021120071411133, "rewards/margins": 0.019411057233810425, "rewards/rejected": -1.421523094177246, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 6.185455085126803, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.04533197358250618, "logits/rejected": 0.10845015197992325, "logps/chosen": -1.2356188297271729, "logps/rejected": -1.3273508548736572, "loss": 1.6465, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2356188297271729, "rewards/margins": 0.09173201024532318, "rewards/rejected": -1.3273508548736572, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 5.862756658253257, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.046646635979413986, "logits/rejected": 0.06485871970653534, "logps/chosen": -1.3361549377441406, "logps/rejected": -1.5056731700897217, "loss": 1.6712, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3361549377441406, "rewards/margins": 0.16951832175254822, "rewards/rejected": -1.5056731700897217, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 9.160651550896507, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.009948519058525562, "logits/rejected": 0.1008104532957077, "logps/chosen": -1.3717749118804932, "logps/rejected": -1.4444175958633423, "loss": 1.7385, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3717749118804932, "rewards/margins": 0.07264275848865509, "rewards/rejected": -1.4444175958633423, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 8.485100054672055, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.06173746660351753, "logits/rejected": 0.060471467673778534, "logps/chosen": -1.328833818435669, "logps/rejected": -1.476729393005371, "loss": 1.6721, "rewards/accuracies": 0.53125, "rewards/chosen": -1.328833818435669, "rewards/margins": 0.1478956639766693, "rewards/rejected": -1.476729393005371, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 9.67704832986553, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.012681236490607262, "logits/rejected": 0.0875493735074997, "logps/chosen": -1.2063080072402954, "logps/rejected": -1.4012012481689453, "loss": 1.5614, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2063080072402954, "rewards/margins": 0.19489315152168274, "rewards/rejected": -1.4012012481689453, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 7.578613594936711, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.06550656259059906, "logits/rejected": -0.013155996799468994, "logps/chosen": -1.2424876689910889, "logps/rejected": -1.388684868812561, "loss": 1.6206, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2424876689910889, "rewards/margins": 0.1461973935365677, "rewards/rejected": -1.388684868812561, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.2713717818260193, "eval_logits/rejected": 0.3522588908672333, "eval_logps/chosen": -1.308337688446045, "eval_logps/rejected": -1.4521799087524414, "eval_loss": 1.6640232801437378, "eval_rewards/accuracies": 0.5563797950744629, "eval_rewards/chosen": -1.308337688446045, "eval_rewards/margins": 0.14384222030639648, "eval_rewards/rejected": -1.4521799087524414, "eval_runtime": 40.2855, "eval_samples_per_second": 33.387, "eval_steps_per_second": 8.365, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 9.902098500941502, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.13126561045646667, "logits/rejected": 0.023626195266842842, "logps/chosen": -1.2611668109893799, "logps/rejected": -1.3815600872039795, "loss": 1.6263, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2611668109893799, "rewards/margins": 0.12039327621459961, "rewards/rejected": -1.3815600872039795, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 9.735377329344027, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.05382559448480606, "logits/rejected": 0.12032803148031235, "logps/chosen": -1.2983372211456299, "logps/rejected": -1.4043587446212769, "loss": 1.6978, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2983372211456299, "rewards/margins": 0.1060214415192604, "rewards/rejected": -1.4043587446212769, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 6.387580672860148, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.0932522639632225, "logits/rejected": 0.019932487979531288, "logps/chosen": -1.2985928058624268, "logps/rejected": -1.3247510194778442, "loss": 1.6794, "rewards/accuracies": 0.5, "rewards/chosen": -1.2985928058624268, "rewards/margins": 0.02615833841264248, "rewards/rejected": -1.3247510194778442, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 8.286558491291323, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.06500460207462311, "logits/rejected": 0.160376638174057, "logps/chosen": -1.2172410488128662, "logps/rejected": -1.3483210802078247, "loss": 1.6241, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2172410488128662, "rewards/margins": 0.13107988238334656, "rewards/rejected": -1.3483210802078247, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 5.9781859984561345, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.13773970305919647, "logits/rejected": 0.055517297238111496, "logps/chosen": -1.3097951412200928, "logps/rejected": -1.4055107831954956, "loss": 1.6675, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3097951412200928, "rewards/margins": 0.09571562707424164, "rewards/rejected": -1.4055107831954956, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 8.031756035684708, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.06108025461435318, "logits/rejected": 0.04098379611968994, "logps/chosen": -1.2422094345092773, "logps/rejected": -1.4047267436981201, "loss": 1.6224, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2422094345092773, "rewards/margins": 0.16251742839813232, "rewards/rejected": -1.4047267436981201, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 6.818886107239191, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.030785422772169113, "logits/rejected": 0.08796034008264542, "logps/chosen": -1.2995566129684448, "logps/rejected": -1.3784334659576416, "loss": 1.6868, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2995566129684448, "rewards/margins": 0.07887676358222961, "rewards/rejected": -1.3784334659576416, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 8.55024697740653, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.156296044588089, "logits/rejected": -0.04075910523533821, "logps/chosen": -1.2702831029891968, "logps/rejected": -1.3911727666854858, "loss": 1.6497, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2702831029891968, "rewards/margins": 0.12088973820209503, "rewards/rejected": -1.3911727666854858, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 7.909529033039376, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.05753855034708977, "logits/rejected": 0.09787220507860184, "logps/chosen": -1.3558070659637451, "logps/rejected": -1.3945585489273071, "loss": 1.7134, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3558070659637451, "rewards/margins": 0.03875157609581947, "rewards/rejected": -1.3945585489273071, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 11.207775248078786, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.059803556650877, "logits/rejected": 0.10910670459270477, "logps/chosen": -1.3280284404754639, "logps/rejected": -1.3814871311187744, "loss": 1.7036, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.3280284404754639, "rewards/margins": 0.053458768874406815, "rewards/rejected": -1.3814871311187744, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 9.396884172972708, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.06721973419189453, "logits/rejected": 0.04472237452864647, "logps/chosen": -1.2888884544372559, "logps/rejected": -1.3872915506362915, "loss": 1.6554, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2888884544372559, "rewards/margins": 0.0984029546380043, "rewards/rejected": -1.3872915506362915, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 7.7532438287878245, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.10559892654418945, "logits/rejected": 0.011072332970798016, "logps/chosen": -1.2306454181671143, "logps/rejected": -1.4112110137939453, "loss": 1.6035, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2306454181671143, "rewards/margins": 0.18056556582450867, "rewards/rejected": -1.4112110137939453, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 6.734623913255203, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.03286564350128174, "logits/rejected": 0.08090507984161377, "logps/chosen": -1.2756887674331665, "logps/rejected": -1.387162446975708, "loss": 1.6341, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2756887674331665, "rewards/margins": 0.11147379875183105, "rewards/rejected": -1.387162446975708, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 8.00249171934378, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.07302910834550858, "logits/rejected": 0.02558300271630287, "logps/chosen": -1.2368438243865967, "logps/rejected": -1.3913614749908447, "loss": 1.6175, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2368438243865967, "rewards/margins": 0.15451772511005402, "rewards/rejected": -1.3913614749908447, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 5.610957995849928, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.06308968365192413, "logits/rejected": 0.03248829022049904, "logps/chosen": -1.295707106590271, "logps/rejected": -1.3742659091949463, "loss": 1.6766, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.295707106590271, "rewards/margins": 0.07855904847383499, "rewards/rejected": -1.3742659091949463, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 8.34191349008034, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.07875625044107437, "logits/rejected": 0.04786789044737816, "logps/chosen": -1.3658252954483032, "logps/rejected": -1.4279983043670654, "loss": 1.7333, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3658252954483032, "rewards/margins": 0.062173031270504, "rewards/rejected": -1.4279983043670654, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 7.628569943813764, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.04173643887042999, "logits/rejected": 0.0719747543334961, "logps/chosen": -1.2409158945083618, "logps/rejected": -1.3405625820159912, "loss": 1.6127, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2409158945083618, "rewards/margins": 0.09964674711227417, "rewards/rejected": -1.3405625820159912, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 7.976144585160272, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.17471951246261597, "logits/rejected": -0.04663420841097832, "logps/chosen": -1.3102699518203735, "logps/rejected": -1.3529913425445557, "loss": 1.7179, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3102699518203735, "rewards/margins": 0.042721252888441086, "rewards/rejected": -1.3529913425445557, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 6.085502180077309, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.07361548393964767, "logits/rejected": -0.06373373419046402, "logps/chosen": -1.2772166728973389, "logps/rejected": -1.4289699792861938, "loss": 1.6439, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2772166728973389, "rewards/margins": 0.15175335109233856, "rewards/rejected": -1.4289699792861938, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 7.4475181215635375, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.0670512318611145, "logits/rejected": 0.14799444377422333, "logps/chosen": -1.27315354347229, "logps/rejected": -1.4144771099090576, "loss": 1.6302, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.27315354347229, "rewards/margins": 0.14132359623908997, "rewards/rejected": -1.4144771099090576, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 7.023481889009539, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.011247454211115837, "logits/rejected": 0.05740055441856384, "logps/chosen": -1.2823988199234009, "logps/rejected": -1.4183152914047241, "loss": 1.6301, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2823988199234009, "rewards/margins": 0.13591650128364563, "rewards/rejected": -1.4183152914047241, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 7.044884087209908, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.00906024593859911, "logits/rejected": 0.07833532989025116, "logps/chosen": -1.2953308820724487, "logps/rejected": -1.419379472732544, "loss": 1.6834, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2953308820724487, "rewards/margins": 0.1240486279129982, "rewards/rejected": -1.419379472732544, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 6.3756912656805484, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.073188416659832, "logits/rejected": 0.08738872408866882, "logps/chosen": -1.237274408340454, "logps/rejected": -1.440292239189148, "loss": 1.6012, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.237274408340454, "rewards/margins": 0.20301775634288788, "rewards/rejected": -1.440292239189148, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 5.800572640904087, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.16412648558616638, "logits/rejected": 0.015692204236984253, "logps/chosen": -1.2207753658294678, "logps/rejected": -1.3832472562789917, "loss": 1.5895, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2207753658294678, "rewards/margins": 0.1624719202518463, "rewards/rejected": -1.3832472562789917, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 11.079064346768625, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.03358171135187149, "logits/rejected": 0.056316912174224854, "logps/chosen": -1.2931864261627197, "logps/rejected": -1.380968689918518, "loss": 1.6817, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2931864261627197, "rewards/margins": 0.08778227865695953, "rewards/rejected": -1.380968689918518, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 5.522926070818182, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.07296119630336761, "logits/rejected": 0.03256779909133911, "logps/chosen": -1.2829877138137817, "logps/rejected": -1.357912302017212, "loss": 1.6631, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2829877138137817, "rewards/margins": 0.07492466270923615, "rewards/rejected": -1.357912302017212, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 6.39437906575835, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.022577999159693718, "logits/rejected": 0.04554181545972824, "logps/chosen": -1.3532822132110596, "logps/rejected": -1.4808286428451538, "loss": 1.6798, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3532822132110596, "rewards/margins": 0.12754635512828827, "rewards/rejected": -1.4808286428451538, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 7.854902159500145, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.03494944050908089, "logits/rejected": -0.010035835206508636, "logps/chosen": -1.3224153518676758, "logps/rejected": -1.4352055788040161, "loss": 1.6933, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3224153518676758, "rewards/margins": 0.11279022693634033, "rewards/rejected": -1.4352055788040161, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 5.1287235604030545, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.011534917168319225, "logits/rejected": 0.16566434502601624, "logps/chosen": -1.3075754642486572, "logps/rejected": -1.392835259437561, "loss": 1.6681, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3075754642486572, "rewards/margins": 0.08525966107845306, "rewards/rejected": -1.392835259437561, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 7.552550459316424, "learning_rate": 9.408091218166002e-07, "logits/chosen": -0.008674606680870056, "logits/rejected": 0.03636609762907028, "logps/chosen": -1.292672872543335, "logps/rejected": -1.3186228275299072, "loss": 1.7023, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.292672872543335, "rewards/margins": 0.025949766859412193, "rewards/rejected": -1.3186228275299072, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 7.132339621039502, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.049724649637937546, "logits/rejected": 0.19820116460323334, "logps/chosen": -1.345320701599121, "logps/rejected": -1.4055614471435547, "loss": 1.7265, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.345320701599121, "rewards/margins": 0.060240667313337326, "rewards/rejected": -1.4055614471435547, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 8.479499284806872, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.13432948291301727, "logits/rejected": -0.0047289221547544, "logps/chosen": -1.3591015338897705, "logps/rejected": -1.484825849533081, "loss": 1.7062, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3591015338897705, "rewards/margins": 0.12572428584098816, "rewards/rejected": -1.484825849533081, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 5.378054207471065, "learning_rate": 9.38584798451817e-07, "logits/chosen": 9.254701581085101e-05, "logits/rejected": 0.12194955348968506, "logps/chosen": -1.2895641326904297, "logps/rejected": -1.4120676517486572, "loss": 1.6517, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2895641326904297, "rewards/margins": 0.12250330299139023, "rewards/rejected": -1.4120676517486572, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 8.339099044480037, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.03549307957291603, "logits/rejected": 0.06424375623464584, "logps/chosen": -1.344766616821289, "logps/rejected": -1.5251480340957642, "loss": 1.6706, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.344766616821289, "rewards/margins": 0.18038137257099152, "rewards/rejected": -1.5251480340957642, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 6.481678501748973, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.090664342045784, "logits/rejected": 0.026456937193870544, "logps/chosen": -1.264943242073059, "logps/rejected": -1.4818670749664307, "loss": 1.6211, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.264943242073059, "rewards/margins": 0.21692386269569397, "rewards/rejected": -1.4818670749664307, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 9.402893063974487, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.10575218498706818, "logits/rejected": -0.005930374842137098, "logps/chosen": -1.2926733493804932, "logps/rejected": -1.3471362590789795, "loss": 1.6957, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2926733493804932, "rewards/margins": 0.05446278303861618, "rewards/rejected": -1.3471362590789795, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 7.4118705202897255, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.1039794534444809, "logits/rejected": -4.317015554988757e-05, "logps/chosen": -1.2439075708389282, "logps/rejected": -1.3433864116668701, "loss": 1.6463, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2439075708389282, "rewards/margins": 0.09947877377271652, "rewards/rejected": -1.3433864116668701, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 9.343791160069676, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.012524081394076347, "logits/rejected": 0.08128078281879425, "logps/chosen": -1.2931407690048218, "logps/rejected": -1.4175056219100952, "loss": 1.6637, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2931407690048218, "rewards/margins": 0.12436497211456299, "rewards/rejected": -1.4175056219100952, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 6.2264638302612125, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.00481851352378726, "logits/rejected": 0.08721502125263214, "logps/chosen": -1.2536060810089111, "logps/rejected": -1.3298468589782715, "loss": 1.6526, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2536060810089111, "rewards/margins": 0.0762406438589096, "rewards/rejected": -1.3298468589782715, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 5.1453513483583695, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.19782808423042297, "logits/rejected": -0.09469230473041534, "logps/chosen": -1.3321107625961304, "logps/rejected": -1.3970056772232056, "loss": 1.6925, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3321107625961304, "rewards/margins": 0.06489496678113937, "rewards/rejected": -1.3970056772232056, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 5.777001331063325, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.05654817074537277, "logits/rejected": 0.10098656266927719, "logps/chosen": -1.1896693706512451, "logps/rejected": -1.40790855884552, "loss": 1.5696, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1896693706512451, "rewards/margins": 0.21823914349079132, "rewards/rejected": -1.40790855884552, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 5.61242400636659, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.10295641422271729, "logits/rejected": 0.05629279464483261, "logps/chosen": -1.2972595691680908, "logps/rejected": -1.4456944465637207, "loss": 1.6474, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2972595691680908, "rewards/margins": 0.14843474328517914, "rewards/rejected": -1.4456944465637207, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 10.179841050216837, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.007915811613202095, "logits/rejected": 0.11887457221746445, "logps/chosen": -1.3415172100067139, "logps/rejected": -1.3081309795379639, "loss": 1.7415, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3415172100067139, "rewards/margins": -0.033386241644620895, "rewards/rejected": -1.3081309795379639, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 5.916410428318361, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.02033943310379982, "logits/rejected": 0.08537691831588745, "logps/chosen": -1.2250733375549316, "logps/rejected": -1.4753800630569458, "loss": 1.5933, "rewards/accuracies": 0.5, "rewards/chosen": -1.2250733375549316, "rewards/margins": 0.2503066658973694, "rewards/rejected": -1.4753800630569458, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 6.290863299930604, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.011968791484832764, "logits/rejected": 0.040803950279951096, "logps/chosen": -1.2538902759552002, "logps/rejected": -1.514215111732483, "loss": 1.6073, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2538902759552002, "rewards/margins": 0.26032498478889465, "rewards/rejected": -1.514215111732483, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 6.750555821696019, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.10954097658395767, "logits/rejected": 0.0022288993932306767, "logps/chosen": -1.317549467086792, "logps/rejected": -1.3856089115142822, "loss": 1.6836, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.317549467086792, "rewards/margins": 0.06805931776762009, "rewards/rejected": -1.3856089115142822, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 7.482354045279545, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.003342367708683014, "logits/rejected": 0.04428841918706894, "logps/chosen": -1.254521131515503, "logps/rejected": -1.4428226947784424, "loss": 1.6476, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.254521131515503, "rewards/margins": 0.18830154836177826, "rewards/rejected": -1.4428226947784424, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 5.8446392499081625, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.08462724834680557, "logits/rejected": -0.008226044476032257, "logps/chosen": -1.3154375553131104, "logps/rejected": -1.3610209226608276, "loss": 1.6737, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3154375553131104, "rewards/margins": 0.04558344930410385, "rewards/rejected": -1.3610209226608276, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 8.253023221142572, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.14412081241607666, "logits/rejected": 0.03400403633713722, "logps/chosen": -1.3450403213500977, "logps/rejected": -1.4334951639175415, "loss": 1.7044, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3450403213500977, "rewards/margins": 0.08845490962266922, "rewards/rejected": -1.4334951639175415, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 16.894600835872414, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.0017465263372287154, "logits/rejected": 0.07787595689296722, "logps/chosen": -1.2358486652374268, "logps/rejected": -1.349094271659851, "loss": 1.6353, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2358486652374268, "rewards/margins": 0.11324580013751984, "rewards/rejected": -1.349094271659851, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 9.338591447617187, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.038293130695819855, "logits/rejected": 0.09338172525167465, "logps/chosen": -1.2623484134674072, "logps/rejected": -1.4510595798492432, "loss": 1.6238, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2623484134674072, "rewards/margins": 0.1887110322713852, "rewards/rejected": -1.4510595798492432, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 5.6439465725538955, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.042051661759614944, "logits/rejected": 0.08378352224826813, "logps/chosen": -1.2224422693252563, "logps/rejected": -1.3968486785888672, "loss": 1.5918, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2224422693252563, "rewards/margins": 0.17440657317638397, "rewards/rejected": -1.3968486785888672, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 6.489533578561058, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.021588444709777832, "logits/rejected": 0.06242724508047104, "logps/chosen": -1.3425042629241943, "logps/rejected": -1.4560575485229492, "loss": 1.7156, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3425042629241943, "rewards/margins": 0.113553486764431, "rewards/rejected": -1.4560575485229492, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 8.645614434625807, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.10292228311300278, "logits/rejected": -0.07323513925075531, "logps/chosen": -1.3078405857086182, "logps/rejected": -1.382354497909546, "loss": 1.6892, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3078405857086182, "rewards/margins": 0.07451382279396057, "rewards/rejected": -1.382354497909546, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 5.772767333891252, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.09033913165330887, "logits/rejected": 0.052205026149749756, "logps/chosen": -1.2833424806594849, "logps/rejected": -1.4271941184997559, "loss": 1.6229, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2833424806594849, "rewards/margins": 0.14385172724723816, "rewards/rejected": -1.4271941184997559, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 7.526256119076799, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.027691710740327835, "logits/rejected": 0.06575385481119156, "logps/chosen": -1.2705562114715576, "logps/rejected": -1.4572803974151611, "loss": 1.642, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2705562114715576, "rewards/margins": 0.18672427535057068, "rewards/rejected": -1.4572803974151611, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 7.680511985762157, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.007962608709931374, "logits/rejected": 0.12913981080055237, "logps/chosen": -1.3152211904525757, "logps/rejected": -1.4535510540008545, "loss": 1.6639, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3152211904525757, "rewards/margins": 0.13832971453666687, "rewards/rejected": -1.4535510540008545, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 10.528838663469754, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.06781430542469025, "logits/rejected": 0.02455865405499935, "logps/chosen": -1.2909950017929077, "logps/rejected": -1.3279635906219482, "loss": 1.6853, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2909950017929077, "rewards/margins": 0.036968667060136795, "rewards/rejected": -1.3279635906219482, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 9.310501631330935, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.05040599778294563, "logits/rejected": 0.04599171131849289, "logps/chosen": -1.3820037841796875, "logps/rejected": -1.4067412614822388, "loss": 1.7502, "rewards/accuracies": 0.5, "rewards/chosen": -1.3820037841796875, "rewards/margins": 0.024737408384680748, "rewards/rejected": -1.4067412614822388, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 9.32817945324196, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.07473565638065338, "logits/rejected": 0.014921599999070168, "logps/chosen": -1.2716801166534424, "logps/rejected": -1.4177004098892212, "loss": 1.6462, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2716801166534424, "rewards/margins": 0.14602023363113403, "rewards/rejected": -1.4177004098892212, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 8.872541653326993, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.016116593033075333, "logits/rejected": 0.06647348403930664, "logps/chosen": -1.2263656854629517, "logps/rejected": -1.446277379989624, "loss": 1.5796, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2263656854629517, "rewards/margins": 0.21991190314292908, "rewards/rejected": -1.446277379989624, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 6.140760178845314, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.07571863383054733, "logits/rejected": 0.15782687067985535, "logps/chosen": -1.3236324787139893, "logps/rejected": -1.4447602033615112, "loss": 1.6687, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3236324787139893, "rewards/margins": 0.12112772464752197, "rewards/rejected": -1.4447602033615112, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 6.274424281796091, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.0122830243781209, "logits/rejected": 0.04894590005278587, "logps/chosen": -1.2596652507781982, "logps/rejected": -1.3989272117614746, "loss": 1.6466, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2596652507781982, "rewards/margins": 0.13926205039024353, "rewards/rejected": -1.3989272117614746, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 8.012759631154838, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.020154908299446106, "logits/rejected": 0.04682103544473648, "logps/chosen": -1.2988132238388062, "logps/rejected": -1.4243242740631104, "loss": 1.6557, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2988132238388062, "rewards/margins": 0.12551096081733704, "rewards/rejected": -1.4243242740631104, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 7.330468460048957, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.044300626963377, "logits/rejected": 0.10973195731639862, "logps/chosen": -1.2899428606033325, "logps/rejected": -1.3735395669937134, "loss": 1.689, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2899428606033325, "rewards/margins": 0.08359669148921967, "rewards/rejected": -1.3735395669937134, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 9.830201816802699, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.06985237449407578, "logits/rejected": 0.011676972731947899, "logps/chosen": -1.3224313259124756, "logps/rejected": -1.4183366298675537, "loss": 1.671, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3224313259124756, "rewards/margins": 0.09590514004230499, "rewards/rejected": -1.4183366298675537, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 9.685933820455805, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.08251698315143585, "logits/rejected": 0.06552572548389435, "logps/chosen": -1.245185136795044, "logps/rejected": -1.4116127490997314, "loss": 1.6071, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.245185136795044, "rewards/margins": 0.1664276123046875, "rewards/rejected": -1.4116127490997314, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 8.791302067711484, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.032307274639606476, "logits/rejected": 0.1064511314034462, "logps/chosen": -1.273343801498413, "logps/rejected": -1.4894983768463135, "loss": 1.6085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.273343801498413, "rewards/margins": 0.21615469455718994, "rewards/rejected": -1.4894983768463135, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 6.908997857266348, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.013830190524458885, "logits/rejected": 0.12852010130882263, "logps/chosen": -1.2819262742996216, "logps/rejected": -1.3675658702850342, "loss": 1.6723, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2819262742996216, "rewards/margins": 0.08563955128192902, "rewards/rejected": -1.3675658702850342, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 8.030389353007761, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.14801613986492157, "logits/rejected": 0.08842134475708008, "logps/chosen": -1.3376647233963013, "logps/rejected": -1.4502404928207397, "loss": 1.7009, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3376647233963013, "rewards/margins": 0.11257576942443848, "rewards/rejected": -1.4502404928207397, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 4.847521377767916, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.11283471435308456, "logits/rejected": 0.10331887006759644, "logps/chosen": -1.3693912029266357, "logps/rejected": -1.5027391910552979, "loss": 1.7083, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3693912029266357, "rewards/margins": 0.13334789872169495, "rewards/rejected": -1.5027391910552979, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 7.5086666418473715, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.018067115917801857, "logits/rejected": 0.037450969219207764, "logps/chosen": -1.32191002368927, "logps/rejected": -1.46347975730896, "loss": 1.66, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.32191002368927, "rewards/margins": 0.14156992733478546, "rewards/rejected": -1.46347975730896, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 6.464434757263405, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.01471424289047718, "logits/rejected": 0.06980814039707184, "logps/chosen": -1.3211361169815063, "logps/rejected": -1.4248106479644775, "loss": 1.6868, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3211361169815063, "rewards/margins": 0.10367457568645477, "rewards/rejected": -1.4248106479644775, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 7.922400117981282, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.017235392704606056, "logits/rejected": 0.025248417630791664, "logps/chosen": -1.2190972566604614, "logps/rejected": -1.36015784740448, "loss": 1.6115, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2190972566604614, "rewards/margins": 0.1410606950521469, "rewards/rejected": -1.36015784740448, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 8.04143104023758, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.03161366656422615, "logits/rejected": 0.05319889262318611, "logps/chosen": -1.2939759492874146, "logps/rejected": -1.35179603099823, "loss": 1.6676, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2939759492874146, "rewards/margins": 0.05782013386487961, "rewards/rejected": -1.35179603099823, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 6.655102165682414, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.026246452704072, "logits/rejected": 0.23437242209911346, "logps/chosen": -1.2864961624145508, "logps/rejected": -1.4211158752441406, "loss": 1.6252, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2864961624145508, "rewards/margins": 0.13461963832378387, "rewards/rejected": -1.4211158752441406, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 7.037479275050492, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.013667059130966663, "logits/rejected": 0.12505964934825897, "logps/chosen": -1.2189918756484985, "logps/rejected": -1.4661953449249268, "loss": 1.5914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2189918756484985, "rewards/margins": 0.24720346927642822, "rewards/rejected": -1.4661953449249268, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 7.432156146892551, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.07939986884593964, "logits/rejected": 0.00025191306485794485, "logps/chosen": -1.2421469688415527, "logps/rejected": -1.3436261415481567, "loss": 1.6291, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2421469688415527, "rewards/margins": 0.10147915035486221, "rewards/rejected": -1.3436261415481567, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 9.242470163562162, "learning_rate": 8.998077280340981e-07, "logits/chosen": 0.012076860293745995, "logits/rejected": 0.08173094689846039, "logps/chosen": -1.3606398105621338, "logps/rejected": -1.3600795269012451, "loss": 1.7323, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3606398105621338, "rewards/margins": -0.0005601089214906096, "rewards/rejected": -1.3600795269012451, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 6.88825620139616, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.1084945797920227, "logits/rejected": 0.05396001413464546, "logps/chosen": -1.326180338859558, "logps/rejected": -1.500441551208496, "loss": 1.6566, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.326180338859558, "rewards/margins": 0.17426112294197083, "rewards/rejected": -1.500441551208496, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.27639397978782654, "eval_logits/rejected": 0.35783126950263977, "eval_logps/chosen": -1.3096314668655396, "eval_logps/rejected": -1.4584813117980957, "eval_loss": 1.659972906112671, "eval_rewards/accuracies": 0.5593471527099609, "eval_rewards/chosen": -1.3096314668655396, "eval_rewards/margins": 0.14884990453720093, "eval_rewards/rejected": -1.4584813117980957, "eval_runtime": 40.3391, "eval_samples_per_second": 33.342, "eval_steps_per_second": 8.354, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 5.1888977883125165, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.09929139912128448, "logits/rejected": 0.09505654871463776, "logps/chosen": -1.2664982080459595, "logps/rejected": -1.482974886894226, "loss": 1.5998, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2664982080459595, "rewards/margins": 0.21647676825523376, "rewards/rejected": -1.482974886894226, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 5.823354557557946, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.10225830227136612, "logits/rejected": 0.05890607833862305, "logps/chosen": -1.2548028230667114, "logps/rejected": -1.4133626222610474, "loss": 1.6205, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2548028230667114, "rewards/margins": 0.158559650182724, "rewards/rejected": -1.4133626222610474, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 7.225956675808371, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.06003696471452713, "logits/rejected": 0.027661174535751343, "logps/chosen": -1.3222529888153076, "logps/rejected": -1.5003571510314941, "loss": 1.6707, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3222529888153076, "rewards/margins": 0.1781042069196701, "rewards/rejected": -1.5003571510314941, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 6.830752926640301, "learning_rate": 8.950832333773184e-07, "logits/chosen": 0.0059167868457734585, "logits/rejected": 0.12878912687301636, "logps/chosen": -1.190798044204712, "logps/rejected": -1.4466768503189087, "loss": 1.5546, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.190798044204712, "rewards/margins": 0.25587883591651917, "rewards/rejected": -1.4466768503189087, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 8.251986204131775, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.07970758527517319, "logits/rejected": 0.12433300167322159, "logps/chosen": -1.3536022901535034, "logps/rejected": -1.475412130355835, "loss": 1.7004, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3536022901535034, "rewards/margins": 0.12180988490581512, "rewards/rejected": -1.475412130355835, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 7.521930965978341, "learning_rate": 8.931665393857983e-07, "logits/chosen": 0.007930627092719078, "logits/rejected": 0.14260335266590118, "logps/chosen": -1.312232255935669, "logps/rejected": -1.3909637928009033, "loss": 1.6768, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.312232255935669, "rewards/margins": 0.07873149961233139, "rewards/rejected": -1.3909637928009033, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 10.993310009459227, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.07959005236625671, "logits/rejected": 0.022915149107575417, "logps/chosen": -1.2250585556030273, "logps/rejected": -1.3788336515426636, "loss": 1.6106, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2250585556030273, "rewards/margins": 0.15377524495124817, "rewards/rejected": -1.3788336515426636, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 6.924128529614549, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.0716363936662674, "logits/rejected": 0.020317738875746727, "logps/chosen": -1.2232820987701416, "logps/rejected": -1.3890739679336548, "loss": 1.6274, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2232820987701416, "rewards/margins": 0.1657918244600296, "rewards/rejected": -1.3890739679336548, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 10.01123514535389, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.0073282113298773766, "logits/rejected": 0.03696933388710022, "logps/chosen": -1.2226999998092651, "logps/rejected": -1.3663519620895386, "loss": 1.6106, "rewards/accuracies": 0.625, "rewards/chosen": -1.2226999998092651, "rewards/margins": 0.1436520665884018, "rewards/rejected": -1.3663519620895386, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 11.142210987143999, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.06746591627597809, "logits/rejected": 0.1274791657924652, "logps/chosen": -1.2502325773239136, "logps/rejected": -1.4318780899047852, "loss": 1.6284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2502325773239136, "rewards/margins": 0.18164536356925964, "rewards/rejected": -1.4318780899047852, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 10.041962734713945, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.07328319549560547, "logits/rejected": 0.01814514584839344, "logps/chosen": -1.3024415969848633, "logps/rejected": -1.3887003660202026, "loss": 1.7004, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3024415969848633, "rewards/margins": 0.0862586721777916, "rewards/rejected": -1.3887003660202026, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 6.492097536398011, "learning_rate": 8.873252100389377e-07, "logits/chosen": 0.004718318581581116, "logits/rejected": 0.0009179293992929161, "logps/chosen": -1.2343924045562744, "logps/rejected": -1.4021685123443604, "loss": 1.6224, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2343924045562744, "rewards/margins": 0.16777613759040833, "rewards/rejected": -1.4021685123443604, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 6.711455617048511, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.04459734261035919, "logits/rejected": 0.012989720329642296, "logps/chosen": -1.3119922876358032, "logps/rejected": -1.4353927373886108, "loss": 1.6688, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3119922876358032, "rewards/margins": 0.12340055406093597, "rewards/rejected": -1.4353927373886108, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 6.329821330332764, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.0069259097799658775, "logits/rejected": 0.13729408383369446, "logps/chosen": -1.3276126384735107, "logps/rejected": -1.3940479755401611, "loss": 1.7322, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3276126384735107, "rewards/margins": 0.06643550097942352, "rewards/rejected": -1.3940479755401611, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 5.243545096090104, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.015339165925979614, "logits/rejected": 0.16871802508831024, "logps/chosen": -1.3177587985992432, "logps/rejected": -1.5341602563858032, "loss": 1.6806, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3177587985992432, "rewards/margins": 0.2164015769958496, "rewards/rejected": -1.5341602563858032, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 7.139112205969165, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.10702624171972275, "logits/rejected": -0.009430733509361744, "logps/chosen": -1.346219778060913, "logps/rejected": -1.455933928489685, "loss": 1.6872, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.346219778060913, "rewards/margins": 0.10971428453922272, "rewards/rejected": -1.455933928489685, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 8.788895829020646, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.09646473824977875, "logits/rejected": 0.10258965194225311, "logps/chosen": -1.2965043783187866, "logps/rejected": -1.4185608625411987, "loss": 1.6591, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2965043783187866, "rewards/margins": 0.1220565065741539, "rewards/rejected": -1.4185608625411987, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 7.074812329175778, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.08094370365142822, "logits/rejected": 0.08374644815921783, "logps/chosen": -1.2347418069839478, "logps/rejected": -1.4043574333190918, "loss": 1.5944, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2347418069839478, "rewards/margins": 0.16961567103862762, "rewards/rejected": -1.4043574333190918, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 11.325862698178144, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.2112002819776535, "logits/rejected": -0.09278073161840439, "logps/chosen": -1.3451100587844849, "logps/rejected": -1.4331865310668945, "loss": 1.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3451100587844849, "rewards/margins": 0.08807633072137833, "rewards/rejected": -1.4331865310668945, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 8.409412894002354, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.054956771433353424, "logits/rejected": -0.09063141793012619, "logps/chosen": -1.3343979120254517, "logps/rejected": -1.3663153648376465, "loss": 1.7033, "rewards/accuracies": 0.5, "rewards/chosen": -1.3343979120254517, "rewards/margins": 0.03191746026277542, "rewards/rejected": -1.3663153648376465, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 8.440378996313939, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.007442918606102467, "logits/rejected": 0.11806105077266693, "logps/chosen": -1.3040060997009277, "logps/rejected": -1.3856302499771118, "loss": 1.6761, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3040060997009277, "rewards/margins": 0.08162431418895721, "rewards/rejected": -1.3856302499771118, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 7.30589729352165, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.07810314744710922, "logits/rejected": -0.009394729509949684, "logps/chosen": -1.243417501449585, "logps/rejected": -1.3742151260375977, "loss": 1.6096, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.243417501449585, "rewards/margins": 0.13079769909381866, "rewards/rejected": -1.3742151260375977, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 6.627437386798884, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.03734375163912773, "logits/rejected": 0.15281075239181519, "logps/chosen": -1.3413758277893066, "logps/rejected": -1.4052072763442993, "loss": 1.7142, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3413758277893066, "rewards/margins": 0.0638314038515091, "rewards/rejected": -1.4052072763442993, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 6.446085230868156, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.07835554331541061, "logits/rejected": -0.030281897634267807, "logps/chosen": -1.1814202070236206, "logps/rejected": -1.4106985330581665, "loss": 1.573, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1814202070236206, "rewards/margins": 0.22927825152873993, "rewards/rejected": -1.4106985330581665, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 6.826260743987488, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.08342117816209793, "logits/rejected": 0.0009416237589903176, "logps/chosen": -1.3154761791229248, "logps/rejected": -1.509495735168457, "loss": 1.6444, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3154761791229248, "rewards/margins": 0.1940193474292755, "rewards/rejected": -1.509495735168457, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 9.427502099175387, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.021935302764177322, "logits/rejected": 0.061028145253658295, "logps/chosen": -1.2851629257202148, "logps/rejected": -1.3662439584732056, "loss": 1.6694, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2851629257202148, "rewards/margins": 0.08108110725879669, "rewards/rejected": -1.3662439584732056, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 7.23708158585448, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.13660050928592682, "logits/rejected": -0.009003483690321445, "logps/chosen": -1.2197661399841309, "logps/rejected": -1.5537761449813843, "loss": 1.5576, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2197661399841309, "rewards/margins": 0.33400994539260864, "rewards/rejected": -1.5537761449813843, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 6.690179189666134, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.11041511595249176, "logits/rejected": -0.007809894625097513, "logps/chosen": -1.2693771123886108, "logps/rejected": -1.3839272260665894, "loss": 1.6346, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2693771123886108, "rewards/margins": 0.11454999446868896, "rewards/rejected": -1.3839272260665894, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 6.875548426964406, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.03753243386745453, "logits/rejected": -0.023988056927919388, "logps/chosen": -1.2683788537979126, "logps/rejected": -1.3910294771194458, "loss": 1.6375, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2683788537979126, "rewards/margins": 0.12265037000179291, "rewards/rejected": -1.3910294771194458, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 10.913820939420868, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.11481869220733643, "logits/rejected": -0.06308847665786743, "logps/chosen": -1.266616702079773, "logps/rejected": -1.4824692010879517, "loss": 1.6015, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.266616702079773, "rewards/margins": 0.2158524990081787, "rewards/rejected": -1.4824692010879517, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 5.804834151418999, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.10651911795139313, "logits/rejected": -0.03721824288368225, "logps/chosen": -1.2545320987701416, "logps/rejected": -1.3591195344924927, "loss": 1.6401, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2545320987701416, "rewards/margins": 0.10458721220493317, "rewards/rejected": -1.3591195344924927, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 6.638379460376084, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.0438620001077652, "logits/rejected": 0.054749589413404465, "logps/chosen": -1.2415411472320557, "logps/rejected": -1.392091989517212, "loss": 1.6126, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2415411472320557, "rewards/margins": 0.1505509614944458, "rewards/rejected": -1.392091989517212, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 6.540934099687025, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.03682807832956314, "logits/rejected": -0.020194586366415024, "logps/chosen": -1.2712775468826294, "logps/rejected": -1.428292989730835, "loss": 1.6514, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2712775468826294, "rewards/margins": 0.15701545774936676, "rewards/rejected": -1.428292989730835, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 5.755972920864021, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.03591788560152054, "logits/rejected": 0.09994689375162125, "logps/chosen": -1.296185851097107, "logps/rejected": -1.3564786911010742, "loss": 1.6628, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.296185851097107, "rewards/margins": 0.06029283255338669, "rewards/rejected": -1.3564786911010742, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 8.516675145333354, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.10436888039112091, "logits/rejected": -0.04029594361782074, "logps/chosen": -1.2394301891326904, "logps/rejected": -1.3689510822296143, "loss": 1.6142, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2394301891326904, "rewards/margins": 0.12952089309692383, "rewards/rejected": -1.3689510822296143, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 6.289999309014636, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.09868212044239044, "logits/rejected": 0.0011174462269991636, "logps/chosen": -1.2685974836349487, "logps/rejected": -1.431061029434204, "loss": 1.6255, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2685974836349487, "rewards/margins": 0.16246357560157776, "rewards/rejected": -1.431061029434204, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 7.183464870616547, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.006868282798677683, "logits/rejected": 0.06984652578830719, "logps/chosen": -1.2149683237075806, "logps/rejected": -1.309650182723999, "loss": 1.6388, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2149683237075806, "rewards/margins": 0.09468193352222443, "rewards/rejected": -1.309650182723999, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 7.560385247647893, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.004362165927886963, "logits/rejected": 0.0364392027258873, "logps/chosen": -1.2910511493682861, "logps/rejected": -1.4012508392333984, "loss": 1.653, "rewards/accuracies": 0.5, "rewards/chosen": -1.2910511493682861, "rewards/margins": 0.11019964516162872, "rewards/rejected": -1.4012508392333984, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 7.882374537885048, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.09401813894510269, "logits/rejected": 0.024964818730950356, "logps/chosen": -1.2600610256195068, "logps/rejected": -1.4081159830093384, "loss": 1.6276, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2600610256195068, "rewards/margins": 0.1480550318956375, "rewards/rejected": -1.4081159830093384, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 5.2752952279039445, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.0389956533908844, "logits/rejected": 0.017822042107582092, "logps/chosen": -1.2581193447113037, "logps/rejected": -1.3478018045425415, "loss": 1.6489, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2581193447113037, "rewards/margins": 0.08968259394168854, "rewards/rejected": -1.3478018045425415, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 5.94588261820782, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.16185346245765686, "logits/rejected": -0.05402858182787895, "logps/chosen": -1.218846082687378, "logps/rejected": -1.4322948455810547, "loss": 1.5672, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.218846082687378, "rewards/margins": 0.21344876289367676, "rewards/rejected": -1.4322948455810547, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 7.631863260969375, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.038693930953741074, "logits/rejected": 0.08398672938346863, "logps/chosen": -1.2442553043365479, "logps/rejected": -1.3523961305618286, "loss": 1.6342, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2442553043365479, "rewards/margins": 0.10814078152179718, "rewards/rejected": -1.3523961305618286, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 6.160883134645266, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.08054449409246445, "logits/rejected": 0.023082170635461807, "logps/chosen": -1.2673180103302002, "logps/rejected": -1.3657923936843872, "loss": 1.6303, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2673180103302002, "rewards/margins": 0.09847430884838104, "rewards/rejected": -1.3657923936843872, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 7.3685895699380515, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.057333867996931076, "logits/rejected": 0.0019186340505257249, "logps/chosen": -1.2531261444091797, "logps/rejected": -1.3625218868255615, "loss": 1.6317, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2531261444091797, "rewards/margins": 0.10939564555883408, "rewards/rejected": -1.3625218868255615, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 7.955820131517521, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.11731233447790146, "logits/rejected": 0.002155174035578966, "logps/chosen": -1.2458884716033936, "logps/rejected": -1.417532205581665, "loss": 1.6204, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2458884716033936, "rewards/margins": 0.17164357006549835, "rewards/rejected": -1.417532205581665, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 6.122737863351037, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.03416532278060913, "logits/rejected": 0.053548503667116165, "logps/chosen": -1.3238328695297241, "logps/rejected": -1.3591514825820923, "loss": 1.7031, "rewards/accuracies": 0.5, "rewards/chosen": -1.3238328695297241, "rewards/margins": 0.035318635404109955, "rewards/rejected": -1.3591514825820923, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 7.675621169957029, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.06183997914195061, "logits/rejected": -0.016029536724090576, "logps/chosen": -1.2829948663711548, "logps/rejected": -1.3081411123275757, "loss": 1.6731, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2829948663711548, "rewards/margins": 0.02514636516571045, "rewards/rejected": -1.3081411123275757, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 6.871647817378736, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.02273593842983246, "logits/rejected": 0.05914074927568436, "logps/chosen": -1.279006838798523, "logps/rejected": -1.465643286705017, "loss": 1.6271, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.279006838798523, "rewards/margins": 0.18663650751113892, "rewards/rejected": -1.465643286705017, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 9.202293879728787, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.09814430773258209, "logits/rejected": -0.019922833889722824, "logps/chosen": -1.301499366760254, "logps/rejected": -1.3693764209747314, "loss": 1.676, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.301499366760254, "rewards/margins": 0.06787705421447754, "rewards/rejected": -1.3693764209747314, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 6.264497155893721, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.18302765488624573, "logits/rejected": -0.07154009491205215, "logps/chosen": -1.3125717639923096, "logps/rejected": -1.3157421350479126, "loss": 1.6926, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3125717639923096, "rewards/margins": 0.0031705095898360014, "rewards/rejected": -1.3157421350479126, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 6.488223481774972, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.08081166446208954, "logits/rejected": 0.0005839228397235274, "logps/chosen": -1.2326545715332031, "logps/rejected": -1.369754433631897, "loss": 1.6284, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2326545715332031, "rewards/margins": 0.13709980249404907, "rewards/rejected": -1.369754433631897, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 7.904758128328132, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.04447736218571663, "logits/rejected": 0.01741199567914009, "logps/chosen": -1.3151956796646118, "logps/rejected": -1.4005998373031616, "loss": 1.6629, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3151956796646118, "rewards/margins": 0.08540423959493637, "rewards/rejected": -1.4005998373031616, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 6.880587886858402, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.07799827307462692, "logits/rejected": -0.016821760684251785, "logps/chosen": -1.185073733329773, "logps/rejected": -1.4089863300323486, "loss": 1.5735, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.185073733329773, "rewards/margins": 0.22391244769096375, "rewards/rejected": -1.4089863300323486, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 7.095370270812544, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.13477222621440887, "logits/rejected": -0.005845165345817804, "logps/chosen": -1.3289659023284912, "logps/rejected": -1.377912163734436, "loss": 1.6938, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3289659023284912, "rewards/margins": 0.048946212977170944, "rewards/rejected": -1.377912163734436, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 5.750529158786094, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.1507563441991806, "logits/rejected": -0.0380830354988575, "logps/chosen": -1.3004181385040283, "logps/rejected": -1.3883024454116821, "loss": 1.6578, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3004181385040283, "rewards/margins": 0.08788414299488068, "rewards/rejected": -1.3883024454116821, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 7.777439085414502, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.03777514398097992, "logits/rejected": 0.11495566368103027, "logps/chosen": -1.254887342453003, "logps/rejected": -1.5188519954681396, "loss": 1.616, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.254887342453003, "rewards/margins": 0.26396480202674866, "rewards/rejected": -1.5188519954681396, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 7.538747422203276, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.08718445152044296, "logits/rejected": -0.020811621099710464, "logps/chosen": -1.2131198644638062, "logps/rejected": -1.3882570266723633, "loss": 1.5975, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2131198644638062, "rewards/margins": 0.17513707280158997, "rewards/rejected": -1.3882570266723633, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 6.472314903623082, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.09305468946695328, "logits/rejected": 0.07835519313812256, "logps/chosen": -1.268439531326294, "logps/rejected": -1.4149713516235352, "loss": 1.6652, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.268439531326294, "rewards/margins": 0.1465318351984024, "rewards/rejected": -1.4149713516235352, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 7.459073670556534, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.03299138322472572, "logits/rejected": 0.019478967413306236, "logps/chosen": -1.2478959560394287, "logps/rejected": -1.388780951499939, "loss": 1.61, "rewards/accuracies": 0.625, "rewards/chosen": -1.2478959560394287, "rewards/margins": 0.14088508486747742, "rewards/rejected": -1.388780951499939, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 7.2190581936132885, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1465390920639038, "logits/rejected": 0.07248485833406448, "logps/chosen": -1.2892343997955322, "logps/rejected": -1.3898210525512695, "loss": 1.6613, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2892343997955322, "rewards/margins": 0.10058672726154327, "rewards/rejected": -1.3898210525512695, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 6.660162026732205, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.07367981225252151, "logits/rejected": 0.04928433522582054, "logps/chosen": -1.3099291324615479, "logps/rejected": -1.4067552089691162, "loss": 1.6726, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3099291324615479, "rewards/margins": 0.0968259871006012, "rewards/rejected": -1.4067552089691162, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 7.458385903072796, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.16138581931591034, "logits/rejected": -0.05747319385409355, "logps/chosen": -1.2732274532318115, "logps/rejected": -1.4249045848846436, "loss": 1.6463, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2732274532318115, "rewards/margins": 0.15167725086212158, "rewards/rejected": -1.4249045848846436, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 5.044798526395231, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.12010698020458221, "logits/rejected": -0.06151549890637398, "logps/chosen": -1.23702073097229, "logps/rejected": -1.3350869417190552, "loss": 1.6219, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.23702073097229, "rewards/margins": 0.0980663076043129, "rewards/rejected": -1.3350869417190552, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 6.461444915897312, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.13163354992866516, "logits/rejected": -0.070621058344841, "logps/chosen": -1.1589525938034058, "logps/rejected": -1.3587532043457031, "loss": 1.5428, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1589525938034058, "rewards/margins": 0.19980056583881378, "rewards/rejected": -1.3587532043457031, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 7.789203310879303, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.2395746260881424, "logits/rejected": -0.06477358937263489, "logps/chosen": -1.3387569189071655, "logps/rejected": -1.4158661365509033, "loss": 1.6874, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3387569189071655, "rewards/margins": 0.07710927724838257, "rewards/rejected": -1.4158661365509033, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 6.64609691322332, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.14642703533172607, "logits/rejected": -0.025876719504594803, "logps/chosen": -1.2055902481079102, "logps/rejected": -1.3968555927276611, "loss": 1.5728, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2055902481079102, "rewards/margins": 0.19126519560813904, "rewards/rejected": -1.3968555927276611, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 6.594534161726122, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.17431317269802094, "logits/rejected": -0.02574203535914421, "logps/chosen": -1.272735834121704, "logps/rejected": -1.4293947219848633, "loss": 1.6216, "rewards/accuracies": 0.59375, "rewards/chosen": -1.272735834121704, "rewards/margins": 0.1566590517759323, "rewards/rejected": -1.4293947219848633, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 7.331159146339648, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.09168613702058792, "logits/rejected": 0.0004261195717845112, "logps/chosen": -1.2771458625793457, "logps/rejected": -1.4360650777816772, "loss": 1.642, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2771458625793457, "rewards/margins": 0.1589193344116211, "rewards/rejected": -1.4360650777816772, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 7.080897027099071, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.18290378153324127, "logits/rejected": -0.03943333774805069, "logps/chosen": -1.2643144130706787, "logps/rejected": -1.4040201902389526, "loss": 1.6022, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2643144130706787, "rewards/margins": 0.13970568776130676, "rewards/rejected": -1.4040201902389526, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 6.1550230568679645, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.11934999376535416, "logits/rejected": 0.055243026465177536, "logps/chosen": -1.310400128364563, "logps/rejected": -1.4350045919418335, "loss": 1.6469, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.310400128364563, "rewards/margins": 0.12460446357727051, "rewards/rejected": -1.4350045919418335, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 7.148429628311151, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.08163203299045563, "logits/rejected": -0.026378247886896133, "logps/chosen": -1.2024458646774292, "logps/rejected": -1.4101860523223877, "loss": 1.555, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2024458646774292, "rewards/margins": 0.2077401578426361, "rewards/rejected": -1.4101860523223877, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 6.742860934814458, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.1711881458759308, "logits/rejected": -0.14690272510051727, "logps/chosen": -1.2601053714752197, "logps/rejected": -1.3863235712051392, "loss": 1.6365, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2601053714752197, "rewards/margins": 0.1262180358171463, "rewards/rejected": -1.3863235712051392, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 7.520939323766149, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.06285201013088226, "logits/rejected": -0.008772213943302631, "logps/chosen": -1.2074944972991943, "logps/rejected": -1.4803524017333984, "loss": 1.5389, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2074944972991943, "rewards/margins": 0.27285805344581604, "rewards/rejected": -1.4803524017333984, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 8.444048221481284, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.12451770156621933, "logits/rejected": -0.010346454568207264, "logps/chosen": -1.2716000080108643, "logps/rejected": -1.442080020904541, "loss": 1.6334, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2716000080108643, "rewards/margins": 0.1704799234867096, "rewards/rejected": -1.442080020904541, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 7.439698493505291, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.06397426128387451, "logits/rejected": 0.01084409561008215, "logps/chosen": -1.2019623517990112, "logps/rejected": -1.3936353921890259, "loss": 1.5667, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2019623517990112, "rewards/margins": 0.19167283177375793, "rewards/rejected": -1.3936353921890259, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 7.66250379267884, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.014584923163056374, "logits/rejected": 0.0913209542632103, "logps/chosen": -1.276272177696228, "logps/rejected": -1.4327203035354614, "loss": 1.6424, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.276272177696228, "rewards/margins": 0.15644793212413788, "rewards/rejected": -1.4327203035354614, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 7.6835509995952345, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.10186652094125748, "logits/rejected": 0.030913090333342552, "logps/chosen": -1.2977344989776611, "logps/rejected": -1.3779385089874268, "loss": 1.6738, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2977344989776611, "rewards/margins": 0.08020380884408951, "rewards/rejected": -1.3779385089874268, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 7.38233771315589, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.127910777926445, "logits/rejected": -0.053719568997621536, "logps/chosen": -1.301751732826233, "logps/rejected": -1.4438529014587402, "loss": 1.6436, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.301751732826233, "rewards/margins": 0.1421012133359909, "rewards/rejected": -1.4438529014587402, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 7.261162820145197, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.023806992918252945, "logits/rejected": 0.10135696083307266, "logps/chosen": -1.2175931930541992, "logps/rejected": -1.4716691970825195, "loss": 1.576, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2175931930541992, "rewards/margins": 0.25407615303993225, "rewards/rejected": -1.4716691970825195, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 11.282402041281472, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.10262684524059296, "logits/rejected": 0.02830558642745018, "logps/chosen": -1.3009599447250366, "logps/rejected": -1.3559610843658447, "loss": 1.7104, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3009599447250366, "rewards/margins": 0.05500108003616333, "rewards/rejected": -1.3559610843658447, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.17809128761291504, "eval_logits/rejected": 0.2528059482574463, "eval_logps/chosen": -1.3006004095077515, "eval_logps/rejected": -1.4569226503372192, "eval_loss": 1.6553415060043335, "eval_rewards/accuracies": 0.5660237669944763, "eval_rewards/chosen": -1.3006004095077515, "eval_rewards/margins": 0.15632230043411255, "eval_rewards/rejected": -1.4569226503372192, "eval_runtime": 40.4167, "eval_samples_per_second": 33.278, "eval_steps_per_second": 8.338, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 8.336898237104945, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.08942779153585434, "logits/rejected": -0.022323116660118103, "logps/chosen": -1.3037192821502686, "logps/rejected": -1.3781172037124634, "loss": 1.6777, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3037192821502686, "rewards/margins": 0.07439792156219482, "rewards/rejected": -1.3781172037124634, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 9.214908562514987, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.02387317083775997, "logits/rejected": 0.06140471249818802, "logps/chosen": -1.2356733083724976, "logps/rejected": -1.3744370937347412, "loss": 1.6234, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2356733083724976, "rewards/margins": 0.13876380026340485, "rewards/rejected": -1.3744370937347412, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 6.725378431673041, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.09659741073846817, "logits/rejected": 0.035605426877737045, "logps/chosen": -1.2066147327423096, "logps/rejected": -1.3792082071304321, "loss": 1.5918, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2066147327423096, "rewards/margins": 0.17259351909160614, "rewards/rejected": -1.3792082071304321, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 7.312873076103676, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.1383553296327591, "logits/rejected": -0.005746991373598576, "logps/chosen": -1.3074740171432495, "logps/rejected": -1.4675743579864502, "loss": 1.6512, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3074740171432495, "rewards/margins": 0.16010025143623352, "rewards/rejected": -1.4675743579864502, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 7.354426390150471, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.03547487407922745, "logits/rejected": 0.06639555096626282, "logps/chosen": -1.279608964920044, "logps/rejected": -1.4450252056121826, "loss": 1.6361, "rewards/accuracies": 0.59375, "rewards/chosen": -1.279608964920044, "rewards/margins": 0.1654161959886551, "rewards/rejected": -1.4450252056121826, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 6.417914554300857, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.12844283878803253, "logits/rejected": -0.11059192568063736, "logps/chosen": -1.2247509956359863, "logps/rejected": -1.3813955783843994, "loss": 1.6081, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2247509956359863, "rewards/margins": 0.15664449334144592, "rewards/rejected": -1.3813955783843994, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 6.961956058534192, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.13266727328300476, "logits/rejected": 0.008277666755020618, "logps/chosen": -1.2267061471939087, "logps/rejected": -1.3689435720443726, "loss": 1.5991, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2267061471939087, "rewards/margins": 0.142237588763237, "rewards/rejected": -1.3689435720443726, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 8.991338068218205, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.19082000851631165, "logits/rejected": -0.1373734176158905, "logps/chosen": -1.2447383403778076, "logps/rejected": -1.355181097984314, "loss": 1.6218, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2447383403778076, "rewards/margins": 0.11044273525476456, "rewards/rejected": -1.355181097984314, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 10.371267959736507, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.0713386982679367, "logits/rejected": -0.03666887432336807, "logps/chosen": -1.242963433265686, "logps/rejected": -1.4130642414093018, "loss": 1.6154, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.242963433265686, "rewards/margins": 0.17010077834129333, "rewards/rejected": -1.4130642414093018, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 8.380066198572631, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.15726640820503235, "logits/rejected": -0.06450683623552322, "logps/chosen": -1.2815487384796143, "logps/rejected": -1.3583753108978271, "loss": 1.6766, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2815487384796143, "rewards/margins": 0.07682657241821289, "rewards/rejected": -1.3583753108978271, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 8.343216936885899, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.06073587015271187, "logits/rejected": 0.04076801612973213, "logps/chosen": -1.2568342685699463, "logps/rejected": -1.366917610168457, "loss": 1.6255, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2568342685699463, "rewards/margins": 0.1100834459066391, "rewards/rejected": -1.366917610168457, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 8.089157458893371, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.03909459710121155, "logits/rejected": -0.0011225551133975387, "logps/chosen": -1.2822999954223633, "logps/rejected": -1.475353717803955, "loss": 1.6247, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2822999954223633, "rewards/margins": 0.193053737282753, "rewards/rejected": -1.475353717803955, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 7.708362290838383, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.07062169164419174, "logits/rejected": 0.03906797990202904, "logps/chosen": -1.2624900341033936, "logps/rejected": -1.367241382598877, "loss": 1.6437, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2624900341033936, "rewards/margins": 0.10475137084722519, "rewards/rejected": -1.367241382598877, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 8.132968939547782, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.10409660637378693, "logits/rejected": 0.02609199844300747, "logps/chosen": -1.2147948741912842, "logps/rejected": -1.4452593326568604, "loss": 1.5803, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2147948741912842, "rewards/margins": 0.2304643839597702, "rewards/rejected": -1.4452593326568604, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 7.762202574182165, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.14026805758476257, "logits/rejected": -0.05574485659599304, "logps/chosen": -1.257171630859375, "logps/rejected": -1.452803373336792, "loss": 1.6068, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.257171630859375, "rewards/margins": 0.19563186168670654, "rewards/rejected": -1.452803373336792, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 5.722052395771005, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.1834503412246704, "logits/rejected": -0.05271712690591812, "logps/chosen": -1.1889479160308838, "logps/rejected": -1.4604111909866333, "loss": 1.5221, "rewards/accuracies": 0.625, "rewards/chosen": -1.1889479160308838, "rewards/margins": 0.27146315574645996, "rewards/rejected": -1.4604111909866333, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 11.535482204762738, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.15596617758274078, "logits/rejected": -0.06275282800197601, "logps/chosen": -1.237905740737915, "logps/rejected": -1.3934085369110107, "loss": 1.6117, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.237905740737915, "rewards/margins": 0.15550284087657928, "rewards/rejected": -1.3934085369110107, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 8.74537384643787, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.21057133376598358, "logits/rejected": -0.2055818736553192, "logps/chosen": -1.2205783128738403, "logps/rejected": -1.3647421598434448, "loss": 1.6084, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2205783128738403, "rewards/margins": 0.14416398108005524, "rewards/rejected": -1.3647421598434448, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 4.88317103034377, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.11465215682983398, "logits/rejected": 0.0019001305336132646, "logps/chosen": -1.3683791160583496, "logps/rejected": -1.4810022115707397, "loss": 1.7229, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3683791160583496, "rewards/margins": 0.11262323707342148, "rewards/rejected": -1.4810022115707397, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 8.94810065146586, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.07218264043331146, "logits/rejected": -0.04129823297262192, "logps/chosen": -1.3749353885650635, "logps/rejected": -1.4446145296096802, "loss": 1.7206, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3749353885650635, "rewards/margins": 0.06967911124229431, "rewards/rejected": -1.4446145296096802, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 7.16251528042221, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.052335310727357864, "logits/rejected": 0.04018264636397362, "logps/chosen": -1.300136923789978, "logps/rejected": -1.4270994663238525, "loss": 1.6535, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.300136923789978, "rewards/margins": 0.12696246802806854, "rewards/rejected": -1.4270994663238525, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 8.329444300535107, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.13611355423927307, "logits/rejected": 0.005605706479400396, "logps/chosen": -1.243820309638977, "logps/rejected": -1.5600693225860596, "loss": 1.5738, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.243820309638977, "rewards/margins": 0.3162487745285034, "rewards/rejected": -1.5600693225860596, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 7.9381413036119435, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.03070439025759697, "logits/rejected": 0.010776013135910034, "logps/chosen": -1.2483736276626587, "logps/rejected": -1.3772048950195312, "loss": 1.6129, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2483736276626587, "rewards/margins": 0.12883132696151733, "rewards/rejected": -1.3772048950195312, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 8.610968983850155, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.15365591645240784, "logits/rejected": -0.1279035359621048, "logps/chosen": -1.2922980785369873, "logps/rejected": -1.4172145128250122, "loss": 1.6552, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2922980785369873, "rewards/margins": 0.12491631507873535, "rewards/rejected": -1.4172145128250122, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 6.9824498893646885, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.10298570245504379, "logits/rejected": -0.05632113292813301, "logps/chosen": -1.2003002166748047, "logps/rejected": -1.4210450649261475, "loss": 1.5523, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2003002166748047, "rewards/margins": 0.2207448035478592, "rewards/rejected": -1.4210450649261475, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 8.891391222995903, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.08387793600559235, "logits/rejected": 0.10497613251209259, "logps/chosen": -1.2898766994476318, "logps/rejected": -1.453222632408142, "loss": 1.6373, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2898766994476318, "rewards/margins": 0.1633458435535431, "rewards/rejected": -1.453222632408142, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 6.476456909040568, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.08066694438457489, "logits/rejected": 0.04436764121055603, "logps/chosen": -1.2959989309310913, "logps/rejected": -1.446028470993042, "loss": 1.6487, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2959989309310913, "rewards/margins": 0.15002937614917755, "rewards/rejected": -1.446028470993042, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 7.992391927922851, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.1058705598115921, "logits/rejected": 0.00794590637087822, "logps/chosen": -1.2681523561477661, "logps/rejected": -1.3907512426376343, "loss": 1.6402, "rewards/accuracies": 0.5, "rewards/chosen": -1.2681523561477661, "rewards/margins": 0.12259910255670547, "rewards/rejected": -1.3907512426376343, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 7.766977663702184, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.07947366684675217, "logits/rejected": -0.01362493634223938, "logps/chosen": -1.1541688442230225, "logps/rejected": -1.3922502994537354, "loss": 1.5354, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1541688442230225, "rewards/margins": 0.23808152973651886, "rewards/rejected": -1.3922502994537354, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 5.489764217394201, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.09513584524393082, "logits/rejected": -0.053471989929676056, "logps/chosen": -1.3207443952560425, "logps/rejected": -1.4349052906036377, "loss": 1.6716, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3207443952560425, "rewards/margins": 0.1141607016324997, "rewards/rejected": -1.4349052906036377, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 6.564842400730169, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.04338967055082321, "logits/rejected": 0.007275203708559275, "logps/chosen": -1.3063982725143433, "logps/rejected": -1.4485085010528564, "loss": 1.6369, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3063982725143433, "rewards/margins": 0.1421101987361908, "rewards/rejected": -1.4485085010528564, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 8.656188129297469, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.14563269913196564, "logits/rejected": -0.050481997430324554, "logps/chosen": -1.225486397743225, "logps/rejected": -1.425831913948059, "loss": 1.5927, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.225486397743225, "rewards/margins": 0.20034563541412354, "rewards/rejected": -1.425831913948059, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 6.986966699998474, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.11772508919239044, "logits/rejected": 0.03461534529924393, "logps/chosen": -1.2676708698272705, "logps/rejected": -1.477513074874878, "loss": 1.6074, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2676708698272705, "rewards/margins": 0.20984220504760742, "rewards/rejected": -1.477513074874878, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 5.522124273534204, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.10463432222604752, "logits/rejected": 0.03196742385625839, "logps/chosen": -1.2275731563568115, "logps/rejected": -1.4633052349090576, "loss": 1.5896, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2275731563568115, "rewards/margins": 0.23573215305805206, "rewards/rejected": -1.4633052349090576, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 6.115225759248435, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.11995580047369003, "logits/rejected": 0.019036246463656425, "logps/chosen": -1.300389051437378, "logps/rejected": -1.3865238428115845, "loss": 1.6644, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.300389051437378, "rewards/margins": 0.08613482862710953, "rewards/rejected": -1.3865238428115845, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 6.201062151895898, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.11636760085821152, "logits/rejected": -0.05718974396586418, "logps/chosen": -1.234830617904663, "logps/rejected": -1.407841444015503, "loss": 1.5975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.234830617904663, "rewards/margins": 0.17301085591316223, "rewards/rejected": -1.407841444015503, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 8.382011760189442, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.04539335146546364, "logits/rejected": 0.0673140436410904, "logps/chosen": -1.265032172203064, "logps/rejected": -1.3793548345565796, "loss": 1.6196, "rewards/accuracies": 0.625, "rewards/chosen": -1.265032172203064, "rewards/margins": 0.11432279646396637, "rewards/rejected": -1.3793548345565796, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 6.889993205894781, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.018428370356559753, "logits/rejected": 0.05882607027888298, "logps/chosen": -1.25342857837677, "logps/rejected": -1.3916927576065063, "loss": 1.6259, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.25342857837677, "rewards/margins": 0.13826411962509155, "rewards/rejected": -1.3916927576065063, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 9.041271982810615, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.04829593747854233, "logits/rejected": 0.029843684285879135, "logps/chosen": -1.2311931848526, "logps/rejected": -1.4910755157470703, "loss": 1.5839, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2311931848526, "rewards/margins": 0.2598823010921478, "rewards/rejected": -1.4910755157470703, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 7.524567421829956, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.12873823940753937, "logits/rejected": -0.1393408328294754, "logps/chosen": -1.2622649669647217, "logps/rejected": -1.4139279127120972, "loss": 1.6225, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2622649669647217, "rewards/margins": 0.15166299045085907, "rewards/rejected": -1.4139279127120972, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 7.098693824228783, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.1603354513645172, "logits/rejected": -0.010439865291118622, "logps/chosen": -1.1993751525878906, "logps/rejected": -1.374045729637146, "loss": 1.5809, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1993751525878906, "rewards/margins": 0.17467060685157776, "rewards/rejected": -1.374045729637146, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 8.549821414081535, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.21130235493183136, "logits/rejected": -0.11883778870105743, "logps/chosen": -1.1435682773590088, "logps/rejected": -1.3947386741638184, "loss": 1.5242, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1435682773590088, "rewards/margins": 0.25117045640945435, "rewards/rejected": -1.3947386741638184, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 7.835406150352905, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.1915084421634674, "logits/rejected": -0.1323508620262146, "logps/chosen": -1.364108681678772, "logps/rejected": -1.531037449836731, "loss": 1.6856, "rewards/accuracies": 0.59375, "rewards/chosen": -1.364108681678772, "rewards/margins": 0.16692869365215302, "rewards/rejected": -1.531037449836731, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 6.2139292317312265, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.2246476113796234, "logits/rejected": -0.120921291410923, "logps/chosen": -1.2061656713485718, "logps/rejected": -1.3511766195297241, "loss": 1.5949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2061656713485718, "rewards/margins": 0.14501085877418518, "rewards/rejected": -1.3511766195297241, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 8.07090751462236, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.15275296568870544, "logits/rejected": 0.017976239323616028, "logps/chosen": -1.2264677286148071, "logps/rejected": -1.4621691703796387, "loss": 1.5633, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2264677286148071, "rewards/margins": 0.23570141196250916, "rewards/rejected": -1.4621691703796387, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 12.851901330750579, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.2065228968858719, "logits/rejected": -0.09520624577999115, "logps/chosen": -1.2863290309906006, "logps/rejected": -1.5747162103652954, "loss": 1.635, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2863290309906006, "rewards/margins": 0.2883870601654053, "rewards/rejected": -1.5747162103652954, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 9.135773391171075, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.176313579082489, "logits/rejected": -0.02505047246813774, "logps/chosen": -1.2322592735290527, "logps/rejected": -1.4805638790130615, "loss": 1.5927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2322592735290527, "rewards/margins": 0.24830465018749237, "rewards/rejected": -1.4805638790130615, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 8.979940076666123, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.1995842158794403, "logits/rejected": -0.08602572232484818, "logps/chosen": -1.2478491067886353, "logps/rejected": -1.4512939453125, "loss": 1.5858, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2478491067886353, "rewards/margins": 0.20344488322734833, "rewards/rejected": -1.4512939453125, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 9.39158424428714, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.15345005691051483, "logits/rejected": -0.11476775258779526, "logps/chosen": -1.2204474210739136, "logps/rejected": -1.4159668684005737, "loss": 1.6053, "rewards/accuracies": 0.625, "rewards/chosen": -1.2204474210739136, "rewards/margins": 0.1955193430185318, "rewards/rejected": -1.4159668684005737, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 7.494636699449011, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.1034126877784729, "logits/rejected": -0.09635962545871735, "logps/chosen": -1.1871516704559326, "logps/rejected": -1.4469674825668335, "loss": 1.5351, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1871516704559326, "rewards/margins": 0.25981590151786804, "rewards/rejected": -1.4469674825668335, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 6.954594962262396, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.17379239201545715, "logits/rejected": -0.08324754983186722, "logps/chosen": -1.3096932172775269, "logps/rejected": -1.3674468994140625, "loss": 1.684, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3096932172775269, "rewards/margins": 0.05775368958711624, "rewards/rejected": -1.3674468994140625, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 7.200104189312471, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.10099692642688751, "logits/rejected": -0.11718853563070297, "logps/chosen": -1.2691152095794678, "logps/rejected": -1.392392873764038, "loss": 1.6195, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2691152095794678, "rewards/margins": 0.1232776790857315, "rewards/rejected": -1.392392873764038, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 8.410740178588743, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.18823234736919403, "logits/rejected": -0.10828492790460587, "logps/chosen": -1.300615668296814, "logps/rejected": -1.4223439693450928, "loss": 1.6481, "rewards/accuracies": 0.53125, "rewards/chosen": -1.300615668296814, "rewards/margins": 0.12172845751047134, "rewards/rejected": -1.4223439693450928, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 8.488143673141622, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.2915375530719757, "logits/rejected": -0.16539961099624634, "logps/chosen": -1.239471673965454, "logps/rejected": -1.4392166137695312, "loss": 1.5973, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.239471673965454, "rewards/margins": 0.1997450292110443, "rewards/rejected": -1.4392166137695312, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 8.296572864657373, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.1600394994020462, "logits/rejected": -0.07500159740447998, "logps/chosen": -1.2279202938079834, "logps/rejected": -1.467357873916626, "loss": 1.5723, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2279202938079834, "rewards/margins": 0.2394375503063202, "rewards/rejected": -1.467357873916626, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 10.477747918745342, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.19462502002716064, "logits/rejected": -0.07571206986904144, "logps/chosen": -1.1689397096633911, "logps/rejected": -1.347392201423645, "loss": 1.5665, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1689397096633911, "rewards/margins": 0.17845246195793152, "rewards/rejected": -1.347392201423645, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 7.736285254184248, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.13334044814109802, "logits/rejected": 0.028622111305594444, "logps/chosen": -1.2916946411132812, "logps/rejected": -1.444271445274353, "loss": 1.6559, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2916946411132812, "rewards/margins": 0.15257684886455536, "rewards/rejected": -1.444271445274353, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 9.641761863507444, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.22626297175884247, "logits/rejected": -0.10098767280578613, "logps/chosen": -1.1568272113800049, "logps/rejected": -1.3377034664154053, "loss": 1.5632, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1568272113800049, "rewards/margins": 0.180876225233078, "rewards/rejected": -1.3377034664154053, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 7.916149713153111, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.20858672261238098, "logits/rejected": -0.1384880095720291, "logps/chosen": -1.2436957359313965, "logps/rejected": -1.4017614126205444, "loss": 1.6109, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2436957359313965, "rewards/margins": 0.15806587040424347, "rewards/rejected": -1.4017614126205444, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 8.863796337651957, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.14621509611606598, "logits/rejected": -0.04588788002729416, "logps/chosen": -1.2604069709777832, "logps/rejected": -1.3629649877548218, "loss": 1.6573, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2604069709777832, "rewards/margins": 0.10255799442529678, "rewards/rejected": -1.3629649877548218, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 7.524654604515273, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.19042053818702698, "logits/rejected": -0.07803567498922348, "logps/chosen": -1.3758207559585571, "logps/rejected": -1.4862630367279053, "loss": 1.7169, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3758207559585571, "rewards/margins": 0.11044234037399292, "rewards/rejected": -1.4862630367279053, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 11.70027581573188, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.13621467351913452, "logits/rejected": 0.08509569615125656, "logps/chosen": -1.2939293384552002, "logps/rejected": -1.4757057428359985, "loss": 1.627, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2939293384552002, "rewards/margins": 0.18177632987499237, "rewards/rejected": -1.4757057428359985, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 9.092757887640735, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.12994270026683807, "logits/rejected": 0.010222077369689941, "logps/chosen": -1.271327018737793, "logps/rejected": -1.4542458057403564, "loss": 1.6104, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.271327018737793, "rewards/margins": 0.18291863799095154, "rewards/rejected": -1.4542458057403564, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 7.451524709977769, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.04890859127044678, "logits/rejected": -0.03615367412567139, "logps/chosen": -1.278269648551941, "logps/rejected": -1.4786007404327393, "loss": 1.629, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.278269648551941, "rewards/margins": 0.20033085346221924, "rewards/rejected": -1.4786007404327393, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 10.219394443903381, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.008174806833267212, "logits/rejected": 0.049251943826675415, "logps/chosen": -1.3031771183013916, "logps/rejected": -1.532463788986206, "loss": 1.6425, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3031771183013916, "rewards/margins": 0.22928662598133087, "rewards/rejected": -1.532463788986206, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 7.739260291455012, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.21783366799354553, "logits/rejected": -0.13086874783039093, "logps/chosen": -1.2585595846176147, "logps/rejected": -1.446861982345581, "loss": 1.6193, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2585595846176147, "rewards/margins": 0.18830236792564392, "rewards/rejected": -1.446861982345581, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 8.686763534911524, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.0693262368440628, "logits/rejected": -0.02944047376513481, "logps/chosen": -1.2556920051574707, "logps/rejected": -1.493545651435852, "loss": 1.5915, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2556920051574707, "rewards/margins": 0.23785361647605896, "rewards/rejected": -1.493545651435852, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 5.993060290872711, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.23667342960834503, "logits/rejected": -0.09508855640888214, "logps/chosen": -1.270794153213501, "logps/rejected": -1.4260084629058838, "loss": 1.6538, "rewards/accuracies": 0.5625, "rewards/chosen": -1.270794153213501, "rewards/margins": 0.155214324593544, "rewards/rejected": -1.4260084629058838, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 7.783007918313454, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.15866756439208984, "logits/rejected": -0.02681003138422966, "logps/chosen": -1.2622493505477905, "logps/rejected": -1.4406099319458008, "loss": 1.6396, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2622493505477905, "rewards/margins": 0.1783604919910431, "rewards/rejected": -1.4406099319458008, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 6.8802055144839045, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.1418730765581131, "logits/rejected": -0.03416634723544121, "logps/chosen": -1.2981500625610352, "logps/rejected": -1.4062778949737549, "loss": 1.6544, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2981500625610352, "rewards/margins": 0.10812785476446152, "rewards/rejected": -1.4062778949737549, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 8.495832340611686, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.08535777032375336, "logits/rejected": 0.027291741222143173, "logps/chosen": -1.1947400569915771, "logps/rejected": -1.429766058921814, "loss": 1.5805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1947400569915771, "rewards/margins": 0.23502619564533234, "rewards/rejected": -1.429766058921814, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 7.007217118241574, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.0938306525349617, "logits/rejected": 0.003519988153129816, "logps/chosen": -1.2325066328048706, "logps/rejected": -1.562382698059082, "loss": 1.5775, "rewards/accuracies": 0.625, "rewards/chosen": -1.2325066328048706, "rewards/margins": 0.32987624406814575, "rewards/rejected": -1.562382698059082, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 5.8155125097634786, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.09181183576583862, "logits/rejected": 0.010623258538544178, "logps/chosen": -1.2614425420761108, "logps/rejected": -1.4549132585525513, "loss": 1.5924, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2614425420761108, "rewards/margins": 0.1934705525636673, "rewards/rejected": -1.4549132585525513, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 8.061942671596224, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.06603260338306427, "logits/rejected": 0.021892230957746506, "logps/chosen": -1.222551941871643, "logps/rejected": -1.429960012435913, "loss": 1.5705, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.222551941871643, "rewards/margins": 0.20740799605846405, "rewards/rejected": -1.429960012435913, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 8.431180545335108, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.2085665911436081, "logits/rejected": -0.06773025542497635, "logps/chosen": -1.1748874187469482, "logps/rejected": -1.2752532958984375, "loss": 1.5719, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1748874187469482, "rewards/margins": 0.1003657728433609, "rewards/rejected": -1.2752532958984375, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 8.787806946795875, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.20111064612865448, "logits/rejected": -0.016483021900057793, "logps/chosen": -1.2925128936767578, "logps/rejected": -1.4721014499664307, "loss": 1.646, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2925128936767578, "rewards/margins": 0.17958858609199524, "rewards/rejected": -1.4721014499664307, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 8.15008584285262, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.08273342996835709, "logits/rejected": -0.0012397721875458956, "logps/chosen": -1.259852409362793, "logps/rejected": -1.4208247661590576, "loss": 1.6395, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.259852409362793, "rewards/margins": 0.16097232699394226, "rewards/rejected": -1.4208247661590576, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 6.717432726032404, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.08920027315616608, "logits/rejected": -0.04481660574674606, "logps/chosen": -1.2388746738433838, "logps/rejected": -1.4416940212249756, "loss": 1.5898, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2388746738433838, "rewards/margins": 0.20281946659088135, "rewards/rejected": -1.4416940212249756, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 10.284664281846965, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.25946319103240967, "logits/rejected": -0.1676512062549591, "logps/chosen": -1.1732776165008545, "logps/rejected": -1.3788777589797974, "loss": 1.5704, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1732776165008545, "rewards/margins": 0.20560026168823242, "rewards/rejected": -1.3788777589797974, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 9.357298837621496, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.12120018154382706, "logits/rejected": -0.031773291528224945, "logps/chosen": -1.232176661491394, "logps/rejected": -1.4104655981063843, "loss": 1.6123, "rewards/accuracies": 0.59375, "rewards/chosen": -1.232176661491394, "rewards/margins": 0.1782890111207962, "rewards/rejected": -1.4104655981063843, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.09563587605953217, "eval_logits/rejected": 0.16495996713638306, "eval_logps/chosen": -1.3029249906539917, "eval_logps/rejected": -1.4742597341537476, "eval_loss": 1.652089238166809, "eval_rewards/accuracies": 0.5667656064033508, "eval_rewards/chosen": -1.3029249906539917, "eval_rewards/margins": 0.17133480310440063, "eval_rewards/rejected": -1.4742597341537476, "eval_runtime": 40.4637, "eval_samples_per_second": 33.24, "eval_steps_per_second": 8.328, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 7.866399941057585, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.13615265488624573, "logits/rejected": -0.005280242767184973, "logps/chosen": -1.266261100769043, "logps/rejected": -1.4741570949554443, "loss": 1.6092, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.266261100769043, "rewards/margins": 0.20789583027362823, "rewards/rejected": -1.4741570949554443, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 5.782974933673425, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.26628702878952026, "logits/rejected": -0.09593107551336288, "logps/chosen": -1.2359964847564697, "logps/rejected": -1.3293280601501465, "loss": 1.6401, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2359964847564697, "rewards/margins": 0.09333159029483795, "rewards/rejected": -1.3293280601501465, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 6.547931969592147, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.10351938009262085, "logits/rejected": -0.055590152740478516, "logps/chosen": -1.1684571504592896, "logps/rejected": -1.4438257217407227, "loss": 1.5218, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1684571504592896, "rewards/margins": 0.2753686010837555, "rewards/rejected": -1.4438257217407227, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 8.18176580142601, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.24395263195037842, "logits/rejected": -0.1428661048412323, "logps/chosen": -1.278551697731018, "logps/rejected": -1.5387744903564453, "loss": 1.5966, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.278551697731018, "rewards/margins": 0.26022282242774963, "rewards/rejected": -1.5387744903564453, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 8.163975822722538, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.11933907121419907, "logits/rejected": -0.05643367022275925, "logps/chosen": -1.2212616205215454, "logps/rejected": -1.398843765258789, "loss": 1.5925, "rewards/accuracies": 0.625, "rewards/chosen": -1.2212616205215454, "rewards/margins": 0.1775822937488556, "rewards/rejected": -1.398843765258789, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 8.50468106485477, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.1156187430024147, "logits/rejected": -0.08730246126651764, "logps/chosen": -1.3546292781829834, "logps/rejected": -1.5157809257507324, "loss": 1.6826, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3546292781829834, "rewards/margins": 0.16115155816078186, "rewards/rejected": -1.5157809257507324, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 4.972232516102699, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.08850985765457153, "logits/rejected": 0.003945094998925924, "logps/chosen": -1.2295047044754028, "logps/rejected": -1.3775475025177002, "loss": 1.6045, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2295047044754028, "rewards/margins": 0.1480426788330078, "rewards/rejected": -1.3775475025177002, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 9.197101107744537, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.23168103396892548, "logits/rejected": -0.14563602209091187, "logps/chosen": -1.2089626789093018, "logps/rejected": -1.4014352560043335, "loss": 1.5829, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2089626789093018, "rewards/margins": 0.19247262179851532, "rewards/rejected": -1.4014352560043335, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 8.679503255748509, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.1771184206008911, "logits/rejected": -0.0606854073703289, "logps/chosen": -1.2561490535736084, "logps/rejected": -1.3943471908569336, "loss": 1.6229, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2561490535736084, "rewards/margins": 0.13819798827171326, "rewards/rejected": -1.3943471908569336, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 6.545905205306638, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.17317122220993042, "logits/rejected": -0.04910287261009216, "logps/chosen": -1.217040777206421, "logps/rejected": -1.4837815761566162, "loss": 1.5456, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.217040777206421, "rewards/margins": 0.2667407989501953, "rewards/rejected": -1.4837815761566162, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 5.185633419013426, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.17966656386852264, "logits/rejected": -0.0012095480924472213, "logps/chosen": -1.3041753768920898, "logps/rejected": -1.4204387664794922, "loss": 1.6485, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3041753768920898, "rewards/margins": 0.11626337468624115, "rewards/rejected": -1.4204387664794922, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 8.749038985182217, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.150712251663208, "logits/rejected": -0.07498790323734283, "logps/chosen": -1.2950248718261719, "logps/rejected": -1.3897525072097778, "loss": 1.6523, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2950248718261719, "rewards/margins": 0.09472791105508804, "rewards/rejected": -1.3897525072097778, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 6.837164978630255, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.12322545051574707, "logits/rejected": -0.07410383224487305, "logps/chosen": -1.2550203800201416, "logps/rejected": -1.4570932388305664, "loss": 1.6043, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2550203800201416, "rewards/margins": 0.2020729035139084, "rewards/rejected": -1.4570932388305664, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 10.434805459545437, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.24980542063713074, "logits/rejected": -0.14923252165317535, "logps/chosen": -1.2360928058624268, "logps/rejected": -1.3302528858184814, "loss": 1.6215, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2360928058624268, "rewards/margins": 0.0941600427031517, "rewards/rejected": -1.3302528858184814, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 6.649978989330003, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.16329626739025116, "logits/rejected": -0.10834117233753204, "logps/chosen": -1.2520506381988525, "logps/rejected": -1.458328366279602, "loss": 1.6092, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2520506381988525, "rewards/margins": 0.20627769827842712, "rewards/rejected": -1.458328366279602, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 6.3096691733673715, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.28249967098236084, "logits/rejected": -0.14977525174617767, "logps/chosen": -1.2829852104187012, "logps/rejected": -1.4930446147918701, "loss": 1.6114, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2829852104187012, "rewards/margins": 0.21005935966968536, "rewards/rejected": -1.4930446147918701, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 8.077535905450087, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.10574954748153687, "logits/rejected": 0.047827187925577164, "logps/chosen": -1.283026099205017, "logps/rejected": -1.4941781759262085, "loss": 1.646, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.283026099205017, "rewards/margins": 0.21115219593048096, "rewards/rejected": -1.4941781759262085, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 7.356301919669881, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.22362974286079407, "logits/rejected": -0.1858159899711609, "logps/chosen": -1.2722280025482178, "logps/rejected": -1.5098580121994019, "loss": 1.6113, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2722280025482178, "rewards/margins": 0.2376299649477005, "rewards/rejected": -1.5098580121994019, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 7.104468059666298, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.10563309490680695, "logits/rejected": -0.1071210652589798, "logps/chosen": -1.2554916143417358, "logps/rejected": -1.4397131204605103, "loss": 1.61, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2554916143417358, "rewards/margins": 0.18422135710716248, "rewards/rejected": -1.4397131204605103, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 7.991481488996529, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.11559624969959259, "logits/rejected": 0.007312471512705088, "logps/chosen": -1.2053483724594116, "logps/rejected": -1.389601469039917, "loss": 1.569, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2053483724594116, "rewards/margins": 0.18425318598747253, "rewards/rejected": -1.389601469039917, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 7.161788696656108, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.12719660997390747, "logits/rejected": 0.011344531551003456, "logps/chosen": -1.214221715927124, "logps/rejected": -1.3893407583236694, "loss": 1.5752, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.214221715927124, "rewards/margins": 0.17511887848377228, "rewards/rejected": -1.3893407583236694, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 10.254805100878995, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.20160675048828125, "logits/rejected": -0.0989512950181961, "logps/chosen": -1.2082228660583496, "logps/rejected": -1.4115922451019287, "loss": 1.5709, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2082228660583496, "rewards/margins": 0.20336949825286865, "rewards/rejected": -1.4115922451019287, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 6.201294311291015, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.11442514508962631, "logits/rejected": -0.02388397417962551, "logps/chosen": -1.2926146984100342, "logps/rejected": -1.401564359664917, "loss": 1.668, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2926146984100342, "rewards/margins": 0.1089496836066246, "rewards/rejected": -1.401564359664917, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 5.807785907494708, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.2186061143875122, "logits/rejected": -0.15785038471221924, "logps/chosen": -1.2709650993347168, "logps/rejected": -1.4271172285079956, "loss": 1.6172, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2709650993347168, "rewards/margins": 0.1561521738767624, "rewards/rejected": -1.4271172285079956, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 10.614893624777126, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.1689203381538391, "logits/rejected": -0.1105555072426796, "logps/chosen": -1.2965948581695557, "logps/rejected": -1.4499021768569946, "loss": 1.6348, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2965948581695557, "rewards/margins": 0.15330716967582703, "rewards/rejected": -1.4499021768569946, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 8.62582624798431, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.20537002384662628, "logits/rejected": -0.17075181007385254, "logps/chosen": -1.2418001890182495, "logps/rejected": -1.410526990890503, "loss": 1.6153, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2418001890182495, "rewards/margins": 0.16872690618038177, "rewards/rejected": -1.410526990890503, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 6.14248446253142, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.24151122570037842, "logits/rejected": -0.13936847448349, "logps/chosen": -1.26100754737854, "logps/rejected": -1.454439401626587, "loss": 1.607, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.26100754737854, "rewards/margins": 0.1934318244457245, "rewards/rejected": -1.454439401626587, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 9.504606958292078, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.14703956246376038, "logits/rejected": -0.01648893393576145, "logps/chosen": -1.3492311239242554, "logps/rejected": -1.4347513914108276, "loss": 1.6997, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3492311239242554, "rewards/margins": 0.08552038669586182, "rewards/rejected": -1.4347513914108276, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 6.612189818543011, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.13871613144874573, "logits/rejected": -0.015263721346855164, "logps/chosen": -1.256090760231018, "logps/rejected": -1.455414056777954, "loss": 1.6162, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.256090760231018, "rewards/margins": 0.19932350516319275, "rewards/rejected": -1.455414056777954, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 7.058132572253126, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.15726891160011292, "logits/rejected": -0.03172614425420761, "logps/chosen": -1.2419919967651367, "logps/rejected": -1.326724648475647, "loss": 1.6355, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2419919967651367, "rewards/margins": 0.08473268151283264, "rewards/rejected": -1.326724648475647, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 9.209693166256022, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.10544419288635254, "logits/rejected": -0.04484523460268974, "logps/chosen": -1.1819980144500732, "logps/rejected": -1.4745718240737915, "loss": 1.5391, "rewards/accuracies": 0.625, "rewards/chosen": -1.1819980144500732, "rewards/margins": 0.2925736606121063, "rewards/rejected": -1.4745718240737915, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 7.432396154450075, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.0917210653424263, "logits/rejected": 0.01057431660592556, "logps/chosen": -1.229265570640564, "logps/rejected": -1.4219061136245728, "loss": 1.5893, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.229265570640564, "rewards/margins": 0.19264045357704163, "rewards/rejected": -1.4219061136245728, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 8.895934890344009, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.12629587948322296, "logits/rejected": -0.03258613497018814, "logps/chosen": -1.2157530784606934, "logps/rejected": -1.4207779169082642, "loss": 1.5777, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2157530784606934, "rewards/margins": 0.2050248682498932, "rewards/rejected": -1.4207779169082642, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 5.284246111498499, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.06085330992937088, "logits/rejected": -0.015537110157310963, "logps/chosen": -1.2201460599899292, "logps/rejected": -1.4362637996673584, "loss": 1.5669, "rewards/accuracies": 0.625, "rewards/chosen": -1.2201460599899292, "rewards/margins": 0.21611778438091278, "rewards/rejected": -1.4362637996673584, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 9.453518891765544, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.21074800193309784, "logits/rejected": -0.0934746041893959, "logps/chosen": -1.2131701707839966, "logps/rejected": -1.4448621273040771, "loss": 1.5703, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2131701707839966, "rewards/margins": 0.23169195652008057, "rewards/rejected": -1.4448621273040771, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 10.55445943981361, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.12727180123329163, "logits/rejected": -0.07430537790060043, "logps/chosen": -1.3147636651992798, "logps/rejected": -1.4463951587677002, "loss": 1.6785, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3147636651992798, "rewards/margins": 0.13163141906261444, "rewards/rejected": -1.4463951587677002, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 7.260128046427542, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.05735338479280472, "logits/rejected": 0.07477830350399017, "logps/chosen": -1.1611049175262451, "logps/rejected": -1.4185049533843994, "loss": 1.5439, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1611049175262451, "rewards/margins": 0.2574000358581543, "rewards/rejected": -1.4185049533843994, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 8.563324012040841, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.1607932448387146, "logits/rejected": -0.03989001363515854, "logps/chosen": -1.208744764328003, "logps/rejected": -1.4180536270141602, "loss": 1.5911, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.208744764328003, "rewards/margins": 0.2093089520931244, "rewards/rejected": -1.4180536270141602, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 8.132441147071921, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.10679218918085098, "logits/rejected": -0.08082714676856995, "logps/chosen": -1.2230453491210938, "logps/rejected": -1.407715082168579, "loss": 1.5794, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2230453491210938, "rewards/margins": 0.18466970324516296, "rewards/rejected": -1.407715082168579, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 9.724335849310258, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.05054439231753349, "logits/rejected": 0.049748364835977554, "logps/chosen": -1.249579668045044, "logps/rejected": -1.3751084804534912, "loss": 1.6245, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.249579668045044, "rewards/margins": 0.1255287230014801, "rewards/rejected": -1.3751084804534912, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 8.232190498117061, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.07768498361110687, "logits/rejected": -0.050233401358127594, "logps/chosen": -1.238800048828125, "logps/rejected": -1.3910675048828125, "loss": 1.6121, "rewards/accuracies": 0.53125, "rewards/chosen": -1.238800048828125, "rewards/margins": 0.1522674858570099, "rewards/rejected": -1.3910675048828125, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 9.200695719936968, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.03262275084853172, "logits/rejected": 0.03302247077226639, "logps/chosen": -1.2927391529083252, "logps/rejected": -1.4022797346115112, "loss": 1.684, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2927391529083252, "rewards/margins": 0.1095406785607338, "rewards/rejected": -1.4022797346115112, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 11.089373494205253, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.15007483959197998, "logits/rejected": -0.035728149116039276, "logps/chosen": -1.2085886001586914, "logps/rejected": -1.47746741771698, "loss": 1.5697, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2085886001586914, "rewards/margins": 0.2688790261745453, "rewards/rejected": -1.47746741771698, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 9.176711063517878, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.14987041056156158, "logits/rejected": -0.15985018014907837, "logps/chosen": -1.3277456760406494, "logps/rejected": -1.4078071117401123, "loss": 1.7256, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.3277456760406494, "rewards/margins": 0.08006126433610916, "rewards/rejected": -1.4078071117401123, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 8.617162785119755, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.0892399325966835, "logits/rejected": 0.03166574984788895, "logps/chosen": -1.248976230621338, "logps/rejected": -1.438613772392273, "loss": 1.5903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.248976230621338, "rewards/margins": 0.189637690782547, "rewards/rejected": -1.438613772392273, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 8.36874910160355, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.25129103660583496, "logits/rejected": -0.08070940524339676, "logps/chosen": -1.3176677227020264, "logps/rejected": -1.404175043106079, "loss": 1.6778, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3176677227020264, "rewards/margins": 0.08650705963373184, "rewards/rejected": -1.404175043106079, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 10.09644986033379, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.16776664555072784, "logits/rejected": -0.10792151838541031, "logps/chosen": -1.255562424659729, "logps/rejected": -1.420966386795044, "loss": 1.5925, "rewards/accuracies": 0.625, "rewards/chosen": -1.255562424659729, "rewards/margins": 0.1654038280248642, "rewards/rejected": -1.420966386795044, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 6.624218741585395, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.1129215806722641, "logits/rejected": -0.06237906217575073, "logps/chosen": -1.3118427991867065, "logps/rejected": -1.47782301902771, "loss": 1.6551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3118427991867065, "rewards/margins": 0.1659802496433258, "rewards/rejected": -1.47782301902771, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 8.11624284407403, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.13897685706615448, "logits/rejected": -0.07714378088712692, "logps/chosen": -1.2451449632644653, "logps/rejected": -1.4978704452514648, "loss": 1.5768, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2451449632644653, "rewards/margins": 0.2527254819869995, "rewards/rejected": -1.4978704452514648, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 6.767373925947111, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.20330551266670227, "logits/rejected": -0.08086180686950684, "logps/chosen": -1.253254771232605, "logps/rejected": -1.6146825551986694, "loss": 1.6038, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.253254771232605, "rewards/margins": 0.36142784357070923, "rewards/rejected": -1.6146825551986694, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 8.782776045832264, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.12377236038446426, "logits/rejected": -0.0035462796222418547, "logps/chosen": -1.157189130783081, "logps/rejected": -1.339942216873169, "loss": 1.5445, "rewards/accuracies": 0.5625, "rewards/chosen": -1.157189130783081, "rewards/margins": 0.18275293707847595, "rewards/rejected": -1.339942216873169, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 8.528001606755288, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.054256655275821686, "logits/rejected": -0.02772495709359646, "logps/chosen": -1.1307682991027832, "logps/rejected": -1.3774828910827637, "loss": 1.5133, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1307682991027832, "rewards/margins": 0.24671444296836853, "rewards/rejected": -1.3774828910827637, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 7.568061035346011, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.20463016629219055, "logits/rejected": -0.11927781254053116, "logps/chosen": -1.2784268856048584, "logps/rejected": -1.4733545780181885, "loss": 1.616, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2784268856048584, "rewards/margins": 0.1949276179075241, "rewards/rejected": -1.4733545780181885, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 8.754933977946239, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.08934472501277924, "logits/rejected": -0.02049086056649685, "logps/chosen": -1.3698203563690186, "logps/rejected": -1.4895521402359009, "loss": 1.7197, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3698203563690186, "rewards/margins": 0.11973158270120621, "rewards/rejected": -1.4895521402359009, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 10.10100058158381, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.1388912945985794, "logits/rejected": -0.010842189192771912, "logps/chosen": -1.2867250442504883, "logps/rejected": -1.4099212884902954, "loss": 1.6438, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2867250442504883, "rewards/margins": 0.12319610267877579, "rewards/rejected": -1.4099212884902954, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 9.231846695551004, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.18062396347522736, "logits/rejected": -0.0430278442800045, "logps/chosen": -1.2710715532302856, "logps/rejected": -1.4060547351837158, "loss": 1.6416, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2710715532302856, "rewards/margins": 0.13498328626155853, "rewards/rejected": -1.4060547351837158, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 9.237884271223397, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.1564517766237259, "logits/rejected": 0.003753349184989929, "logps/chosen": -1.2695815563201904, "logps/rejected": -1.3745739459991455, "loss": 1.6364, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2695815563201904, "rewards/margins": 0.10499223321676254, "rewards/rejected": -1.3745739459991455, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 10.139955829157355, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.2812516987323761, "logits/rejected": -0.09271320700645447, "logps/chosen": -1.2651293277740479, "logps/rejected": -1.5354411602020264, "loss": 1.6048, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2651293277740479, "rewards/margins": 0.2703118920326233, "rewards/rejected": -1.5354411602020264, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 6.426845661488, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.13508456945419312, "logits/rejected": -0.11751226335763931, "logps/chosen": -1.2416387796401978, "logps/rejected": -1.4536396265029907, "loss": 1.5797, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2416387796401978, "rewards/margins": 0.21200060844421387, "rewards/rejected": -1.4536396265029907, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 7.714598930066917, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.17228879034519196, "logits/rejected": -0.01403956301510334, "logps/chosen": -1.290063738822937, "logps/rejected": -1.4569785594940186, "loss": 1.6395, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.290063738822937, "rewards/margins": 0.16691474616527557, "rewards/rejected": -1.4569785594940186, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 5.708198226308769, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.19277127087116241, "logits/rejected": -0.028677979484200478, "logps/chosen": -1.1680980920791626, "logps/rejected": -1.4506738185882568, "loss": 1.5045, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1680980920791626, "rewards/margins": 0.2825758755207062, "rewards/rejected": -1.4506738185882568, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 7.689429641279572, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.10672245919704437, "logits/rejected": -0.08293677121400833, "logps/chosen": -1.2571884393692017, "logps/rejected": -1.4730494022369385, "loss": 1.5866, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2571884393692017, "rewards/margins": 0.21586088836193085, "rewards/rejected": -1.4730494022369385, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 7.730130806525485, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.23001877963542938, "logits/rejected": -0.15539811551570892, "logps/chosen": -1.198460340499878, "logps/rejected": -1.3851935863494873, "loss": 1.5722, "rewards/accuracies": 0.53125, "rewards/chosen": -1.198460340499878, "rewards/margins": 0.186733216047287, "rewards/rejected": -1.3851935863494873, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 7.45875436423318, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.1821044236421585, "logits/rejected": -0.08938132971525192, "logps/chosen": -1.2271735668182373, "logps/rejected": -1.3389554023742676, "loss": 1.6007, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2271735668182373, "rewards/margins": 0.11178169399499893, "rewards/rejected": -1.3389554023742676, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 9.274979791690361, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.14727376401424408, "logits/rejected": -0.05907534435391426, "logps/chosen": -1.23906672000885, "logps/rejected": -1.4582659006118774, "loss": 1.6037, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.23906672000885, "rewards/margins": 0.21919913589954376, "rewards/rejected": -1.4582659006118774, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 8.308768679842965, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.19906631112098694, "logits/rejected": -0.06506966054439545, "logps/chosen": -1.2406883239746094, "logps/rejected": -1.4651925563812256, "loss": 1.6225, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2406883239746094, "rewards/margins": 0.22450414299964905, "rewards/rejected": -1.4651925563812256, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 7.296154052157939, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.1607706993818283, "logits/rejected": 0.003165569854900241, "logps/chosen": -1.1542766094207764, "logps/rejected": -1.4799726009368896, "loss": 1.5236, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1542766094207764, "rewards/margins": 0.32569605112075806, "rewards/rejected": -1.4799726009368896, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 9.399177789493598, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.19197958707809448, "logits/rejected": -0.10159718990325928, "logps/chosen": -1.19269597530365, "logps/rejected": -1.4547951221466064, "loss": 1.5679, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.19269597530365, "rewards/margins": 0.2620992064476013, "rewards/rejected": -1.4547951221466064, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 7.426364054634275, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.17664211988449097, "logits/rejected": -0.08954844623804092, "logps/chosen": -1.2093679904937744, "logps/rejected": -1.3380529880523682, "loss": 1.5911, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2093679904937744, "rewards/margins": 0.12868481874465942, "rewards/rejected": -1.3380529880523682, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 8.03926829429509, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.15437455475330353, "logits/rejected": -0.08024605363607407, "logps/chosen": -1.2849972248077393, "logps/rejected": -1.5242011547088623, "loss": 1.6053, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2849972248077393, "rewards/margins": 0.23920373618602753, "rewards/rejected": -1.5242011547088623, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 9.826266540452712, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.10797605663537979, "logits/rejected": 0.014654259197413921, "logps/chosen": -1.2318819761276245, "logps/rejected": -1.3689765930175781, "loss": 1.5971, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2318819761276245, "rewards/margins": 0.13709449768066406, "rewards/rejected": -1.3689765930175781, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 7.879073302523249, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.22682805359363556, "logits/rejected": -0.08655048161745071, "logps/chosen": -1.267865538597107, "logps/rejected": -1.5178544521331787, "loss": 1.6205, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.267865538597107, "rewards/margins": 0.24998879432678223, "rewards/rejected": -1.5178544521331787, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 8.292600778846104, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.11081697791814804, "logits/rejected": 0.00440225237980485, "logps/chosen": -1.1883069276809692, "logps/rejected": -1.3887240886688232, "loss": 1.5564, "rewards/accuracies": 0.625, "rewards/chosen": -1.1883069276809692, "rewards/margins": 0.200417160987854, "rewards/rejected": -1.3887240886688232, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 7.9152983573075115, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.17233093082904816, "logits/rejected": -0.1128607988357544, "logps/chosen": -1.2181812524795532, "logps/rejected": -1.2844047546386719, "loss": 1.6288, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2181812524795532, "rewards/margins": 0.06622340530157089, "rewards/rejected": -1.2844047546386719, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 5.556966087112487, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.13642926514148712, "logits/rejected": -0.12106633186340332, "logps/chosen": -1.1797887086868286, "logps/rejected": -1.3879332542419434, "loss": 1.5525, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1797887086868286, "rewards/margins": 0.20814457535743713, "rewards/rejected": -1.3879332542419434, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 5.392301224487717, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.14420035481452942, "logits/rejected": -0.061324309557676315, "logps/chosen": -1.3488701581954956, "logps/rejected": -1.4870027303695679, "loss": 1.6864, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3488701581954956, "rewards/margins": 0.13813255727291107, "rewards/rejected": -1.4870027303695679, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 7.535224374184795, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.18038392066955566, "logits/rejected": -0.07577495276927948, "logps/chosen": -1.3093490600585938, "logps/rejected": -1.4596061706542969, "loss": 1.6714, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3093490600585938, "rewards/margins": 0.15025700628757477, "rewards/rejected": -1.4596061706542969, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 11.729284249643793, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.132780522108078, "logits/rejected": -0.14752009510993958, "logps/chosen": -1.2028001546859741, "logps/rejected": -1.418414831161499, "loss": 1.5775, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2028001546859741, "rewards/margins": 0.21561458706855774, "rewards/rejected": -1.418414831161499, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 7.13278055336521, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.021932054311037064, "logits/rejected": -0.00037776678800582886, "logps/chosen": -1.1753807067871094, "logps/rejected": -1.311991810798645, "loss": 1.6075, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1753807067871094, "rewards/margins": 0.13661102950572968, "rewards/rejected": -1.311991810798645, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 7.341481609459083, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.20246192812919617, "logits/rejected": -0.13079217076301575, "logps/chosen": -1.31998610496521, "logps/rejected": -1.535980463027954, "loss": 1.6688, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.31998610496521, "rewards/margins": 0.21599426865577698, "rewards/rejected": -1.535980463027954, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.10498671978712082, "eval_logits/rejected": 0.17505963146686554, "eval_logps/chosen": -1.2999731302261353, "eval_logps/rejected": -1.4728928804397583, "eval_loss": 1.6485788822174072, "eval_rewards/accuracies": 0.5689911246299744, "eval_rewards/chosen": -1.2999731302261353, "eval_rewards/margins": 0.17291992902755737, "eval_rewards/rejected": -1.4728928804397583, "eval_runtime": 40.3876, "eval_samples_per_second": 33.302, "eval_steps_per_second": 8.344, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 4.849294698705585, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.2729756236076355, "logits/rejected": -0.15374641120433807, "logps/chosen": -1.1556975841522217, "logps/rejected": -1.393615961074829, "loss": 1.5309, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1556975841522217, "rewards/margins": 0.23791857063770294, "rewards/rejected": -1.393615961074829, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 11.891979762914524, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.1642301380634308, "logits/rejected": -0.14914503693580627, "logps/chosen": -1.286602258682251, "logps/rejected": -1.4862585067749023, "loss": 1.6405, "rewards/accuracies": 0.59375, "rewards/chosen": -1.286602258682251, "rewards/margins": 0.19965621829032898, "rewards/rejected": -1.4862585067749023, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 7.780853040126767, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.10971619933843613, "logits/rejected": -0.003022894263267517, "logps/chosen": -1.2809616327285767, "logps/rejected": -1.463622808456421, "loss": 1.6122, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2809616327285767, "rewards/margins": 0.18266119062900543, "rewards/rejected": -1.463622808456421, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 6.591814679920951, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.12527456879615784, "logits/rejected": 0.0023119777906686068, "logps/chosen": -1.2496436834335327, "logps/rejected": -1.3035128116607666, "loss": 1.6462, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2496436834335327, "rewards/margins": 0.05386912822723389, "rewards/rejected": -1.3035128116607666, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 7.044574604780478, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.19462482631206512, "logits/rejected": -0.05798368528485298, "logps/chosen": -1.309741497039795, "logps/rejected": -1.5621037483215332, "loss": 1.6324, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.309741497039795, "rewards/margins": 0.25236231088638306, "rewards/rejected": -1.5621037483215332, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 7.108608702314739, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.16074320673942566, "logits/rejected": -0.033432383090257645, "logps/chosen": -1.3094022274017334, "logps/rejected": -1.3287417888641357, "loss": 1.7062, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3094022274017334, "rewards/margins": 0.019339632242918015, "rewards/rejected": -1.3287417888641357, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 9.514791959704727, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.1489587277173996, "logits/rejected": -0.008739927783608437, "logps/chosen": -1.2328494787216187, "logps/rejected": -1.3843698501586914, "loss": 1.6031, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2328494787216187, "rewards/margins": 0.1515202671289444, "rewards/rejected": -1.3843698501586914, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 8.805937871805284, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.1267159879207611, "logits/rejected": -0.03693726286292076, "logps/chosen": -1.262799620628357, "logps/rejected": -1.4053882360458374, "loss": 1.6117, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.262799620628357, "rewards/margins": 0.14258867502212524, "rewards/rejected": -1.4053882360458374, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 7.9077575887848655, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.1251341700553894, "logits/rejected": -0.06009829789400101, "logps/chosen": -1.1997259855270386, "logps/rejected": -1.4353101253509521, "loss": 1.5689, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1997259855270386, "rewards/margins": 0.2355840504169464, "rewards/rejected": -1.4353101253509521, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 9.469293451184276, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.17621280252933502, "logits/rejected": -0.1186450719833374, "logps/chosen": -1.2835023403167725, "logps/rejected": -1.4212285280227661, "loss": 1.6484, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2835023403167725, "rewards/margins": 0.13772614300251007, "rewards/rejected": -1.4212285280227661, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 6.5897812059697385, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.18296119570732117, "logits/rejected": -0.05216040462255478, "logps/chosen": -1.3307311534881592, "logps/rejected": -1.527421236038208, "loss": 1.656, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3307311534881592, "rewards/margins": 0.19669008255004883, "rewards/rejected": -1.527421236038208, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 8.831262505099328, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.23127059638500214, "logits/rejected": -0.08917646110057831, "logps/chosen": -1.2180769443511963, "logps/rejected": -1.3507416248321533, "loss": 1.5876, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2180769443511963, "rewards/margins": 0.132664754986763, "rewards/rejected": -1.3507416248321533, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 7.992509411678571, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.3154450058937073, "logits/rejected": -0.2124137133359909, "logps/chosen": -1.2226747274398804, "logps/rejected": -1.422944188117981, "loss": 1.604, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2226747274398804, "rewards/margins": 0.20026938617229462, "rewards/rejected": -1.422944188117981, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 9.366797403404275, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.20468780398368835, "logits/rejected": -0.11390724033117294, "logps/chosen": -1.2798616886138916, "logps/rejected": -1.3370516300201416, "loss": 1.6791, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2798616886138916, "rewards/margins": 0.05719008296728134, "rewards/rejected": -1.3370516300201416, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 8.178835145850876, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.20676104724407196, "logits/rejected": -0.14471128582954407, "logps/chosen": -1.278641939163208, "logps/rejected": -1.393040657043457, "loss": 1.6344, "rewards/accuracies": 0.5625, "rewards/chosen": -1.278641939163208, "rewards/margins": 0.11439867317676544, "rewards/rejected": -1.393040657043457, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 9.263124693885866, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.0644320473074913, "logits/rejected": 0.02961653098464012, "logps/chosen": -1.2251948118209839, "logps/rejected": -1.4403178691864014, "loss": 1.6105, "rewards/accuracies": 0.625, "rewards/chosen": -1.2251948118209839, "rewards/margins": 0.21512313187122345, "rewards/rejected": -1.4403178691864014, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 8.159770572119198, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.14082811772823334, "logits/rejected": -0.039442818611860275, "logps/chosen": -1.241360068321228, "logps/rejected": -1.4022153615951538, "loss": 1.6075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.241360068321228, "rewards/margins": 0.1608552634716034, "rewards/rejected": -1.4022153615951538, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 7.228196201868465, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.24351325631141663, "logits/rejected": -0.08822022378444672, "logps/chosen": -1.2423522472381592, "logps/rejected": -1.4015181064605713, "loss": 1.6044, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2423522472381592, "rewards/margins": 0.15916575491428375, "rewards/rejected": -1.4015181064605713, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 7.735085289730952, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.21803371608257294, "logits/rejected": -0.11068868637084961, "logps/chosen": -1.1797256469726562, "logps/rejected": -1.351781964302063, "loss": 1.5622, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1797256469726562, "rewards/margins": 0.17205628752708435, "rewards/rejected": -1.351781964302063, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 7.755538928282204, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.11511901766061783, "logits/rejected": -0.031004998832941055, "logps/chosen": -1.263131022453308, "logps/rejected": -1.4148228168487549, "loss": 1.6173, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.263131022453308, "rewards/margins": 0.15169157087802887, "rewards/rejected": -1.4148228168487549, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 6.08725319260436, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.09188179671764374, "logits/rejected": -0.066791832447052, "logps/chosen": -1.2208307981491089, "logps/rejected": -1.412955641746521, "loss": 1.5818, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2208307981491089, "rewards/margins": 0.19212493300437927, "rewards/rejected": -1.412955641746521, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 7.969419534273052, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.13259169459342957, "logits/rejected": -0.07302357256412506, "logps/chosen": -1.204056739807129, "logps/rejected": -1.3601787090301514, "loss": 1.5882, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.204056739807129, "rewards/margins": 0.1561218798160553, "rewards/rejected": -1.3601787090301514, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 9.477438173704936, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.10372304916381836, "logits/rejected": 0.026906440034508705, "logps/chosen": -1.2964290380477905, "logps/rejected": -1.4188811779022217, "loss": 1.6533, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2964290380477905, "rewards/margins": 0.12245219945907593, "rewards/rejected": -1.4188811779022217, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 5.790667370836652, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.09391071647405624, "logits/rejected": 0.0030964971520006657, "logps/chosen": -1.2686607837677002, "logps/rejected": -1.4566603899002075, "loss": 1.6123, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2686607837677002, "rewards/margins": 0.18799959123134613, "rewards/rejected": -1.4566603899002075, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 8.774622196913327, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.09754832088947296, "logits/rejected": -0.06331709772348404, "logps/chosen": -1.1873222589492798, "logps/rejected": -1.4646153450012207, "loss": 1.5576, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1873222589492798, "rewards/margins": 0.2772930860519409, "rewards/rejected": -1.4646153450012207, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 10.30320939433974, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.2510526478290558, "logits/rejected": -0.12636008858680725, "logps/chosen": -1.2209627628326416, "logps/rejected": -1.4968620538711548, "loss": 1.5691, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2209627628326416, "rewards/margins": 0.27589935064315796, "rewards/rejected": -1.4968620538711548, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 7.653860879146269, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.1960235983133316, "logits/rejected": -0.07919275760650635, "logps/chosen": -1.2599356174468994, "logps/rejected": -1.4395039081573486, "loss": 1.6216, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2599356174468994, "rewards/margins": 0.1795683354139328, "rewards/rejected": -1.4395039081573486, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 6.058531376640787, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.19647802412509918, "logits/rejected": -0.060090743005275726, "logps/chosen": -1.2536137104034424, "logps/rejected": -1.4841291904449463, "loss": 1.6131, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2536137104034424, "rewards/margins": 0.2305155098438263, "rewards/rejected": -1.4841291904449463, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 10.050325184853422, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.18975773453712463, "logits/rejected": -0.07312886416912079, "logps/chosen": -1.2278273105621338, "logps/rejected": -1.348670244216919, "loss": 1.6151, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2278273105621338, "rewards/margins": 0.12084273993968964, "rewards/rejected": -1.348670244216919, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 10.247961150436891, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.08409741520881653, "logits/rejected": 0.10858605802059174, "logps/chosen": -1.1982523202896118, "logps/rejected": -1.4111332893371582, "loss": 1.5483, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1982523202896118, "rewards/margins": 0.21288099884986877, "rewards/rejected": -1.4111332893371582, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 7.392703100739131, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.10309597104787827, "logits/rejected": -0.02585100196301937, "logps/chosen": -1.2173881530761719, "logps/rejected": -1.434526801109314, "loss": 1.5753, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2173881530761719, "rewards/margins": 0.21713873744010925, "rewards/rejected": -1.434526801109314, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 8.731910409734624, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.20448771119117737, "logits/rejected": -0.10863231122493744, "logps/chosen": -1.2373604774475098, "logps/rejected": -1.401271104812622, "loss": 1.5899, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2373604774475098, "rewards/margins": 0.1639106571674347, "rewards/rejected": -1.401271104812622, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 11.54676399891047, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.1474340260028839, "logits/rejected": -0.07390202581882477, "logps/chosen": -1.202203392982483, "logps/rejected": -1.3635931015014648, "loss": 1.5809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.202203392982483, "rewards/margins": 0.16138988733291626, "rewards/rejected": -1.3635931015014648, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 11.368510077058957, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.15914659202098846, "logits/rejected": -0.021402398124337196, "logps/chosen": -1.2980326414108276, "logps/rejected": -1.4061661958694458, "loss": 1.6791, "rewards/accuracies": 0.5, "rewards/chosen": -1.2980326414108276, "rewards/margins": 0.10813357681035995, "rewards/rejected": -1.4061661958694458, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 8.928421716896088, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.1347757875919342, "logits/rejected": -0.026164641603827477, "logps/chosen": -1.2644984722137451, "logps/rejected": -1.5294973850250244, "loss": 1.5981, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2644984722137451, "rewards/margins": 0.2649989724159241, "rewards/rejected": -1.5294973850250244, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 10.035546717884044, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.224522203207016, "logits/rejected": -0.1060357317328453, "logps/chosen": -1.299433708190918, "logps/rejected": -1.4766066074371338, "loss": 1.6595, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.299433708190918, "rewards/margins": 0.17717306315898895, "rewards/rejected": -1.4766066074371338, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 5.906593506571086, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.18225395679473877, "logits/rejected": -0.036761581897735596, "logps/chosen": -1.3251721858978271, "logps/rejected": -1.4256501197814941, "loss": 1.6713, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3251721858978271, "rewards/margins": 0.10047806799411774, "rewards/rejected": -1.4256501197814941, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 8.425429840500003, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.16185665130615234, "logits/rejected": -0.023658882826566696, "logps/chosen": -1.2019795179367065, "logps/rejected": -1.4564263820648193, "loss": 1.5652, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2019795179367065, "rewards/margins": 0.25444674491882324, "rewards/rejected": -1.4564263820648193, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 8.605090024836523, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.14876993000507355, "logits/rejected": -0.0722462460398674, "logps/chosen": -1.310435175895691, "logps/rejected": -1.545121192932129, "loss": 1.635, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.310435175895691, "rewards/margins": 0.23468592762947083, "rewards/rejected": -1.545121192932129, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 7.129441612993683, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.173610657453537, "logits/rejected": -0.06454495340585709, "logps/chosen": -1.3077783584594727, "logps/rejected": -1.4751222133636475, "loss": 1.6835, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3077783584594727, "rewards/margins": 0.16734392940998077, "rewards/rejected": -1.4751222133636475, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 7.845254283018713, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.22596955299377441, "logits/rejected": -0.10917635262012482, "logps/chosen": -1.2063977718353271, "logps/rejected": -1.3889039754867554, "loss": 1.5891, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2063977718353271, "rewards/margins": 0.18250641226768494, "rewards/rejected": -1.3889039754867554, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 8.734234001949082, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.19577258825302124, "logits/rejected": -0.08821289241313934, "logps/chosen": -1.2466051578521729, "logps/rejected": -1.4116572141647339, "loss": 1.6361, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2466051578521729, "rewards/margins": 0.16505205631256104, "rewards/rejected": -1.4116572141647339, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 8.585114257305527, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.055124759674072266, "logits/rejected": 0.012016276828944683, "logps/chosen": -1.240079641342163, "logps/rejected": -1.4648487567901611, "loss": 1.5826, "rewards/accuracies": 0.625, "rewards/chosen": -1.240079641342163, "rewards/margins": 0.22476892173290253, "rewards/rejected": -1.4648487567901611, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 7.978827577989167, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.20655544102191925, "logits/rejected": -0.07737629115581512, "logps/chosen": -1.2412638664245605, "logps/rejected": -1.3628274202346802, "loss": 1.6176, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2412638664245605, "rewards/margins": 0.12156347930431366, "rewards/rejected": -1.3628274202346802, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 8.170321883840396, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.16517595946788788, "logits/rejected": -0.052291691303253174, "logps/chosen": -1.2107970714569092, "logps/rejected": -1.359795331954956, "loss": 1.5975, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2107970714569092, "rewards/margins": 0.1489982157945633, "rewards/rejected": -1.359795331954956, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 5.630556685338138, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.1525728702545166, "logits/rejected": -0.04186255484819412, "logps/chosen": -1.2931994199752808, "logps/rejected": -1.3988301753997803, "loss": 1.6377, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2931994199752808, "rewards/margins": 0.10563075542449951, "rewards/rejected": -1.3988301753997803, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 8.157753203282361, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.0260242260992527, "logits/rejected": 0.060523390769958496, "logps/chosen": -1.2663099765777588, "logps/rejected": -1.3480726480484009, "loss": 1.6493, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2663099765777588, "rewards/margins": 0.08176268637180328, "rewards/rejected": -1.3480726480484009, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 11.972062026400824, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.1755906492471695, "logits/rejected": -0.13223165273666382, "logps/chosen": -1.1845462322235107, "logps/rejected": -1.4218249320983887, "loss": 1.5402, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1845462322235107, "rewards/margins": 0.23727861046791077, "rewards/rejected": -1.4218249320983887, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 13.119864768265927, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.14955314993858337, "logits/rejected": -0.06605812162160873, "logps/chosen": -1.2177717685699463, "logps/rejected": -1.3286962509155273, "loss": 1.6158, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2177717685699463, "rewards/margins": 0.11092434823513031, "rewards/rejected": -1.3286962509155273, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 9.106121612831492, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.1662093698978424, "logits/rejected": -0.02916570007801056, "logps/chosen": -1.2815699577331543, "logps/rejected": -1.5031042098999023, "loss": 1.6074, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2815699577331543, "rewards/margins": 0.22153429687023163, "rewards/rejected": -1.5031042098999023, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 6.146930557344138, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.17067714035511017, "logits/rejected": -0.057190440595149994, "logps/chosen": -1.2174100875854492, "logps/rejected": -1.4384522438049316, "loss": 1.5877, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2174100875854492, "rewards/margins": 0.22104212641716003, "rewards/rejected": -1.4384522438049316, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 6.794366427686016, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.19617754220962524, "logits/rejected": -0.05101104825735092, "logps/chosen": -1.2647570371627808, "logps/rejected": -1.4480626583099365, "loss": 1.6192, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2647570371627808, "rewards/margins": 0.18330557644367218, "rewards/rejected": -1.4480626583099365, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 7.890439130584057, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.0842519998550415, "logits/rejected": -0.06073132902383804, "logps/chosen": -1.2535126209259033, "logps/rejected": -1.5172220468521118, "loss": 1.5936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2535126209259033, "rewards/margins": 0.2637094557285309, "rewards/rejected": -1.5172220468521118, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 6.932068669261237, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.22326147556304932, "logits/rejected": -0.1301649957895279, "logps/chosen": -1.2490341663360596, "logps/rejected": -1.4320869445800781, "loss": 1.6044, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2490341663360596, "rewards/margins": 0.18305273354053497, "rewards/rejected": -1.4320869445800781, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 6.534333873273087, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.23500683903694153, "logits/rejected": -0.18605561554431915, "logps/chosen": -1.1966211795806885, "logps/rejected": -1.4017795324325562, "loss": 1.5608, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1966211795806885, "rewards/margins": 0.2051583081483841, "rewards/rejected": -1.4017795324325562, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 8.222345073859744, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.28006216883659363, "logits/rejected": -0.12380047887563705, "logps/chosen": -1.1686115264892578, "logps/rejected": -1.3616727590560913, "loss": 1.5561, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1686115264892578, "rewards/margins": 0.19306130707263947, "rewards/rejected": -1.3616727590560913, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 7.848292425252936, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.08855435997247696, "logits/rejected": -0.08716576546430588, "logps/chosen": -1.1851686239242554, "logps/rejected": -1.3858120441436768, "loss": 1.5523, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1851686239242554, "rewards/margins": 0.20064334571361542, "rewards/rejected": -1.3858120441436768, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 7.992472739060857, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.07760701328516006, "logits/rejected": -0.022303396835923195, "logps/chosen": -1.2271113395690918, "logps/rejected": -1.363673210144043, "loss": 1.6226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2271113395690918, "rewards/margins": 0.13656170666217804, "rewards/rejected": -1.363673210144043, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 9.041074439845941, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.12331873178482056, "logits/rejected": -0.003659465117380023, "logps/chosen": -1.2456037998199463, "logps/rejected": -1.4136433601379395, "loss": 1.6064, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2456037998199463, "rewards/margins": 0.16803942620754242, "rewards/rejected": -1.4136433601379395, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 9.30614609280546, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.28149527311325073, "logits/rejected": -0.16837576031684875, "logps/chosen": -1.289886474609375, "logps/rejected": -1.4208576679229736, "loss": 1.6559, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.289886474609375, "rewards/margins": 0.13097111880779266, "rewards/rejected": -1.4208576679229736, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 9.840047778991377, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.20624008774757385, "logits/rejected": -0.058651864528656006, "logps/chosen": -1.2796785831451416, "logps/rejected": -1.4359012842178345, "loss": 1.6297, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2796785831451416, "rewards/margins": 0.15622270107269287, "rewards/rejected": -1.4359012842178345, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 7.1401470055451535, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.17281001806259155, "logits/rejected": -0.08889992535114288, "logps/chosen": -1.254786729812622, "logps/rejected": -1.459333896636963, "loss": 1.605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.254786729812622, "rewards/margins": 0.2045472413301468, "rewards/rejected": -1.459333896636963, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 5.658621347401571, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.09652390331029892, "logits/rejected": -0.0367254912853241, "logps/chosen": -1.2596657276153564, "logps/rejected": -1.4990681409835815, "loss": 1.5933, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2596657276153564, "rewards/margins": 0.23940233886241913, "rewards/rejected": -1.4990681409835815, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 5.6994139371594486, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.19167150557041168, "logits/rejected": -0.07216285914182663, "logps/chosen": -1.3158442974090576, "logps/rejected": -1.4828596115112305, "loss": 1.6714, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3158442974090576, "rewards/margins": 0.16701537370681763, "rewards/rejected": -1.4828596115112305, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 7.29893228737393, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.11124508082866669, "logits/rejected": -0.028446123003959656, "logps/chosen": -1.3150824308395386, "logps/rejected": -1.5301481485366821, "loss": 1.6543, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3150824308395386, "rewards/margins": 0.2150656282901764, "rewards/rejected": -1.5301481485366821, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 7.02963922142275, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.1324525773525238, "logits/rejected": -0.03445501625537872, "logps/chosen": -1.3168666362762451, "logps/rejected": -1.4846765995025635, "loss": 1.6482, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3168666362762451, "rewards/margins": 0.16780975461006165, "rewards/rejected": -1.4846765995025635, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 7.430759587558647, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.0802285447716713, "logits/rejected": -0.044229067862033844, "logps/chosen": -1.3205969333648682, "logps/rejected": -1.460759162902832, "loss": 1.6794, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3205969333648682, "rewards/margins": 0.14016228914260864, "rewards/rejected": -1.460759162902832, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 5.776051063063486, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.33011573553085327, "logits/rejected": -0.193569153547287, "logps/chosen": -1.1992324590682983, "logps/rejected": -1.3035260438919067, "loss": 1.5946, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1992324590682983, "rewards/margins": 0.10429352521896362, "rewards/rejected": -1.3035260438919067, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 8.391908438600513, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.15375883877277374, "logits/rejected": -0.05089033395051956, "logps/chosen": -1.243142008781433, "logps/rejected": -1.3542028665542603, "loss": 1.6207, "rewards/accuracies": 0.53125, "rewards/chosen": -1.243142008781433, "rewards/margins": 0.1110607385635376, "rewards/rejected": -1.3542028665542603, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 7.90373749299904, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.28623443841934204, "logits/rejected": -0.1853335201740265, "logps/chosen": -1.3113467693328857, "logps/rejected": -1.4720947742462158, "loss": 1.6392, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3113467693328857, "rewards/margins": 0.16074810922145844, "rewards/rejected": -1.4720947742462158, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 11.50635801645452, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.27749574184417725, "logits/rejected": -0.1822170913219452, "logps/chosen": -1.1950091123580933, "logps/rejected": -1.4036287069320679, "loss": 1.5649, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1950091123580933, "rewards/margins": 0.2086195945739746, "rewards/rejected": -1.4036287069320679, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 8.313965644208702, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.23794877529144287, "logits/rejected": -0.22094373404979706, "logps/chosen": -1.2401232719421387, "logps/rejected": -1.4726377725601196, "loss": 1.5962, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2401232719421387, "rewards/margins": 0.23251450061798096, "rewards/rejected": -1.4726377725601196, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 6.279641958228806, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.2521572411060333, "logits/rejected": -0.18462175130844116, "logps/chosen": -1.2821708917617798, "logps/rejected": -1.4105151891708374, "loss": 1.6424, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2821708917617798, "rewards/margins": 0.12834443151950836, "rewards/rejected": -1.4105151891708374, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 6.145885295128102, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.23114600777626038, "logits/rejected": -0.15453112125396729, "logps/chosen": -1.2547972202301025, "logps/rejected": -1.4210622310638428, "loss": 1.6316, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2547972202301025, "rewards/margins": 0.16626504063606262, "rewards/rejected": -1.4210622310638428, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 9.283537736527547, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.2676324248313904, "logits/rejected": -0.19834928214550018, "logps/chosen": -1.2339831590652466, "logps/rejected": -1.3864284753799438, "loss": 1.6215, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2339831590652466, "rewards/margins": 0.1524452269077301, "rewards/rejected": -1.3864284753799438, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 5.54712738975142, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.12336407601833344, "logits/rejected": -0.08209587633609772, "logps/chosen": -1.2824407815933228, "logps/rejected": -1.4989856481552124, "loss": 1.613, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2824407815933228, "rewards/margins": 0.21654491126537323, "rewards/rejected": -1.4989856481552124, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 8.07909200577735, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.17828579246997833, "logits/rejected": -0.17043253779411316, "logps/chosen": -1.200535535812378, "logps/rejected": -1.3539785146713257, "loss": 1.5748, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.200535535812378, "rewards/margins": 0.15344305336475372, "rewards/rejected": -1.3539785146713257, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 8.134623664630519, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.23891082406044006, "logits/rejected": -0.12342718988656998, "logps/chosen": -1.220595359802246, "logps/rejected": -1.4715100526809692, "loss": 1.5438, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.220595359802246, "rewards/margins": 0.25091463327407837, "rewards/rejected": -1.4715100526809692, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 5.636674650063026, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.24777567386627197, "logits/rejected": -0.16002780199050903, "logps/chosen": -1.227882981300354, "logps/rejected": -1.4638681411743164, "loss": 1.5698, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.227882981300354, "rewards/margins": 0.2359851896762848, "rewards/rejected": -1.4638681411743164, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 8.26481230362609, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.14667882025241852, "logits/rejected": -0.01680718921124935, "logps/chosen": -1.275852918624878, "logps/rejected": -1.575873613357544, "loss": 1.6012, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.275852918624878, "rewards/margins": 0.30002063512802124, "rewards/rejected": -1.575873613357544, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.14010000228881836, "eval_logits/rejected": 0.2138935625553131, "eval_logps/chosen": -1.300903558731079, "eval_logps/rejected": -1.4722107648849487, "eval_loss": 1.6495161056518555, "eval_rewards/accuracies": 0.5667656064033508, "eval_rewards/chosen": -1.300903558731079, "eval_rewards/margins": 0.1713072508573532, "eval_rewards/rejected": -1.4722107648849487, "eval_runtime": 40.5718, "eval_samples_per_second": 33.151, "eval_steps_per_second": 8.306, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 5.5146507889561525, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.2771919369697571, "logits/rejected": -0.18472592532634735, "logps/chosen": -1.2956626415252686, "logps/rejected": -1.4821714162826538, "loss": 1.6298, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2956626415252686, "rewards/margins": 0.18650877475738525, "rewards/rejected": -1.4821714162826538, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 9.231050864247887, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.2625105381011963, "logits/rejected": -0.18162298202514648, "logps/chosen": -1.2624567747116089, "logps/rejected": -1.4393675327301025, "loss": 1.6263, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2624567747116089, "rewards/margins": 0.17691072821617126, "rewards/rejected": -1.4393675327301025, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 9.688812251430225, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.22895050048828125, "logits/rejected": -0.12594011425971985, "logps/chosen": -1.2300055027008057, "logps/rejected": -1.3816862106323242, "loss": 1.6023, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2300055027008057, "rewards/margins": 0.15168072283267975, "rewards/rejected": -1.3816862106323242, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 6.722526720784602, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.23030118644237518, "logits/rejected": -0.07641883939504623, "logps/chosen": -1.3148319721221924, "logps/rejected": -1.4781345129013062, "loss": 1.653, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3148319721221924, "rewards/margins": 0.16330257058143616, "rewards/rejected": -1.4781345129013062, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 9.475403044592714, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.23579053580760956, "logits/rejected": -0.12721948325634003, "logps/chosen": -1.1815276145935059, "logps/rejected": -1.3895444869995117, "loss": 1.5467, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1815276145935059, "rewards/margins": 0.20801691710948944, "rewards/rejected": -1.3895444869995117, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 6.696982142265946, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.2501373291015625, "logits/rejected": -0.13391202688217163, "logps/chosen": -1.2293736934661865, "logps/rejected": -1.4656680822372437, "loss": 1.5685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2293736934661865, "rewards/margins": 0.23629438877105713, "rewards/rejected": -1.4656680822372437, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 8.490345110087015, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.204094797372818, "logits/rejected": -0.16735957562923431, "logps/chosen": -1.2532507181167603, "logps/rejected": -1.437369704246521, "loss": 1.602, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2532507181167603, "rewards/margins": 0.18411897122859955, "rewards/rejected": -1.437369704246521, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 6.271553325835849, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.11776401847600937, "logits/rejected": -0.04645920917391777, "logps/chosen": -1.199644923210144, "logps/rejected": -1.4558780193328857, "loss": 1.5254, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.199644923210144, "rewards/margins": 0.2562331557273865, "rewards/rejected": -1.4558780193328857, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 8.823179932392053, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.3170090317726135, "logits/rejected": -0.20921523869037628, "logps/chosen": -1.2052419185638428, "logps/rejected": -1.4000091552734375, "loss": 1.585, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2052419185638428, "rewards/margins": 0.19476726651191711, "rewards/rejected": -1.4000091552734375, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 7.087829064434588, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.09932031482458115, "logits/rejected": -0.006480866577476263, "logps/chosen": -1.2517788410186768, "logps/rejected": -1.4760605096817017, "loss": 1.5947, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2517788410186768, "rewards/margins": 0.2242816686630249, "rewards/rejected": -1.4760605096817017, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 7.305535342674533, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.12802617251873016, "logits/rejected": -0.06714338064193726, "logps/chosen": -1.3415563106536865, "logps/rejected": -1.4062931537628174, "loss": 1.6824, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3415563106536865, "rewards/margins": 0.06473670899868011, "rewards/rejected": -1.4062931537628174, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 8.118621526135135, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.27072957158088684, "logits/rejected": -0.15636329352855682, "logps/chosen": -1.2973716259002686, "logps/rejected": -1.3826886415481567, "loss": 1.6414, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2973716259002686, "rewards/margins": 0.08531701564788818, "rewards/rejected": -1.3826886415481567, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 11.27222860909617, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.15749691426753998, "logits/rejected": -0.06950239837169647, "logps/chosen": -1.1970851421356201, "logps/rejected": -1.4039987325668335, "loss": 1.5624, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1970851421356201, "rewards/margins": 0.20691350102424622, "rewards/rejected": -1.4039987325668335, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 8.279971215164508, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.1439046859741211, "logits/rejected": -0.010311020538210869, "logps/chosen": -1.1577627658843994, "logps/rejected": -1.4001916646957397, "loss": 1.5366, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1577627658843994, "rewards/margins": 0.2424289882183075, "rewards/rejected": -1.4001916646957397, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 8.997782212793032, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.2029833048582077, "logits/rejected": 0.04279404133558273, "logps/chosen": -1.2708823680877686, "logps/rejected": -1.3666971921920776, "loss": 1.6451, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2708823680877686, "rewards/margins": 0.0958150252699852, "rewards/rejected": -1.3666971921920776, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 8.28610441032188, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.17824730277061462, "logits/rejected": -0.10273455083370209, "logps/chosen": -1.2131166458129883, "logps/rejected": -1.3982055187225342, "loss": 1.5873, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2131166458129883, "rewards/margins": 0.18508893251419067, "rewards/rejected": -1.3982055187225342, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 7.547073763786053, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.20342381298542023, "logits/rejected": -0.030245179310441017, "logps/chosen": -1.2119803428649902, "logps/rejected": -1.4356105327606201, "loss": 1.6003, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2119803428649902, "rewards/margins": 0.22363007068634033, "rewards/rejected": -1.4356105327606201, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 8.180883361802714, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.14093777537345886, "logits/rejected": -0.05257188156247139, "logps/chosen": -1.2363379001617432, "logps/rejected": -1.3310226202011108, "loss": 1.6366, "rewards/accuracies": 0.5, "rewards/chosen": -1.2363379001617432, "rewards/margins": 0.09468485414981842, "rewards/rejected": -1.3310226202011108, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 9.046488593280937, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.17947307229042053, "logits/rejected": -0.1344245821237564, "logps/chosen": -1.3134772777557373, "logps/rejected": -1.4176721572875977, "loss": 1.7044, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3134772777557373, "rewards/margins": 0.10419495403766632, "rewards/rejected": -1.4176721572875977, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 7.928396648025896, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.27724844217300415, "logits/rejected": -0.1727016419172287, "logps/chosen": -1.313591718673706, "logps/rejected": -1.463928461074829, "loss": 1.6647, "rewards/accuracies": 0.5625, "rewards/chosen": -1.313591718673706, "rewards/margins": 0.1503368318080902, "rewards/rejected": -1.463928461074829, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 9.244841943663591, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.2549338936805725, "logits/rejected": -0.15062884986400604, "logps/chosen": -1.2812715768814087, "logps/rejected": -1.3981187343597412, "loss": 1.6565, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2812715768814087, "rewards/margins": 0.11684717237949371, "rewards/rejected": -1.3981187343597412, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 7.687183836493162, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.1065024733543396, "logits/rejected": -0.06856070458889008, "logps/chosen": -1.2517259120941162, "logps/rejected": -1.4108588695526123, "loss": 1.6143, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2517259120941162, "rewards/margins": 0.1591329425573349, "rewards/rejected": -1.4108588695526123, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 8.894348041746323, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.27567869424819946, "logits/rejected": -0.16353575885295868, "logps/chosen": -1.2159628868103027, "logps/rejected": -1.397537112236023, "loss": 1.5918, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2159628868103027, "rewards/margins": 0.18157419562339783, "rewards/rejected": -1.397537112236023, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 7.090683543079151, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.17891091108322144, "logits/rejected": -0.028943505138158798, "logps/chosen": -1.2455543279647827, "logps/rejected": -1.4666962623596191, "loss": 1.5921, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2455543279647827, "rewards/margins": 0.22114193439483643, "rewards/rejected": -1.4666962623596191, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 8.038774574999868, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.2750813364982605, "logits/rejected": -0.12457992881536484, "logps/chosen": -1.2607461214065552, "logps/rejected": -1.451475739479065, "loss": 1.6106, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2607461214065552, "rewards/margins": 0.19072948396205902, "rewards/rejected": -1.451475739479065, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 9.664541781553764, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.1424180418252945, "logits/rejected": -0.05181293562054634, "logps/chosen": -1.1679651737213135, "logps/rejected": -1.4122867584228516, "loss": 1.5243, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1679651737213135, "rewards/margins": 0.2443215399980545, "rewards/rejected": -1.4122867584228516, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 11.392142095404724, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.21398165822029114, "logits/rejected": -0.0784340351819992, "logps/chosen": -1.283003807067871, "logps/rejected": -1.4440919160842896, "loss": 1.6162, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.283003807067871, "rewards/margins": 0.16108819842338562, "rewards/rejected": -1.4440919160842896, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 6.20814263774446, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.13394159078598022, "logits/rejected": -0.01691107079386711, "logps/chosen": -1.2252600193023682, "logps/rejected": -1.4495265483856201, "loss": 1.5877, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2252600193023682, "rewards/margins": 0.22426645457744598, "rewards/rejected": -1.4495265483856201, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 8.96712169638507, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.15664049983024597, "logits/rejected": -0.018710583448410034, "logps/chosen": -1.156507134437561, "logps/rejected": -1.364248275756836, "loss": 1.5416, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.156507134437561, "rewards/margins": 0.2077411711215973, "rewards/rejected": -1.364248275756836, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 8.42976352514539, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.2600031793117523, "logits/rejected": -0.13997478783130646, "logps/chosen": -1.3221666812896729, "logps/rejected": -1.4479409456253052, "loss": 1.6767, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3221666812896729, "rewards/margins": 0.12577416002750397, "rewards/rejected": -1.4479409456253052, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 10.628946026491922, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.022637512534856796, "logits/rejected": 0.009353891015052795, "logps/chosen": -1.3117666244506836, "logps/rejected": -1.397631287574768, "loss": 1.7081, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3117666244506836, "rewards/margins": 0.0858646109700203, "rewards/rejected": -1.397631287574768, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 8.161709625334764, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.19507454335689545, "logits/rejected": -0.01754070445895195, "logps/chosen": -1.2055964469909668, "logps/rejected": -1.3833786249160767, "loss": 1.5788, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2055964469909668, "rewards/margins": 0.17778228223323822, "rewards/rejected": -1.3833786249160767, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 5.789284712069662, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.2251238375902176, "logits/rejected": -0.10471514612436295, "logps/chosen": -1.242795705795288, "logps/rejected": -1.439692735671997, "loss": 1.5858, "rewards/accuracies": 0.59375, "rewards/chosen": -1.242795705795288, "rewards/margins": 0.19689705967903137, "rewards/rejected": -1.439692735671997, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 6.4576897334290715, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.18072129786014557, "logits/rejected": -0.05849956348538399, "logps/chosen": -1.2341316938400269, "logps/rejected": -1.3826676607131958, "loss": 1.6024, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2341316938400269, "rewards/margins": 0.14853589236736298, "rewards/rejected": -1.3826676607131958, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 7.8585008339633164, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.060932356864213943, "logits/rejected": -0.03307708352804184, "logps/chosen": -1.2687528133392334, "logps/rejected": -1.3632913827896118, "loss": 1.6541, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2687528133392334, "rewards/margins": 0.0945383757352829, "rewards/rejected": -1.3632913827896118, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 5.779868997107154, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.24139253795146942, "logits/rejected": -0.1293010115623474, "logps/chosen": -1.2347960472106934, "logps/rejected": -1.4479663372039795, "loss": 1.5791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2347960472106934, "rewards/margins": 0.2131703644990921, "rewards/rejected": -1.4479663372039795, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 7.1835051661861895, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.20090754330158234, "logits/rejected": -0.17768360674381256, "logps/chosen": -1.295978307723999, "logps/rejected": -1.444831371307373, "loss": 1.6513, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.295978307723999, "rewards/margins": 0.14885297417640686, "rewards/rejected": -1.444831371307373, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 9.757139822711501, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.15369240939617157, "logits/rejected": -0.059753142297267914, "logps/chosen": -1.3139399290084839, "logps/rejected": -1.4136239290237427, "loss": 1.6832, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3139399290084839, "rewards/margins": 0.09968401491641998, "rewards/rejected": -1.4136239290237427, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 8.8837194965421, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.19835281372070312, "logits/rejected": -0.12649798393249512, "logps/chosen": -1.191025972366333, "logps/rejected": -1.4131062030792236, "loss": 1.5628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.191025972366333, "rewards/margins": 0.22208015620708466, "rewards/rejected": -1.4131062030792236, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 8.13866966024635, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.19858194887638092, "logits/rejected": -0.10243697464466095, "logps/chosen": -1.258873462677002, "logps/rejected": -1.40668785572052, "loss": 1.6142, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.258873462677002, "rewards/margins": 0.1478143036365509, "rewards/rejected": -1.40668785572052, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 11.355187720081398, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.17992979288101196, "logits/rejected": -0.05091270059347153, "logps/chosen": -1.2505369186401367, "logps/rejected": -1.427901029586792, "loss": 1.6086, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2505369186401367, "rewards/margins": 0.1773640364408493, "rewards/rejected": -1.427901029586792, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 8.244314426249161, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.18372592329978943, "logits/rejected": -0.01030859723687172, "logps/chosen": -1.3831340074539185, "logps/rejected": -1.489128589630127, "loss": 1.7304, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3831340074539185, "rewards/margins": 0.10599465668201447, "rewards/rejected": -1.489128589630127, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 7.525238246945747, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.18597820401191711, "logits/rejected": -0.10546676069498062, "logps/chosen": -1.2357432842254639, "logps/rejected": -1.4461013078689575, "loss": 1.5916, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2357432842254639, "rewards/margins": 0.21035806834697723, "rewards/rejected": -1.4461013078689575, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 9.698197774122443, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.1279938519001007, "logits/rejected": -0.02889108657836914, "logps/chosen": -1.2791216373443604, "logps/rejected": -1.517954707145691, "loss": 1.6296, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2791216373443604, "rewards/margins": 0.2388329952955246, "rewards/rejected": -1.517954707145691, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 6.172841283742514, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.2117902934551239, "logits/rejected": -0.17474015057086945, "logps/chosen": -1.2423498630523682, "logps/rejected": -1.4180864095687866, "loss": 1.6009, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2423498630523682, "rewards/margins": 0.1757366955280304, "rewards/rejected": -1.4180864095687866, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 10.448068708499129, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.2236020565032959, "logits/rejected": -0.12878096103668213, "logps/chosen": -1.3056252002716064, "logps/rejected": -1.495781421661377, "loss": 1.6369, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3056252002716064, "rewards/margins": 0.19015632569789886, "rewards/rejected": -1.495781421661377, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 10.046608805526557, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.20522567629814148, "logits/rejected": -0.13971085846424103, "logps/chosen": -1.2589514255523682, "logps/rejected": -1.4753795862197876, "loss": 1.5882, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2589514255523682, "rewards/margins": 0.21642813086509705, "rewards/rejected": -1.4753795862197876, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 10.07150425461416, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.24803462624549866, "logits/rejected": -0.19548973441123962, "logps/chosen": -1.1807209253311157, "logps/rejected": -1.4144724607467651, "loss": 1.5658, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1807209253311157, "rewards/margins": 0.23375146090984344, "rewards/rejected": -1.4144724607467651, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 7.890741243298936, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.22131972014904022, "logits/rejected": -0.01649406924843788, "logps/chosen": -1.22940194606781, "logps/rejected": -1.4079666137695312, "loss": 1.5728, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.22940194606781, "rewards/margins": 0.17856450378894806, "rewards/rejected": -1.4079666137695312, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 10.333119132736323, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.31308794021606445, "logits/rejected": -0.15774603188037872, "logps/chosen": -1.2503737211227417, "logps/rejected": -1.5004576444625854, "loss": 1.5846, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2503737211227417, "rewards/margins": 0.2500839829444885, "rewards/rejected": -1.5004576444625854, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 6.919591848086438, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.16612792015075684, "logits/rejected": -0.036644019186496735, "logps/chosen": -1.411642074584961, "logps/rejected": -1.6058638095855713, "loss": 1.7367, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.411642074584961, "rewards/margins": 0.19422176480293274, "rewards/rejected": -1.6058638095855713, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 10.712034376207075, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.08675368130207062, "logits/rejected": 0.01952032931149006, "logps/chosen": -1.235244631767273, "logps/rejected": -1.426180124282837, "loss": 1.6395, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.235244631767273, "rewards/margins": 0.19093546271324158, "rewards/rejected": -1.426180124282837, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 6.924033747890647, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.244086354970932, "logits/rejected": -0.13570265471935272, "logps/chosen": -1.203270673751831, "logps/rejected": -1.367319107055664, "loss": 1.5766, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.203270673751831, "rewards/margins": 0.16404837369918823, "rewards/rejected": -1.367319107055664, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 7.464170816007541, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.22056007385253906, "logits/rejected": -0.1388140469789505, "logps/chosen": -1.3312097787857056, "logps/rejected": -1.472596287727356, "loss": 1.6641, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3312097787857056, "rewards/margins": 0.1413867175579071, "rewards/rejected": -1.472596287727356, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 6.761094195749699, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.29377657175064087, "logits/rejected": -0.23835745453834534, "logps/chosen": -1.256420373916626, "logps/rejected": -1.452223777770996, "loss": 1.6167, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.256420373916626, "rewards/margins": 0.19580325484275818, "rewards/rejected": -1.452223777770996, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 7.9660825575787255, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.15394994616508484, "logits/rejected": -0.12967711687088013, "logps/chosen": -1.2819709777832031, "logps/rejected": -1.4983596801757812, "loss": 1.6208, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2819709777832031, "rewards/margins": 0.2163887768983841, "rewards/rejected": -1.4983596801757812, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 10.274470546661414, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.2137545645236969, "logits/rejected": -0.12820082902908325, "logps/chosen": -1.2431753873825073, "logps/rejected": -1.4213874340057373, "loss": 1.6093, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2431753873825073, "rewards/margins": 0.17821213603019714, "rewards/rejected": -1.4213874340057373, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 6.8007809735857245, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.15245218575000763, "logits/rejected": -0.05015261098742485, "logps/chosen": -1.301128625869751, "logps/rejected": -1.3995161056518555, "loss": 1.6607, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.301128625869751, "rewards/margins": 0.09838749468326569, "rewards/rejected": -1.3995161056518555, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 8.146538116029161, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.23573660850524902, "logits/rejected": -0.0928931012749672, "logps/chosen": -1.2535717487335205, "logps/rejected": -1.4325979948043823, "loss": 1.5981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2535717487335205, "rewards/margins": 0.17902621626853943, "rewards/rejected": -1.4325979948043823, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 8.154477081361472, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.18610471487045288, "logits/rejected": -0.05407028645277023, "logps/chosen": -1.2972004413604736, "logps/rejected": -1.529928207397461, "loss": 1.6143, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2972004413604736, "rewards/margins": 0.23272785544395447, "rewards/rejected": -1.529928207397461, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 9.159993057152748, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.17671632766723633, "logits/rejected": -0.08549154549837112, "logps/chosen": -1.296130657196045, "logps/rejected": -1.4943058490753174, "loss": 1.6465, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.296130657196045, "rewards/margins": 0.19817538559436798, "rewards/rejected": -1.4943058490753174, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 9.571784503316689, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.20955529808998108, "logits/rejected": -0.14858964085578918, "logps/chosen": -1.242735743522644, "logps/rejected": -1.3935405015945435, "loss": 1.5927, "rewards/accuracies": 0.59375, "rewards/chosen": -1.242735743522644, "rewards/margins": 0.15080469846725464, "rewards/rejected": -1.3935405015945435, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 9.319053879866445, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.19252166152000427, "logits/rejected": -0.030325695872306824, "logps/chosen": -1.2827403545379639, "logps/rejected": -1.4876335859298706, "loss": 1.6236, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2827403545379639, "rewards/margins": 0.20489318668842316, "rewards/rejected": -1.4876335859298706, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 9.101855369858386, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.23145878314971924, "logits/rejected": -0.12374116480350494, "logps/chosen": -1.2911999225616455, "logps/rejected": -1.4790095090866089, "loss": 1.6347, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2911999225616455, "rewards/margins": 0.1878097951412201, "rewards/rejected": -1.4790095090866089, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 7.144259581381725, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.18061567842960358, "logits/rejected": -0.018749738112092018, "logps/chosen": -1.2568756341934204, "logps/rejected": -1.4504077434539795, "loss": 1.6138, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2568756341934204, "rewards/margins": 0.19353221356868744, "rewards/rejected": -1.4504077434539795, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 11.728557151173204, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.18043796718120575, "logits/rejected": -0.12324018776416779, "logps/chosen": -1.2382079362869263, "logps/rejected": -1.44778311252594, "loss": 1.595, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2382079362869263, "rewards/margins": 0.20957525074481964, "rewards/rejected": -1.44778311252594, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 6.5571486374089805, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.26944318413734436, "logits/rejected": -0.16007402539253235, "logps/chosen": -1.2057348489761353, "logps/rejected": -1.3465030193328857, "loss": 1.6035, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2057348489761353, "rewards/margins": 0.14076808094978333, "rewards/rejected": -1.3465030193328857, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 11.186457917160363, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.20591190457344055, "logits/rejected": -0.006433853413909674, "logps/chosen": -1.2343441247940063, "logps/rejected": -1.433020830154419, "loss": 1.5926, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2343441247940063, "rewards/margins": 0.1986767053604126, "rewards/rejected": -1.433020830154419, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 9.125103890239405, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.3186398446559906, "logits/rejected": -0.0966661125421524, "logps/chosen": -1.2360858917236328, "logps/rejected": -1.352618932723999, "loss": 1.6067, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2360858917236328, "rewards/margins": 0.11653308570384979, "rewards/rejected": -1.352618932723999, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 7.048961384226068, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.14295661449432373, "logits/rejected": -0.06613682955503464, "logps/chosen": -1.2295020818710327, "logps/rejected": -1.374596357345581, "loss": 1.5876, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2295020818710327, "rewards/margins": 0.14509446918964386, "rewards/rejected": -1.374596357345581, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 6.114270164065146, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.26475760340690613, "logits/rejected": -0.14583542943000793, "logps/chosen": -1.1900657415390015, "logps/rejected": -1.3886867761611938, "loss": 1.5951, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1900657415390015, "rewards/margins": 0.19862084090709686, "rewards/rejected": -1.3886867761611938, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 11.502442498112247, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.1803629845380783, "logits/rejected": -0.12026476860046387, "logps/chosen": -1.281933069229126, "logps/rejected": -1.466369867324829, "loss": 1.6331, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.281933069229126, "rewards/margins": 0.18443647027015686, "rewards/rejected": -1.466369867324829, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 6.002821132575185, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.18821127712726593, "logits/rejected": -0.14258238673210144, "logps/chosen": -1.2131202220916748, "logps/rejected": -1.4328038692474365, "loss": 1.5913, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2131202220916748, "rewards/margins": 0.2196836918592453, "rewards/rejected": -1.4328038692474365, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 7.635269179529435, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.2189391553401947, "logits/rejected": -0.10928545892238617, "logps/chosen": -1.2418806552886963, "logps/rejected": -1.4406083822250366, "loss": 1.6055, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2418806552886963, "rewards/margins": 0.1987278163433075, "rewards/rejected": -1.4406083822250366, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 7.443797664782801, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.12070286273956299, "logits/rejected": -0.016779515892267227, "logps/chosen": -1.2063409090042114, "logps/rejected": -1.4125832319259644, "loss": 1.5852, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2063409090042114, "rewards/margins": 0.2062424123287201, "rewards/rejected": -1.4125832319259644, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 8.504628729714607, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.19800689816474915, "logits/rejected": -0.16064441204071045, "logps/chosen": -1.2234773635864258, "logps/rejected": -1.464333176612854, "loss": 1.5766, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2234773635864258, "rewards/margins": 0.24085572361946106, "rewards/rejected": -1.464333176612854, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 7.9677329475988605, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.1262514889240265, "logits/rejected": -0.08946479856967926, "logps/chosen": -1.3143281936645508, "logps/rejected": -1.4545438289642334, "loss": 1.6649, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3143281936645508, "rewards/margins": 0.14021556079387665, "rewards/rejected": -1.4545438289642334, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 7.952744211308689, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.13433387875556946, "logits/rejected": -0.07007332146167755, "logps/chosen": -1.277406930923462, "logps/rejected": -1.4121787548065186, "loss": 1.6319, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.277406930923462, "rewards/margins": 0.13477174937725067, "rewards/rejected": -1.4121787548065186, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 6.781740184269981, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.2822973132133484, "logits/rejected": -0.10233817994594574, "logps/chosen": -1.2295691967010498, "logps/rejected": -1.4294407367706299, "loss": 1.5833, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2295691967010498, "rewards/margins": 0.19987143576145172, "rewards/rejected": -1.4294407367706299, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 9.943679947386732, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.14189042150974274, "logits/rejected": 0.013472884893417358, "logps/chosen": -1.209618091583252, "logps/rejected": -1.4569435119628906, "loss": 1.5646, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.209618091583252, "rewards/margins": 0.24732527136802673, "rewards/rejected": -1.4569435119628906, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.10516196489334106, "eval_logits/rejected": 0.17712825536727905, "eval_logps/chosen": -1.2986959218978882, "eval_logps/rejected": -1.477814793586731, "eval_loss": 1.647760033607483, "eval_rewards/accuracies": 0.5704748034477234, "eval_rewards/chosen": -1.2986959218978882, "eval_rewards/margins": 0.179118812084198, "eval_rewards/rejected": -1.477814793586731, "eval_runtime": 40.3628, "eval_samples_per_second": 33.323, "eval_steps_per_second": 8.349, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 7.808770644621557, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.24109283089637756, "logits/rejected": -0.2096950262784958, "logps/chosen": -1.2588623762130737, "logps/rejected": -1.3880422115325928, "loss": 1.641, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2588623762130737, "rewards/margins": 0.1291799396276474, "rewards/rejected": -1.3880422115325928, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 6.415918600900964, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.14331869781017303, "logits/rejected": -0.04207003861665726, "logps/chosen": -1.2295633554458618, "logps/rejected": -1.4343435764312744, "loss": 1.5756, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2295633554458618, "rewards/margins": 0.2047802209854126, "rewards/rejected": -1.4343435764312744, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 6.2842172727300865, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.0876409187912941, "logits/rejected": 0.008129620924592018, "logps/chosen": -1.3283607959747314, "logps/rejected": -1.436718463897705, "loss": 1.6673, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3283607959747314, "rewards/margins": 0.10835757106542587, "rewards/rejected": -1.436718463897705, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 10.35555577271964, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.15332219004631042, "logits/rejected": -0.09550925344228745, "logps/chosen": -1.2686882019042969, "logps/rejected": -1.3833425045013428, "loss": 1.6595, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2686882019042969, "rewards/margins": 0.11465413868427277, "rewards/rejected": -1.3833425045013428, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 8.709612576533985, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.09398411959409714, "logits/rejected": -0.07650090754032135, "logps/chosen": -1.2532272338867188, "logps/rejected": -1.4717903137207031, "loss": 1.6012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2532272338867188, "rewards/margins": 0.21856316924095154, "rewards/rejected": -1.4717903137207031, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 8.75858655836486, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.18788668513298035, "logits/rejected": -0.14185675978660583, "logps/chosen": -1.2734358310699463, "logps/rejected": -1.43949556350708, "loss": 1.6406, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2734358310699463, "rewards/margins": 0.16605976223945618, "rewards/rejected": -1.43949556350708, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 10.659669108581703, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.18720242381095886, "logits/rejected": -0.11303949356079102, "logps/chosen": -1.290158987045288, "logps/rejected": -1.448259949684143, "loss": 1.651, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.290158987045288, "rewards/margins": 0.15810075402259827, "rewards/rejected": -1.448259949684143, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 7.846067267898016, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.15360137820243835, "logits/rejected": -0.04836711287498474, "logps/chosen": -1.3664968013763428, "logps/rejected": -1.4371174573898315, "loss": 1.713, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3664968013763428, "rewards/margins": 0.07062048465013504, "rewards/rejected": -1.4371174573898315, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 7.473076523289346, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.18360236287117004, "logits/rejected": -0.07247422635555267, "logps/chosen": -1.3604953289031982, "logps/rejected": -1.4847160577774048, "loss": 1.6987, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3604953289031982, "rewards/margins": 0.12422071397304535, "rewards/rejected": -1.4847160577774048, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 7.444897497210662, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.20776736736297607, "logits/rejected": -0.11900806427001953, "logps/chosen": -1.2479541301727295, "logps/rejected": -1.385831356048584, "loss": 1.6118, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2479541301727295, "rewards/margins": 0.13787731528282166, "rewards/rejected": -1.385831356048584, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 11.393299359575368, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.1976751834154129, "logits/rejected": -0.07935307919979095, "logps/chosen": -1.3020820617675781, "logps/rejected": -1.455503225326538, "loss": 1.6683, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3020820617675781, "rewards/margins": 0.1534210443496704, "rewards/rejected": -1.455503225326538, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 11.232389591855133, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.20845703780651093, "logits/rejected": -0.09822802245616913, "logps/chosen": -1.2032948732376099, "logps/rejected": -1.3660537004470825, "loss": 1.5965, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2032948732376099, "rewards/margins": 0.16275881230831146, "rewards/rejected": -1.3660537004470825, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 8.85819556697752, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.1434224545955658, "logits/rejected": -0.039260972291231155, "logps/chosen": -1.2225271463394165, "logps/rejected": -1.3938648700714111, "loss": 1.6086, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2225271463394165, "rewards/margins": 0.17133775353431702, "rewards/rejected": -1.3938648700714111, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 6.489617402223498, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.21803636848926544, "logits/rejected": -0.09411749243736267, "logps/chosen": -1.2584469318389893, "logps/rejected": -1.4679569005966187, "loss": 1.6013, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2584469318389893, "rewards/margins": 0.20950999855995178, "rewards/rejected": -1.4679569005966187, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 7.405848196687554, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.17621895670890808, "logits/rejected": -0.05477041006088257, "logps/chosen": -1.288193702697754, "logps/rejected": -1.578798532485962, "loss": 1.606, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.288193702697754, "rewards/margins": 0.29060500860214233, "rewards/rejected": -1.578798532485962, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 8.007223927502881, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.19255180656909943, "logits/rejected": -0.0977533608675003, "logps/chosen": -1.138106346130371, "logps/rejected": -1.3500877618789673, "loss": 1.5194, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.138106346130371, "rewards/margins": 0.21198129653930664, "rewards/rejected": -1.3500877618789673, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 8.567693499874197, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.23554396629333496, "logits/rejected": -0.1732751429080963, "logps/chosen": -1.2374318838119507, "logps/rejected": -1.468052864074707, "loss": 1.5719, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2374318838119507, "rewards/margins": 0.23062118887901306, "rewards/rejected": -1.468052864074707, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 7.500530526726536, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.15899136662483215, "logits/rejected": -0.027197346091270447, "logps/chosen": -1.2746217250823975, "logps/rejected": -1.4639803171157837, "loss": 1.6315, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2746217250823975, "rewards/margins": 0.18935871124267578, "rewards/rejected": -1.4639803171157837, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 7.35502099895214, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.11527419090270996, "logits/rejected": -0.08784718811511993, "logps/chosen": -1.3176900148391724, "logps/rejected": -1.550249457359314, "loss": 1.6318, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3176900148391724, "rewards/margins": 0.23255948722362518, "rewards/rejected": -1.550249457359314, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 8.661549654110603, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.12260335683822632, "logits/rejected": -0.030400067567825317, "logps/chosen": -1.2405729293823242, "logps/rejected": -1.4361417293548584, "loss": 1.593, "rewards/accuracies": 0.625, "rewards/chosen": -1.2405729293823242, "rewards/margins": 0.19556888937950134, "rewards/rejected": -1.4361417293548584, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 8.768064878804184, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.16908925771713257, "logits/rejected": -0.10685942322015762, "logps/chosen": -1.2900021076202393, "logps/rejected": -1.4903085231781006, "loss": 1.6398, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2900021076202393, "rewards/margins": 0.20030629634857178, "rewards/rejected": -1.4903085231781006, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 7.926221873843065, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.07368534058332443, "logits/rejected": -0.03026638552546501, "logps/chosen": -1.29747474193573, "logps/rejected": -1.458830714225769, "loss": 1.6498, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.29747474193573, "rewards/margins": 0.16135601699352264, "rewards/rejected": -1.458830714225769, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 5.747132852592568, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.13629598915576935, "logits/rejected": -0.06977993249893188, "logps/chosen": -1.2730337381362915, "logps/rejected": -1.475895881652832, "loss": 1.6178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2730337381362915, "rewards/margins": 0.20286211371421814, "rewards/rejected": -1.475895881652832, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 6.970404484087072, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.1679581254720688, "logits/rejected": -0.10403795540332794, "logps/chosen": -1.2511913776397705, "logps/rejected": -1.4179065227508545, "loss": 1.6271, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2511913776397705, "rewards/margins": 0.16671538352966309, "rewards/rejected": -1.4179065227508545, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 6.81701280169232, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.2270546406507492, "logits/rejected": -0.14077839255332947, "logps/chosen": -1.2551872730255127, "logps/rejected": -1.422701120376587, "loss": 1.6151, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2551872730255127, "rewards/margins": 0.16751374304294586, "rewards/rejected": -1.422701120376587, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 7.559032307430267, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.11934938281774521, "logits/rejected": -0.01248091645538807, "logps/chosen": -1.218796968460083, "logps/rejected": -1.4315433502197266, "loss": 1.5745, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.218796968460083, "rewards/margins": 0.2127462923526764, "rewards/rejected": -1.4315433502197266, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 7.357540622090184, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.09708665311336517, "logits/rejected": -0.031124413013458252, "logps/chosen": -1.3405152559280396, "logps/rejected": -1.4379889965057373, "loss": 1.6962, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3405152559280396, "rewards/margins": 0.09747375547885895, "rewards/rejected": -1.4379889965057373, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 7.029834417254843, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.10880253463983536, "logits/rejected": -0.07140463590621948, "logps/chosen": -1.327852725982666, "logps/rejected": -1.4347646236419678, "loss": 1.6715, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.327852725982666, "rewards/margins": 0.10691193491220474, "rewards/rejected": -1.4347646236419678, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 6.6231405405746395, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.16665935516357422, "logits/rejected": -0.08508746325969696, "logps/chosen": -1.2745071649551392, "logps/rejected": -1.4427918195724487, "loss": 1.6197, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2745071649551392, "rewards/margins": 0.16828462481498718, "rewards/rejected": -1.4427918195724487, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 6.744849193834629, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.20406830310821533, "logits/rejected": -0.13290412724018097, "logps/chosen": -1.1823909282684326, "logps/rejected": -1.4527281522750854, "loss": 1.5256, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1823909282684326, "rewards/margins": 0.27033716440200806, "rewards/rejected": -1.4527281522750854, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 7.446708091453874, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.1945587396621704, "logits/rejected": -0.1164223775267601, "logps/chosen": -1.2044284343719482, "logps/rejected": -1.525867223739624, "loss": 1.5394, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2044284343719482, "rewards/margins": 0.32143890857696533, "rewards/rejected": -1.525867223739624, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 7.394113324521219, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.25701871514320374, "logits/rejected": -0.24626454710960388, "logps/chosen": -1.243540644645691, "logps/rejected": -1.4760215282440186, "loss": 1.573, "rewards/accuracies": 0.625, "rewards/chosen": -1.243540644645691, "rewards/margins": 0.23248091340065002, "rewards/rejected": -1.4760215282440186, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 8.634215422478055, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.126735121011734, "logits/rejected": 0.01685112714767456, "logps/chosen": -1.2230195999145508, "logps/rejected": -1.500044822692871, "loss": 1.5565, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2230195999145508, "rewards/margins": 0.27702516317367554, "rewards/rejected": -1.500044822692871, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 8.739521731135147, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.26435479521751404, "logits/rejected": -0.13318143784999847, "logps/chosen": -1.174684762954712, "logps/rejected": -1.5840399265289307, "loss": 1.5061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.174684762954712, "rewards/margins": 0.4093553125858307, "rewards/rejected": -1.5840399265289307, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 7.887435915063563, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.23848462104797363, "logits/rejected": -0.19505062699317932, "logps/chosen": -1.230447769165039, "logps/rejected": -1.439680814743042, "loss": 1.5973, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.230447769165039, "rewards/margins": 0.20923320949077606, "rewards/rejected": -1.439680814743042, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 7.20935429995073, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.11859314143657684, "logits/rejected": -0.09189174324274063, "logps/chosen": -1.2536604404449463, "logps/rejected": -1.4843485355377197, "loss": 1.6231, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2536604404449463, "rewards/margins": 0.2306881844997406, "rewards/rejected": -1.4843485355377197, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 6.542434525353967, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.17586956918239594, "logits/rejected": -0.07927899062633514, "logps/chosen": -1.271405577659607, "logps/rejected": -1.5665825605392456, "loss": 1.5756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.271405577659607, "rewards/margins": 0.29517701268196106, "rewards/rejected": -1.5665825605392456, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 6.2583841886826175, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.24330580234527588, "logits/rejected": -0.0978088229894638, "logps/chosen": -1.2277387380599976, "logps/rejected": -1.4597686529159546, "loss": 1.5563, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2277387380599976, "rewards/margins": 0.23202982544898987, "rewards/rejected": -1.4597686529159546, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 6.445583706030623, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.17014442384243011, "logits/rejected": -0.15202614665031433, "logps/chosen": -1.2047111988067627, "logps/rejected": -1.3954555988311768, "loss": 1.5703, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2047111988067627, "rewards/margins": 0.1907445192337036, "rewards/rejected": -1.3954555988311768, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 8.38716639275352, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.16343332827091217, "logits/rejected": -0.06480084359645844, "logps/chosen": -1.286795973777771, "logps/rejected": -1.5521700382232666, "loss": 1.6305, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.286795973777771, "rewards/margins": 0.2653741240501404, "rewards/rejected": -1.5521700382232666, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 10.866345322456871, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.14301295578479767, "logits/rejected": -0.05214305594563484, "logps/chosen": -1.190725564956665, "logps/rejected": -1.431903600692749, "loss": 1.534, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.190725564956665, "rewards/margins": 0.24117796123027802, "rewards/rejected": -1.431903600692749, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 7.7262644544689145, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.07260935008525848, "logits/rejected": 0.004268960561603308, "logps/chosen": -1.2204227447509766, "logps/rejected": -1.4730734825134277, "loss": 1.5652, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2204227447509766, "rewards/margins": 0.2526509761810303, "rewards/rejected": -1.4730734825134277, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 7.418071761914062, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.2659146189689636, "logits/rejected": -0.15749691426753998, "logps/chosen": -1.2592390775680542, "logps/rejected": -1.4130852222442627, "loss": 1.6185, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2592390775680542, "rewards/margins": 0.1538461297750473, "rewards/rejected": -1.4130852222442627, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 9.265432079063835, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.23816660046577454, "logits/rejected": -0.06556002795696259, "logps/chosen": -1.3202743530273438, "logps/rejected": -1.4127553701400757, "loss": 1.6821, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3202743530273438, "rewards/margins": 0.09248095750808716, "rewards/rejected": -1.4127553701400757, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 8.43824883139852, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.19324809312820435, "logits/rejected": -0.09612785279750824, "logps/chosen": -1.2284154891967773, "logps/rejected": -1.3744738101959229, "loss": 1.5943, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2284154891967773, "rewards/margins": 0.1460580974817276, "rewards/rejected": -1.3744738101959229, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 8.457060541067369, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.25151169300079346, "logits/rejected": -0.13039612770080566, "logps/chosen": -1.2907540798187256, "logps/rejected": -1.5371133089065552, "loss": 1.6136, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2907540798187256, "rewards/margins": 0.24635927379131317, "rewards/rejected": -1.5371133089065552, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 7.365987381138891, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.17692366242408752, "logits/rejected": -0.0669875293970108, "logps/chosen": -1.2781174182891846, "logps/rejected": -1.4772151708602905, "loss": 1.6226, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2781174182891846, "rewards/margins": 0.1990976482629776, "rewards/rejected": -1.4772151708602905, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 8.056718284287626, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.12126553058624268, "logits/rejected": -0.04391341656446457, "logps/chosen": -1.1803662776947021, "logps/rejected": -1.5080560445785522, "loss": 1.5237, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1803662776947021, "rewards/margins": 0.3276898264884949, "rewards/rejected": -1.5080560445785522, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 7.159079293202307, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.30405330657958984, "logits/rejected": -0.1049649715423584, "logps/chosen": -1.300267219543457, "logps/rejected": -1.5288124084472656, "loss": 1.634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.300267219543457, "rewards/margins": 0.22854533791542053, "rewards/rejected": -1.5288124084472656, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 7.744160416002481, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.18721036612987518, "logits/rejected": -0.05016489699482918, "logps/chosen": -1.2924706935882568, "logps/rejected": -1.4859955310821533, "loss": 1.6305, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2924706935882568, "rewards/margins": 0.19352486729621887, "rewards/rejected": -1.4859955310821533, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 7.293770790685207, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.15263861417770386, "logits/rejected": -9.13009062060155e-05, "logps/chosen": -1.2374870777130127, "logps/rejected": -1.5334017276763916, "loss": 1.579, "rewards/accuracies": 0.625, "rewards/chosen": -1.2374870777130127, "rewards/margins": 0.29591476917266846, "rewards/rejected": -1.5334017276763916, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 6.212222071720489, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.18352673947811127, "logits/rejected": -0.05127815529704094, "logps/chosen": -1.2543615102767944, "logps/rejected": -1.4569965600967407, "loss": 1.6082, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2543615102767944, "rewards/margins": 0.2026350051164627, "rewards/rejected": -1.4569965600967407, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 10.837827872231681, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.2902659475803375, "logits/rejected": -0.13155676424503326, "logps/chosen": -1.291379690170288, "logps/rejected": -1.418097734451294, "loss": 1.6395, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.291379690170288, "rewards/margins": 0.12671804428100586, "rewards/rejected": -1.418097734451294, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 8.955970485936408, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.2045224905014038, "logits/rejected": -0.18195459246635437, "logps/chosen": -1.1399279832839966, "logps/rejected": -1.3327436447143555, "loss": 1.5193, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1399279832839966, "rewards/margins": 0.19281557202339172, "rewards/rejected": -1.3327436447143555, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 6.872285912032201, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.23911531269550323, "logits/rejected": -0.13421085476875305, "logps/chosen": -1.2303617000579834, "logps/rejected": -1.4737814664840698, "loss": 1.569, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2303617000579834, "rewards/margins": 0.24341964721679688, "rewards/rejected": -1.4737814664840698, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 6.797546616721448, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.23981383442878723, "logits/rejected": -0.11860020458698273, "logps/chosen": -1.1498663425445557, "logps/rejected": -1.397800087928772, "loss": 1.4988, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1498663425445557, "rewards/margins": 0.24793359637260437, "rewards/rejected": -1.397800087928772, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 7.350912931287538, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.24004165828227997, "logits/rejected": -0.11813943088054657, "logps/chosen": -1.1958129405975342, "logps/rejected": -1.4676146507263184, "loss": 1.5325, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1958129405975342, "rewards/margins": 0.271801620721817, "rewards/rejected": -1.4676146507263184, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 10.665938862076626, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.16734269261360168, "logits/rejected": -0.09549416601657867, "logps/chosen": -1.236961841583252, "logps/rejected": -1.4197896718978882, "loss": 1.6023, "rewards/accuracies": 0.5625, "rewards/chosen": -1.236961841583252, "rewards/margins": 0.1828276813030243, "rewards/rejected": -1.4197896718978882, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 9.847712661459095, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.2093161791563034, "logits/rejected": -0.008092102594673634, "logps/chosen": -1.2099231481552124, "logps/rejected": -1.436281681060791, "loss": 1.5682, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2099231481552124, "rewards/margins": 0.2263585776090622, "rewards/rejected": -1.436281681060791, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 8.457076838996455, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.08073611557483673, "logits/rejected": -0.08362691849470139, "logps/chosen": -1.2163944244384766, "logps/rejected": -1.431199073791504, "loss": 1.5725, "rewards/accuracies": 0.625, "rewards/chosen": -1.2163944244384766, "rewards/margins": 0.21480469405651093, "rewards/rejected": -1.431199073791504, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 7.488957696280318, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.16372177004814148, "logits/rejected": -0.07514110952615738, "logps/chosen": -1.2316033840179443, "logps/rejected": -1.400773525238037, "loss": 1.5816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2316033840179443, "rewards/margins": 0.16917003691196442, "rewards/rejected": -1.400773525238037, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 6.794047364268938, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.1059347540140152, "logits/rejected": -0.006711071822792292, "logps/chosen": -1.1519787311553955, "logps/rejected": -1.3930784463882446, "loss": 1.525, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1519787311553955, "rewards/margins": 0.24109964072704315, "rewards/rejected": -1.3930784463882446, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 5.697122413335748, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.2032475471496582, "logits/rejected": -0.06592769175767899, "logps/chosen": -1.2571234703063965, "logps/rejected": -1.537047266960144, "loss": 1.5779, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2571234703063965, "rewards/margins": 0.2799237072467804, "rewards/rejected": -1.537047266960144, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 6.725268181910343, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.2132866084575653, "logits/rejected": -0.07607977092266083, "logps/chosen": -1.2650877237319946, "logps/rejected": -1.5556974411010742, "loss": 1.5755, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2650877237319946, "rewards/margins": 0.2906096875667572, "rewards/rejected": -1.5556974411010742, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 7.726090458805358, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.20984609425067902, "logits/rejected": -0.11176743358373642, "logps/chosen": -1.1926872730255127, "logps/rejected": -1.454128623008728, "loss": 1.5672, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1926872730255127, "rewards/margins": 0.2614414095878601, "rewards/rejected": -1.454128623008728, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 6.319357466196435, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.287931889295578, "logits/rejected": -0.1848205327987671, "logps/chosen": -1.278119444847107, "logps/rejected": -1.4347612857818604, "loss": 1.6255, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.278119444847107, "rewards/margins": 0.15664172172546387, "rewards/rejected": -1.4347612857818604, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 7.167238920674248, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.12809686362743378, "logits/rejected": -0.0829991027712822, "logps/chosen": -1.2316675186157227, "logps/rejected": -1.500192403793335, "loss": 1.5574, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2316675186157227, "rewards/margins": 0.26852482557296753, "rewards/rejected": -1.500192403793335, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 9.086017638738277, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.16330906748771667, "logits/rejected": -0.1318950206041336, "logps/chosen": -1.1459821462631226, "logps/rejected": -1.5246264934539795, "loss": 1.4668, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1459821462631226, "rewards/margins": 0.3786444365978241, "rewards/rejected": -1.5246264934539795, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 7.908057122641015, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.17196539044380188, "logits/rejected": -0.023234616965055466, "logps/chosen": -1.2766989469528198, "logps/rejected": -1.4319767951965332, "loss": 1.6312, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2766989469528198, "rewards/margins": 0.15527768433094025, "rewards/rejected": -1.4319767951965332, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 9.871413993500036, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.11495999246835709, "logits/rejected": 0.016498660668730736, "logps/chosen": -1.2260228395462036, "logps/rejected": -1.3812894821166992, "loss": 1.6032, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2260228395462036, "rewards/margins": 0.155266672372818, "rewards/rejected": -1.3812894821166992, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 7.83555448376319, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.23881487548351288, "logits/rejected": -0.16763314604759216, "logps/chosen": -1.190349817276001, "logps/rejected": -1.4223787784576416, "loss": 1.5393, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.190349817276001, "rewards/margins": 0.23202869296073914, "rewards/rejected": -1.4223787784576416, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 12.022333269502365, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.16634608805179596, "logits/rejected": -0.06972671300172806, "logps/chosen": -1.260103702545166, "logps/rejected": -1.561883807182312, "loss": 1.5905, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.260103702545166, "rewards/margins": 0.3017801344394684, "rewards/rejected": -1.561883807182312, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 7.107186437787189, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.11475582420825958, "logits/rejected": -0.02629752829670906, "logps/chosen": -1.1744776964187622, "logps/rejected": -1.4737120866775513, "loss": 1.5135, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1744776964187622, "rewards/margins": 0.29923444986343384, "rewards/rejected": -1.4737120866775513, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 7.131379407199912, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.1745968759059906, "logits/rejected": -0.09987455606460571, "logps/chosen": -1.2226725816726685, "logps/rejected": -1.507332444190979, "loss": 1.5716, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2226725816726685, "rewards/margins": 0.28465989232063293, "rewards/rejected": -1.507332444190979, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 7.021912184076167, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.11084862053394318, "logits/rejected": -0.042972978204488754, "logps/chosen": -1.1444134712219238, "logps/rejected": -1.4090802669525146, "loss": 1.5116, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1444134712219238, "rewards/margins": 0.26466673612594604, "rewards/rejected": -1.4090802669525146, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 8.622696824811587, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.2279435098171234, "logits/rejected": -0.17337724566459656, "logps/chosen": -1.1432182788848877, "logps/rejected": -1.488876223564148, "loss": 1.4881, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1432182788848877, "rewards/margins": 0.3456578254699707, "rewards/rejected": -1.488876223564148, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 11.351056337593837, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.26267361640930176, "logits/rejected": -0.18066450953483582, "logps/chosen": -1.2176510095596313, "logps/rejected": -1.4880212545394897, "loss": 1.5592, "rewards/accuracies": 0.625, "rewards/chosen": -1.2176510095596313, "rewards/margins": 0.27037036418914795, "rewards/rejected": -1.4880212545394897, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 7.191875819198297, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.2620295584201813, "logits/rejected": -0.1543290913105011, "logps/chosen": -1.1899316310882568, "logps/rejected": -1.4223182201385498, "loss": 1.5356, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1899316310882568, "rewards/margins": 0.23238661885261536, "rewards/rejected": -1.4223182201385498, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 7.5889949335374345, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.30554115772247314, "logits/rejected": -0.09454164654016495, "logps/chosen": -1.2171602249145508, "logps/rejected": -1.532593011856079, "loss": 1.5557, "rewards/accuracies": 0.625, "rewards/chosen": -1.2171602249145508, "rewards/margins": 0.3154327869415283, "rewards/rejected": -1.532593011856079, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 9.09363561535733, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.2022174596786499, "logits/rejected": -0.10738588869571686, "logps/chosen": -1.196028709411621, "logps/rejected": -1.4597480297088623, "loss": 1.5351, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.196028709411621, "rewards/margins": 0.2637191414833069, "rewards/rejected": -1.4597480297088623, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.054726891219615936, "eval_logits/rejected": 0.12384755164384842, "eval_logps/chosen": -1.3020442724227905, "eval_logps/rejected": -1.495224118232727, "eval_loss": 1.646976113319397, "eval_rewards/accuracies": 0.5712166428565979, "eval_rewards/chosen": -1.3020442724227905, "eval_rewards/margins": 0.19317977130413055, "eval_rewards/rejected": -1.495224118232727, "eval_runtime": 40.3493, "eval_samples_per_second": 33.334, "eval_steps_per_second": 8.352, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 6.706146378331485, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.2207103967666626, "logits/rejected": -0.04271166771650314, "logps/chosen": -1.3544065952301025, "logps/rejected": -1.512176752090454, "loss": 1.6739, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3544065952301025, "rewards/margins": 0.1577700972557068, "rewards/rejected": -1.512176752090454, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 6.468273724297947, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.20493578910827637, "logits/rejected": -0.12602047622203827, "logps/chosen": -1.2647501230239868, "logps/rejected": -1.4369029998779297, "loss": 1.6101, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2647501230239868, "rewards/margins": 0.1721528172492981, "rewards/rejected": -1.4369029998779297, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 7.548959517776111, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.23293106257915497, "logits/rejected": -0.026049653068184853, "logps/chosen": -1.2496126890182495, "logps/rejected": -1.4440643787384033, "loss": 1.6015, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2496126890182495, "rewards/margins": 0.19445185363292694, "rewards/rejected": -1.4440643787384033, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 6.715284489428122, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.28804710507392883, "logits/rejected": -0.17533376812934875, "logps/chosen": -1.2737540006637573, "logps/rejected": -1.4372507333755493, "loss": 1.6193, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2737540006637573, "rewards/margins": 0.1634966880083084, "rewards/rejected": -1.4372507333755493, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 8.172689510884089, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.21455779671669006, "logits/rejected": -0.13440287113189697, "logps/chosen": -1.175466775894165, "logps/rejected": -1.4104111194610596, "loss": 1.5519, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.175466775894165, "rewards/margins": 0.2349444329738617, "rewards/rejected": -1.4104111194610596, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 11.649277739380334, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.2572100758552551, "logits/rejected": -0.19653275609016418, "logps/chosen": -1.193954348564148, "logps/rejected": -1.463118553161621, "loss": 1.54, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.193954348564148, "rewards/margins": 0.26916417479515076, "rewards/rejected": -1.463118553161621, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 13.066068577813956, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.2382601499557495, "logits/rejected": -0.20679374039173126, "logps/chosen": -1.2078959941864014, "logps/rejected": -1.3131279945373535, "loss": 1.6089, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2078959941864014, "rewards/margins": 0.10523198544979095, "rewards/rejected": -1.3131279945373535, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 6.799592401932894, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.2680966258049011, "logits/rejected": -0.24163532257080078, "logps/chosen": -1.177412509918213, "logps/rejected": -1.3566945791244507, "loss": 1.5494, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.177412509918213, "rewards/margins": 0.17928209900856018, "rewards/rejected": -1.3566945791244507, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 9.205016811310943, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.1484549641609192, "logits/rejected": -0.04238913580775261, "logps/chosen": -1.2709609270095825, "logps/rejected": -1.4526519775390625, "loss": 1.6185, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2709609270095825, "rewards/margins": 0.1816910207271576, "rewards/rejected": -1.4526519775390625, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 4.990038669563438, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.27551451325416565, "logits/rejected": -0.19810011982917786, "logps/chosen": -1.2552398443222046, "logps/rejected": -1.4007903337478638, "loss": 1.6253, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2552398443222046, "rewards/margins": 0.14555031061172485, "rewards/rejected": -1.4007903337478638, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 6.9657305893367125, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.19223785400390625, "logits/rejected": -0.14967408776283264, "logps/chosen": -1.2315384149551392, "logps/rejected": -1.4193180799484253, "loss": 1.5764, "rewards/accuracies": 0.625, "rewards/chosen": -1.2315384149551392, "rewards/margins": 0.18777960538864136, "rewards/rejected": -1.4193180799484253, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 8.656762165165723, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.07751107960939407, "logits/rejected": -0.005736204795539379, "logps/chosen": -1.1673355102539062, "logps/rejected": -1.341541051864624, "loss": 1.5375, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1673355102539062, "rewards/margins": 0.17420557141304016, "rewards/rejected": -1.341541051864624, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 8.216756052327828, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.28622791171073914, "logits/rejected": -0.21848344802856445, "logps/chosen": -1.234943151473999, "logps/rejected": -1.4843318462371826, "loss": 1.5576, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.234943151473999, "rewards/margins": 0.2493886649608612, "rewards/rejected": -1.4843318462371826, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 11.15955657755358, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.17828765511512756, "logits/rejected": -0.005265363492071629, "logps/chosen": -1.1935495138168335, "logps/rejected": -1.4459011554718018, "loss": 1.5417, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1935495138168335, "rewards/margins": 0.25235164165496826, "rewards/rejected": -1.4459011554718018, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 7.989961925106653, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.24111512303352356, "logits/rejected": -0.09941961616277695, "logps/chosen": -1.2860196828842163, "logps/rejected": -1.4652435779571533, "loss": 1.62, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2860196828842163, "rewards/margins": 0.17922398447990417, "rewards/rejected": -1.4652435779571533, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 5.6274107718592115, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.2330934703350067, "logits/rejected": -0.13141149282455444, "logps/chosen": -1.26702880859375, "logps/rejected": -1.4907028675079346, "loss": 1.6039, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.26702880859375, "rewards/margins": 0.22367417812347412, "rewards/rejected": -1.4907028675079346, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 6.8913373247329535, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.21193423867225647, "logits/rejected": -0.1271488070487976, "logps/chosen": -1.2517024278640747, "logps/rejected": -1.4006750583648682, "loss": 1.6414, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2517024278640747, "rewards/margins": 0.14897270500659943, "rewards/rejected": -1.4006750583648682, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 6.812174740485534, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.26948246359825134, "logits/rejected": -0.12425342947244644, "logps/chosen": -1.224528431892395, "logps/rejected": -1.4087930917739868, "loss": 1.5841, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.224528431892395, "rewards/margins": 0.18426451086997986, "rewards/rejected": -1.4087930917739868, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 7.8071610053875045, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.23521646857261658, "logits/rejected": -0.17201289534568787, "logps/chosen": -1.3097665309906006, "logps/rejected": -1.4396437406539917, "loss": 1.6505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3097665309906006, "rewards/margins": 0.12987719476222992, "rewards/rejected": -1.4396437406539917, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 9.759600570549564, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.1733095496892929, "logits/rejected": -0.06978510320186615, "logps/chosen": -1.159136414527893, "logps/rejected": -1.4183191061019897, "loss": 1.5377, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.159136414527893, "rewards/margins": 0.25918275117874146, "rewards/rejected": -1.4183191061019897, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 9.654762879413962, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.23652561008930206, "logits/rejected": -0.07516833394765854, "logps/chosen": -1.2808523178100586, "logps/rejected": -1.449417233467102, "loss": 1.6213, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2808523178100586, "rewards/margins": 0.16856491565704346, "rewards/rejected": -1.449417233467102, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 9.366010754246267, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.16323630511760712, "logits/rejected": -0.02795899473130703, "logps/chosen": -1.2069051265716553, "logps/rejected": -1.3920739889144897, "loss": 1.5824, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2069051265716553, "rewards/margins": 0.18516886234283447, "rewards/rejected": -1.3920739889144897, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 7.544247300712321, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.15321019291877747, "logits/rejected": -0.12837058305740356, "logps/chosen": -1.2154773473739624, "logps/rejected": -1.4970439672470093, "loss": 1.5524, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2154773473739624, "rewards/margins": 0.28156667947769165, "rewards/rejected": -1.4970439672470093, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 5.510570855361945, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.2098352015018463, "logits/rejected": -0.1655178815126419, "logps/chosen": -1.1754869222640991, "logps/rejected": -1.4288277626037598, "loss": 1.5211, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1754869222640991, "rewards/margins": 0.25334078073501587, "rewards/rejected": -1.4288277626037598, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 6.781154016442021, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.24368247389793396, "logits/rejected": -0.08154761046171188, "logps/chosen": -1.1996369361877441, "logps/rejected": -1.4158093929290771, "loss": 1.5593, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1996369361877441, "rewards/margins": 0.216172456741333, "rewards/rejected": -1.4158093929290771, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 7.839360609152996, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.19044779241085052, "logits/rejected": -0.12619850039482117, "logps/chosen": -1.2049930095672607, "logps/rejected": -1.464005470275879, "loss": 1.5452, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2049930095672607, "rewards/margins": 0.2590124011039734, "rewards/rejected": -1.464005470275879, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 14.741267901523582, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.22283372282981873, "logits/rejected": -0.2287733107805252, "logps/chosen": -1.2565338611602783, "logps/rejected": -1.4918019771575928, "loss": 1.6078, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2565338611602783, "rewards/margins": 0.23526807129383087, "rewards/rejected": -1.4918019771575928, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 8.455044723284791, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.3040529787540436, "logits/rejected": -0.23242278397083282, "logps/chosen": -1.1781367063522339, "logps/rejected": -1.4392648935317993, "loss": 1.5309, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1781367063522339, "rewards/margins": 0.26112794876098633, "rewards/rejected": -1.4392648935317993, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 8.127198255996031, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.15838944911956787, "logits/rejected": -0.04505031183362007, "logps/chosen": -1.276071310043335, "logps/rejected": -1.5455598831176758, "loss": 1.6046, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.276071310043335, "rewards/margins": 0.2694885730743408, "rewards/rejected": -1.5455598831176758, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 8.103668914229091, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.16998286545276642, "logits/rejected": -0.10186684131622314, "logps/chosen": -1.1686853170394897, "logps/rejected": -1.4343395233154297, "loss": 1.5359, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1686853170394897, "rewards/margins": 0.26565423607826233, "rewards/rejected": -1.4343395233154297, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 7.996804928577198, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.1501077264547348, "logits/rejected": -0.17684204876422882, "logps/chosen": -1.2057361602783203, "logps/rejected": -1.426270604133606, "loss": 1.5515, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2057361602783203, "rewards/margins": 0.2205342799425125, "rewards/rejected": -1.426270604133606, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 9.335124344977482, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.21425195038318634, "logits/rejected": -0.09165969491004944, "logps/chosen": -1.2601537704467773, "logps/rejected": -1.4697338342666626, "loss": 1.6069, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2601537704467773, "rewards/margins": 0.20958003401756287, "rewards/rejected": -1.4697338342666626, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 7.097111005955174, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.21242213249206543, "logits/rejected": -0.1380898356437683, "logps/chosen": -1.2612653970718384, "logps/rejected": -1.4888466596603394, "loss": 1.608, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2612653970718384, "rewards/margins": 0.2275814265012741, "rewards/rejected": -1.4888466596603394, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 8.531276498829367, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.12863633036613464, "logits/rejected": -0.08679869771003723, "logps/chosen": -1.239980936050415, "logps/rejected": -1.4835368394851685, "loss": 1.5748, "rewards/accuracies": 0.65625, "rewards/chosen": -1.239980936050415, "rewards/margins": 0.2435559332370758, "rewards/rejected": -1.4835368394851685, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 7.0822481308862395, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.2195165604352951, "logits/rejected": -0.1345038115978241, "logps/chosen": -1.2192291021347046, "logps/rejected": -1.3896890878677368, "loss": 1.5897, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2192291021347046, "rewards/margins": 0.17045992612838745, "rewards/rejected": -1.3896890878677368, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 10.335725680656012, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.23852384090423584, "logits/rejected": -0.05886412411928177, "logps/chosen": -1.1707592010498047, "logps/rejected": -1.449096918106079, "loss": 1.5156, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1707592010498047, "rewards/margins": 0.2783377468585968, "rewards/rejected": -1.449096918106079, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 8.753214305093897, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.26844969391822815, "logits/rejected": -0.269203782081604, "logps/chosen": -1.144785761833191, "logps/rejected": -1.4154605865478516, "loss": 1.5112, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.144785761833191, "rewards/margins": 0.27067479491233826, "rewards/rejected": -1.4154605865478516, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 8.49101681170719, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.13564220070838928, "logits/rejected": -0.04429326206445694, "logps/chosen": -1.223623514175415, "logps/rejected": -1.435447096824646, "loss": 1.579, "rewards/accuracies": 0.625, "rewards/chosen": -1.223623514175415, "rewards/margins": 0.21182358264923096, "rewards/rejected": -1.435447096824646, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 5.32479960697816, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.29307204484939575, "logits/rejected": -0.1578172743320465, "logps/chosen": -1.2666765451431274, "logps/rejected": -1.4149402379989624, "loss": 1.6249, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2666765451431274, "rewards/margins": 0.14826364815235138, "rewards/rejected": -1.4149402379989624, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 8.502020695065282, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.0959613025188446, "logits/rejected": -0.11763976514339447, "logps/chosen": -1.271305799484253, "logps/rejected": -1.502671480178833, "loss": 1.5997, "rewards/accuracies": 0.625, "rewards/chosen": -1.271305799484253, "rewards/margins": 0.23136551678180695, "rewards/rejected": -1.502671480178833, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 8.408928564528509, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.26660507917404175, "logits/rejected": -0.20372562110424042, "logps/chosen": -1.2004365921020508, "logps/rejected": -1.4662361145019531, "loss": 1.5209, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2004365921020508, "rewards/margins": 0.26579946279525757, "rewards/rejected": -1.4662361145019531, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 8.460265723960784, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.2261292189359665, "logits/rejected": -0.1469893455505371, "logps/chosen": -1.2134878635406494, "logps/rejected": -1.5519218444824219, "loss": 1.5517, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2134878635406494, "rewards/margins": 0.33843404054641724, "rewards/rejected": -1.5519218444824219, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 6.811278118003559, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.24250388145446777, "logits/rejected": -0.1430424600839615, "logps/chosen": -1.2435731887817383, "logps/rejected": -1.452593445777893, "loss": 1.5939, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2435731887817383, "rewards/margins": 0.20902028679847717, "rewards/rejected": -1.452593445777893, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 7.76141628727321, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.2618526816368103, "logits/rejected": -0.09927698969841003, "logps/chosen": -1.2269976139068604, "logps/rejected": -1.4207863807678223, "loss": 1.5909, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2269976139068604, "rewards/margins": 0.19378860294818878, "rewards/rejected": -1.4207863807678223, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 10.727996484376538, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.2516542673110962, "logits/rejected": -0.11354745924472809, "logps/chosen": -1.2602417469024658, "logps/rejected": -1.5199130773544312, "loss": 1.5865, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2602417469024658, "rewards/margins": 0.2596711814403534, "rewards/rejected": -1.5199130773544312, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 7.809550665640668, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.2582088112831116, "logits/rejected": -0.20305652916431427, "logps/chosen": -1.2808090448379517, "logps/rejected": -1.5809587240219116, "loss": 1.6193, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2808090448379517, "rewards/margins": 0.30014973878860474, "rewards/rejected": -1.5809587240219116, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 7.570062948249775, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.20256300270557404, "logits/rejected": -0.10187848657369614, "logps/chosen": -1.2940313816070557, "logps/rejected": -1.3799540996551514, "loss": 1.6734, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2940313816070557, "rewards/margins": 0.08592281490564346, "rewards/rejected": -1.3799540996551514, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 8.623862268653017, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.18112261593341827, "logits/rejected": -0.08644866943359375, "logps/chosen": -1.2399652004241943, "logps/rejected": -1.3614873886108398, "loss": 1.6195, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2399652004241943, "rewards/margins": 0.12152229249477386, "rewards/rejected": -1.3614873886108398, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 7.106966197584022, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.27653664350509644, "logits/rejected": -0.1859886646270752, "logps/chosen": -1.2457199096679688, "logps/rejected": -1.371375322341919, "loss": 1.6025, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2457199096679688, "rewards/margins": 0.12565529346466064, "rewards/rejected": -1.371375322341919, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 5.110825815541059, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.18636533617973328, "logits/rejected": -0.04834170266985893, "logps/chosen": -1.2373005151748657, "logps/rejected": -1.4027382135391235, "loss": 1.6122, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2373005151748657, "rewards/margins": 0.16543760895729065, "rewards/rejected": -1.4027382135391235, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 10.93040123034802, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.2494935244321823, "logits/rejected": -0.05917506664991379, "logps/chosen": -1.2552902698516846, "logps/rejected": -1.499660611152649, "loss": 1.5887, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2552902698516846, "rewards/margins": 0.2443702518939972, "rewards/rejected": -1.499660611152649, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 8.524387736147663, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.20921842753887177, "logits/rejected": -0.10822540521621704, "logps/chosen": -1.172518014907837, "logps/rejected": -1.4659488201141357, "loss": 1.5203, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.172518014907837, "rewards/margins": 0.29343074560165405, "rewards/rejected": -1.4659488201141357, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 6.648049567413863, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.2961587607860565, "logits/rejected": -0.15879955887794495, "logps/chosen": -1.222677230834961, "logps/rejected": -1.4485093355178833, "loss": 1.5618, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.222677230834961, "rewards/margins": 0.22583195567131042, "rewards/rejected": -1.4485093355178833, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 5.8745856311887374, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.1480863094329834, "logits/rejected": 0.050052572041749954, "logps/chosen": -1.243753433227539, "logps/rejected": -1.594008207321167, "loss": 1.5783, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.243753433227539, "rewards/margins": 0.3502548336982727, "rewards/rejected": -1.594008207321167, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 8.907531457349748, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.19933749735355377, "logits/rejected": -0.13979306817054749, "logps/chosen": -1.297931432723999, "logps/rejected": -1.497729778289795, "loss": 1.6489, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.297931432723999, "rewards/margins": 0.19979847967624664, "rewards/rejected": -1.497729778289795, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 6.864862563404964, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.21919536590576172, "logits/rejected": -0.1593218892812729, "logps/chosen": -1.1974241733551025, "logps/rejected": -1.3887784481048584, "loss": 1.5465, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1974241733551025, "rewards/margins": 0.19135454297065735, "rewards/rejected": -1.3887784481048584, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 10.299591143212481, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.1530156284570694, "logits/rejected": -0.15994104743003845, "logps/chosen": -1.1667227745056152, "logps/rejected": -1.4124925136566162, "loss": 1.5057, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1667227745056152, "rewards/margins": 0.24576973915100098, "rewards/rejected": -1.4124925136566162, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 10.554924882674959, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.2538110017776489, "logits/rejected": -0.10423139482736588, "logps/chosen": -1.2699533700942993, "logps/rejected": -1.4364902973175049, "loss": 1.6115, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2699533700942993, "rewards/margins": 0.16653692722320557, "rewards/rejected": -1.4364902973175049, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 8.924876754305254, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.2252012938261032, "logits/rejected": -0.1906716227531433, "logps/chosen": -1.2317861318588257, "logps/rejected": -1.4582042694091797, "loss": 1.5814, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2317861318588257, "rewards/margins": 0.226418137550354, "rewards/rejected": -1.4582042694091797, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 10.591288986335153, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.21802549064159393, "logits/rejected": -0.1272871047258377, "logps/chosen": -1.2291576862335205, "logps/rejected": -1.3705837726593018, "loss": 1.5957, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2291576862335205, "rewards/margins": 0.14142607152462006, "rewards/rejected": -1.3705837726593018, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 9.327629730160833, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.2663048207759857, "logits/rejected": -0.16262254118919373, "logps/chosen": -1.2088749408721924, "logps/rejected": -1.4837599992752075, "loss": 1.5425, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2088749408721924, "rewards/margins": 0.27488499879837036, "rewards/rejected": -1.4837599992752075, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 8.388252875949682, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.2186436653137207, "logits/rejected": -0.053496263921260834, "logps/chosen": -1.2235801219940186, "logps/rejected": -1.4754079580307007, "loss": 1.5735, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2235801219940186, "rewards/margins": 0.25182777643203735, "rewards/rejected": -1.4754079580307007, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 7.222944816268537, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.1922692358493805, "logits/rejected": -0.07148855179548264, "logps/chosen": -1.222347617149353, "logps/rejected": -1.4011558294296265, "loss": 1.5932, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.222347617149353, "rewards/margins": 0.17880816757678986, "rewards/rejected": -1.4011558294296265, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 8.36618947670397, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.22373394668102264, "logits/rejected": -0.12639454007148743, "logps/chosen": -1.2792359590530396, "logps/rejected": -1.401538610458374, "loss": 1.6406, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2792359590530396, "rewards/margins": 0.12230267375707626, "rewards/rejected": -1.401538610458374, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 11.456603790802449, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.23932285606861115, "logits/rejected": -0.10981453955173492, "logps/chosen": -1.2031185626983643, "logps/rejected": -1.4842259883880615, "loss": 1.5514, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2031185626983643, "rewards/margins": 0.28110748529434204, "rewards/rejected": -1.4842259883880615, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 6.560458780045564, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.20181819796562195, "logits/rejected": -0.12014786154031754, "logps/chosen": -1.304861307144165, "logps/rejected": -1.436704397201538, "loss": 1.6624, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.304861307144165, "rewards/margins": 0.13184314966201782, "rewards/rejected": -1.436704397201538, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 10.365098773976653, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.18095768988132477, "logits/rejected": -0.1348801553249359, "logps/chosen": -1.2820155620574951, "logps/rejected": -1.4945478439331055, "loss": 1.6453, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2820155620574951, "rewards/margins": 0.21253237128257751, "rewards/rejected": -1.4945478439331055, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 8.709818303845928, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.2264154851436615, "logits/rejected": -0.09404493868350983, "logps/chosen": -1.1498174667358398, "logps/rejected": -1.5352723598480225, "loss": 1.492, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1498174667358398, "rewards/margins": 0.3854547142982483, "rewards/rejected": -1.5352723598480225, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 8.88602868081274, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.2641579508781433, "logits/rejected": -0.19507022202014923, "logps/chosen": -1.3444536924362183, "logps/rejected": -1.3795907497406006, "loss": 1.7046, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3444536924362183, "rewards/margins": 0.03513716533780098, "rewards/rejected": -1.3795907497406006, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 7.721172501362392, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.16946734488010406, "logits/rejected": -0.11795884370803833, "logps/chosen": -1.2383652925491333, "logps/rejected": -1.499885082244873, "loss": 1.572, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2383652925491333, "rewards/margins": 0.2615198493003845, "rewards/rejected": -1.499885082244873, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 6.637011020428752, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.25504356622695923, "logits/rejected": -0.1839907467365265, "logps/chosen": -1.274584174156189, "logps/rejected": -1.4763041734695435, "loss": 1.5997, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.274584174156189, "rewards/margins": 0.20172002911567688, "rewards/rejected": -1.4763041734695435, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 7.410794123821557, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.31483370065689087, "logits/rejected": -0.1500522792339325, "logps/chosen": -1.319117546081543, "logps/rejected": -1.4547805786132812, "loss": 1.6529, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.319117546081543, "rewards/margins": 0.13566307723522186, "rewards/rejected": -1.4547805786132812, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 7.819488623849646, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.13828317821025848, "logits/rejected": -0.14891335368156433, "logps/chosen": -1.1876940727233887, "logps/rejected": -1.4420838356018066, "loss": 1.5432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1876940727233887, "rewards/margins": 0.25438982248306274, "rewards/rejected": -1.4420838356018066, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 9.917352217947807, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.18322905898094177, "logits/rejected": -0.07606031000614166, "logps/chosen": -1.1553547382354736, "logps/rejected": -1.5348331928253174, "loss": 1.4964, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1553547382354736, "rewards/margins": 0.3794783651828766, "rewards/rejected": -1.5348331928253174, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 10.565204282699165, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.32471174001693726, "logits/rejected": -0.17733509838581085, "logps/chosen": -1.2622826099395752, "logps/rejected": -1.4821867942810059, "loss": 1.6017, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2622826099395752, "rewards/margins": 0.21990422904491425, "rewards/rejected": -1.4821867942810059, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 7.235371461274833, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.24023239314556122, "logits/rejected": -0.0995052307844162, "logps/chosen": -1.288403034210205, "logps/rejected": -1.4486147165298462, "loss": 1.6281, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.288403034210205, "rewards/margins": 0.1602117270231247, "rewards/rejected": -1.4486147165298462, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 9.729259628403861, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.26643213629722595, "logits/rejected": -0.1396816521883011, "logps/chosen": -1.2085447311401367, "logps/rejected": -1.5265811681747437, "loss": 1.5236, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2085447311401367, "rewards/margins": 0.31803658604621887, "rewards/rejected": -1.5265811681747437, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 6.5834768933781245, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.22660765051841736, "logits/rejected": -0.12067173421382904, "logps/chosen": -1.2112690210342407, "logps/rejected": -1.3970801830291748, "loss": 1.5741, "rewards/accuracies": 0.625, "rewards/chosen": -1.2112690210342407, "rewards/margins": 0.1858113557100296, "rewards/rejected": -1.3970801830291748, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 6.820345187978874, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.19299550354480743, "logits/rejected": -0.07580564171075821, "logps/chosen": -1.2356846332550049, "logps/rejected": -1.5556024312973022, "loss": 1.565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2356846332550049, "rewards/margins": 0.319917768239975, "rewards/rejected": -1.5556024312973022, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 6.141813056666793, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.28624340891838074, "logits/rejected": -0.1574813276529312, "logps/chosen": -1.198730230331421, "logps/rejected": -1.4840584993362427, "loss": 1.5307, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.198730230331421, "rewards/margins": 0.28532809019088745, "rewards/rejected": -1.4840584993362427, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": -0.006359766703099012, "eval_logits/rejected": 0.05871352180838585, "eval_logps/chosen": -1.3054404258728027, "eval_logps/rejected": -1.5042866468429565, "eval_loss": 1.6468573808670044, "eval_rewards/accuracies": 0.5712166428565979, "eval_rewards/chosen": -1.3054404258728027, "eval_rewards/margins": 0.1988460123538971, "eval_rewards/rejected": -1.5042866468429565, "eval_runtime": 40.3603, "eval_samples_per_second": 33.325, "eval_steps_per_second": 8.35, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 7.648024547279085, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.2932616174221039, "logits/rejected": -0.2517506182193756, "logps/chosen": -1.2127629518508911, "logps/rejected": -1.3618614673614502, "loss": 1.5902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2127629518508911, "rewards/margins": 0.1490984857082367, "rewards/rejected": -1.3618614673614502, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 6.568924993201521, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.26170510053634644, "logits/rejected": -0.23782913386821747, "logps/chosen": -1.3037235736846924, "logps/rejected": -1.5700328350067139, "loss": 1.6319, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3037235736846924, "rewards/margins": 0.26630908250808716, "rewards/rejected": -1.5700328350067139, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 7.537474141208627, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.1868119090795517, "logits/rejected": -0.05980759859085083, "logps/chosen": -1.2489304542541504, "logps/rejected": -1.447834849357605, "loss": 1.5896, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2489304542541504, "rewards/margins": 0.19890446960926056, "rewards/rejected": -1.447834849357605, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 8.586632763586499, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.2403072863817215, "logits/rejected": -0.17797724902629852, "logps/chosen": -1.279308795928955, "logps/rejected": -1.402352213859558, "loss": 1.6493, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.279308795928955, "rewards/margins": 0.12304346263408661, "rewards/rejected": -1.402352213859558, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 8.382363884634668, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.21785800158977509, "logits/rejected": -0.12086410820484161, "logps/chosen": -1.191899061203003, "logps/rejected": -1.4459514617919922, "loss": 1.5283, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.191899061203003, "rewards/margins": 0.2540523111820221, "rewards/rejected": -1.4459514617919922, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 5.937326547288543, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.22209754586219788, "logits/rejected": -0.09047858417034149, "logps/chosen": -1.2848058938980103, "logps/rejected": -1.501165747642517, "loss": 1.6148, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2848058938980103, "rewards/margins": 0.21635976433753967, "rewards/rejected": -1.501165747642517, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 6.600559161032941, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.3405889868736267, "logits/rejected": -0.16396290063858032, "logps/chosen": -1.1788181066513062, "logps/rejected": -1.4323985576629639, "loss": 1.5256, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1788181066513062, "rewards/margins": 0.2535802125930786, "rewards/rejected": -1.4323985576629639, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 9.478762843203286, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.29350167512893677, "logits/rejected": -0.129825621843338, "logps/chosen": -1.2769209146499634, "logps/rejected": -1.4102004766464233, "loss": 1.6316, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2769209146499634, "rewards/margins": 0.1332794427871704, "rewards/rejected": -1.4102004766464233, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 8.174733232105966, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.26454395055770874, "logits/rejected": -0.11497775465250015, "logps/chosen": -1.229234218597412, "logps/rejected": -1.4280798435211182, "loss": 1.6064, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.229234218597412, "rewards/margins": 0.1988455057144165, "rewards/rejected": -1.4280798435211182, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 9.864670169770434, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.20456938445568085, "logits/rejected": -0.09650013595819473, "logps/chosen": -1.286020040512085, "logps/rejected": -1.4996373653411865, "loss": 1.6172, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.286020040512085, "rewards/margins": 0.21361732482910156, "rewards/rejected": -1.4996373653411865, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 8.026644075668933, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.2971116900444031, "logits/rejected": -0.22733533382415771, "logps/chosen": -1.2687150239944458, "logps/rejected": -1.4610542058944702, "loss": 1.6074, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2687150239944458, "rewards/margins": 0.1923392117023468, "rewards/rejected": -1.4610542058944702, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 7.4530620599680395, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.29837164282798767, "logits/rejected": -0.10467962175607681, "logps/chosen": -1.1872581243515015, "logps/rejected": -1.5156866312026978, "loss": 1.5082, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1872581243515015, "rewards/margins": 0.3284284472465515, "rewards/rejected": -1.5156866312026978, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 6.948344264565798, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.29605644941329956, "logits/rejected": -0.14458665251731873, "logps/chosen": -1.2869765758514404, "logps/rejected": -1.433555245399475, "loss": 1.6447, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2869765758514404, "rewards/margins": 0.1465785950422287, "rewards/rejected": -1.433555245399475, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 8.034435333341417, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.16029222309589386, "logits/rejected": -0.09777076542377472, "logps/chosen": -1.1405402421951294, "logps/rejected": -1.3531912565231323, "loss": 1.526, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1405402421951294, "rewards/margins": 0.21265116333961487, "rewards/rejected": -1.3531912565231323, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 8.062583397766922, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.2398650348186493, "logits/rejected": -0.048020146787166595, "logps/chosen": -1.2048254013061523, "logps/rejected": -1.4288790225982666, "loss": 1.5512, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2048254013061523, "rewards/margins": 0.22405338287353516, "rewards/rejected": -1.4288790225982666, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 8.999501778068137, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.19769065082073212, "logits/rejected": -0.13914498686790466, "logps/chosen": -1.1431434154510498, "logps/rejected": -1.3402740955352783, "loss": 1.5346, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1431434154510498, "rewards/margins": 0.19713075459003448, "rewards/rejected": -1.3402740955352783, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 11.91169582300977, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.25468355417251587, "logits/rejected": -0.16460008919239044, "logps/chosen": -1.2626378536224365, "logps/rejected": -1.5172367095947266, "loss": 1.6013, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2626378536224365, "rewards/margins": 0.25459879636764526, "rewards/rejected": -1.5172367095947266, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 7.198831095781075, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.2648671269416809, "logits/rejected": -0.08476381003856659, "logps/chosen": -1.2206342220306396, "logps/rejected": -1.51417875289917, "loss": 1.5395, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2206342220306396, "rewards/margins": 0.2935445010662079, "rewards/rejected": -1.51417875289917, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 9.336637921006375, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.31433624029159546, "logits/rejected": -0.13823679089546204, "logps/chosen": -1.2059760093688965, "logps/rejected": -1.4413530826568604, "loss": 1.5535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2059760093688965, "rewards/margins": 0.23537695407867432, "rewards/rejected": -1.4413530826568604, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 6.755312518786944, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.12695498764514923, "logits/rejected": -0.13664838671684265, "logps/chosen": -1.2372138500213623, "logps/rejected": -1.433397650718689, "loss": 1.5933, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2372138500213623, "rewards/margins": 0.19618380069732666, "rewards/rejected": -1.433397650718689, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 10.439826793339416, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.273817241191864, "logits/rejected": -0.15390698611736298, "logps/chosen": -1.2211644649505615, "logps/rejected": -1.5253816843032837, "loss": 1.5589, "rewards/accuracies": 0.625, "rewards/chosen": -1.2211644649505615, "rewards/margins": 0.30421727895736694, "rewards/rejected": -1.5253816843032837, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 11.234889573389628, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.3066954016685486, "logits/rejected": -0.20369398593902588, "logps/chosen": -1.1758203506469727, "logps/rejected": -1.3948097229003906, "loss": 1.5495, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1758203506469727, "rewards/margins": 0.21898940205574036, "rewards/rejected": -1.3948097229003906, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 6.189244157909716, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.2799038290977478, "logits/rejected": -0.22861607372760773, "logps/chosen": -1.2325727939605713, "logps/rejected": -1.4817126989364624, "loss": 1.5636, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2325727939605713, "rewards/margins": 0.24913974106311798, "rewards/rejected": -1.4817126989364624, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 7.701897641900147, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.24316546320915222, "logits/rejected": -0.18140380084514618, "logps/chosen": -1.2099978923797607, "logps/rejected": -1.4995787143707275, "loss": 1.5331, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2099978923797607, "rewards/margins": 0.28958070278167725, "rewards/rejected": -1.4995787143707275, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 10.315218953869728, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.3166322112083435, "logits/rejected": -0.16723382472991943, "logps/chosen": -1.3230161666870117, "logps/rejected": -1.5725752115249634, "loss": 1.6338, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3230161666870117, "rewards/margins": 0.2495591938495636, "rewards/rejected": -1.5725752115249634, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 6.37458432411914, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.28852128982543945, "logits/rejected": -0.2155466377735138, "logps/chosen": -1.2530457973480225, "logps/rejected": -1.438676118850708, "loss": 1.6031, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2530457973480225, "rewards/margins": 0.18563035130500793, "rewards/rejected": -1.438676118850708, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 6.944524704067793, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.15872615575790405, "logits/rejected": -0.08008311688899994, "logps/chosen": -1.2024890184402466, "logps/rejected": -1.4294945001602173, "loss": 1.5812, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2024890184402466, "rewards/margins": 0.22700531780719757, "rewards/rejected": -1.4294945001602173, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 7.221047565580255, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.1756131798028946, "logits/rejected": -0.12765046954154968, "logps/chosen": -1.1767234802246094, "logps/rejected": -1.3959977626800537, "loss": 1.5461, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1767234802246094, "rewards/margins": 0.21927419304847717, "rewards/rejected": -1.3959977626800537, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 7.195056037934172, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.22789350152015686, "logits/rejected": -0.11055190861225128, "logps/chosen": -1.1836001873016357, "logps/rejected": -1.4245049953460693, "loss": 1.5593, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1836001873016357, "rewards/margins": 0.24090464413166046, "rewards/rejected": -1.4245049953460693, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 10.498404475913253, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.27692264318466187, "logits/rejected": -0.18639354407787323, "logps/chosen": -1.1665010452270508, "logps/rejected": -1.4428874254226685, "loss": 1.5453, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1665010452270508, "rewards/margins": 0.2763864994049072, "rewards/rejected": -1.4428874254226685, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 7.8526590966783125, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.2460877150297165, "logits/rejected": -0.19337491691112518, "logps/chosen": -1.281903624534607, "logps/rejected": -1.4304178953170776, "loss": 1.627, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.281903624534607, "rewards/margins": 0.14851422607898712, "rewards/rejected": -1.4304178953170776, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 9.928785492272352, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.2748548090457916, "logits/rejected": -0.13906711339950562, "logps/chosen": -1.1843833923339844, "logps/rejected": -1.3841426372528076, "loss": 1.5605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1843833923339844, "rewards/margins": 0.19975917041301727, "rewards/rejected": -1.3841426372528076, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 9.3550788719883, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.21005089581012726, "logits/rejected": -0.154231995344162, "logps/chosen": -1.285735011100769, "logps/rejected": -1.4750357866287231, "loss": 1.6221, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.285735011100769, "rewards/margins": 0.1893007755279541, "rewards/rejected": -1.4750357866287231, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 6.711557098770309, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.17600271105766296, "logits/rejected": -0.05018197372555733, "logps/chosen": -1.2361449003219604, "logps/rejected": -1.4916973114013672, "loss": 1.5731, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2361449003219604, "rewards/margins": 0.2555524706840515, "rewards/rejected": -1.4916973114013672, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 7.170275009538907, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.1657809019088745, "logits/rejected": -0.12726573646068573, "logps/chosen": -1.2311041355133057, "logps/rejected": -1.4348998069763184, "loss": 1.5876, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2311041355133057, "rewards/margins": 0.20379574596881866, "rewards/rejected": -1.4348998069763184, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 8.412859806198218, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.25646305084228516, "logits/rejected": -0.22181513905525208, "logps/chosen": -1.1789541244506836, "logps/rejected": -1.4659755229949951, "loss": 1.5227, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1789541244506836, "rewards/margins": 0.2870217263698578, "rewards/rejected": -1.4659755229949951, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 6.681635468763091, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.20302769541740417, "logits/rejected": -0.09072286635637283, "logps/chosen": -1.2906471490859985, "logps/rejected": -1.4585412740707397, "loss": 1.6575, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2906471490859985, "rewards/margins": 0.1678941547870636, "rewards/rejected": -1.4585412740707397, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 6.894788190860359, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.3316098749637604, "logits/rejected": -0.1765943318605423, "logps/chosen": -1.181456446647644, "logps/rejected": -1.4261738061904907, "loss": 1.5514, "rewards/accuracies": 0.59375, "rewards/chosen": -1.181456446647644, "rewards/margins": 0.2447173297405243, "rewards/rejected": -1.4261738061904907, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 7.58771593401615, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.2860944867134094, "logits/rejected": -0.19084931910037994, "logps/chosen": -1.2366198301315308, "logps/rejected": -1.508881688117981, "loss": 1.5823, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2366198301315308, "rewards/margins": 0.2722617983818054, "rewards/rejected": -1.508881688117981, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 8.280607864909799, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.27023738622665405, "logits/rejected": -0.049452733248472214, "logps/chosen": -1.28057861328125, "logps/rejected": -1.5539510250091553, "loss": 1.6078, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.28057861328125, "rewards/margins": 0.2733725607395172, "rewards/rejected": -1.5539510250091553, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 8.548402118132431, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.2092166692018509, "logits/rejected": -0.1112133041024208, "logps/chosen": -1.2589666843414307, "logps/rejected": -1.5114405155181885, "loss": 1.6017, "rewards/accuracies": 0.625, "rewards/chosen": -1.2589666843414307, "rewards/margins": 0.2524738907814026, "rewards/rejected": -1.5114405155181885, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 8.033466740124519, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.24130284786224365, "logits/rejected": -0.11197362095117569, "logps/chosen": -1.2412917613983154, "logps/rejected": -1.5120117664337158, "loss": 1.5583, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2412917613983154, "rewards/margins": 0.2707201838493347, "rewards/rejected": -1.5120117664337158, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 7.046513210480663, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.18820837140083313, "logits/rejected": -0.07801246643066406, "logps/chosen": -1.2589250802993774, "logps/rejected": -1.547240972518921, "loss": 1.577, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2589250802993774, "rewards/margins": 0.28831595182418823, "rewards/rejected": -1.547240972518921, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 7.508135876465592, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.20133718848228455, "logits/rejected": -0.03770292550325394, "logps/chosen": -1.1520895957946777, "logps/rejected": -1.4470627307891846, "loss": 1.499, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1520895957946777, "rewards/margins": 0.29497310519218445, "rewards/rejected": -1.4470627307891846, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 6.466550652115422, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.32645565271377563, "logits/rejected": -0.28997671604156494, "logps/chosen": -1.2724857330322266, "logps/rejected": -1.4714481830596924, "loss": 1.5952, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2724857330322266, "rewards/margins": 0.1989624947309494, "rewards/rejected": -1.4714481830596924, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 11.735900395678097, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.19582371413707733, "logits/rejected": -0.12573130428791046, "logps/chosen": -1.198366403579712, "logps/rejected": -1.4611613750457764, "loss": 1.5604, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.198366403579712, "rewards/margins": 0.2627950608730316, "rewards/rejected": -1.4611613750457764, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 9.902997434035893, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.13688859343528748, "logits/rejected": -0.017780795693397522, "logps/chosen": -1.2002695798873901, "logps/rejected": -1.4998315572738647, "loss": 1.564, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2002695798873901, "rewards/margins": 0.2995621860027313, "rewards/rejected": -1.4998315572738647, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 7.351790490600173, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.1672445833683014, "logits/rejected": -0.07436300069093704, "logps/chosen": -1.2341772317886353, "logps/rejected": -1.4727962017059326, "loss": 1.5704, "rewards/accuracies": 0.625, "rewards/chosen": -1.2341772317886353, "rewards/margins": 0.2386188954114914, "rewards/rejected": -1.4727962017059326, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 9.39778011100013, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.2063862830400467, "logits/rejected": -0.0940953716635704, "logps/chosen": -1.2115062475204468, "logps/rejected": -1.47126305103302, "loss": 1.5636, "rewards/accuracies": 0.625, "rewards/chosen": -1.2115062475204468, "rewards/margins": 0.25975683331489563, "rewards/rejected": -1.47126305103302, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 8.479287513353329, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.24118804931640625, "logits/rejected": -0.03551122546195984, "logps/chosen": -1.242187738418579, "logps/rejected": -1.541870355606079, "loss": 1.5544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.242187738418579, "rewards/margins": 0.2996828556060791, "rewards/rejected": -1.541870355606079, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 7.894086829697386, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.25328195095062256, "logits/rejected": -0.1307801753282547, "logps/chosen": -1.2145090103149414, "logps/rejected": -1.3983103036880493, "loss": 1.5689, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2145090103149414, "rewards/margins": 0.18380117416381836, "rewards/rejected": -1.3983103036880493, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 6.646646491542126, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.23050686717033386, "logits/rejected": -0.12764891982078552, "logps/chosen": -1.2896080017089844, "logps/rejected": -1.4467109441757202, "loss": 1.6302, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2896080017089844, "rewards/margins": 0.1571030169725418, "rewards/rejected": -1.4467109441757202, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 7.353091005940125, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.2040676325559616, "logits/rejected": -0.14961931109428406, "logps/chosen": -1.3131300210952759, "logps/rejected": -1.5262869596481323, "loss": 1.6443, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3131300210952759, "rewards/margins": 0.21315693855285645, "rewards/rejected": -1.5262869596481323, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 8.29195579521603, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.21293942630290985, "logits/rejected": -0.10854591429233551, "logps/chosen": -1.2803435325622559, "logps/rejected": -1.4527544975280762, "loss": 1.6204, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2803435325622559, "rewards/margins": 0.17241086065769196, "rewards/rejected": -1.4527544975280762, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 6.8924834068001415, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.16460387408733368, "logits/rejected": -0.043806709349155426, "logps/chosen": -1.186786413192749, "logps/rejected": -1.441338300704956, "loss": 1.5477, "rewards/accuracies": 0.5625, "rewards/chosen": -1.186786413192749, "rewards/margins": 0.2545519173145294, "rewards/rejected": -1.441338300704956, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 8.946938575965241, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.1750279814004898, "logits/rejected": -0.08176834881305695, "logps/chosen": -1.2342102527618408, "logps/rejected": -1.4532325267791748, "loss": 1.5877, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2342102527618408, "rewards/margins": 0.21902227401733398, "rewards/rejected": -1.4532325267791748, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 6.154934974488931, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.2888779044151306, "logits/rejected": -0.13938526809215546, "logps/chosen": -1.244964838027954, "logps/rejected": -1.4479520320892334, "loss": 1.5898, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.244964838027954, "rewards/margins": 0.20298714935779572, "rewards/rejected": -1.4479520320892334, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 9.230252365717345, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.20475709438323975, "logits/rejected": -0.17029550671577454, "logps/chosen": -1.263304352760315, "logps/rejected": -1.4428774118423462, "loss": 1.6101, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.263304352760315, "rewards/margins": 0.17957308888435364, "rewards/rejected": -1.4428774118423462, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 8.54427486444681, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.19619551301002502, "logits/rejected": -0.035579193383455276, "logps/chosen": -1.2540932893753052, "logps/rejected": -1.4471417665481567, "loss": 1.5896, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2540932893753052, "rewards/margins": 0.19304853677749634, "rewards/rejected": -1.4471417665481567, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 7.084263280815062, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.28747788071632385, "logits/rejected": -0.19399094581604004, "logps/chosen": -1.2100915908813477, "logps/rejected": -1.4136308431625366, "loss": 1.5647, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2100915908813477, "rewards/margins": 0.20353949069976807, "rewards/rejected": -1.4136308431625366, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 8.845836063800892, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.14692391455173492, "logits/rejected": -0.13063344359397888, "logps/chosen": -1.2915900945663452, "logps/rejected": -1.4874224662780762, "loss": 1.6382, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2915900945663452, "rewards/margins": 0.19583231210708618, "rewards/rejected": -1.4874224662780762, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 11.014357071728812, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.2220824509859085, "logits/rejected": -0.10035743564367294, "logps/chosen": -1.2579433917999268, "logps/rejected": -1.4760496616363525, "loss": 1.5911, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2579433917999268, "rewards/margins": 0.2181061953306198, "rewards/rejected": -1.4760496616363525, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 7.105325272287114, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.22952616214752197, "logits/rejected": -0.21391217410564423, "logps/chosen": -1.3182785511016846, "logps/rejected": -1.5258429050445557, "loss": 1.649, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3182785511016846, "rewards/margins": 0.20756426453590393, "rewards/rejected": -1.5258429050445557, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 9.793604028079923, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.20207378268241882, "logits/rejected": -0.13237614929676056, "logps/chosen": -1.1758029460906982, "logps/rejected": -1.4300979375839233, "loss": 1.5208, "rewards/accuracies": 0.625, "rewards/chosen": -1.1758029460906982, "rewards/margins": 0.2542952001094818, "rewards/rejected": -1.4300979375839233, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 8.129358537041927, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.24870195984840393, "logits/rejected": -0.19785283505916595, "logps/chosen": -1.2585840225219727, "logps/rejected": -1.4193689823150635, "loss": 1.6106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2585840225219727, "rewards/margins": 0.16078488528728485, "rewards/rejected": -1.4193689823150635, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 8.531145146991735, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.21363727748394012, "logits/rejected": -0.12141523510217667, "logps/chosen": -1.1580511331558228, "logps/rejected": -1.3999485969543457, "loss": 1.5276, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1580511331558228, "rewards/margins": 0.2418973743915558, "rewards/rejected": -1.3999485969543457, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 7.250451028832405, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.2147807627916336, "logits/rejected": -0.27469998598098755, "logps/chosen": -1.277307152748108, "logps/rejected": -1.4533443450927734, "loss": 1.6183, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.277307152748108, "rewards/margins": 0.17603713274002075, "rewards/rejected": -1.4533443450927734, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 8.974364799626747, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.26902931928634644, "logits/rejected": -0.15223777294158936, "logps/chosen": -1.2735518217086792, "logps/rejected": -1.530316948890686, "loss": 1.6067, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2735518217086792, "rewards/margins": 0.25676512718200684, "rewards/rejected": -1.530316948890686, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 7.666730812815336, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.20124347507953644, "logits/rejected": -0.16961103677749634, "logps/chosen": -1.2784777879714966, "logps/rejected": -1.5050462484359741, "loss": 1.6, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2784777879714966, "rewards/margins": 0.2265685349702835, "rewards/rejected": -1.5050462484359741, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 6.24962077094031, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.17841601371765137, "logits/rejected": -0.024863218888640404, "logps/chosen": -1.218346357345581, "logps/rejected": -1.6240530014038086, "loss": 1.5231, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.218346357345581, "rewards/margins": 0.4057064950466156, "rewards/rejected": -1.6240530014038086, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 6.504494487290139, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.23551194369792938, "logits/rejected": -0.0994628295302391, "logps/chosen": -1.1515239477157593, "logps/rejected": -1.4859282970428467, "loss": 1.498, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1515239477157593, "rewards/margins": 0.334404319524765, "rewards/rejected": -1.4859282970428467, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 7.107319168779414, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.26813262701034546, "logits/rejected": -0.14674730598926544, "logps/chosen": -1.166414499282837, "logps/rejected": -1.3925293684005737, "loss": 1.5349, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.166414499282837, "rewards/margins": 0.22611498832702637, "rewards/rejected": -1.3925293684005737, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 8.378977507372232, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.28723758459091187, "logits/rejected": -0.13835971057415009, "logps/chosen": -1.2562470436096191, "logps/rejected": -1.4372190237045288, "loss": 1.6221, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2562470436096191, "rewards/margins": 0.1809719353914261, "rewards/rejected": -1.4372190237045288, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 8.238179195381722, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.22701935470104218, "logits/rejected": -0.14575409889221191, "logps/chosen": -1.2587096691131592, "logps/rejected": -1.5133172273635864, "loss": 1.5878, "rewards/accuracies": 0.625, "rewards/chosen": -1.2587096691131592, "rewards/margins": 0.25460749864578247, "rewards/rejected": -1.5133172273635864, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 5.54586496974905, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.09544014185667038, "logits/rejected": -0.0030840635299682617, "logps/chosen": -1.1870863437652588, "logps/rejected": -1.4353337287902832, "loss": 1.5336, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1870863437652588, "rewards/margins": 0.24824753403663635, "rewards/rejected": -1.4353337287902832, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 5.120314060587667, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.18808437883853912, "logits/rejected": -0.12491951137781143, "logps/chosen": -1.2355691194534302, "logps/rejected": -1.4354352951049805, "loss": 1.585, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2355691194534302, "rewards/margins": 0.1998661756515503, "rewards/rejected": -1.4354352951049805, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 10.532994020488944, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.18425069749355316, "logits/rejected": -0.14060840010643005, "logps/chosen": -1.2594051361083984, "logps/rejected": -1.4246543645858765, "loss": 1.6132, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2594051361083984, "rewards/margins": 0.16524925827980042, "rewards/rejected": -1.4246543645858765, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 6.432786636668071, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.27169930934906006, "logits/rejected": -0.15257199108600616, "logps/chosen": -1.2576349973678589, "logps/rejected": -1.5024917125701904, "loss": 1.5822, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2576349973678589, "rewards/margins": 0.2448567897081375, "rewards/rejected": -1.5024917125701904, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 10.938717134054878, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.17738035321235657, "logits/rejected": -0.12643186748027802, "logps/chosen": -1.2729908227920532, "logps/rejected": -1.4721641540527344, "loss": 1.6237, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2729908227920532, "rewards/margins": 0.19917339086532593, "rewards/rejected": -1.4721641540527344, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 8.268168215582811, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.21264667809009552, "logits/rejected": -0.1251252144575119, "logps/chosen": -1.1901899576187134, "logps/rejected": -1.4447002410888672, "loss": 1.5433, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1901899576187134, "rewards/margins": 0.2545102834701538, "rewards/rejected": -1.4447002410888672, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.08799929171800613, "eval_logits/rejected": 0.1608801633119583, "eval_logps/chosen": -1.3037397861480713, "eval_logps/rejected": -1.5017025470733643, "eval_loss": 1.647184133529663, "eval_rewards/accuracies": 0.5727003216743469, "eval_rewards/chosen": -1.3037397861480713, "eval_rewards/margins": 0.19796282052993774, "eval_rewards/rejected": -1.5017025470733643, "eval_runtime": 40.3397, "eval_samples_per_second": 33.342, "eval_steps_per_second": 8.354, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 10.004139345742539, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.27661585807800293, "logits/rejected": -0.17738543450832367, "logps/chosen": -1.298222541809082, "logps/rejected": -1.5642954111099243, "loss": 1.6178, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.298222541809082, "rewards/margins": 0.2660728991031647, "rewards/rejected": -1.5642954111099243, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 6.853794272791884, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.18757258355617523, "logits/rejected": -0.07676851749420166, "logps/chosen": -1.3012917041778564, "logps/rejected": -1.524106740951538, "loss": 1.6321, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3012917041778564, "rewards/margins": 0.22281484305858612, "rewards/rejected": -1.524106740951538, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 8.424319277147559, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.18077567219734192, "logits/rejected": -0.09020836651325226, "logps/chosen": -1.1457854509353638, "logps/rejected": -1.5183522701263428, "loss": 1.4771, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1457854509353638, "rewards/margins": 0.37256690859794617, "rewards/rejected": -1.5183522701263428, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 8.710967305387417, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.19583284854888916, "logits/rejected": -0.1298530250787735, "logps/chosen": -1.3064498901367188, "logps/rejected": -1.4972361326217651, "loss": 1.6298, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3064498901367188, "rewards/margins": 0.19078609347343445, "rewards/rejected": -1.4972361326217651, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 9.934704564510238, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.1697060763835907, "logits/rejected": -0.0799294114112854, "logps/chosen": -1.2031030654907227, "logps/rejected": -1.3971145153045654, "loss": 1.571, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2031030654907227, "rewards/margins": 0.19401133060455322, "rewards/rejected": -1.3971145153045654, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 5.558747942426494, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.2419288605451584, "logits/rejected": -0.12492205202579498, "logps/chosen": -1.2873618602752686, "logps/rejected": -1.44034743309021, "loss": 1.6322, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2873618602752686, "rewards/margins": 0.15298554301261902, "rewards/rejected": -1.44034743309021, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 10.588648455241415, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.34422606229782104, "logits/rejected": -0.20205549895763397, "logps/chosen": -1.2288485765457153, "logps/rejected": -1.431166172027588, "loss": 1.5837, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2288485765457153, "rewards/margins": 0.20231764018535614, "rewards/rejected": -1.431166172027588, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 17.41639008267266, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.27671462297439575, "logits/rejected": -0.12988321483135223, "logps/chosen": -1.2465171813964844, "logps/rejected": -1.4433683156967163, "loss": 1.5895, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2465171813964844, "rewards/margins": 0.19685105979442596, "rewards/rejected": -1.4433683156967163, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 6.963596714901478, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.27144211530685425, "logits/rejected": -0.16773834824562073, "logps/chosen": -1.2030506134033203, "logps/rejected": -1.4597923755645752, "loss": 1.532, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2030506134033203, "rewards/margins": 0.2567417025566101, "rewards/rejected": -1.4597923755645752, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 10.818305557457998, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.3696734309196472, "logits/rejected": -0.24278569221496582, "logps/chosen": -1.1997416019439697, "logps/rejected": -1.4444429874420166, "loss": 1.5504, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1997416019439697, "rewards/margins": 0.24470162391662598, "rewards/rejected": -1.4444429874420166, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 11.424581546227913, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.2225084751844406, "logits/rejected": -0.10350972414016724, "logps/chosen": -1.1904308795928955, "logps/rejected": -1.3949419260025024, "loss": 1.5781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1904308795928955, "rewards/margins": 0.20451101660728455, "rewards/rejected": -1.3949419260025024, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 10.680690720703248, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.2484455406665802, "logits/rejected": -0.2033940851688385, "logps/chosen": -1.1515166759490967, "logps/rejected": -1.5101279020309448, "loss": 1.4904, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1515166759490967, "rewards/margins": 0.35861116647720337, "rewards/rejected": -1.5101279020309448, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 7.766892508530793, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.33021020889282227, "logits/rejected": -0.2669282555580139, "logps/chosen": -1.2164661884307861, "logps/rejected": -1.5209804773330688, "loss": 1.5522, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2164661884307861, "rewards/margins": 0.3045143187046051, "rewards/rejected": -1.5209804773330688, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 10.5121533070436, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.20856007933616638, "logits/rejected": -0.1037999615073204, "logps/chosen": -1.2410916090011597, "logps/rejected": -1.4437034130096436, "loss": 1.5985, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2410916090011597, "rewards/margins": 0.20261195302009583, "rewards/rejected": -1.4437034130096436, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 8.287978704385738, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.09526379406452179, "logits/rejected": -0.04287983849644661, "logps/chosen": -1.2036092281341553, "logps/rejected": -1.5484676361083984, "loss": 1.5442, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2036092281341553, "rewards/margins": 0.34485840797424316, "rewards/rejected": -1.5484676361083984, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 8.942252036294636, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.15273435413837433, "logits/rejected": -0.06922443211078644, "logps/chosen": -1.1939882040023804, "logps/rejected": -1.490462064743042, "loss": 1.5264, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1939882040023804, "rewards/margins": 0.29647380113601685, "rewards/rejected": -1.490462064743042, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 8.136911208657267, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.17341378331184387, "logits/rejected": -0.11079660803079605, "logps/chosen": -1.3053932189941406, "logps/rejected": -1.567337155342102, "loss": 1.6352, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3053932189941406, "rewards/margins": 0.26194387674331665, "rewards/rejected": -1.567337155342102, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 9.058068642708994, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.2817673981189728, "logits/rejected": -0.14578992128372192, "logps/chosen": -1.2361321449279785, "logps/rejected": -1.505322813987732, "loss": 1.5715, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2361321449279785, "rewards/margins": 0.2691906988620758, "rewards/rejected": -1.505322813987732, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 6.941136983346939, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.2306617945432663, "logits/rejected": -0.15258941054344177, "logps/chosen": -1.2696197032928467, "logps/rejected": -1.424823522567749, "loss": 1.6234, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2696197032928467, "rewards/margins": 0.15520384907722473, "rewards/rejected": -1.424823522567749, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 6.028115156070567, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.17112164199352264, "logits/rejected": -0.056985218077898026, "logps/chosen": -1.2183990478515625, "logps/rejected": -1.4961133003234863, "loss": 1.5523, "rewards/accuracies": 0.625, "rewards/chosen": -1.2183990478515625, "rewards/margins": 0.27771419286727905, "rewards/rejected": -1.4961133003234863, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 6.413105815703295, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.21311946213245392, "logits/rejected": -0.09097801893949509, "logps/chosen": -1.2271919250488281, "logps/rejected": -1.490767002105713, "loss": 1.5694, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2271919250488281, "rewards/margins": 0.26357507705688477, "rewards/rejected": -1.490767002105713, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 7.187362492638208, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.2537928521633148, "logits/rejected": -0.28167393803596497, "logps/chosen": -1.2362048625946045, "logps/rejected": -1.567087173461914, "loss": 1.5652, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2362048625946045, "rewards/margins": 0.33088216185569763, "rewards/rejected": -1.567087173461914, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 8.332445387426878, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.1759515255689621, "logits/rejected": -0.10240741074085236, "logps/chosen": -1.1517741680145264, "logps/rejected": -1.447211742401123, "loss": 1.4964, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1517741680145264, "rewards/margins": 0.2954375743865967, "rewards/rejected": -1.447211742401123, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 10.163277147162006, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.1583867073059082, "logits/rejected": -0.10426501929759979, "logps/chosen": -1.2298393249511719, "logps/rejected": -1.414184808731079, "loss": 1.5992, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2298393249511719, "rewards/margins": 0.18434542417526245, "rewards/rejected": -1.414184808731079, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 8.515773110713946, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.3868665397167206, "logits/rejected": -0.18404386937618256, "logps/chosen": -1.1990516185760498, "logps/rejected": -1.4621487855911255, "loss": 1.5456, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1990516185760498, "rewards/margins": 0.26309722661972046, "rewards/rejected": -1.4621487855911255, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 6.398725501498622, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.14923153817653656, "logits/rejected": -0.03317415714263916, "logps/chosen": -1.2743452787399292, "logps/rejected": -1.5962750911712646, "loss": 1.5927, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2743452787399292, "rewards/margins": 0.32192978262901306, "rewards/rejected": -1.5962750911712646, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 11.077780630597667, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.23723450303077698, "logits/rejected": -0.1026090532541275, "logps/chosen": -1.2323095798492432, "logps/rejected": -1.4321863651275635, "loss": 1.5876, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2323095798492432, "rewards/margins": 0.19987669587135315, "rewards/rejected": -1.4321863651275635, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 6.697254262643319, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.2031714916229248, "logits/rejected": -0.14960214495658875, "logps/chosen": -1.2334299087524414, "logps/rejected": -1.3718469142913818, "loss": 1.5928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2334299087524414, "rewards/margins": 0.13841703534126282, "rewards/rejected": -1.3718469142913818, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 7.426199038991483, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.2201036959886551, "logits/rejected": -0.04712003469467163, "logps/chosen": -1.2281110286712646, "logps/rejected": -1.5359797477722168, "loss": 1.549, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2281110286712646, "rewards/margins": 0.3078688085079193, "rewards/rejected": -1.5359797477722168, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 12.306575673857681, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.24709157645702362, "logits/rejected": -0.1545828878879547, "logps/chosen": -1.191224455833435, "logps/rejected": -1.4444421529769897, "loss": 1.5518, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.191224455833435, "rewards/margins": 0.2532176971435547, "rewards/rejected": -1.4444421529769897, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 8.610634145118038, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.22103190422058105, "logits/rejected": -0.2326325923204422, "logps/chosen": -1.2279322147369385, "logps/rejected": -1.5125610828399658, "loss": 1.5511, "rewards/accuracies": 0.625, "rewards/chosen": -1.2279322147369385, "rewards/margins": 0.28462880849838257, "rewards/rejected": -1.5125610828399658, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 7.307535491883655, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.13648240268230438, "logits/rejected": -0.060978494584560394, "logps/chosen": -1.2389017343521118, "logps/rejected": -1.4436099529266357, "loss": 1.5848, "rewards/accuracies": 0.625, "rewards/chosen": -1.2389017343521118, "rewards/margins": 0.2047082930803299, "rewards/rejected": -1.4436099529266357, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 8.703302267157248, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.24506616592407227, "logits/rejected": -0.14053188264369965, "logps/chosen": -1.1402724981307983, "logps/rejected": -1.392337441444397, "loss": 1.4885, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1402724981307983, "rewards/margins": 0.25206464529037476, "rewards/rejected": -1.392337441444397, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 10.685508236641486, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.15796777606010437, "logits/rejected": -0.10449770838022232, "logps/chosen": -1.1950023174285889, "logps/rejected": -1.4939477443695068, "loss": 1.5286, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1950023174285889, "rewards/margins": 0.2989455759525299, "rewards/rejected": -1.4939477443695068, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 9.097216461292135, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.22597603499889374, "logits/rejected": -0.1907559335231781, "logps/chosen": -1.2609063386917114, "logps/rejected": -1.4642679691314697, "loss": 1.6028, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2609063386917114, "rewards/margins": 0.20336155593395233, "rewards/rejected": -1.4642679691314697, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 9.941018126511944, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.10990214347839355, "logits/rejected": -0.05437646433711052, "logps/chosen": -1.343835473060608, "logps/rejected": -1.556043267250061, "loss": 1.6562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.343835473060608, "rewards/margins": 0.21220776438713074, "rewards/rejected": -1.556043267250061, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 11.559828017336898, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.17771419882774353, "logits/rejected": -0.04531203210353851, "logps/chosen": -1.1872304677963257, "logps/rejected": -1.4659850597381592, "loss": 1.5465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1872304677963257, "rewards/margins": 0.27875441312789917, "rewards/rejected": -1.4659850597381592, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 9.282636949865253, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.27501699328422546, "logits/rejected": -0.09257388114929199, "logps/chosen": -1.1860332489013672, "logps/rejected": -1.4876549243927002, "loss": 1.5319, "rewards/accuracies": 0.625, "rewards/chosen": -1.1860332489013672, "rewards/margins": 0.3016217052936554, "rewards/rejected": -1.4876549243927002, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 9.805693856644686, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.26975321769714355, "logits/rejected": -0.26140162348747253, "logps/chosen": -1.237966537475586, "logps/rejected": -1.4422686100006104, "loss": 1.5988, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.237966537475586, "rewards/margins": 0.20430207252502441, "rewards/rejected": -1.4422686100006104, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 8.328654351915757, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.2382686585187912, "logits/rejected": -0.23010282218456268, "logps/chosen": -1.2322697639465332, "logps/rejected": -1.5280249118804932, "loss": 1.5755, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2322697639465332, "rewards/margins": 0.2957552969455719, "rewards/rejected": -1.5280249118804932, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 9.857601718828215, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.17979669570922852, "logits/rejected": -0.15013888478279114, "logps/chosen": -1.2193511724472046, "logps/rejected": -1.4303195476531982, "loss": 1.5614, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2193511724472046, "rewards/margins": 0.21096833050251007, "rewards/rejected": -1.4303195476531982, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 9.385374558005283, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.1488681137561798, "logits/rejected": -0.0743316262960434, "logps/chosen": -1.350601315498352, "logps/rejected": -1.5454646348953247, "loss": 1.6932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.350601315498352, "rewards/margins": 0.19486349821090698, "rewards/rejected": -1.5454646348953247, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 10.082983742251782, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.2765267491340637, "logits/rejected": -0.2307772934436798, "logps/chosen": -1.2331907749176025, "logps/rejected": -1.50911545753479, "loss": 1.5738, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2331907749176025, "rewards/margins": 0.2759249806404114, "rewards/rejected": -1.50911545753479, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 8.575174459256017, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.28711315989494324, "logits/rejected": -0.14060531556606293, "logps/chosen": -1.2072677612304688, "logps/rejected": -1.3850557804107666, "loss": 1.582, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2072677612304688, "rewards/margins": 0.17778778076171875, "rewards/rejected": -1.3850557804107666, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 7.781545018454678, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.255064457654953, "logits/rejected": -0.1399548202753067, "logps/chosen": -1.2635090351104736, "logps/rejected": -1.4571034908294678, "loss": 1.6057, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2635090351104736, "rewards/margins": 0.19359445571899414, "rewards/rejected": -1.4571034908294678, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 8.012243534412995, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.2057885229587555, "logits/rejected": -0.16035917401313782, "logps/chosen": -1.229182481765747, "logps/rejected": -1.4483156204223633, "loss": 1.5837, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.229182481765747, "rewards/margins": 0.21913310885429382, "rewards/rejected": -1.4483156204223633, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 14.593694769561147, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.2511407434940338, "logits/rejected": -0.141941636800766, "logps/chosen": -1.2716915607452393, "logps/rejected": -1.4837024211883545, "loss": 1.6227, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2716915607452393, "rewards/margins": 0.21201090514659882, "rewards/rejected": -1.4837024211883545, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 11.589102849621897, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.0357593409717083, "logits/rejected": -0.08599834144115448, "logps/chosen": -1.1936821937561035, "logps/rejected": -1.5037119388580322, "loss": 1.524, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1936821937561035, "rewards/margins": 0.31002962589263916, "rewards/rejected": -1.5037119388580322, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 8.900457362007634, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.19243542850017548, "logits/rejected": -0.16841921210289001, "logps/chosen": -1.1981301307678223, "logps/rejected": -1.4053146839141846, "loss": 1.5488, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1981301307678223, "rewards/margins": 0.20718452334403992, "rewards/rejected": -1.4053146839141846, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 8.919580335367286, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.2229524850845337, "logits/rejected": -0.18007297813892365, "logps/chosen": -1.1097791194915771, "logps/rejected": -1.4422390460968018, "loss": 1.4453, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1097791194915771, "rewards/margins": 0.332459956407547, "rewards/rejected": -1.4422390460968018, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 8.349626905258832, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.3001171350479126, "logits/rejected": -0.23147349059581757, "logps/chosen": -1.1641672849655151, "logps/rejected": -1.4396671056747437, "loss": 1.5375, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1641672849655151, "rewards/margins": 0.2754998803138733, "rewards/rejected": -1.4396671056747437, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 11.847683979223595, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.19615155458450317, "logits/rejected": -0.14950647950172424, "logps/chosen": -1.222930669784546, "logps/rejected": -1.4548113346099854, "loss": 1.5727, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.222930669784546, "rewards/margins": 0.23188063502311707, "rewards/rejected": -1.4548113346099854, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 9.364344351079968, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.24645750224590302, "logits/rejected": -0.05692873150110245, "logps/chosen": -1.286911964416504, "logps/rejected": -1.5462675094604492, "loss": 1.6129, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.286911964416504, "rewards/margins": 0.2593555748462677, "rewards/rejected": -1.5462675094604492, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 7.546798581760435, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.12102420628070831, "logits/rejected": 0.015786726027727127, "logps/chosen": -1.1583021879196167, "logps/rejected": -1.3769385814666748, "loss": 1.5371, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1583021879196167, "rewards/margins": 0.21863646805286407, "rewards/rejected": -1.3769385814666748, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 11.375516386492313, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.20476698875427246, "logits/rejected": -0.14468708634376526, "logps/chosen": -1.2095907926559448, "logps/rejected": -1.340696096420288, "loss": 1.603, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2095907926559448, "rewards/margins": 0.1311051994562149, "rewards/rejected": -1.340696096420288, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 7.390452082521005, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.23655863106250763, "logits/rejected": -0.12614436447620392, "logps/chosen": -1.2767188549041748, "logps/rejected": -1.4665131568908691, "loss": 1.6172, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2767188549041748, "rewards/margins": 0.18979422748088837, "rewards/rejected": -1.4665131568908691, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 10.44433841417844, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.3213575482368469, "logits/rejected": -0.2255897969007492, "logps/chosen": -1.1311696767807007, "logps/rejected": -1.556115746498108, "loss": 1.4573, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1311696767807007, "rewards/margins": 0.42494598031044006, "rewards/rejected": -1.556115746498108, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 11.491098345334695, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.3183497190475464, "logits/rejected": -0.2026023417711258, "logps/chosen": -1.2869782447814941, "logps/rejected": -1.531435251235962, "loss": 1.6113, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2869782447814941, "rewards/margins": 0.2444569617509842, "rewards/rejected": -1.531435251235962, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 6.65963325181154, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.34064042568206787, "logits/rejected": -0.1863982379436493, "logps/chosen": -1.2175891399383545, "logps/rejected": -1.5243316888809204, "loss": 1.5504, "rewards/accuracies": 0.625, "rewards/chosen": -1.2175891399383545, "rewards/margins": 0.3067426383495331, "rewards/rejected": -1.5243316888809204, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 9.51269645374707, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.14729897677898407, "logits/rejected": -0.039565883576869965, "logps/chosen": -1.178842544555664, "logps/rejected": -1.5293989181518555, "loss": 1.5147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.178842544555664, "rewards/margins": 0.3505564332008362, "rewards/rejected": -1.5293989181518555, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 10.398165476971881, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.2192573994398117, "logits/rejected": -0.08377514034509659, "logps/chosen": -1.2447128295898438, "logps/rejected": -1.412784218788147, "loss": 1.5972, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2447128295898438, "rewards/margins": 0.16807132959365845, "rewards/rejected": -1.412784218788147, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 8.038138448432369, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.30199727416038513, "logits/rejected": -0.17887206375598907, "logps/chosen": -1.2027664184570312, "logps/rejected": -1.4390037059783936, "loss": 1.562, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2027664184570312, "rewards/margins": 0.23623719811439514, "rewards/rejected": -1.4390037059783936, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 8.273843305880611, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.30483612418174744, "logits/rejected": -0.1412568837404251, "logps/chosen": -1.2145841121673584, "logps/rejected": -1.5266047716140747, "loss": 1.5396, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2145841121673584, "rewards/margins": 0.3120204210281372, "rewards/rejected": -1.5266047716140747, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 10.10204457670056, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.25015076994895935, "logits/rejected": -0.21441802382469177, "logps/chosen": -1.3197141885757446, "logps/rejected": -1.5619823932647705, "loss": 1.6268, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3197141885757446, "rewards/margins": 0.24226808547973633, "rewards/rejected": -1.5619823932647705, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 15.832997721578597, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.31810659170150757, "logits/rejected": -0.13229572772979736, "logps/chosen": -1.1907761096954346, "logps/rejected": -1.4618098735809326, "loss": 1.5408, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1907761096954346, "rewards/margins": 0.27103400230407715, "rewards/rejected": -1.4618098735809326, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 6.913405186133222, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.37828677892684937, "logits/rejected": -0.21424618363380432, "logps/chosen": -1.2096235752105713, "logps/rejected": -1.5016435384750366, "loss": 1.53, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2096235752105713, "rewards/margins": 0.29201993346214294, "rewards/rejected": -1.5016435384750366, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 8.644851992143936, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.14419499039649963, "logits/rejected": -0.08842798322439194, "logps/chosen": -1.2097914218902588, "logps/rejected": -1.4539039134979248, "loss": 1.5729, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2097914218902588, "rewards/margins": 0.24411258101463318, "rewards/rejected": -1.4539039134979248, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 6.738111830624236, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.16246473789215088, "logits/rejected": -0.05987215042114258, "logps/chosen": -1.2946544885635376, "logps/rejected": -1.4763842821121216, "loss": 1.6436, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2946544885635376, "rewards/margins": 0.18172983825206757, "rewards/rejected": -1.4763842821121216, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 11.981029031922487, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.23299722373485565, "logits/rejected": -0.0805606096982956, "logps/chosen": -1.2046631574630737, "logps/rejected": -1.4061990976333618, "loss": 1.5615, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2046631574630737, "rewards/margins": 0.20153579115867615, "rewards/rejected": -1.4061990976333618, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 5.553668431643315, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.2729364037513733, "logits/rejected": -0.17603769898414612, "logps/chosen": -1.197695016860962, "logps/rejected": -1.4483814239501953, "loss": 1.5524, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.197695016860962, "rewards/margins": 0.25068649649620056, "rewards/rejected": -1.4483814239501953, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 7.683718213238564, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.31468385457992554, "logits/rejected": -0.23054513335227966, "logps/chosen": -1.2684756517410278, "logps/rejected": -1.5039026737213135, "loss": 1.599, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2684756517410278, "rewards/margins": 0.23542693257331848, "rewards/rejected": -1.5039026737213135, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 9.212426590229022, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.2610294222831726, "logits/rejected": -0.20872995257377625, "logps/chosen": -1.1865425109863281, "logps/rejected": -1.550987958908081, "loss": 1.5008, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1865425109863281, "rewards/margins": 0.364445298910141, "rewards/rejected": -1.550987958908081, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 8.081361937247129, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.2479631006717682, "logits/rejected": -0.09326585382223129, "logps/chosen": -1.1892836093902588, "logps/rejected": -1.4418988227844238, "loss": 1.5311, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1892836093902588, "rewards/margins": 0.25261521339416504, "rewards/rejected": -1.4418988227844238, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 8.992632637265395, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.23023590445518494, "logits/rejected": -0.12915799021720886, "logps/chosen": -1.2853100299835205, "logps/rejected": -1.4848616123199463, "loss": 1.63, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2853100299835205, "rewards/margins": 0.199551522731781, "rewards/rejected": -1.4848616123199463, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 8.50942739888827, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.2819350063800812, "logits/rejected": -0.19880495965480804, "logps/chosen": -1.2908051013946533, "logps/rejected": -1.4433648586273193, "loss": 1.6442, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2908051013946533, "rewards/margins": 0.15255983173847198, "rewards/rejected": -1.4433648586273193, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 7.09993439182778, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.18955400586128235, "logits/rejected": -0.16120107471942902, "logps/chosen": -1.2785166501998901, "logps/rejected": -1.5583308935165405, "loss": 1.5895, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2785166501998901, "rewards/margins": 0.2798142433166504, "rewards/rejected": -1.5583308935165405, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 11.962208827975902, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.2284051477909088, "logits/rejected": -0.14482834935188293, "logps/chosen": -1.1373264789581299, "logps/rejected": -1.4195890426635742, "loss": 1.512, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1373264789581299, "rewards/margins": 0.28226256370544434, "rewards/rejected": -1.4195890426635742, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 8.616815056941201, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.3416827917098999, "logits/rejected": -0.22021794319152832, "logps/chosen": -1.224382996559143, "logps/rejected": -1.4130111932754517, "loss": 1.5654, "rewards/accuracies": 0.625, "rewards/chosen": -1.224382996559143, "rewards/margins": 0.18862825632095337, "rewards/rejected": -1.4130111932754517, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 11.488081919358322, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.2214185744524002, "logits/rejected": -0.11094842851161957, "logps/chosen": -1.2285008430480957, "logps/rejected": -1.447881817817688, "loss": 1.5802, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2285008430480957, "rewards/margins": 0.2193809449672699, "rewards/rejected": -1.447881817817688, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 9.903452577728135, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.291803777217865, "logits/rejected": -0.17823100090026855, "logps/chosen": -1.174984335899353, "logps/rejected": -1.419689416885376, "loss": 1.5671, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.174984335899353, "rewards/margins": 0.2447052001953125, "rewards/rejected": -1.419689416885376, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.025222957134246826, "eval_logits/rejected": 0.0927000492811203, "eval_logps/chosen": -1.30295991897583, "eval_logps/rejected": -1.4993714094161987, "eval_loss": 1.6473075151443481, "eval_rewards/accuracies": 0.5719584822654724, "eval_rewards/chosen": -1.30295991897583, "eval_rewards/margins": 0.19641143083572388, "eval_rewards/rejected": -1.4993714094161987, "eval_runtime": 40.3052, "eval_samples_per_second": 33.37, "eval_steps_per_second": 8.361, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 8.17255312752598, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.1715945303440094, "logits/rejected": -0.15807819366455078, "logps/chosen": -1.2178270816802979, "logps/rejected": -1.4111155271530151, "loss": 1.5908, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2178270816802979, "rewards/margins": 0.1932884007692337, "rewards/rejected": -1.4111155271530151, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 8.538930468238522, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.22177401185035706, "logits/rejected": -0.16164055466651917, "logps/chosen": -1.222161054611206, "logps/rejected": -1.4369077682495117, "loss": 1.568, "rewards/accuracies": 0.59375, "rewards/chosen": -1.222161054611206, "rewards/margins": 0.21474671363830566, "rewards/rejected": -1.4369077682495117, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 7.820754902549526, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.22256644070148468, "logits/rejected": -0.1498219519853592, "logps/chosen": -1.2076025009155273, "logps/rejected": -1.5045273303985596, "loss": 1.5444, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2076025009155273, "rewards/margins": 0.29692476987838745, "rewards/rejected": -1.5045273303985596, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 10.773504638231776, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.25352656841278076, "logits/rejected": -0.2084973305463791, "logps/chosen": -1.1974619626998901, "logps/rejected": -1.5066121816635132, "loss": 1.5284, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1974619626998901, "rewards/margins": 0.3091502785682678, "rewards/rejected": -1.5066121816635132, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 8.96727868563191, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.19537881016731262, "logits/rejected": -0.09505072236061096, "logps/chosen": -1.2803305387496948, "logps/rejected": -1.519592046737671, "loss": 1.5941, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2803305387496948, "rewards/margins": 0.23926150798797607, "rewards/rejected": -1.519592046737671, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 9.775911172018159, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.30756497383117676, "logits/rejected": -0.1980385184288025, "logps/chosen": -1.242377519607544, "logps/rejected": -1.4381451606750488, "loss": 1.5894, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.242377519607544, "rewards/margins": 0.19576765596866608, "rewards/rejected": -1.4381451606750488, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 6.16390638342754, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.2471589744091034, "logits/rejected": -0.17163333296775818, "logps/chosen": -1.2506660223007202, "logps/rejected": -1.4531687498092651, "loss": 1.5882, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2506660223007202, "rewards/margins": 0.20250268280506134, "rewards/rejected": -1.4531687498092651, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 9.416370335380098, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.24961507320404053, "logits/rejected": -0.07604103535413742, "logps/chosen": -1.2434465885162354, "logps/rejected": -1.4642319679260254, "loss": 1.617, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2434465885162354, "rewards/margins": 0.22078534960746765, "rewards/rejected": -1.4642319679260254, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 6.819545176184575, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.16510526835918427, "logits/rejected": -0.13053981959819794, "logps/chosen": -1.1947574615478516, "logps/rejected": -1.3906234502792358, "loss": 1.5771, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1947574615478516, "rewards/margins": 0.1958661526441574, "rewards/rejected": -1.3906234502792358, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 7.398688825817576, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.26923614740371704, "logits/rejected": -0.16045144200325012, "logps/chosen": -1.2702943086624146, "logps/rejected": -1.473854422569275, "loss": 1.6096, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2702943086624146, "rewards/margins": 0.20355994999408722, "rewards/rejected": -1.473854422569275, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 8.283410973305017, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.23493504524230957, "logits/rejected": -0.06386389583349228, "logps/chosen": -1.2502615451812744, "logps/rejected": -1.4193909168243408, "loss": 1.6, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2502615451812744, "rewards/margins": 0.16912934184074402, "rewards/rejected": -1.4193909168243408, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 10.62770644227685, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.22654405236244202, "logits/rejected": -0.16856110095977783, "logps/chosen": -1.2229833602905273, "logps/rejected": -1.5214375257492065, "loss": 1.5809, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2229833602905273, "rewards/margins": 0.2984543740749359, "rewards/rejected": -1.5214375257492065, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 8.053002223568207, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.3435817360877991, "logits/rejected": -0.18410401046276093, "logps/chosen": -1.4058219194412231, "logps/rejected": -1.521140456199646, "loss": 1.7366, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4058219194412231, "rewards/margins": 0.1153184324502945, "rewards/rejected": -1.521140456199646, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 6.410906678426636, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.2007797211408615, "logits/rejected": -0.08656108379364014, "logps/chosen": -1.1756919622421265, "logps/rejected": -1.483074426651001, "loss": 1.5043, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1756919622421265, "rewards/margins": 0.30738240480422974, "rewards/rejected": -1.483074426651001, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 8.061132634743135, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.22029061615467072, "logits/rejected": -0.09311743080615997, "logps/chosen": -1.2549231052398682, "logps/rejected": -1.4947597980499268, "loss": 1.5859, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2549231052398682, "rewards/margins": 0.23983661830425262, "rewards/rejected": -1.4947597980499268, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 9.12189454063848, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.21285684406757355, "logits/rejected": -0.0851225033402443, "logps/chosen": -1.2914241552352905, "logps/rejected": -1.4883497953414917, "loss": 1.636, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2914241552352905, "rewards/margins": 0.19692568480968475, "rewards/rejected": -1.4883497953414917, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 7.9275295644342005, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.24292925000190735, "logits/rejected": -0.16514852643013, "logps/chosen": -1.194392204284668, "logps/rejected": -1.4477427005767822, "loss": 1.536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.194392204284668, "rewards/margins": 0.2533505856990814, "rewards/rejected": -1.4477427005767822, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 6.94883158701022, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.2350497990846634, "logits/rejected": -0.12720319628715515, "logps/chosen": -1.2613648176193237, "logps/rejected": -1.454443097114563, "loss": 1.6147, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2613648176193237, "rewards/margins": 0.1930783987045288, "rewards/rejected": -1.454443097114563, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 6.377123568788396, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.27777573466300964, "logits/rejected": -0.12587803602218628, "logps/chosen": -1.3249562978744507, "logps/rejected": -1.5746320486068726, "loss": 1.6309, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3249562978744507, "rewards/margins": 0.24967575073242188, "rewards/rejected": -1.5746320486068726, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 7.856262939713161, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.24618788063526154, "logits/rejected": -0.2185472697019577, "logps/chosen": -1.2331881523132324, "logps/rejected": -1.558468222618103, "loss": 1.5617, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2331881523132324, "rewards/margins": 0.32528001070022583, "rewards/rejected": -1.558468222618103, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 8.297563864396986, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.22748489677906036, "logits/rejected": -0.11356830596923828, "logps/chosen": -1.2860530614852905, "logps/rejected": -1.5371191501617432, "loss": 1.6336, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2860530614852905, "rewards/margins": 0.2510661780834198, "rewards/rejected": -1.5371191501617432, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 5.8219485752748765, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.34974604845046997, "logits/rejected": -0.14192189276218414, "logps/chosen": -1.1785976886749268, "logps/rejected": -1.4980539083480835, "loss": 1.5294, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1785976886749268, "rewards/margins": 0.3194560408592224, "rewards/rejected": -1.4980539083480835, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 9.582578031201527, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.12283550202846527, "logits/rejected": -0.03970341011881828, "logps/chosen": -1.2159522771835327, "logps/rejected": -1.3907673358917236, "loss": 1.5766, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2159522771835327, "rewards/margins": 0.17481514811515808, "rewards/rejected": -1.3907673358917236, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 9.036023167800439, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.13198763132095337, "logits/rejected": -0.07314762473106384, "logps/chosen": -1.2157835960388184, "logps/rejected": -1.3626552820205688, "loss": 1.5912, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2157835960388184, "rewards/margins": 0.14687182009220123, "rewards/rejected": -1.3626552820205688, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 7.859939253955737, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.18645405769348145, "logits/rejected": -0.08493912220001221, "logps/chosen": -1.2615854740142822, "logps/rejected": -1.5630425214767456, "loss": 1.5806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2615854740142822, "rewards/margins": 0.301457017660141, "rewards/rejected": -1.5630425214767456, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 9.732168161037025, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.1656079888343811, "logits/rejected": -0.11995343863964081, "logps/chosen": -1.1951570510864258, "logps/rejected": -1.4527256488800049, "loss": 1.5305, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1951570510864258, "rewards/margins": 0.25756850838661194, "rewards/rejected": -1.4527256488800049, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 14.634501129963239, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.14711536467075348, "logits/rejected": -0.1679435521364212, "logps/chosen": -1.2382829189300537, "logps/rejected": -1.4174609184265137, "loss": 1.5958, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2382829189300537, "rewards/margins": 0.17917793989181519, "rewards/rejected": -1.4174609184265137, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 8.177412551257165, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.1711091846227646, "logits/rejected": -0.16347761452198029, "logps/chosen": -1.2568700313568115, "logps/rejected": -1.40218985080719, "loss": 1.6302, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2568700313568115, "rewards/margins": 0.14531996846199036, "rewards/rejected": -1.40218985080719, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 6.8081120753218825, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.31831425428390503, "logits/rejected": -0.1833570897579193, "logps/chosen": -1.1927130222320557, "logps/rejected": -1.4259960651397705, "loss": 1.5188, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1927130222320557, "rewards/margins": 0.233283132314682, "rewards/rejected": -1.4259960651397705, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 10.222984434332936, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.1781303584575653, "logits/rejected": -0.2083595246076584, "logps/chosen": -1.3082549571990967, "logps/rejected": -1.506154179573059, "loss": 1.6577, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3082549571990967, "rewards/margins": 0.19789907336235046, "rewards/rejected": -1.506154179573059, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 8.988075682315458, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.2713666558265686, "logits/rejected": -0.19798487424850464, "logps/chosen": -1.239227533340454, "logps/rejected": -1.455033779144287, "loss": 1.5997, "rewards/accuracies": 0.5625, "rewards/chosen": -1.239227533340454, "rewards/margins": 0.21580609679222107, "rewards/rejected": -1.455033779144287, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 9.768561838045548, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.23024603724479675, "logits/rejected": -0.0810357853770256, "logps/chosen": -1.240037202835083, "logps/rejected": -1.486128568649292, "loss": 1.6004, "rewards/accuracies": 0.59375, "rewards/chosen": -1.240037202835083, "rewards/margins": 0.24609121680259705, "rewards/rejected": -1.486128568649292, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 7.270990516203394, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.2629620432853699, "logits/rejected": -0.12923569977283478, "logps/chosen": -1.2358171939849854, "logps/rejected": -1.5153964757919312, "loss": 1.5649, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2358171939849854, "rewards/margins": 0.27957937121391296, "rewards/rejected": -1.5153964757919312, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 9.998922677571656, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.14417660236358643, "logits/rejected": -0.12314148247241974, "logps/chosen": -1.2316958904266357, "logps/rejected": -1.3660876750946045, "loss": 1.6054, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2316958904266357, "rewards/margins": 0.13439175486564636, "rewards/rejected": -1.3660876750946045, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 8.099282766800515, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.21907174587249756, "logits/rejected": -0.1412518322467804, "logps/chosen": -1.1827168464660645, "logps/rejected": -1.5288317203521729, "loss": 1.5092, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1827168464660645, "rewards/margins": 0.34611478447914124, "rewards/rejected": -1.5288317203521729, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 6.138067696587353, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.2404029667377472, "logits/rejected": -0.07165710628032684, "logps/chosen": -1.359360694885254, "logps/rejected": -1.547788381576538, "loss": 1.6632, "rewards/accuracies": 0.59375, "rewards/chosen": -1.359360694885254, "rewards/margins": 0.18842774629592896, "rewards/rejected": -1.547788381576538, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 7.384617991891714, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.18356771767139435, "logits/rejected": -0.12828607857227325, "logps/chosen": -1.2168136835098267, "logps/rejected": -1.3994461297988892, "loss": 1.5823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2168136835098267, "rewards/margins": 0.1826324164867401, "rewards/rejected": -1.3994461297988892, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 6.76595372965001, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.18755054473876953, "logits/rejected": -0.10399661958217621, "logps/chosen": -1.2633230686187744, "logps/rejected": -1.4344213008880615, "loss": 1.6064, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2633230686187744, "rewards/margins": 0.17109842598438263, "rewards/rejected": -1.4344213008880615, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 6.6443189435045795, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.33120718598365784, "logits/rejected": -0.17057207226753235, "logps/chosen": -1.18402898311615, "logps/rejected": -1.4530203342437744, "loss": 1.5123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.18402898311615, "rewards/margins": 0.26899123191833496, "rewards/rejected": -1.4530203342437744, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 7.705572513303001, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.25371435284614563, "logits/rejected": -0.04801352694630623, "logps/chosen": -1.2450672388076782, "logps/rejected": -1.6382758617401123, "loss": 1.5736, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2450672388076782, "rewards/margins": 0.3932085931301117, "rewards/rejected": -1.6382758617401123, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 9.116026612594158, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.1768065094947815, "logits/rejected": -0.15552476048469543, "logps/chosen": -1.3003408908843994, "logps/rejected": -1.5687168836593628, "loss": 1.6192, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3003408908843994, "rewards/margins": 0.2683759927749634, "rewards/rejected": -1.5687168836593628, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 7.581508989260258, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.16222859919071198, "logits/rejected": -0.06536058336496353, "logps/chosen": -1.22344172000885, "logps/rejected": -1.5434781312942505, "loss": 1.56, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.22344172000885, "rewards/margins": 0.3200364410877228, "rewards/rejected": -1.5434781312942505, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 7.875764868054461, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.24861487746238708, "logits/rejected": -0.11518082767724991, "logps/chosen": -1.2504818439483643, "logps/rejected": -1.4509363174438477, "loss": 1.5929, "rewards/accuracies": 0.625, "rewards/chosen": -1.2504818439483643, "rewards/margins": 0.2004544734954834, "rewards/rejected": -1.4509363174438477, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 10.540014977561935, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.20477533340454102, "logits/rejected": -0.1502620428800583, "logps/chosen": -1.1682026386260986, "logps/rejected": -1.3941516876220703, "loss": 1.5416, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1682026386260986, "rewards/margins": 0.22594909369945526, "rewards/rejected": -1.3941516876220703, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 12.9123691767093, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.20697562396526337, "logits/rejected": -0.1116500049829483, "logps/chosen": -1.272203803062439, "logps/rejected": -1.5195564031600952, "loss": 1.6129, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.272203803062439, "rewards/margins": 0.24735260009765625, "rewards/rejected": -1.5195564031600952, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 9.672107526238177, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.23657703399658203, "logits/rejected": -0.10220441967248917, "logps/chosen": -1.2120410203933716, "logps/rejected": -1.5323138236999512, "loss": 1.5388, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2120410203933716, "rewards/margins": 0.3202727437019348, "rewards/rejected": -1.5323138236999512, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 13.32645795297891, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.3036276698112488, "logits/rejected": -0.2224380522966385, "logps/chosen": -1.211949348449707, "logps/rejected": -1.5303452014923096, "loss": 1.5299, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.211949348449707, "rewards/margins": 0.31839582324028015, "rewards/rejected": -1.5303452014923096, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 8.30113982964633, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.24548538029193878, "logits/rejected": -0.1568877398967743, "logps/chosen": -1.2283847332000732, "logps/rejected": -1.5463618040084839, "loss": 1.5675, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2283847332000732, "rewards/margins": 0.3179771304130554, "rewards/rejected": -1.5463618040084839, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 7.124171847107691, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.07658245414495468, "logits/rejected": -0.08402254432439804, "logps/chosen": -1.2022011280059814, "logps/rejected": -1.5855541229248047, "loss": 1.5345, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2022011280059814, "rewards/margins": 0.383353054523468, "rewards/rejected": -1.5855541229248047, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 9.291362822335682, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.16195708513259888, "logits/rejected": -0.07419019192457199, "logps/chosen": -1.243790626525879, "logps/rejected": -1.559624433517456, "loss": 1.566, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.243790626525879, "rewards/margins": 0.3158337473869324, "rewards/rejected": -1.559624433517456, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 7.5983784368398055, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.24264466762542725, "logits/rejected": -0.09939133375883102, "logps/chosen": -1.2212355136871338, "logps/rejected": -1.3946906328201294, "loss": 1.5814, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2212355136871338, "rewards/margins": 0.17345522344112396, "rewards/rejected": -1.3946906328201294, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 8.042593806805142, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.22303660213947296, "logits/rejected": -0.15075257420539856, "logps/chosen": -1.2438392639160156, "logps/rejected": -1.5116565227508545, "loss": 1.568, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2438392639160156, "rewards/margins": 0.2678173780441284, "rewards/rejected": -1.5116565227508545, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 8.522258152746392, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.28066182136535645, "logits/rejected": -0.13141262531280518, "logps/chosen": -1.2957508563995361, "logps/rejected": -1.4685115814208984, "loss": 1.6427, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2957508563995361, "rewards/margins": 0.1727607548236847, "rewards/rejected": -1.4685115814208984, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 10.245778076063425, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.1834983080625534, "logits/rejected": -0.11405984312295914, "logps/chosen": -1.1761218309402466, "logps/rejected": -1.4256806373596191, "loss": 1.5236, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1761218309402466, "rewards/margins": 0.249558687210083, "rewards/rejected": -1.4256806373596191, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 8.151394853140006, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.15573295950889587, "logits/rejected": -0.1990688145160675, "logps/chosen": -1.1482346057891846, "logps/rejected": -1.3903645277023315, "loss": 1.5217, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1482346057891846, "rewards/margins": 0.24212996661663055, "rewards/rejected": -1.3903645277023315, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 7.154479535819093, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.19441083073616028, "logits/rejected": -0.11569847166538239, "logps/chosen": -1.2421464920043945, "logps/rejected": -1.416693925857544, "loss": 1.6012, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2421464920043945, "rewards/margins": 0.17454750835895538, "rewards/rejected": -1.416693925857544, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 8.165676270910435, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.17287132143974304, "logits/rejected": -0.04290672019124031, "logps/chosen": -1.2198317050933838, "logps/rejected": -1.4292641878128052, "loss": 1.577, "rewards/accuracies": 0.625, "rewards/chosen": -1.2198317050933838, "rewards/margins": 0.2094324380159378, "rewards/rejected": -1.4292641878128052, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 10.173550581014478, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.23846104741096497, "logits/rejected": -0.21791306138038635, "logps/chosen": -1.2163054943084717, "logps/rejected": -1.3821016550064087, "loss": 1.5776, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2163054943084717, "rewards/margins": 0.16579614579677582, "rewards/rejected": -1.3821016550064087, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 7.329033604357603, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.30749398469924927, "logits/rejected": -0.140707865357399, "logps/chosen": -1.1801255941390991, "logps/rejected": -1.4713468551635742, "loss": 1.5143, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1801255941390991, "rewards/margins": 0.29122108221054077, "rewards/rejected": -1.4713468551635742, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 7.202329760268409, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.22415634989738464, "logits/rejected": -0.13418297469615936, "logps/chosen": -1.2656097412109375, "logps/rejected": -1.3961628675460815, "loss": 1.6224, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2656097412109375, "rewards/margins": 0.1305532157421112, "rewards/rejected": -1.3961628675460815, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 6.303726793421413, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.1686837077140808, "logits/rejected": -0.17766132950782776, "logps/chosen": -1.2433160543441772, "logps/rejected": -1.4891825914382935, "loss": 1.5733, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2433160543441772, "rewards/margins": 0.24586646258831024, "rewards/rejected": -1.4891825914382935, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 9.119218389688942, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.25223469734191895, "logits/rejected": -0.17962117493152618, "logps/chosen": -1.2419198751449585, "logps/rejected": -1.5848525762557983, "loss": 1.5492, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2419198751449585, "rewards/margins": 0.34293264150619507, "rewards/rejected": -1.5848525762557983, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 9.600831043124986, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.2802756428718567, "logits/rejected": -0.17401473224163055, "logps/chosen": -1.2063791751861572, "logps/rejected": -1.5460056066513062, "loss": 1.534, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2063791751861572, "rewards/margins": 0.3396264612674713, "rewards/rejected": -1.5460056066513062, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 9.948839107290668, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.22035236656665802, "logits/rejected": -0.1833755075931549, "logps/chosen": -1.1798374652862549, "logps/rejected": -1.4181078672409058, "loss": 1.5562, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1798374652862549, "rewards/margins": 0.2382703721523285, "rewards/rejected": -1.4181078672409058, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 6.746792031918658, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.23057572543621063, "logits/rejected": -0.13543887436389923, "logps/chosen": -1.225454568862915, "logps/rejected": -1.559378981590271, "loss": 1.5537, "rewards/accuracies": 0.625, "rewards/chosen": -1.225454568862915, "rewards/margins": 0.33392423391342163, "rewards/rejected": -1.559378981590271, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 12.319687335187103, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.2521504759788513, "logits/rejected": -0.07061384618282318, "logps/chosen": -1.3217427730560303, "logps/rejected": -1.453894019126892, "loss": 1.6689, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3217427730560303, "rewards/margins": 0.13215124607086182, "rewards/rejected": -1.453894019126892, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 10.749252456170122, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.24004845321178436, "logits/rejected": -0.21623799204826355, "logps/chosen": -1.2044014930725098, "logps/rejected": -1.3565657138824463, "loss": 1.5856, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2044014930725098, "rewards/margins": 0.15216435492038727, "rewards/rejected": -1.3565657138824463, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 8.928875115417165, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.32898491621017456, "logits/rejected": -0.19997842609882355, "logps/chosen": -1.2778807878494263, "logps/rejected": -1.4223525524139404, "loss": 1.6185, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2778807878494263, "rewards/margins": 0.14447186887264252, "rewards/rejected": -1.4223525524139404, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 7.423246640192279, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.23562900722026825, "logits/rejected": -0.1592407524585724, "logps/chosen": -1.1718533039093018, "logps/rejected": -1.3776428699493408, "loss": 1.5518, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1718533039093018, "rewards/margins": 0.20578965544700623, "rewards/rejected": -1.3776428699493408, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 7.789031939079042, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.16586032509803772, "logits/rejected": -0.07216402888298035, "logps/chosen": -1.2948392629623413, "logps/rejected": -1.5134284496307373, "loss": 1.6148, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2948392629623413, "rewards/margins": 0.21858926117420197, "rewards/rejected": -1.5134284496307373, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 9.7236517150733, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.2562541365623474, "logits/rejected": -0.09349828958511353, "logps/chosen": -1.3202444314956665, "logps/rejected": -1.5408875942230225, "loss": 1.6399, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3202444314956665, "rewards/margins": 0.22064313292503357, "rewards/rejected": -1.5408875942230225, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 7.980336065142897, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.20060701668262482, "logits/rejected": -0.20425963401794434, "logps/chosen": -1.2538707256317139, "logps/rejected": -1.3540282249450684, "loss": 1.6204, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2538707256317139, "rewards/margins": 0.10015746206045151, "rewards/rejected": -1.3540282249450684, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 11.045387212434024, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.2866879999637604, "logits/rejected": -0.2636876106262207, "logps/chosen": -1.2187350988388062, "logps/rejected": -1.39274001121521, "loss": 1.5827, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2187350988388062, "rewards/margins": 0.17400483787059784, "rewards/rejected": -1.39274001121521, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 9.409135351575326, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.2014199197292328, "logits/rejected": -0.10790824890136719, "logps/chosen": -1.1877915859222412, "logps/rejected": -1.5239720344543457, "loss": 1.5295, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1877915859222412, "rewards/margins": 0.33618029952049255, "rewards/rejected": -1.5239720344543457, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 9.875788186643785, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.2554752826690674, "logits/rejected": -0.1442861109972, "logps/chosen": -1.2132186889648438, "logps/rejected": -1.4958226680755615, "loss": 1.5554, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2132186889648438, "rewards/margins": 0.2826038599014282, "rewards/rejected": -1.4958226680755615, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 8.273370345518318, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.31946608424186707, "logits/rejected": -0.12483116239309311, "logps/chosen": -1.1476317644119263, "logps/rejected": -1.49470055103302, "loss": 1.5048, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1476317644119263, "rewards/margins": 0.3470688462257385, "rewards/rejected": -1.49470055103302, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 9.002525495716036, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.2967630922794342, "logits/rejected": -0.17088404297828674, "logps/chosen": -1.2522767782211304, "logps/rejected": -1.4954627752304077, "loss": 1.5749, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2522767782211304, "rewards/margins": 0.24318604171276093, "rewards/rejected": -1.4954627752304077, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 6.989817783688385, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.23328891396522522, "logits/rejected": -0.20442518591880798, "logps/chosen": -1.2976136207580566, "logps/rejected": -1.4591684341430664, "loss": 1.637, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2976136207580566, "rewards/margins": 0.16155479848384857, "rewards/rejected": -1.4591684341430664, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 9.159023001208364, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.24357542395591736, "logits/rejected": -0.1504550278186798, "logps/chosen": -1.166772484779358, "logps/rejected": -1.4286892414093018, "loss": 1.5248, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.166772484779358, "rewards/margins": 0.26191678643226624, "rewards/rejected": -1.4286892414093018, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 9.005783452720413, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.2362523078918457, "logits/rejected": -0.1721603125333786, "logps/chosen": -1.2057607173919678, "logps/rejected": -1.5258591175079346, "loss": 1.5482, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2057607173919678, "rewards/margins": 0.32009822130203247, "rewards/rejected": -1.5258591175079346, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.016507524996995926, "eval_logits/rejected": 0.083260677754879, "eval_logps/chosen": -1.3030215501785278, "eval_logps/rejected": -1.4992130994796753, "eval_loss": 1.6474518775939941, "eval_rewards/accuracies": 0.5712166428565979, "eval_rewards/chosen": -1.3030215501785278, "eval_rewards/margins": 0.19619165360927582, "eval_rewards/rejected": -1.4992130994796753, "eval_runtime": 40.3232, "eval_samples_per_second": 33.355, "eval_steps_per_second": 8.357, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 1.6264250374623148, "train_runtime": 30148.9193, "train_samples_per_second": 5.949, "train_steps_per_second": 0.186 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }