diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,18170 +10,17036 @@ "log_history": [ { "epoch": 0.002676032781401572, - "grad_norm": 9.17970088825437, + "grad_norm": 11.487560113167183, "learning_rate": 8.9126559714795e-09, - "logits/chosen": -0.07006202638149261, - "logits/rejected": 0.1360432207584381, - "logps/chosen": -1.7161109447479248, - "logps/rejected": -1.8897171020507812, - "loss": 2.289, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.7161109447479248, - "rewards/margins": 0.17360590398311615, - "rewards/rejected": -1.8897171020507812, - "semantic_entropy": 0.6584368348121643, + "logits/chosen": -0.07002891600131989, + "logits/rejected": 0.1360647976398468, + "logps/chosen": -1.7161403894424438, + "logps/rejected": -1.8893934488296509, + "loss": 1.9598, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7161403894424438, + "rewards/margins": 0.17325332760810852, + "rewards/rejected": -1.8893934488296509, "step": 5 }, { "epoch": 0.005352065562803144, - "grad_norm": 22.707738390768604, + "grad_norm": 24.605044963130553, "learning_rate": 1.7825311942959e-08, - "logits/chosen": 0.006351391319185495, - "logits/rejected": 0.12548018991947174, - "logps/chosen": -1.801119089126587, - "logps/rejected": -1.8452117443084717, - "loss": 2.3709, + "logits/chosen": 0.011153340339660645, + "logits/rejected": 0.13156278431415558, + "logps/chosen": -1.8016386032104492, + "logps/rejected": -1.844559907913208, + "loss": 2.0517, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.801119089126587, - "rewards/margins": 0.04409261420369148, - "rewards/rejected": -1.8452117443084717, - "semantic_entropy": 0.6396836042404175, + "rewards/chosen": -1.8016386032104492, + "rewards/margins": 0.0429212786257267, + "rewards/rejected": -1.844559907913208, "step": 10 }, { "epoch": 0.008028098344204716, - "grad_norm": 20.020177676042728, + "grad_norm": 22.47805312136356, "learning_rate": 2.67379679144385e-08, - "logits/chosen": -0.024356648325920105, - "logits/rejected": 0.07323823869228363, - "logps/chosen": -1.6344894170761108, - "logps/rejected": -1.7645975351333618, - "loss": 2.2682, + "logits/chosen": -0.024431750178337097, + "logits/rejected": 0.07315035164356232, + "logps/chosen": -1.6345176696777344, + "logps/rejected": -1.7633349895477295, + "loss": 1.9217, "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.6344894170761108, - "rewards/margins": 0.13010787963867188, - "rewards/rejected": -1.7645975351333618, - "semantic_entropy": 0.693292498588562, + "rewards/chosen": -1.6345176696777344, + "rewards/margins": 0.1288173794746399, + "rewards/rejected": -1.7633349895477295, "step": 15 }, { "epoch": 0.010704131125606288, - "grad_norm": 10.276327712753156, + "grad_norm": 13.11760535567929, "learning_rate": 3.5650623885918e-08, - "logits/chosen": -0.02814987301826477, - "logits/rejected": 0.056642692536115646, - "logps/chosen": -1.725663185119629, - "logps/rejected": -1.8059051036834717, - "loss": 2.3382, + "logits/chosen": -0.03134971112012863, + "logits/rejected": 0.05292157083749771, + "logps/chosen": -1.7249581813812256, + "logps/rejected": -1.8054351806640625, + "loss": 2.0033, "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.725663185119629, - "rewards/margins": 0.08024205267429352, - "rewards/rejected": -1.8059051036834717, - "semantic_entropy": 0.6685150861740112, + "rewards/chosen": -1.7249581813812256, + "rewards/margins": 0.08047701418399811, + "rewards/rejected": -1.8054351806640625, "step": 20 }, { "epoch": 0.013380163907007862, - "grad_norm": 27.180149093707602, + "grad_norm": 30.312039716334574, "learning_rate": 4.45632798573975e-08, - "logits/chosen": -0.043658602982759476, - "logits/rejected": 0.040141694247722626, - "logps/chosen": -1.8687822818756104, - "logps/rejected": -1.7786893844604492, - "loss": 2.4973, + "logits/chosen": -0.04933280870318413, + "logits/rejected": 0.03452097624540329, + "logps/chosen": -1.8688116073608398, + "logps/rejected": -1.7789065837860107, + "loss": 2.1755, "rewards/accuracies": 0.3812499940395355, - "rewards/chosen": -1.8687822818756104, - "rewards/margins": -0.09009285271167755, - "rewards/rejected": -1.7786893844604492, - "semantic_entropy": 0.6433964967727661, + "rewards/chosen": -1.8688116073608398, + "rewards/margins": -0.08990499377250671, + "rewards/rejected": -1.7789065837860107, "step": 25 }, { "epoch": 0.016056196688409432, - "grad_norm": 24.409252182598028, + "grad_norm": 27.26040466944694, "learning_rate": 5.3475935828877e-08, - "logits/chosen": -0.08515056222677231, - "logits/rejected": 0.0056139142252504826, - "logps/chosen": -1.90885329246521, - "logps/rejected": -1.8324095010757446, - "loss": 2.4654, + "logits/chosen": -0.07853099703788757, + "logits/rejected": 0.013800591230392456, + "logps/chosen": -1.908769965171814, + "logps/rejected": -1.8326698541641235, + "loss": 2.1563, "rewards/accuracies": 0.4437499940395355, - "rewards/chosen": -1.90885329246521, - "rewards/margins": -0.07644428312778473, - "rewards/rejected": -1.8324095010757446, - "semantic_entropy": 0.6178733706474304, + "rewards/chosen": -1.908769965171814, + "rewards/margins": -0.0761001706123352, + "rewards/rejected": -1.8326698541641235, "step": 30 }, { "epoch": 0.018732229469811006, - "grad_norm": 16.574736077868895, + "grad_norm": 19.586132665797237, "learning_rate": 6.23885918003565e-08, - "logits/chosen": -0.04204554110765457, - "logits/rejected": 0.11867649853229523, - "logps/chosen": -1.845669150352478, - "logps/rejected": -1.9964662790298462, - "loss": 2.4259, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.845669150352478, - "rewards/margins": 0.15079709887504578, - "rewards/rejected": -1.9964662790298462, - "semantic_entropy": 0.6348816752433777, + "logits/chosen": -0.05052985996007919, + "logits/rejected": 0.10728694498538971, + "logps/chosen": -1.8463678359985352, + "logps/rejected": -1.9973781108856201, + "loss": 2.1094, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.8463678359985352, + "rewards/margins": 0.15101028978824615, + "rewards/rejected": -1.9973781108856201, "step": 35 }, { "epoch": 0.021408262251212576, - "grad_norm": 23.292130959882876, + "grad_norm": 25.702013601620546, "learning_rate": 7.1301247771836e-08, - "logits/chosen": 0.038313932716846466, - "logits/rejected": 0.2096748799085617, - "logps/chosen": -1.8787275552749634, - "logps/rejected": -1.7414252758026123, - "loss": 2.4792, - "rewards/accuracies": 0.45625001192092896, - "rewards/chosen": -1.8787275552749634, - "rewards/margins": -0.13730214536190033, - "rewards/rejected": -1.7414252758026123, - "semantic_entropy": 0.6435689926147461, + "logits/chosen": 0.036237478256225586, + "logits/rejected": 0.20881040394306183, + "logps/chosen": -1.8786709308624268, + "logps/rejected": -1.7416874170303345, + "loss": 2.1575, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.8786709308624268, + "rewards/margins": -0.13698363304138184, + "rewards/rejected": -1.7416874170303345, "step": 40 }, { "epoch": 0.02408429503261415, - "grad_norm": 20.477762582760594, + "grad_norm": 23.35974114943005, "learning_rate": 8.021390374331551e-08, - "logits/chosen": -0.003790763672441244, - "logits/rejected": 0.18562594056129456, - "logps/chosen": -1.8347675800323486, - "logps/rejected": -1.8684990406036377, - "loss": 2.4304, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.8347675800323486, - "rewards/margins": 0.033731609582901, - "rewards/rejected": -1.8684990406036377, - "semantic_entropy": 0.6498380899429321, + "logits/chosen": 0.027120601385831833, + "logits/rejected": 0.22463078796863556, + "logps/chosen": -1.8334786891937256, + "logps/rejected": -1.869138479232788, + "loss": 2.1041, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.8334786891937256, + "rewards/margins": 0.03565989434719086, + "rewards/rejected": -1.869138479232788, "step": 45 }, { "epoch": 0.026760327814015723, - "grad_norm": 24.218166746857367, + "grad_norm": 28.029289251958236, "learning_rate": 8.9126559714795e-08, - "logits/chosen": -0.044481705874204636, - "logits/rejected": 0.10558140277862549, - "logps/chosen": -1.8938385248184204, - "logps/rejected": -1.774763822555542, - "loss": 2.4776, + "logits/chosen": -0.05109367519617081, + "logits/rejected": 0.09518839418888092, + "logps/chosen": -1.8920953273773193, + "logps/rejected": -1.7738196849822998, + "loss": 2.1586, "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.8938385248184204, - "rewards/margins": -0.11907454580068588, - "rewards/rejected": -1.774763822555542, - "semantic_entropy": 0.6348689794540405, + "rewards/chosen": -1.8920953273773193, + "rewards/margins": -0.11827566474676132, + "rewards/rejected": -1.7738196849822998, "step": 50 }, { "epoch": 0.029436360595417294, - "grad_norm": 19.346768220162602, + "grad_norm": 22.434347301056413, "learning_rate": 9.80392156862745e-08, - "logits/chosen": -0.095049187541008, - "logits/rejected": 0.12877969443798065, - "logps/chosen": -1.8224636316299438, - "logps/rejected": -1.8576208353042603, - "loss": 2.4, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.8224636316299438, - "rewards/margins": 0.035157203674316406, - "rewards/rejected": -1.8576208353042603, - "semantic_entropy": 0.6459155082702637, + "logits/chosen": -0.11258158832788467, + "logits/rejected": 0.10594246536493301, + "logps/chosen": -1.8228343725204468, + "logps/rejected": -1.8576141595840454, + "loss": 2.0774, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.8228343725204468, + "rewards/margins": 0.034779977053403854, + "rewards/rejected": -1.8576141595840454, "step": 55 }, { "epoch": 0.032112393376818864, - "grad_norm": 22.14339393235795, + "grad_norm": 25.1075556418722, "learning_rate": 1.06951871657754e-07, - "logits/chosen": -0.06976853311061859, - "logits/rejected": 0.12213323265314102, - "logps/chosen": -1.7787249088287354, - "logps/rejected": -1.8833869695663452, - "loss": 2.3455, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.7787249088287354, - "rewards/margins": 0.1046619862318039, - "rewards/rejected": -1.8833869695663452, - "semantic_entropy": 0.6385133862495422, + "logits/chosen": -0.07487720251083374, + "logits/rejected": 0.11652688682079315, + "logps/chosen": -1.7783119678497314, + "logps/rejected": -1.882073998451233, + "loss": 2.026, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.7783119678497314, + "rewards/margins": 0.1037621945142746, + "rewards/rejected": -1.882073998451233, "step": 60 }, { "epoch": 0.03478842615822044, - "grad_norm": 21.514956821961622, + "grad_norm": 24.457172876439813, "learning_rate": 1.158645276292335e-07, - "logits/chosen": -0.02376721426844597, - "logits/rejected": 0.12463350594043732, - "logps/chosen": -1.6288169622421265, - "logps/rejected": -1.7579704523086548, - "loss": 2.2499, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.6288169622421265, - "rewards/margins": 0.12915340065956116, - "rewards/rejected": -1.7579704523086548, - "semantic_entropy": 0.6990243196487427, + "logits/chosen": -0.02016451396048069, + "logits/rejected": 0.12632247805595398, + "logps/chosen": -1.6278730630874634, + "logps/rejected": -1.7560151815414429, + "loss": 1.8995, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6278730630874634, + "rewards/margins": 0.12814214825630188, + "rewards/rejected": -1.7560151815414429, "step": 65 }, { "epoch": 0.03746445893962201, - "grad_norm": 21.041922188660656, + "grad_norm": 24.621380365467704, "learning_rate": 1.24777183600713e-07, - "logits/chosen": -0.07015866041183472, - "logits/rejected": 0.08012659847736359, - "logps/chosen": -1.754617691040039, - "logps/rejected": -1.7989566326141357, - "loss": 2.3606, + "logits/chosen": -0.06752300262451172, + "logits/rejected": 0.08107473701238632, + "logps/chosen": -1.7528495788574219, + "logps/rejected": -1.7970850467681885, + "loss": 2.0302, "rewards/accuracies": 0.42500001192092896, - "rewards/chosen": -1.754617691040039, - "rewards/margins": 0.04433891549706459, - "rewards/rejected": -1.7989566326141357, - "semantic_entropy": 0.657590925693512, + "rewards/chosen": -1.7528495788574219, + "rewards/margins": 0.04423557221889496, + "rewards/rejected": -1.7970850467681885, "step": 70 }, { "epoch": 0.04014049172102358, - "grad_norm": 17.482112329039467, + "grad_norm": 20.855832452699083, "learning_rate": 1.3368983957219251e-07, - "logits/chosen": -0.04342065379023552, - "logits/rejected": 0.13985566794872284, - "logps/chosen": -1.7492595911026, - "logps/rejected": -2.0036559104919434, - "loss": 2.318, + "logits/chosen": -0.05488968640565872, + "logits/rejected": 0.12391235679388046, + "logps/chosen": -1.7454535961151123, + "logps/rejected": -2.000891923904419, + "loss": 1.9937, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.7492595911026, - "rewards/margins": 0.2543964982032776, - "rewards/rejected": -2.0036559104919434, - "semantic_entropy": 0.6419968605041504, + "rewards/chosen": -1.7454535961151123, + "rewards/margins": 0.2554382085800171, + "rewards/rejected": -2.000891923904419, "step": 75 }, { "epoch": 0.04281652450242515, - "grad_norm": 15.988514228088759, + "grad_norm": 19.142552795246292, "learning_rate": 1.42602495543672e-07, - "logits/chosen": -0.007324705831706524, - "logits/rejected": 0.09563705325126648, - "logps/chosen": -1.6806221008300781, - "logps/rejected": -1.7140804529190063, - "loss": 2.2999, + "logits/chosen": -0.015638595446944237, + "logits/rejected": 0.08477606624364853, + "logps/chosen": -1.6801780462265015, + "logps/rejected": -1.7129218578338623, + "loss": 1.9599, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.6806221008300781, - "rewards/margins": 0.03345843032002449, - "rewards/rejected": -1.7140804529190063, - "semantic_entropy": 0.6795748472213745, + "rewards/chosen": -1.6801780462265015, + "rewards/margins": 0.03274388611316681, + "rewards/rejected": -1.7129218578338623, "step": 80 }, { "epoch": 0.04549255728382673, - "grad_norm": 11.611154920796695, + "grad_norm": 14.356282609461584, "learning_rate": 1.5151515151515152e-07, - "logits/chosen": -0.17916560173034668, - "logits/rejected": 0.05491837114095688, - "logps/chosen": -1.7472093105316162, - "logps/rejected": -1.9139823913574219, - "loss": 2.3546, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.7472093105316162, - "rewards/margins": 0.1667732149362564, - "rewards/rejected": -1.9139823913574219, - "semantic_entropy": 0.6593233346939087, + "logits/chosen": -0.16840913891792297, + "logits/rejected": 0.07064966857433319, + "logps/chosen": -1.746219277381897, + "logps/rejected": -1.9129278659820557, + "loss": 2.0241, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.746219277381897, + "rewards/margins": 0.16670863330364227, + "rewards/rejected": -1.9129278659820557, "step": 85 }, { "epoch": 0.0481685900652283, - "grad_norm": 22.467095092939147, + "grad_norm": 25.731648485863353, "learning_rate": 1.6042780748663102e-07, - "logits/chosen": 0.07985838502645493, - "logits/rejected": 0.043650977313518524, - "logps/chosen": -1.6980218887329102, - "logps/rejected": -1.7370532751083374, - "loss": 2.3257, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.6980218887329102, - "rewards/margins": 0.03903146833181381, - "rewards/rejected": -1.7370532751083374, - "semantic_entropy": 0.6805222630500793, + "logits/chosen": 0.08056856691837311, + "logits/rejected": 0.04404681921005249, + "logps/chosen": -1.6959501504898071, + "logps/rejected": -1.7361692190170288, + "loss": 1.9836, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.6959501504898071, + "rewards/margins": 0.040219251066446304, + "rewards/rejected": -1.7361692190170288, "step": 90 }, { "epoch": 0.05084462284662987, - "grad_norm": 26.392041621319432, + "grad_norm": 30.275642149716607, "learning_rate": 1.693404634581105e-07, - "logits/chosen": -0.09472827613353729, - "logits/rejected": 0.051044244319200516, - "logps/chosen": -1.7366969585418701, - "logps/rejected": -1.8614689111709595, - "loss": 2.3268, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.7366969585418701, - "rewards/margins": 0.12477195262908936, - "rewards/rejected": -1.8614689111709595, - "semantic_entropy": 0.6559633612632751, + "logits/chosen": -0.0880376473069191, + "logits/rejected": 0.05738549306988716, + "logps/chosen": -1.7332429885864258, + "logps/rejected": -1.8584188222885132, + "loss": 1.9958, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7332429885864258, + "rewards/margins": 0.1251758486032486, + "rewards/rejected": -1.8584188222885132, "step": 95 }, { "epoch": 0.05352065562803145, - "grad_norm": 11.596621418184593, + "grad_norm": 13.92404217311914, "learning_rate": 1.7825311942959e-07, - "logits/chosen": -0.04228469356894493, - "logits/rejected": 0.019458714872598648, - "logps/chosen": -1.6304876804351807, - "logps/rejected": -1.7368942499160767, - "loss": 2.2454, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.6304876804351807, - "rewards/margins": 0.10640676319599152, - "rewards/rejected": -1.7368942499160767, - "semantic_entropy": 0.6917005777359009, + "logits/chosen": -0.06531379371881485, + "logits/rejected": -0.004916741047054529, + "logps/chosen": -1.6134307384490967, + "logps/rejected": -1.7194831371307373, + "loss": 1.8853, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.6134307384490967, + "rewards/margins": 0.10605257749557495, + "rewards/rejected": -1.7194831371307373, "step": 100 }, { "epoch": 0.05619668840943302, - "grad_norm": 16.464623708925348, + "grad_norm": 20.83071580061446, "learning_rate": 1.8716577540106952e-07, - "logits/chosen": 0.03008785843849182, - "logits/rejected": 0.056171614676713943, - "logps/chosen": -1.5428593158721924, - "logps/rejected": -1.7105385065078735, - "loss": 2.1781, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.5428593158721924, - "rewards/margins": 0.16767899692058563, - "rewards/rejected": -1.7105385065078735, - "semantic_entropy": 0.7140295505523682, + "logits/chosen": 0.024060076102614403, + "logits/rejected": 0.048125751316547394, + "logps/chosen": -1.5307506322860718, + "logps/rejected": -1.7004512548446655, + "loss": 1.8107, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5307506322860718, + "rewards/margins": 0.1697007268667221, + "rewards/rejected": -1.7004512548446655, "step": 105 }, { "epoch": 0.05887272119083459, - "grad_norm": 15.32604700768848, + "grad_norm": 18.9086884907858, "learning_rate": 1.96078431372549e-07, - "logits/chosen": -0.02091386541724205, - "logits/rejected": 0.07242826372385025, - "logps/chosen": -1.532142996788025, - "logps/rejected": -1.5921493768692017, - "loss": 2.2011, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.532142996788025, - "rewards/margins": 0.06000634282827377, - "rewards/rejected": -1.5921493768692017, - "semantic_entropy": 0.7271608710289001, + "logits/chosen": 0.0007185645517893136, + "logits/rejected": 0.0946788415312767, + "logps/chosen": -1.5208451747894287, + "logps/rejected": -1.5815619230270386, + "loss": 1.8283, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5208451747894287, + "rewards/margins": 0.06071670725941658, + "rewards/rejected": -1.5815619230270386, "step": 110 }, { "epoch": 0.06154875397223616, - "grad_norm": 17.01697563425044, + "grad_norm": 20.642901156172965, "learning_rate": 2.049910873440285e-07, - "logits/chosen": -0.007334655616432428, - "logits/rejected": 0.19188645482063293, - "logps/chosen": -1.5300090312957764, - "logps/rejected": -1.7706422805786133, - "loss": 2.1436, + "logits/chosen": 0.0076905665919184685, + "logits/rejected": 0.21130745112895966, + "logps/chosen": -1.5200780630111694, + "logps/rejected": -1.757401466369629, + "loss": 1.7815, "rewards/accuracies": 0.625, - "rewards/chosen": -1.5300090312957764, - "rewards/margins": 0.24063313007354736, - "rewards/rejected": -1.7706422805786133, - "semantic_entropy": 0.7088441848754883, + "rewards/chosen": -1.5200780630111694, + "rewards/margins": 0.23732347786426544, + "rewards/rejected": -1.757401466369629, "step": 115 }, { "epoch": 0.06422478675363773, - "grad_norm": 17.820280607333967, + "grad_norm": 22.300246680384884, "learning_rate": 2.13903743315508e-07, - "logits/chosen": -0.10667898505926132, - "logits/rejected": 0.058054011315107346, - "logps/chosen": -1.5706758499145508, - "logps/rejected": -1.6764158010482788, - "loss": 2.2182, + "logits/chosen": -0.08757462352514267, + "logits/rejected": 0.08249307423830032, + "logps/chosen": -1.5570647716522217, + "logps/rejected": -1.6597316265106201, + "loss": 1.8545, "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.5706758499145508, - "rewards/margins": 0.10573999583721161, - "rewards/rejected": -1.6764158010482788, - "semantic_entropy": 0.707051694393158, + "rewards/chosen": -1.5570647716522217, + "rewards/margins": 0.10266710817813873, + "rewards/rejected": -1.6597316265106201, "step": 120 }, { "epoch": 0.0669008195350393, - "grad_norm": 5.961991097766711, + "grad_norm": 7.451369428561403, "learning_rate": 2.2281639928698751e-07, - "logits/chosen": -0.08696900308132172, - "logits/rejected": 0.043225500732660294, - "logps/chosen": -1.5204510688781738, - "logps/rejected": -1.4943760633468628, - "loss": 2.2164, - "rewards/accuracies": 0.4625000059604645, - "rewards/chosen": -1.5204510688781738, - "rewards/margins": -0.026074940338730812, - "rewards/rejected": -1.4943760633468628, - "semantic_entropy": 0.7435601353645325, + "logits/chosen": -0.08727750927209854, + "logits/rejected": 0.04271925613284111, + "logps/chosen": -1.508993148803711, + "logps/rejected": -1.4839754104614258, + "loss": 1.8361, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.508993148803711, + "rewards/margins": -0.025017833337187767, + "rewards/rejected": -1.4839754104614258, "step": 125 }, { "epoch": 0.06957685231644088, - "grad_norm": 21.999826366025292, + "grad_norm": 26.668725802501854, "learning_rate": 2.31729055258467e-07, - "logits/chosen": 0.011612406000494957, - "logits/rejected": 0.1401824653148651, - "logps/chosen": -1.5498534440994263, - "logps/rejected": -1.6641845703125, - "loss": 2.1912, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.5498534440994263, - "rewards/margins": 0.11433116346597672, - "rewards/rejected": -1.6641845703125, - "semantic_entropy": 0.7085353136062622, + "logits/chosen": 0.032693587243556976, + "logits/rejected": 0.16499973833560944, + "logps/chosen": -1.53310227394104, + "logps/rejected": -1.6465816497802734, + "loss": 1.8239, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.53310227394104, + "rewards/margins": 0.11347933113574982, + "rewards/rejected": -1.6465816497802734, "step": 130 }, { "epoch": 0.07225288509784245, - "grad_norm": 21.874086544749737, + "grad_norm": 22.517570026833223, "learning_rate": 2.406417112299465e-07, - "logits/chosen": -0.07757744938135147, - "logits/rejected": 0.03269508481025696, - "logps/chosen": -1.5848090648651123, - "logps/rejected": -1.628740668296814, - "loss": 2.2373, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.5848090648651123, - "rewards/margins": 0.04393132030963898, - "rewards/rejected": -1.628740668296814, - "semantic_entropy": 0.7085167765617371, + "logits/chosen": -0.07050226628780365, + "logits/rejected": 0.04343460127711296, + "logps/chosen": -1.5483382940292358, + "logps/rejected": -1.595626950263977, + "loss": 1.8555, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5483382940292358, + "rewards/margins": 0.04728863015770912, + "rewards/rejected": -1.595626950263977, "step": 135 }, { "epoch": 0.07492891787924402, - "grad_norm": 11.387359947808873, + "grad_norm": 14.420706557361244, "learning_rate": 2.49554367201426e-07, - "logits/chosen": -0.058408986777067184, - "logits/rejected": 0.1023082286119461, - "logps/chosen": -1.5260465145111084, - "logps/rejected": -1.6099777221679688, - "loss": 2.1991, + "logits/chosen": -0.07175682485103607, + "logits/rejected": 0.08573417365550995, + "logps/chosen": -1.4752639532089233, + "logps/rejected": -1.5440343618392944, + "loss": 1.8151, "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.5260465145111084, - "rewards/margins": 0.08393123000860214, - "rewards/rejected": -1.6099777221679688, - "semantic_entropy": 0.7268117070198059, + "rewards/chosen": -1.4752639532089233, + "rewards/margins": 0.06877056509256363, + "rewards/rejected": -1.5440343618392944, "step": 140 }, { "epoch": 0.0776049506606456, - "grad_norm": 14.134217333680969, + "grad_norm": 13.975507283277274, "learning_rate": 2.5846702317290554e-07, - "logits/chosen": -0.04021311178803444, - "logits/rejected": 0.10950696468353271, - "logps/chosen": -1.394166111946106, - "logps/rejected": -1.4935468435287476, - "loss": 2.1092, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.394166111946106, - "rewards/margins": 0.09938089549541473, - "rewards/rejected": -1.4935468435287476, - "semantic_entropy": 0.7643908262252808, + "logits/chosen": -0.0945412889122963, + "logits/rejected": 0.04638366773724556, + "logps/chosen": -1.3633663654327393, + "logps/rejected": -1.4590461254119873, + "loss": 1.7122, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.3633663654327393, + "rewards/margins": 0.0956796258687973, + "rewards/rejected": -1.4590461254119873, "step": 145 }, { "epoch": 0.08028098344204716, - "grad_norm": 14.660549090593136, + "grad_norm": 14.532988699334254, "learning_rate": 2.6737967914438503e-07, - "logits/chosen": -0.11129583418369293, - "logits/rejected": 0.038066692650318146, - "logps/chosen": -1.321720838546753, - "logps/rejected": -1.3290258646011353, - "loss": 2.1164, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.321720838546753, - "rewards/margins": 0.0073050023056566715, - "rewards/rejected": -1.3290258646011353, - "semantic_entropy": 0.8203862905502319, + "logits/chosen": -0.12252895534038544, + "logits/rejected": 0.021989500150084496, + "logps/chosen": -1.2998199462890625, + "logps/rejected": -1.31027352809906, + "loss": 1.6962, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2998199462890625, + "rewards/margins": 0.010453557595610619, + "rewards/rejected": -1.31027352809906, "step": 150 }, { "epoch": 0.08295701622344874, - "grad_norm": 9.337305048028103, + "grad_norm": 10.076529575305797, "learning_rate": 2.762923351158645e-07, - "logits/chosen": -0.13234972953796387, - "logits/rejected": -0.0875357836484909, - "logps/chosen": -1.3184583187103271, - "logps/rejected": -1.4332258701324463, - "loss": 2.0793, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3184583187103271, - "rewards/margins": 0.11476727575063705, - "rewards/rejected": -1.4332258701324463, - "semantic_entropy": 0.7989853024482727, + "logits/chosen": -0.11439421027898788, + "logits/rejected": -0.07082248479127884, + "logps/chosen": -1.3111894130706787, + "logps/rejected": -1.4266737699508667, + "loss": 1.6755, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3111894130706787, + "rewards/margins": 0.1154845803976059, + "rewards/rejected": -1.4266737699508667, "step": 155 }, { "epoch": 0.0856330490048503, - "grad_norm": 9.617308878215045, + "grad_norm": 10.107152904622856, "learning_rate": 2.85204991087344e-07, - "logits/chosen": -0.21252942085266113, - "logits/rejected": -0.08288715779781342, - "logps/chosen": -1.4070971012115479, - "logps/rejected": -1.3882343769073486, - "loss": 2.182, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.4070971012115479, - "rewards/margins": -0.018862640485167503, - "rewards/rejected": -1.3882343769073486, - "semantic_entropy": 0.7836942076683044, + "logits/chosen": -0.20813456177711487, + "logits/rejected": -0.07859252393245697, + "logps/chosen": -1.4011151790618896, + "logps/rejected": -1.3801295757293701, + "loss": 1.7873, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.4011151790618896, + "rewards/margins": -0.020985547453165054, + "rewards/rejected": -1.3801295757293701, "step": 160 }, { "epoch": 0.08830908178625188, - "grad_norm": 12.078012222248885, + "grad_norm": 12.806131515610266, "learning_rate": 2.941176470588235e-07, - "logits/chosen": -0.11357660591602325, - "logits/rejected": 0.04716426134109497, - "logps/chosen": -1.3212950229644775, - "logps/rejected": -1.3990843296051025, - "loss": 2.1161, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.3212950229644775, - "rewards/margins": 0.07778936624526978, - "rewards/rejected": -1.3990843296051025, - "semantic_entropy": 0.7998541593551636, + "logits/chosen": -0.10335227102041245, + "logits/rejected": 0.0551244392991066, + "logps/chosen": -1.3156263828277588, + "logps/rejected": -1.3915605545043945, + "loss": 1.7149, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.3156263828277588, + "rewards/margins": 0.07593418657779694, + "rewards/rejected": -1.3915605545043945, "step": 165 }, { "epoch": 0.09098511456765346, - "grad_norm": 13.254483655244782, + "grad_norm": 12.849296884231892, "learning_rate": 3.0303030303030305e-07, - "logits/chosen": -0.13945594429969788, - "logits/rejected": -0.0916006937623024, - "logps/chosen": -1.4381730556488037, - "logps/rejected": -1.5070384740829468, - "loss": 2.1607, + "logits/chosen": -0.1526593118906021, + "logits/rejected": -0.10797767341136932, + "logps/chosen": -1.4327340126037598, + "logps/rejected": -1.5010325908660889, + "loss": 1.7766, "rewards/accuracies": 0.5, - "rewards/chosen": -1.4381730556488037, - "rewards/margins": 0.06886538118124008, - "rewards/rejected": -1.5070384740829468, - "semantic_entropy": 0.763955295085907, + "rewards/chosen": -1.4327340126037598, + "rewards/margins": 0.06829849630594254, + "rewards/rejected": -1.5010325908660889, "step": 170 }, { "epoch": 0.09366114734905502, - "grad_norm": 10.22284067289044, + "grad_norm": 10.642151115867977, "learning_rate": 3.1194295900178254e-07, - "logits/chosen": -0.009927010163664818, - "logits/rejected": -0.01633218303322792, - "logps/chosen": -1.3267039060592651, - "logps/rejected": -1.4154396057128906, - "loss": 2.1055, + "logits/chosen": -0.01714947633445263, + "logits/rejected": -0.023959076032042503, + "logps/chosen": -1.320685863494873, + "logps/rejected": -1.4081209897994995, + "loss": 1.7025, "rewards/accuracies": 0.5, - "rewards/chosen": -1.3267039060592651, - "rewards/margins": 0.08873560279607773, - "rewards/rejected": -1.4154396057128906, - "semantic_entropy": 0.7996432781219482, + "rewards/chosen": -1.320685863494873, + "rewards/margins": 0.08743523061275482, + "rewards/rejected": -1.4081209897994995, "step": 175 }, { "epoch": 0.0963371801304566, - "grad_norm": 7.5910930778984005, + "grad_norm": 8.019318830495136, "learning_rate": 3.2085561497326203e-07, - "logits/chosen": -0.060885727405548096, - "logits/rejected": -0.06494145840406418, - "logps/chosen": -1.3440454006195068, - "logps/rejected": -1.5446110963821411, - "loss": 2.0722, + "logits/chosen": -0.03761307895183563, + "logits/rejected": -0.038325823843479156, + "logps/chosen": -1.3384017944335938, + "logps/rejected": -1.539563775062561, + "loss": 1.6765, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3440454006195068, - "rewards/margins": 0.20056560635566711, - "rewards/rejected": -1.5446110963821411, - "semantic_entropy": 0.7831510901451111, + "rewards/chosen": -1.3384017944335938, + "rewards/margins": 0.20116198062896729, + "rewards/rejected": -1.539563775062561, "step": 180 }, { "epoch": 0.09901321291185818, - "grad_norm": 7.822879105238539, + "grad_norm": 8.494090911892958, "learning_rate": 3.297682709447415e-07, - "logits/chosen": -0.20560979843139648, - "logits/rejected": -0.12279047816991806, - "logps/chosen": -1.3355904817581177, - "logps/rejected": -1.3831441402435303, - "loss": 2.1299, + "logits/chosen": -0.19622287154197693, + "logits/rejected": -0.11487259715795517, + "logps/chosen": -1.331474781036377, + "logps/rejected": -1.3787868022918701, + "loss": 1.7307, "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3355904817581177, - "rewards/margins": 0.04755370691418648, - "rewards/rejected": -1.3831441402435303, - "semantic_entropy": 0.7968094944953918, + "rewards/chosen": -1.331474781036377, + "rewards/margins": 0.04731215909123421, + "rewards/rejected": -1.3787868022918701, "step": 185 }, { "epoch": 0.10168924569325974, - "grad_norm": 9.9835519249466, + "grad_norm": 10.351738064447387, "learning_rate": 3.38680926916221e-07, - "logits/chosen": -0.10206165164709091, - "logits/rejected": 0.010062957182526588, - "logps/chosen": -1.2628173828125, - "logps/rejected": -1.3929227590560913, - "loss": 2.0604, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.2628173828125, - "rewards/margins": 0.13010530173778534, - "rewards/rejected": -1.3929227590560913, - "semantic_entropy": 0.8037274479866028, + "logits/chosen": -0.08686023950576782, + "logits/rejected": 0.02442360296845436, + "logps/chosen": -1.2566090822219849, + "logps/rejected": -1.3870621919631958, + "loss": 1.6552, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2566090822219849, + "rewards/margins": 0.13045313954353333, + "rewards/rejected": -1.3870621919631958, "step": 190 }, { "epoch": 0.10436527847466132, - "grad_norm": 7.658920533300685, + "grad_norm": 8.615804581626048, "learning_rate": 3.475935828877005e-07, - "logits/chosen": -0.031039467081427574, - "logits/rejected": 0.11408261954784393, - "logps/chosen": -1.2417497634887695, - "logps/rejected": -1.4050052165985107, - "loss": 2.0318, + "logits/chosen": -0.026592861860990524, + "logits/rejected": 0.11833508312702179, + "logps/chosen": -1.2394936084747314, + "logps/rejected": -1.401416540145874, + "loss": 1.6174, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2417497634887695, - "rewards/margins": 0.16325534880161285, - "rewards/rejected": -1.4050052165985107, - "semantic_entropy": 0.8272998929023743, + "rewards/chosen": -1.2394936084747314, + "rewards/margins": 0.16192278265953064, + "rewards/rejected": -1.401416540145874, "step": 195 }, { "epoch": 0.1070413112560629, - "grad_norm": 18.816124807543744, + "grad_norm": 21.517358062706677, "learning_rate": 3.5650623885918e-07, - "logits/chosen": -0.12287290394306183, - "logits/rejected": 0.005303362850099802, - "logps/chosen": -1.3640367984771729, - "logps/rejected": -1.4029943943023682, - "loss": 2.1351, + "logits/chosen": -0.09331141412258148, + "logits/rejected": 0.03843807801604271, + "logps/chosen": -1.3600974082946777, + "logps/rejected": -1.4013712406158447, + "loss": 1.7361, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3640367984771729, - "rewards/margins": 0.03895748034119606, - "rewards/rejected": -1.4029943943023682, - "semantic_entropy": 0.791954755783081, + "rewards/chosen": -1.3600974082946777, + "rewards/margins": 0.04127373918890953, + "rewards/rejected": -1.4013712406158447, "step": 200 }, { "epoch": 0.10971734403746446, - "grad_norm": 13.834762563320641, + "grad_norm": 15.348544529088636, "learning_rate": 3.654188948306595e-07, - "logits/chosen": -0.08338096737861633, - "logits/rejected": 0.053165972232818604, - "logps/chosen": -1.2766786813735962, - "logps/rejected": -1.346866488456726, - "loss": 2.0827, + "logits/chosen": -0.07399742305278778, + "logits/rejected": 0.06137485057115555, + "logps/chosen": -1.272519826889038, + "logps/rejected": -1.3434226512908936, + "loss": 1.6771, "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2766786813735962, - "rewards/margins": 0.07018790394067764, - "rewards/rejected": -1.346866488456726, - "semantic_entropy": 0.8064363598823547, + "rewards/chosen": -1.272519826889038, + "rewards/margins": 0.07090290635824203, + "rewards/rejected": -1.3434226512908936, "step": 205 }, { "epoch": 0.11239337681886603, - "grad_norm": 12.941469291971545, + "grad_norm": 12.32485215494046, "learning_rate": 3.7433155080213904e-07, - "logits/chosen": -0.19147761166095734, - "logits/rejected": -0.01435632724314928, - "logps/chosen": -1.3535292148590088, - "logps/rejected": -1.4615424871444702, - "loss": 2.1148, + "logits/chosen": -0.18402157723903656, + "logits/rejected": -0.010704070329666138, + "logps/chosen": -1.3493117094039917, + "logps/rejected": -1.4580633640289307, + "loss": 1.7154, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3535292148590088, - "rewards/margins": 0.10801327228546143, - "rewards/rejected": -1.4615424871444702, - "semantic_entropy": 0.7920994162559509, + "rewards/chosen": -1.3493117094039917, + "rewards/margins": 0.10875160992145538, + "rewards/rejected": -1.4580633640289307, "step": 210 }, { "epoch": 0.1150694096002676, - "grad_norm": 10.314040391266644, + "grad_norm": 10.482989998891275, "learning_rate": 3.8324420677361853e-07, - "logits/chosen": -0.2077387273311615, - "logits/rejected": 0.02725372649729252, - "logps/chosen": -1.375613808631897, - "logps/rejected": -1.4290225505828857, - "loss": 2.1308, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.375613808631897, - "rewards/margins": 0.053408555686473846, - "rewards/rejected": -1.4290225505828857, - "semantic_entropy": 0.7987397313117981, + "logits/chosen": -0.19349366426467896, + "logits/rejected": 0.042197782546281815, + "logps/chosen": -1.3733896017074585, + "logps/rejected": -1.4248539209365845, + "loss": 1.7302, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3733896017074585, + "rewards/margins": 0.051464296877384186, + "rewards/rejected": -1.4248539209365845, "step": 215 }, { "epoch": 0.11774544238166917, - "grad_norm": 14.32317789711802, + "grad_norm": 16.868719953303888, "learning_rate": 3.92156862745098e-07, - "logits/chosen": -0.005137534346431494, - "logits/rejected": 0.0829891785979271, - "logps/chosen": -1.297531247138977, - "logps/rejected": -1.4413492679595947, - "loss": 2.062, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.297531247138977, - "rewards/margins": 0.14381805062294006, - "rewards/rejected": -1.4413492679595947, - "semantic_entropy": 0.8080397844314575, + "logits/chosen": 0.03658125922083855, + "logits/rejected": 0.13169129192829132, + "logps/chosen": -1.2940952777862549, + "logps/rejected": -1.436645269393921, + "loss": 1.6562, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2940952777862549, + "rewards/margins": 0.14254987239837646, + "rewards/rejected": -1.436645269393921, "step": 220 }, { "epoch": 0.12042147516307075, - "grad_norm": 7.505593107015226, + "grad_norm": 7.797019386575935, "learning_rate": 4.010695187165775e-07, - "logits/chosen": -0.13666629791259766, - "logits/rejected": 0.020686468109488487, - "logps/chosen": -1.3048591613769531, - "logps/rejected": -1.4310519695281982, - "loss": 2.0589, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3048591613769531, - "rewards/margins": 0.12619265913963318, - "rewards/rejected": -1.4310519695281982, - "semantic_entropy": 0.8000715970993042, + "logits/chosen": -0.10609889030456543, + "logits/rejected": 0.0540977343916893, + "logps/chosen": -1.301684856414795, + "logps/rejected": -1.4272652864456177, + "loss": 1.6564, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.301684856414795, + "rewards/margins": 0.12558043003082275, + "rewards/rejected": -1.4272652864456177, "step": 225 }, { "epoch": 0.12309750794447231, - "grad_norm": 7.547965569715929, + "grad_norm": 8.729087338113398, "learning_rate": 4.09982174688057e-07, - "logits/chosen": -0.05341342091560364, - "logits/rejected": 0.014276454225182533, - "logps/chosen": -1.3072868585586548, - "logps/rejected": -1.463921308517456, - "loss": 2.0822, + "logits/chosen": -0.03860308602452278, + "logits/rejected": 0.03049338422715664, + "logps/chosen": -1.3036974668502808, + "logps/rejected": -1.4600293636322021, + "loss": 1.6824, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3072868585586548, - "rewards/margins": 0.15663442015647888, - "rewards/rejected": -1.463921308517456, - "semantic_entropy": 0.7943694591522217, + "rewards/chosen": -1.3036974668502808, + "rewards/margins": 0.15633180737495422, + "rewards/rejected": -1.4600293636322021, "step": 230 }, { "epoch": 0.1257735407258739, - "grad_norm": 12.836607023053217, + "grad_norm": 12.197820443233079, "learning_rate": 4.188948306595365e-07, - "logits/chosen": -0.03887845203280449, - "logits/rejected": 0.08260010182857513, - "logps/chosen": -1.2770813703536987, - "logps/rejected": -1.4421340227127075, - "loss": 2.0463, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2770813703536987, - "rewards/margins": 0.1650528609752655, - "rewards/rejected": -1.4421340227127075, - "semantic_entropy": 0.8039000630378723, + "logits/chosen": -0.0038608163595199585, + "logits/rejected": 0.12154042720794678, + "logps/chosen": -1.272853136062622, + "logps/rejected": -1.437613844871521, + "loss": 1.6418, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.272853136062622, + "rewards/margins": 0.16476061940193176, + "rewards/rejected": -1.437613844871521, "step": 235 }, { "epoch": 0.12844957350727546, - "grad_norm": 5.810148561096096, + "grad_norm": 6.466615504548621, "learning_rate": 4.27807486631016e-07, - "logits/chosen": -0.049771975725889206, - "logits/rejected": 0.0695604532957077, - "logps/chosen": -1.293505072593689, - "logps/rejected": -1.4756234884262085, - "loss": 2.0815, + "logits/chosen": -0.03536298871040344, + "logits/rejected": 0.08561581373214722, + "logps/chosen": -1.2899919748306274, + "logps/rejected": -1.470727801322937, + "loss": 1.6874, "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.293505072593689, - "rewards/margins": 0.1821182668209076, - "rewards/rejected": -1.4756234884262085, - "semantic_entropy": 0.7817009091377258, + "rewards/chosen": -1.2899919748306274, + "rewards/margins": 0.18073596060276031, + "rewards/rejected": -1.470727801322937, "step": 240 }, { "epoch": 0.13112560628867703, - "grad_norm": 8.487258861235883, + "grad_norm": 8.822284856207192, "learning_rate": 4.3672014260249554e-07, - "logits/chosen": 0.0011432438623160124, - "logits/rejected": 0.11024985462427139, - "logps/chosen": -1.4130035638809204, - "logps/rejected": -1.4374676942825317, - "loss": 2.1648, + "logits/chosen": 0.008248868398368359, + "logits/rejected": 0.11743185669183731, + "logps/chosen": -1.4091993570327759, + "logps/rejected": -1.4329038858413696, + "loss": 1.7752, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4130035638809204, - "rewards/margins": 0.024464275687932968, - "rewards/rejected": -1.4374676942825317, - "semantic_entropy": 0.7734571099281311, + "rewards/chosen": -1.4091993570327759, + "rewards/margins": 0.02370438165962696, + "rewards/rejected": -1.4329038858413696, "step": 245 }, { "epoch": 0.1338016390700786, - "grad_norm": 10.014551196171814, + "grad_norm": 10.209906693953846, "learning_rate": 4.4563279857397503e-07, - "logits/chosen": -0.06559150665998459, - "logits/rejected": 0.09288983047008514, - "logps/chosen": -1.2952531576156616, - "logps/rejected": -1.3528729677200317, - "loss": 2.1092, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.2952531576156616, - "rewards/margins": 0.05761975795030594, - "rewards/rejected": -1.3528729677200317, - "semantic_entropy": 0.811192512512207, + "logits/chosen": -0.08582687377929688, + "logits/rejected": 0.06534568220376968, + "logps/chosen": -1.291361689567566, + "logps/rejected": -1.3486815690994263, + "loss": 1.6999, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.291361689567566, + "rewards/margins": 0.05731973797082901, + "rewards/rejected": -1.3486815690994263, "step": 250 }, { "epoch": 0.1364776718514802, - "grad_norm": 8.957601045394023, + "grad_norm": 9.005677794931367, "learning_rate": 4.545454545454545e-07, - "logits/chosen": -0.04644311964511871, - "logits/rejected": 0.08571015298366547, - "logps/chosen": -1.2608213424682617, - "logps/rejected": -1.3652595281600952, - "loss": 2.0626, + "logits/chosen": -0.05144655704498291, + "logits/rejected": 0.07802295684814453, + "logps/chosen": -1.258061408996582, + "logps/rejected": -1.3606371879577637, + "loss": 1.6505, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2608213424682617, - "rewards/margins": 0.10443822294473648, - "rewards/rejected": -1.3652595281600952, - "semantic_entropy": 0.821729302406311, + "rewards/chosen": -1.258061408996582, + "rewards/margins": 0.10257569700479507, + "rewards/rejected": -1.3606371879577637, "step": 255 }, { "epoch": 0.13915370463288176, - "grad_norm": 7.633348470343695, + "grad_norm": 7.861754856216085, "learning_rate": 4.63458110516934e-07, - "logits/chosen": -0.25433415174484253, - "logits/rejected": -0.15580113232135773, - "logps/chosen": -1.347470760345459, - "logps/rejected": -1.5038115978240967, - "loss": 2.0622, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.347470760345459, - "rewards/margins": 0.15634091198444366, - "rewards/rejected": -1.5038115978240967, - "semantic_entropy": 0.7835784554481506, + "logits/chosen": -0.24945001304149628, + "logits/rejected": -0.1495785415172577, + "logps/chosen": -1.3438951969146729, + "logps/rejected": -1.4988113641738892, + "loss": 1.668, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3438951969146729, + "rewards/margins": 0.1549161821603775, + "rewards/rejected": -1.4988113641738892, "step": 260 }, { "epoch": 0.1418297374142833, - "grad_norm": 9.896790172976969, + "grad_norm": 11.356790191417975, "learning_rate": 4.723707664884135e-07, - "logits/chosen": -0.10201990604400635, - "logits/rejected": -0.018684420734643936, - "logps/chosen": -1.3355344533920288, - "logps/rejected": -1.501531720161438, - "loss": 2.067, + "logits/chosen": -0.11152307689189911, + "logits/rejected": -0.035057198256254196, + "logps/chosen": -1.3328602313995361, + "logps/rejected": -1.4962241649627686, + "loss": 1.6802, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3355344533920288, - "rewards/margins": 0.1659972369670868, - "rewards/rejected": -1.501531720161438, - "semantic_entropy": 0.7691968083381653, + "rewards/chosen": -1.3328602313995361, + "rewards/margins": 0.16336390376091003, + "rewards/rejected": -1.4962241649627686, "step": 265 }, { "epoch": 0.1445057701956849, - "grad_norm": 7.608482620241505, + "grad_norm": 8.00537843083759, "learning_rate": 4.81283422459893e-07, - "logits/chosen": -0.10369201004505157, - "logits/rejected": 0.01996661350131035, - "logps/chosen": -1.3129630088806152, - "logps/rejected": -1.409122109413147, - "loss": 2.0988, + "logits/chosen": -0.110984206199646, + "logits/rejected": 0.006986084394156933, + "logps/chosen": -1.309006929397583, + "logps/rejected": -1.4050885438919067, + "loss": 1.6973, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3129630088806152, - "rewards/margins": 0.09615901112556458, - "rewards/rejected": -1.409122109413147, - "semantic_entropy": 0.7943571209907532, + "rewards/chosen": -1.309006929397583, + "rewards/margins": 0.0960814505815506, + "rewards/rejected": -1.4050885438919067, "step": 270 }, { "epoch": 0.14718180297708647, - "grad_norm": 9.589428959834835, + "grad_norm": 10.137734751765233, "learning_rate": 4.901960784313725e-07, - "logits/chosen": -0.04439730569720268, - "logits/rejected": 0.05072161555290222, - "logps/chosen": -1.263169765472412, - "logps/rejected": -1.4157510995864868, - "loss": 2.0775, + "logits/chosen": -0.03913550823926926, + "logits/rejected": 0.05400773882865906, + "logps/chosen": -1.2604626417160034, + "logps/rejected": -1.410111904144287, + "loss": 1.6664, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.263169765472412, - "rewards/margins": 0.1525813192129135, - "rewards/rejected": -1.4157510995864868, - "semantic_entropy": 0.8186517953872681, + "rewards/chosen": -1.2604626417160034, + "rewards/margins": 0.14964918792247772, + "rewards/rejected": -1.410111904144287, "step": 275 }, { "epoch": 0.14985783575848804, - "grad_norm": 10.223827861187196, + "grad_norm": 10.412520382560047, "learning_rate": 4.99108734402852e-07, - "logits/chosen": -0.12224537134170532, - "logits/rejected": 0.020121756941080093, - "logps/chosen": -1.3141443729400635, - "logps/rejected": -1.4087907075881958, - "loss": 2.0781, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3141443729400635, - "rewards/margins": 0.09464634954929352, - "rewards/rejected": -1.4087907075881958, - "semantic_entropy": 0.8114911317825317, + "logits/chosen": -0.12299992144107819, + "logits/rejected": 0.020667919889092445, + "logps/chosen": -1.308929681777954, + "logps/rejected": -1.4028128385543823, + "loss": 1.668, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.308929681777954, + "rewards/margins": 0.09388315677642822, + "rewards/rejected": -1.4028128385543823, "step": 280 }, { "epoch": 0.15253386853988962, - "grad_norm": 9.044891488981245, + "grad_norm": 9.309541745069055, "learning_rate": 5.080213903743315e-07, - "logits/chosen": -0.09391235560178757, - "logits/rejected": 0.03823528066277504, - "logps/chosen": -1.341046929359436, - "logps/rejected": -1.4223486185073853, - "loss": 2.1028, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.341046929359436, - "rewards/margins": 0.08130180835723877, - "rewards/rejected": -1.4223486185073853, - "semantic_entropy": 0.7922171950340271, + "logits/chosen": -0.10612060874700546, + "logits/rejected": 0.021303869783878326, + "logps/chosen": -1.3394674062728882, + "logps/rejected": -1.416865587234497, + "loss": 1.7065, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3394674062728882, + "rewards/margins": 0.07739803940057755, + "rewards/rejected": -1.416865587234497, "step": 285 }, { "epoch": 0.1552099013212912, - "grad_norm": 8.115808422311316, + "grad_norm": 8.410936884911042, "learning_rate": 5.169340463458111e-07, - "logits/chosen": -0.12526479363441467, - "logits/rejected": 0.16373701393604279, - "logps/chosen": -1.3602113723754883, - "logps/rejected": -1.4761052131652832, - "loss": 2.0772, + "logits/chosen": -0.12678056955337524, + "logits/rejected": 0.15584680438041687, + "logps/chosen": -1.356748342514038, + "logps/rejected": -1.4695770740509033, + "loss": 1.6826, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3602113723754883, - "rewards/margins": 0.11589386314153671, - "rewards/rejected": -1.4761052131652832, - "semantic_entropy": 0.7861930131912231, + "rewards/chosen": -1.356748342514038, + "rewards/margins": 0.112828828394413, + "rewards/rejected": -1.4695770740509033, "step": 290 }, { "epoch": 0.15788593410269275, - "grad_norm": 11.35002284350681, + "grad_norm": 12.629236143849868, "learning_rate": 5.258467023172905e-07, - "logits/chosen": -0.08692610263824463, - "logits/rejected": -0.03394615650177002, - "logps/chosen": -1.262650728225708, - "logps/rejected": -1.3980472087860107, - "loss": 2.0494, + "logits/chosen": -0.07856544852256775, + "logits/rejected": -0.02292429283261299, + "logps/chosen": -1.2590734958648682, + "logps/rejected": -1.3933165073394775, + "loss": 1.6434, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.262650728225708, - "rewards/margins": 0.13539646565914154, - "rewards/rejected": -1.3980472087860107, - "semantic_entropy": 0.8054231405258179, + "rewards/chosen": -1.2590734958648682, + "rewards/margins": 0.13424314558506012, + "rewards/rejected": -1.3933165073394775, "step": 295 }, { "epoch": 0.16056196688409433, - "grad_norm": 7.6555481324022905, + "grad_norm": 8.303889732551683, "learning_rate": 5.347593582887701e-07, - "logits/chosen": -0.09255888313055038, - "logits/rejected": 0.06662900745868683, - "logps/chosen": -1.3005911111831665, - "logps/rejected": -1.3753496408462524, - "loss": 2.107, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3005911111831665, - "rewards/margins": 0.0747586116194725, - "rewards/rejected": -1.3753496408462524, - "semantic_entropy": 0.8094614744186401, + "logits/chosen": -0.08285187184810638, + "logits/rejected": 0.07641416043043137, + "logps/chosen": -1.295358657836914, + "logps/rejected": -1.3709486722946167, + "loss": 1.6979, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.295358657836914, + "rewards/margins": 0.07559005171060562, + "rewards/rejected": -1.3709486722946167, "step": 300 }, { "epoch": 0.1632379996654959, - "grad_norm": 6.716780191320411, + "grad_norm": 7.078543768571527, "learning_rate": 5.436720142602496e-07, - "logits/chosen": -0.05763741210103035, - "logits/rejected": 0.008676018565893173, - "logps/chosen": -1.4021122455596924, - "logps/rejected": -1.4093527793884277, - "loss": 2.1533, + "logits/chosen": -0.0555441789329052, + "logits/rejected": 0.010712100192904472, + "logps/chosen": -1.3955779075622559, + "logps/rejected": -1.4059025049209595, + "loss": 1.7534, "rewards/accuracies": 0.5, - "rewards/chosen": -1.4021122455596924, - "rewards/margins": 0.007240760140120983, - "rewards/rejected": -1.4093527793884277, - "semantic_entropy": 0.7869191765785217, + "rewards/chosen": -1.3955779075622559, + "rewards/margins": 0.01032471377402544, + "rewards/rejected": -1.4059025049209595, "step": 305 }, { "epoch": 0.16591403244689748, - "grad_norm": 8.57327455330772, + "grad_norm": 8.740043566258406, "learning_rate": 5.52584670231729e-07, - "logits/chosen": -0.23644232749938965, - "logits/rejected": -0.1527770459651947, - "logps/chosen": -1.365864872932434, - "logps/rejected": -1.450217366218567, - "loss": 2.1311, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.365864872932434, - "rewards/margins": 0.0843525156378746, - "rewards/rejected": -1.450217366218567, - "semantic_entropy": 0.7928934097290039, + "logits/chosen": -0.23369836807250977, + "logits/rejected": -0.15311121940612793, + "logps/chosen": -1.3617355823516846, + "logps/rejected": -1.4438841342926025, + "loss": 1.7309, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3617355823516846, + "rewards/margins": 0.08214850723743439, + "rewards/rejected": -1.4438841342926025, "step": 310 }, { "epoch": 0.16859006522829906, - "grad_norm": 9.533260225990018, + "grad_norm": 10.299948854839913, "learning_rate": 5.614973262032086e-07, - "logits/chosen": -0.04432467371225357, - "logits/rejected": 0.09949810057878494, - "logps/chosen": -1.3562901020050049, - "logps/rejected": -1.5015990734100342, - "loss": 2.0876, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3562901020050049, - "rewards/margins": 0.14530906081199646, - "rewards/rejected": -1.5015990734100342, - "semantic_entropy": 0.7747354507446289, + "logits/chosen": -0.04319891706109047, + "logits/rejected": 0.1049783006310463, + "logps/chosen": -1.352993369102478, + "logps/rejected": -1.4952139854431152, + "loss": 1.6967, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.352993369102478, + "rewards/margins": 0.1422206461429596, + "rewards/rejected": -1.4952139854431152, "step": 315 }, { "epoch": 0.1712660980097006, - "grad_norm": 6.5495269716021705, + "grad_norm": 6.868177363644339, "learning_rate": 5.70409982174688e-07, - "logits/chosen": -0.09863855689764023, - "logits/rejected": 0.025040656328201294, - "logps/chosen": -1.3150360584259033, - "logps/rejected": -1.365338683128357, - "loss": 2.1136, + "logits/chosen": -0.09707468748092651, + "logits/rejected": 0.024421801790595055, + "logps/chosen": -1.3122655153274536, + "logps/rejected": -1.3613016605377197, + "loss": 1.7078, "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.3150360584259033, - "rewards/margins": 0.050302695482969284, - "rewards/rejected": -1.365338683128357, - "semantic_entropy": 0.8078792691230774, + "rewards/chosen": -1.3122655153274536, + "rewards/margins": 0.04903603345155716, + "rewards/rejected": -1.3613016605377197, "step": 320 }, { "epoch": 0.17394213079110218, - "grad_norm": 9.359341559750296, + "grad_norm": 9.643934962875473, "learning_rate": 5.793226381461676e-07, - "logits/chosen": -0.1586337685585022, - "logits/rejected": -0.054130129516124725, - "logps/chosen": -1.3173141479492188, - "logps/rejected": -1.5577419996261597, - "loss": 2.0609, + "logits/chosen": -0.1591241955757141, + "logits/rejected": -0.05875442177057266, + "logps/chosen": -1.3131062984466553, + "logps/rejected": -1.5493340492248535, + "loss": 1.6641, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3173141479492188, - "rewards/margins": 0.24042773246765137, - "rewards/rejected": -1.5577419996261597, - "semantic_entropy": 0.785629391670227, + "rewards/chosen": -1.3131062984466553, + "rewards/margins": 0.2362278401851654, + "rewards/rejected": -1.5493340492248535, "step": 325 }, { "epoch": 0.17661816357250376, - "grad_norm": 9.90455638090829, + "grad_norm": 10.01176669398041, "learning_rate": 5.88235294117647e-07, - "logits/chosen": -0.03396131470799446, - "logits/rejected": 0.106153704226017, - "logps/chosen": -1.3295948505401611, - "logps/rejected": -1.5008153915405273, - "loss": 2.0891, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3295948505401611, - "rewards/margins": 0.1712205708026886, - "rewards/rejected": -1.5008153915405273, - "semantic_entropy": 0.7894498705863953, + "logits/chosen": -0.06524165719747543, + "logits/rejected": 0.06379499286413193, + "logps/chosen": -1.3261775970458984, + "logps/rejected": -1.4934340715408325, + "loss": 1.6925, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3261775970458984, + "rewards/margins": 0.16725634038448334, + "rewards/rejected": -1.4934340715408325, "step": 330 }, { "epoch": 0.17929419635390534, - "grad_norm": 9.675886750939776, + "grad_norm": 9.575794458520054, "learning_rate": 5.971479500891266e-07, - "logits/chosen": 0.004796019289642572, - "logits/rejected": 0.10043302923440933, - "logps/chosen": -1.3419394493103027, - "logps/rejected": -1.375423789024353, - "loss": 2.1345, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3419394493103027, - "rewards/margins": 0.03348435088992119, - "rewards/rejected": -1.375423789024353, - "semantic_entropy": 0.8081732988357544, + "logits/chosen": -0.0011968165636062622, + "logits/rejected": 0.0859590619802475, + "logps/chosen": -1.3384212255477905, + "logps/rejected": -1.369568943977356, + "loss": 1.7283, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3384212255477905, + "rewards/margins": 0.031147807836532593, + "rewards/rejected": -1.369568943977356, "step": 335 }, { "epoch": 0.18197022913530692, - "grad_norm": 10.253171249347128, + "grad_norm": 11.07398297496942, "learning_rate": 6.060606060606061e-07, - "logits/chosen": -0.06982637941837311, - "logits/rejected": 0.06200215965509415, - "logps/chosen": -1.400742769241333, - "logps/rejected": -1.4839637279510498, - "loss": 2.1445, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.400742769241333, - "rewards/margins": 0.0832209438085556, - "rewards/rejected": -1.4839637279510498, - "semantic_entropy": 0.7617381811141968, + "logits/chosen": -0.045141976326704025, + "logits/rejected": 0.091048464179039, + "logps/chosen": -1.394019603729248, + "logps/rejected": -1.477367639541626, + "loss": 1.7579, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.394019603729248, + "rewards/margins": 0.0833478718996048, + "rewards/rejected": -1.477367639541626, "step": 340 }, { "epoch": 0.1846462619167085, - "grad_norm": 13.82680468025849, + "grad_norm": 9.720083946898317, "learning_rate": 6.149732620320855e-07, - "logits/chosen": 0.005692244973033667, - "logits/rejected": 0.03132449835538864, - "logps/chosen": -1.3126928806304932, - "logps/rejected": -1.452416181564331, - "loss": 2.0592, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3126928806304932, - "rewards/margins": 0.13972336053848267, - "rewards/rejected": -1.452416181564331, - "semantic_entropy": 0.802249550819397, + "logits/chosen": 0.029754549264907837, + "logits/rejected": 0.05618869513273239, + "logps/chosen": -1.3053172826766968, + "logps/rejected": -1.4416307210922241, + "loss": 1.6551, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3053172826766968, + "rewards/margins": 0.13631334900856018, + "rewards/rejected": -1.4416307210922241, "step": 345 }, { "epoch": 0.18732229469811004, - "grad_norm": 9.737973493965074, + "grad_norm": 10.175563681422924, "learning_rate": 6.238859180035651e-07, - "logits/chosen": -0.025805365294218063, - "logits/rejected": 0.05890653654932976, - "logps/chosen": -1.2869572639465332, - "logps/rejected": -1.4053207635879517, - "loss": 2.0759, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2869572639465332, - "rewards/margins": 0.11836358159780502, - "rewards/rejected": -1.4053207635879517, - "semantic_entropy": 0.8062549829483032, + "logits/chosen": -0.0037210776936262846, + "logits/rejected": 0.08502546697854996, + "logps/chosen": -1.2827966213226318, + "logps/rejected": -1.399217128753662, + "loss": 1.669, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2827966213226318, + "rewards/margins": 0.11642041057348251, + "rewards/rejected": -1.399217128753662, "step": 350 }, { "epoch": 0.18999832747951162, - "grad_norm": 11.011017446465962, + "grad_norm": 11.064201007283366, "learning_rate": 6.327985739750445e-07, - "logits/chosen": -0.08332131057977676, - "logits/rejected": 0.1384621560573578, - "logps/chosen": -1.3908402919769287, - "logps/rejected": -1.417327880859375, - "loss": 2.1468, + "logits/chosen": -0.09607705473899841, + "logits/rejected": 0.11924111843109131, + "logps/chosen": -1.386281967163086, + "logps/rejected": -1.4111428260803223, + "loss": 1.7495, "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.3908402919769287, - "rewards/margins": 0.02648766338825226, - "rewards/rejected": -1.417327880859375, - "semantic_entropy": 0.7860895395278931, + "rewards/chosen": -1.386281967163086, + "rewards/margins": 0.024860884994268417, + "rewards/rejected": -1.4111428260803223, "step": 355 }, { "epoch": 0.1926743602609132, - "grad_norm": 9.3029388405961, + "grad_norm": 9.868246722819997, "learning_rate": 6.417112299465241e-07, - "logits/chosen": -0.0865727886557579, - "logits/rejected": -0.012034505605697632, - "logps/chosen": -1.3202612400054932, - "logps/rejected": -1.440613031387329, - "loss": 2.1384, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3202612400054932, - "rewards/margins": 0.12035173177719116, - "rewards/rejected": -1.440613031387329, - "semantic_entropy": 0.8006309270858765, + "logits/chosen": -0.07199651002883911, + "logits/rejected": 0.0037381022702902555, + "logps/chosen": -1.315307855606079, + "logps/rejected": -1.4310729503631592, + "loss": 1.7349, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.315307855606079, + "rewards/margins": 0.11576519906520844, + "rewards/rejected": -1.4310729503631592, "step": 360 }, { "epoch": 0.19535039304231477, - "grad_norm": 11.887096782670497, + "grad_norm": 11.769854537932227, "learning_rate": 6.506238859180035e-07, - "logits/chosen": -0.022515257820487022, - "logits/rejected": 0.05280578136444092, - "logps/chosen": -1.2991836071014404, - "logps/rejected": -1.399091124534607, - "loss": 2.0964, + "logits/chosen": 0.0018426328897476196, + "logits/rejected": 0.07703053951263428, + "logps/chosen": -1.2928941249847412, + "logps/rejected": -1.3936841487884521, + "loss": 1.6828, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2991836071014404, - "rewards/margins": 0.09990767389535904, - "rewards/rejected": -1.399091124534607, - "semantic_entropy": 0.8175728917121887, + "rewards/chosen": -1.2928941249847412, + "rewards/margins": 0.10079008340835571, + "rewards/rejected": -1.3936841487884521, "step": 365 }, { "epoch": 0.19802642582371635, - "grad_norm": 7.41980630149422, + "grad_norm": 6.711024492277533, "learning_rate": 6.59536541889483e-07, - "logits/chosen": -0.03407667204737663, - "logits/rejected": 0.056151580065488815, - "logps/chosen": -1.294122338294983, - "logps/rejected": -1.3374214172363281, - "loss": 2.1084, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.294122338294983, - "rewards/margins": 0.04329918324947357, - "rewards/rejected": -1.3374214172363281, - "semantic_entropy": 0.8323699235916138, + "logits/chosen": -0.031435489654541016, + "logits/rejected": 0.04781597852706909, + "logps/chosen": -1.2883342504501343, + "logps/rejected": -1.3285871744155884, + "loss": 1.69, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2883342504501343, + "rewards/margins": 0.040252961218357086, + "rewards/rejected": -1.3285871744155884, "step": 370 }, { "epoch": 0.2007024586051179, - "grad_norm": 15.730415357023626, + "grad_norm": 15.976190113657609, "learning_rate": 6.684491978609626e-07, - "logits/chosen": -0.08421232551336288, - "logits/rejected": 0.06763456016778946, - "logps/chosen": -1.2762327194213867, - "logps/rejected": -1.4177181720733643, - "loss": 2.08, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2762327194213867, - "rewards/margins": 0.14148563146591187, - "rewards/rejected": -1.4177181720733643, - "semantic_entropy": 0.8105593919754028, + "logits/chosen": -0.08474372327327728, + "logits/rejected": 0.06233643367886543, + "logps/chosen": -1.2696669101715088, + "logps/rejected": -1.4098860025405884, + "loss": 1.6663, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2696669101715088, + "rewards/margins": 0.1402190923690796, + "rewards/rejected": -1.4098860025405884, "step": 375 }, { "epoch": 0.20337849138651948, - "grad_norm": 9.693161959090537, + "grad_norm": 9.503917608234607, "learning_rate": 6.77361853832442e-07, - "logits/chosen": -0.05102665349841118, - "logits/rejected": 0.027332540601491928, - "logps/chosen": -1.2896788120269775, - "logps/rejected": -1.4495028257369995, - "loss": 2.0546, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2896788120269775, - "rewards/margins": 0.1598241627216339, - "rewards/rejected": -1.4495028257369995, - "semantic_entropy": 0.8106985092163086, + "logits/chosen": -0.05148506909608841, + "logits/rejected": 0.024202097207307816, + "logps/chosen": -1.2852833271026611, + "logps/rejected": -1.4389188289642334, + "loss": 1.6491, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2852833271026611, + "rewards/margins": 0.15363556146621704, + "rewards/rejected": -1.4389188289642334, "step": 380 }, { "epoch": 0.20605452416792105, - "grad_norm": 5.182351383221804, + "grad_norm": 5.7477820941055615, "learning_rate": 6.862745098039216e-07, - "logits/chosen": -0.036210425198078156, - "logits/rejected": 0.03844881430268288, - "logps/chosen": -1.3870257139205933, - "logps/rejected": -1.3795428276062012, - "loss": 2.1431, + "logits/chosen": 0.006725990679115057, + "logits/rejected": 0.08241648972034454, + "logps/chosen": -1.3825719356536865, + "logps/rejected": -1.3719950914382935, + "loss": 1.7442, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3870257139205933, - "rewards/margins": -0.007482861168682575, - "rewards/rejected": -1.3795428276062012, - "semantic_entropy": 0.792412519454956, + "rewards/chosen": -1.3825719356536865, + "rewards/margins": -0.010577131994068623, + "rewards/rejected": -1.3719950914382935, "step": 385 }, { "epoch": 0.20873055694932263, - "grad_norm": 9.966131209763185, + "grad_norm": 10.60627748217004, "learning_rate": 6.95187165775401e-07, - "logits/chosen": 0.016994182020425797, - "logits/rejected": 0.1718176305294037, - "logps/chosen": -1.3804776668548584, - "logps/rejected": -1.4428892135620117, - "loss": 2.1521, + "logits/chosen": 0.04871059209108353, + "logits/rejected": 0.2094159871339798, + "logps/chosen": -1.373580813407898, + "logps/rejected": -1.433243989944458, + "loss": 1.7591, "rewards/accuracies": 0.46875, - "rewards/chosen": -1.3804776668548584, - "rewards/margins": 0.0624115951359272, - "rewards/rejected": -1.4428892135620117, - "semantic_entropy": 0.7761930823326111, + "rewards/chosen": -1.373580813407898, + "rewards/margins": 0.05966333672404289, + "rewards/rejected": -1.433243989944458, "step": 390 }, { "epoch": 0.2114065897307242, - "grad_norm": 7.097965730394271, + "grad_norm": 7.903747067272735, "learning_rate": 7.040998217468806e-07, - "logits/chosen": -0.080187126994133, - "logits/rejected": 0.07204331457614899, - "logps/chosen": -1.3347288370132446, - "logps/rejected": -1.3476696014404297, - "loss": 2.1163, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3347288370132446, - "rewards/margins": 0.01294087152928114, - "rewards/rejected": -1.3476696014404297, - "semantic_entropy": 0.8120431900024414, + "logits/chosen": -0.06710124015808105, + "logits/rejected": 0.08114752173423767, + "logps/chosen": -1.323225736618042, + "logps/rejected": -1.334172010421753, + "loss": 1.7033, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.323225736618042, + "rewards/margins": 0.010946071706712246, + "rewards/rejected": -1.334172010421753, "step": 395 }, { "epoch": 0.2140826225121258, - "grad_norm": 8.470660073637783, + "grad_norm": 7.656216532972495, "learning_rate": 7.1301247771836e-07, - "logits/chosen": 0.05173783749341965, - "logits/rejected": 0.14466162025928497, - "logps/chosen": -1.3133375644683838, - "logps/rejected": -1.4095127582550049, - "loss": 2.0607, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3133375644683838, - "rewards/margins": 0.09617514908313751, - "rewards/rejected": -1.4095127582550049, - "semantic_entropy": 0.804615318775177, + "logits/chosen": 0.04969733580946922, + "logits/rejected": 0.13665138185024261, + "logps/chosen": -1.3036470413208008, + "logps/rejected": -1.3962353467941284, + "loss": 1.6549, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3036470413208008, + "rewards/margins": 0.09258836507797241, + "rewards/rejected": -1.3962353467941284, "step": 400 }, { "epoch": 0.2140826225121258, - "eval_logits/chosen": 0.24520687758922577, - "eval_logits/rejected": 0.32969510555267334, - "eval_logps/chosen": -1.345367193222046, - "eval_logps/rejected": -1.475685954093933, - "eval_loss": 2.0924832820892334, + "eval_logits/chosen": 0.279936283826828, + "eval_logits/rejected": 0.3663715124130249, + "eval_logps/chosen": -1.3375245332717896, + "eval_logps/rejected": -1.4630897045135498, + "eval_loss": 1.6938890218734741, "eval_rewards/accuracies": 0.5563797950744629, - "eval_rewards/chosen": -1.345367193222046, - "eval_rewards/margins": 0.1303185522556305, - "eval_rewards/rejected": -1.475685954093933, - "eval_runtime": 35.1527, - "eval_samples_per_second": 38.262, - "eval_semantic_entropy": 0.7909919619560242, - "eval_steps_per_second": 9.587, + "eval_rewards/chosen": -1.3375245332717896, + "eval_rewards/margins": 0.12556517124176025, + "eval_rewards/rejected": -1.4630897045135498, + "eval_runtime": 41.3293, + "eval_samples_per_second": 32.543, + "eval_steps_per_second": 8.154, "step": 400 }, { "epoch": 0.21675865529352734, - "grad_norm": 9.553550584241732, + "grad_norm": 8.54179230229592, "learning_rate": 7.219251336898395e-07, - "logits/chosen": -0.039488695561885834, - "logits/rejected": 0.051676005125045776, - "logps/chosen": -1.319966197013855, - "logps/rejected": -1.3851897716522217, - "loss": 2.1136, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.319966197013855, - "rewards/margins": 0.06522355228662491, - "rewards/rejected": -1.3851897716522217, - "semantic_entropy": 0.8008989095687866, + "logits/chosen": 0.005595216061919928, + "logits/rejected": 0.10029877722263336, + "logps/chosen": -1.306983470916748, + "logps/rejected": -1.371383547782898, + "loss": 1.7059, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.306983470916748, + "rewards/margins": 0.06440006196498871, + "rewards/rejected": -1.371383547782898, "step": 405 }, { "epoch": 0.2194346880749289, - "grad_norm": 12.81546563496579, + "grad_norm": 10.367031723096446, "learning_rate": 7.30837789661319e-07, - "logits/chosen": -0.004357346799224615, - "logits/rejected": 0.12372720241546631, - "logps/chosen": -1.292946457862854, - "logps/rejected": -1.3807947635650635, - "loss": 2.0843, + "logits/chosen": 0.023940464481711388, + "logits/rejected": 0.15087561309337616, + "logps/chosen": -1.2818481922149658, + "logps/rejected": -1.3659837245941162, + "loss": 1.6734, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.292946457862854, - "rewards/margins": 0.08784836530685425, - "rewards/rejected": -1.3807947635650635, - "semantic_entropy": 0.8132365942001343, + "rewards/chosen": -1.2818481922149658, + "rewards/margins": 0.08413554728031158, + "rewards/rejected": -1.3659837245941162, "step": 410 }, { "epoch": 0.2221107208563305, - "grad_norm": 5.543288881603541, + "grad_norm": 6.056959954806769, "learning_rate": 7.397504456327985e-07, - "logits/chosen": -0.02848835289478302, - "logits/rejected": 0.003672828432172537, - "logps/chosen": -1.2885775566101074, - "logps/rejected": -1.446232557296753, - "loss": 2.0468, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2885775566101074, - "rewards/margins": 0.15765497088432312, - "rewards/rejected": -1.446232557296753, - "semantic_entropy": 0.8051525354385376, + "logits/chosen": -0.012898044660687447, + "logits/rejected": 0.018010448664426804, + "logps/chosen": -1.2827682495117188, + "logps/rejected": -1.4346555471420288, + "loss": 1.6427, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2827682495117188, + "rewards/margins": 0.15188735723495483, + "rewards/rejected": -1.4346555471420288, "step": 415 }, { "epoch": 0.22478675363773207, - "grad_norm": 7.998047741738567, + "grad_norm": 7.653903978061298, "learning_rate": 7.486631016042781e-07, - "logits/chosen": -0.023648854345083237, - "logits/rejected": 0.16269460320472717, - "logps/chosen": -1.2729085683822632, - "logps/rejected": -1.3676693439483643, - "loss": 2.0836, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.2729085683822632, - "rewards/margins": 0.09476063400506973, - "rewards/rejected": -1.3676693439483643, - "semantic_entropy": 0.8268201947212219, + "logits/chosen": -0.03181108459830284, + "logits/rejected": 0.14745911955833435, + "logps/chosen": -1.2670233249664307, + "logps/rejected": -1.356186866760254, + "loss": 1.6684, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2670233249664307, + "rewards/margins": 0.08916331827640533, + "rewards/rejected": -1.356186866760254, "step": 420 }, { "epoch": 0.22746278641913364, - "grad_norm": 6.633391520285902, + "grad_norm": 6.707108967213688, "learning_rate": 7.575757575757575e-07, - "logits/chosen": -0.042364347726106644, - "logits/rejected": 0.1575302630662918, - "logps/chosen": -1.3138656616210938, - "logps/rejected": -1.4691503047943115, - "loss": 2.0663, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3138656616210938, - "rewards/margins": 0.15528468787670135, - "rewards/rejected": -1.4691503047943115, - "semantic_entropy": 0.8047497868537903, + "logits/chosen": -0.07813435047864914, + "logits/rejected": 0.10958679765462875, + "logps/chosen": -1.3069932460784912, + "logps/rejected": -1.457546353340149, + "loss": 1.6623, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3069932460784912, + "rewards/margins": 0.15055301785469055, + "rewards/rejected": -1.457546353340149, "step": 425 }, { "epoch": 0.2301388192005352, - "grad_norm": 7.516999022167481, + "grad_norm": 7.3866104224848215, "learning_rate": 7.664884135472371e-07, - "logits/chosen": -0.08585725724697113, - "logits/rejected": 0.1032024621963501, - "logps/chosen": -1.3429131507873535, - "logps/rejected": -1.4755594730377197, - "loss": 2.0833, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3429131507873535, - "rewards/margins": 0.13264614343643188, - "rewards/rejected": -1.4755594730377197, - "semantic_entropy": 0.7835851907730103, + "logits/chosen": -0.07318639755249023, + "logits/rejected": 0.11783840507268906, + "logps/chosen": -1.339181661605835, + "logps/rejected": -1.4602099657058716, + "loss": 1.691, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.339181661605835, + "rewards/margins": 0.12102824449539185, + "rewards/rejected": -1.4602099657058716, "step": 430 }, { "epoch": 0.23281485198193677, - "grad_norm": 10.217305267561882, + "grad_norm": 12.019218413216004, "learning_rate": 7.754010695187165e-07, - "logits/chosen": -0.03247809037566185, - "logits/rejected": 0.05013108253479004, - "logps/chosen": -1.215358018875122, - "logps/rejected": -1.347867727279663, - "loss": 2.0371, + "logits/chosen": 0.009309527464210987, + "logits/rejected": 0.09513361006975174, + "logps/chosen": -1.207874059677124, + "logps/rejected": -1.3379977941513062, + "loss": 1.6147, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.215358018875122, - "rewards/margins": 0.1325097382068634, - "rewards/rejected": -1.347867727279663, - "semantic_entropy": 0.8345514535903931, + "rewards/chosen": -1.207874059677124, + "rewards/margins": 0.13012397289276123, + "rewards/rejected": -1.3379977941513062, "step": 435 }, { "epoch": 0.23549088476333835, - "grad_norm": 7.388443140413564, + "grad_norm": 7.553555665151894, "learning_rate": 7.84313725490196e-07, - "logits/chosen": -0.022949252277612686, - "logits/rejected": 0.061340272426605225, - "logps/chosen": -1.2916299104690552, - "logps/rejected": -1.3803313970565796, - "loss": 2.087, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2916299104690552, - "rewards/margins": 0.08870140463113785, - "rewards/rejected": -1.3803313970565796, - "semantic_entropy": 0.8170446157455444, + "logits/chosen": 0.006134913768619299, + "logits/rejected": 0.09206631034612656, + "logps/chosen": -1.2856277227401733, + "logps/rejected": -1.3689930438995361, + "loss": 1.6755, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2856277227401733, + "rewards/margins": 0.08336522430181503, + "rewards/rejected": -1.3689930438995361, "step": 440 }, { "epoch": 0.23816691754473993, - "grad_norm": 8.493171472969088, + "grad_norm": 7.648705966898978, "learning_rate": 7.932263814616755e-07, - "logits/chosen": -0.045423250645399094, - "logits/rejected": 0.062299858778715134, - "logps/chosen": -1.318433403968811, - "logps/rejected": -1.4452886581420898, - "loss": 2.1018, + "logits/chosen": -0.02347562648355961, + "logits/rejected": 0.08584611117839813, + "logps/chosen": -1.3102935552597046, + "logps/rejected": -1.433739185333252, + "loss": 1.6955, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.318433403968811, - "rewards/margins": 0.12685513496398926, - "rewards/rejected": -1.4452886581420898, - "semantic_entropy": 0.7928746938705444, + "rewards/chosen": -1.3102935552597046, + "rewards/margins": 0.12344559282064438, + "rewards/rejected": -1.433739185333252, "step": 445 }, { "epoch": 0.2408429503261415, - "grad_norm": 7.634251232185677, + "grad_norm": 8.882655730190928, "learning_rate": 8.02139037433155e-07, - "logits/chosen": -0.004715797491371632, - "logits/rejected": 0.11216776072978973, - "logps/chosen": -1.3025964498519897, - "logps/rejected": -1.4417173862457275, - "loss": 2.064, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3025964498519897, - "rewards/margins": 0.13912081718444824, - "rewards/rejected": -1.4417173862457275, - "semantic_entropy": 0.8090659379959106, + "logits/chosen": -0.0030852502677589655, + "logits/rejected": 0.11593957990407944, + "logps/chosen": -1.2922875881195068, + "logps/rejected": -1.4286361932754517, + "loss": 1.6509, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2922875881195068, + "rewards/margins": 0.1363484412431717, + "rewards/rejected": -1.4286361932754517, "step": 450 }, { "epoch": 0.24351898310754308, - "grad_norm": 9.123904294480766, + "grad_norm": 9.557364259695511, "learning_rate": 8.110516934046346e-07, - "logits/chosen": -0.02629506029188633, - "logits/rejected": 0.058896519243717194, - "logps/chosen": -1.2463464736938477, - "logps/rejected": -1.4314829111099243, - "loss": 2.0277, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2463464736938477, - "rewards/margins": 0.18513646721839905, - "rewards/rejected": -1.4314829111099243, - "semantic_entropy": 0.819383978843689, + "logits/chosen": -0.013877347111701965, + "logits/rejected": 0.07154419273138046, + "logps/chosen": -1.2427895069122314, + "logps/rejected": -1.419550895690918, + "loss": 1.6117, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2427895069122314, + "rewards/margins": 0.1767614483833313, + "rewards/rejected": -1.419550895690918, "step": 455 }, { "epoch": 0.24619501588894463, - "grad_norm": 7.988610941036874, + "grad_norm": 8.275428432095703, "learning_rate": 8.19964349376114e-07, - "logits/chosen": -0.13996991515159607, - "logits/rejected": -0.02202531509101391, - "logps/chosen": -1.373464822769165, - "logps/rejected": -1.4234976768493652, - "loss": 2.134, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.373464822769165, - "rewards/margins": 0.05003304407000542, - "rewards/rejected": -1.4234976768493652, - "semantic_entropy": 0.7923992872238159, + "logits/chosen": -0.14314907789230347, + "logits/rejected": -0.029183436185121536, + "logps/chosen": -1.3672678470611572, + "logps/rejected": -1.4119517803192139, + "loss": 1.7353, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3672678470611572, + "rewards/margins": 0.04468398913741112, + "rewards/rejected": -1.4119517803192139, "step": 460 }, { "epoch": 0.2488710486703462, - "grad_norm": 9.581479715947285, + "grad_norm": 9.505344628861407, "learning_rate": 8.288770053475936e-07, - "logits/chosen": 0.11567505449056625, - "logits/rejected": 0.1285204142332077, - "logps/chosen": -1.2783681154251099, - "logps/rejected": -1.4448531866073608, - "loss": 2.0768, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2783681154251099, - "rewards/margins": 0.1664850413799286, - "rewards/rejected": -1.4448531866073608, - "semantic_entropy": 0.801103949546814, + "logits/chosen": 0.11223528534173965, + "logits/rejected": 0.12632213532924652, + "logps/chosen": -1.2721918821334839, + "logps/rejected": -1.4321367740631104, + "loss": 1.6727, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2721918821334839, + "rewards/margins": 0.15994493663311005, + "rewards/rejected": -1.4321367740631104, "step": 465 }, { "epoch": 0.2515470814517478, - "grad_norm": 9.216483509650814, + "grad_norm": 9.48905108499897, "learning_rate": 8.37789661319073e-07, - "logits/chosen": 0.15024113655090332, - "logits/rejected": 0.09940201789140701, - "logps/chosen": -1.239457130432129, - "logps/rejected": -1.4274377822875977, - "loss": 2.014, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.239457130432129, - "rewards/margins": 0.18798059225082397, - "rewards/rejected": -1.4274377822875977, - "semantic_entropy": 0.8202102780342102, + "logits/chosen": 0.14261920750141144, + "logits/rejected": 0.09591357409954071, + "logps/chosen": -1.2352259159088135, + "logps/rejected": -1.4124305248260498, + "loss": 1.6061, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2352259159088135, + "rewards/margins": 0.17720454931259155, + "rewards/rejected": -1.4124305248260498, "step": 470 }, { "epoch": 0.25422311423314936, - "grad_norm": 7.406586737551856, + "grad_norm": 6.899198780305083, "learning_rate": 8.467023172905525e-07, - "logits/chosen": -0.0853961706161499, - "logits/rejected": 0.046978384256362915, - "logps/chosen": -1.304258108139038, - "logps/rejected": -1.5052478313446045, - "loss": 2.0343, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.304258108139038, - "rewards/margins": 0.20098969340324402, - "rewards/rejected": -1.5052478313446045, - "semantic_entropy": 0.7981175184249878, + "logits/chosen": -0.0794256404042244, + "logits/rejected": 0.052232641726732254, + "logps/chosen": -1.301062822341919, + "logps/rejected": -1.4891085624694824, + "loss": 1.6342, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.301062822341919, + "rewards/margins": 0.1880457103252411, + "rewards/rejected": -1.4891085624694824, "step": 475 }, { "epoch": 0.2568991470145509, - "grad_norm": 12.42290439017345, + "grad_norm": 12.486878937681752, "learning_rate": 8.55614973262032e-07, - "logits/chosen": -0.06797462701797485, - "logits/rejected": 0.12427964061498642, - "logps/chosen": -1.2787773609161377, - "logps/rejected": -1.3450984954833984, - "loss": 2.1266, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2787773609161377, - "rewards/margins": 0.06632117927074432, - "rewards/rejected": -1.3450984954833984, - "semantic_entropy": 0.8253456950187683, + "logits/chosen": -0.055589865893125534, + "logits/rejected": 0.13701441884040833, + "logps/chosen": -1.2720788717269897, + "logps/rejected": -1.338479995727539, + "loss": 1.7002, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2720788717269897, + "rewards/margins": 0.06640110909938812, + "rewards/rejected": -1.338479995727539, "step": 480 }, { "epoch": 0.2595751797959525, - "grad_norm": 12.319073676190952, + "grad_norm": 11.29180900188912, "learning_rate": 8.645276292335115e-07, - "logits/chosen": -0.02665388025343418, - "logits/rejected": 0.007270003668963909, - "logps/chosen": -1.372032880783081, - "logps/rejected": -1.44978928565979, - "loss": 2.1258, + "logits/chosen": 0.008862579241394997, + "logits/rejected": 0.046294886618852615, + "logps/chosen": -1.3658168315887451, + "logps/rejected": -1.4353352785110474, + "loss": 1.7308, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.372032880783081, - "rewards/margins": 0.07775656878948212, - "rewards/rejected": -1.44978928565979, - "semantic_entropy": 0.7805415391921997, + "rewards/chosen": -1.3658168315887451, + "rewards/margins": 0.06951842457056046, + "rewards/rejected": -1.4353352785110474, "step": 485 }, { "epoch": 0.26225121257735406, - "grad_norm": 8.267686997372833, + "grad_norm": 8.596970816409131, "learning_rate": 8.734402852049911e-07, - "logits/chosen": -0.0180144514888525, - "logits/rejected": 0.04987755045294762, - "logps/chosen": -1.3115794658660889, - "logps/rejected": -1.3869727849960327, - "loss": 2.1172, + "logits/chosen": 0.03249656409025192, + "logits/rejected": 0.10003700107336044, + "logps/chosen": -1.3026869297027588, + "logps/rejected": -1.372564673423767, + "loss": 1.7021, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3115794658660889, - "rewards/margins": 0.07539341598749161, - "rewards/rejected": -1.3869727849960327, - "semantic_entropy": 0.81377112865448, + "rewards/chosen": -1.3026869297027588, + "rewards/margins": 0.06987786293029785, + "rewards/rejected": -1.372564673423767, "step": 490 }, { "epoch": 0.26492724535875567, - "grad_norm": 9.738357137083968, + "grad_norm": 9.491755847867198, "learning_rate": 8.823529411764705e-07, - "logits/chosen": -0.058453939855098724, - "logits/rejected": -0.04326649382710457, - "logps/chosen": -1.3211174011230469, - "logps/rejected": -1.4324402809143066, - "loss": 2.095, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3211174011230469, - "rewards/margins": 0.1113228052854538, - "rewards/rejected": -1.4324402809143066, - "semantic_entropy": 0.8082423210144043, + "logits/chosen": -0.019991319626569748, + "logits/rejected": -0.002196407411247492, + "logps/chosen": -1.3101972341537476, + "logps/rejected": -1.418125867843628, + "loss": 1.684, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3101972341537476, + "rewards/margins": 0.10792861878871918, + "rewards/rejected": -1.418125867843628, "step": 495 }, { "epoch": 0.2676032781401572, - "grad_norm": 8.372403169674499, + "grad_norm": 8.371010893036301, "learning_rate": 8.912655971479501e-07, - "logits/chosen": -0.06073238328099251, - "logits/rejected": 0.03806382417678833, - "logps/chosen": -1.234489917755127, - "logps/rejected": -1.3935258388519287, - "loss": 2.0403, + "logits/chosen": -0.057091616094112396, + "logits/rejected": 0.039482321590185165, + "logps/chosen": -1.2270276546478271, + "logps/rejected": -1.3793421983718872, + "loss": 1.6195, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.234489917755127, - "rewards/margins": 0.15903595089912415, - "rewards/rejected": -1.3935258388519287, - "semantic_entropy": 0.8348814249038696, + "rewards/chosen": -1.2270276546478271, + "rewards/margins": 0.1523144692182541, + "rewards/rejected": -1.3793421983718872, "step": 500 }, { "epoch": 0.27027931092155877, - "grad_norm": 9.329346348003517, + "grad_norm": 7.881144482280576, "learning_rate": 9.001782531194295e-07, - "logits/chosen": -0.0702035129070282, - "logits/rejected": 0.06675246357917786, - "logps/chosen": -1.3525861501693726, - "logps/rejected": -1.3896262645721436, - "loss": 2.0984, + "logits/chosen": -0.07503107190132141, + "logits/rejected": 0.05724053829908371, + "logps/chosen": -1.3419079780578613, + "logps/rejected": -1.3775643110275269, + "loss": 1.6939, "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3525861501693726, - "rewards/margins": 0.03703995421528816, - "rewards/rejected": -1.3896262645721436, - "semantic_entropy": 0.7991023063659668, + "rewards/chosen": -1.3419079780578613, + "rewards/margins": 0.03565652295947075, + "rewards/rejected": -1.3775643110275269, "step": 505 }, { "epoch": 0.2729553437029604, - "grad_norm": 7.793041739139555, + "grad_norm": 8.04083324264338, "learning_rate": 9.09090909090909e-07, - "logits/chosen": 0.0727655366063118, - "logits/rejected": 0.13345302641391754, - "logps/chosen": -1.3091163635253906, - "logps/rejected": -1.4701882600784302, - "loss": 2.0514, + "logits/chosen": 0.08807464689016342, + "logits/rejected": 0.14513877034187317, + "logps/chosen": -1.3025153875350952, + "logps/rejected": -1.4536783695220947, + "loss": 1.6515, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3091163635253906, - "rewards/margins": 0.1610718071460724, - "rewards/rejected": -1.4701882600784302, - "semantic_entropy": 0.7971314787864685, + "rewards/chosen": -1.3025153875350952, + "rewards/margins": 0.15116293728351593, + "rewards/rejected": -1.4536783695220947, "step": 510 }, { "epoch": 0.2756313764843619, - "grad_norm": 8.366167229049527, + "grad_norm": 8.680474424479687, "learning_rate": 9.180035650623885e-07, - "logits/chosen": 0.020826030522584915, - "logits/rejected": 0.11258859932422638, - "logps/chosen": -1.251171350479126, - "logps/rejected": -1.3965065479278564, - "loss": 2.0153, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.251171350479126, - "rewards/margins": 0.1453351080417633, - "rewards/rejected": -1.3965065479278564, - "semantic_entropy": 0.8115334510803223, + "logits/chosen": 0.035877007991075516, + "logits/rejected": 0.12638869881629944, + "logps/chosen": -1.2459619045257568, + "logps/rejected": -1.3836866617202759, + "loss": 1.6087, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2459619045257568, + "rewards/margins": 0.13772478699684143, + "rewards/rejected": -1.3836866617202759, "step": 515 }, { "epoch": 0.27830740926576353, - "grad_norm": 6.708105392677698, + "grad_norm": 6.892237240548103, "learning_rate": 9.26916221033868e-07, - "logits/chosen": -0.10605648905038834, - "logits/rejected": 0.02459835447371006, - "logps/chosen": -1.2906177043914795, - "logps/rejected": -1.3887439966201782, - "loss": 2.0989, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.2906177043914795, - "rewards/margins": 0.09812645614147186, - "rewards/rejected": -1.3887439966201782, - "semantic_entropy": 0.8078464269638062, + "logits/chosen": -0.07581953704357147, + "logits/rejected": 0.060358207672834396, + "logps/chosen": -1.2861921787261963, + "logps/rejected": -1.3751986026763916, + "loss": 1.6935, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2861921787261963, + "rewards/margins": 0.08900648355484009, + "rewards/rejected": -1.3751986026763916, "step": 520 }, { "epoch": 0.2809834420471651, - "grad_norm": 18.59523614539976, + "grad_norm": 19.55498923733445, "learning_rate": 9.358288770053476e-07, - "logits/chosen": 0.09513764083385468, - "logits/rejected": 0.16091565787792206, - "logps/chosen": -1.2693884372711182, - "logps/rejected": -1.4406367540359497, - "loss": 2.049, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2693884372711182, - "rewards/margins": 0.1712484359741211, - "rewards/rejected": -1.4406367540359497, - "semantic_entropy": 0.8163102269172668, + "logits/chosen": 0.11548501253128052, + "logits/rejected": 0.18100161850452423, + "logps/chosen": -1.2647411823272705, + "logps/rejected": -1.4218060970306396, + "loss": 1.6418, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2647411823272705, + "rewards/margins": 0.15706504881381989, + "rewards/rejected": -1.4218060970306396, "step": 525 }, { "epoch": 0.2836594748285666, - "grad_norm": 6.400397273187949, + "grad_norm": 6.294694914877995, "learning_rate": 9.44741532976827e-07, - "logits/chosen": 0.05827958509325981, - "logits/rejected": 0.14098814129829407, - "logps/chosen": -1.2453418970108032, - "logps/rejected": -1.3420130014419556, - "loss": 2.0805, + "logits/chosen": 0.08505548536777496, + "logits/rejected": 0.16642725467681885, + "logps/chosen": -1.2363998889923096, + "logps/rejected": -1.3269596099853516, + "loss": 1.6577, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.2453418970108032, - "rewards/margins": 0.09667106717824936, - "rewards/rejected": -1.3420130014419556, - "semantic_entropy": 0.8379393815994263, + "rewards/chosen": -1.2363998889923096, + "rewards/margins": 0.09055972844362259, + "rewards/rejected": -1.3269596099853516, "step": 530 }, { "epoch": 0.28633550760996823, - "grad_norm": 6.655980461093334, + "grad_norm": 7.066635125091955, "learning_rate": 9.536541889483066e-07, - "logits/chosen": -0.10052146762609482, - "logits/rejected": 0.15529409050941467, - "logps/chosen": -1.2592411041259766, - "logps/rejected": -1.3248337507247925, - "loss": 2.0783, + "logits/chosen": -0.07894889265298843, + "logits/rejected": 0.1766219586133957, + "logps/chosen": -1.2514185905456543, + "logps/rejected": -1.3117074966430664, + "loss": 1.6592, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2592411041259766, - "rewards/margins": 0.06559250503778458, - "rewards/rejected": -1.3248337507247925, - "semantic_entropy": 0.8288901448249817, + "rewards/chosen": -1.2514185905456543, + "rewards/margins": 0.06028900295495987, + "rewards/rejected": -1.3117074966430664, "step": 535 }, { "epoch": 0.2890115403913698, - "grad_norm": 8.028377167353824, + "grad_norm": 7.622271735376781, "learning_rate": 9.62566844919786e-07, - "logits/chosen": 0.03284211829304695, - "logits/rejected": 0.10378841310739517, - "logps/chosen": -1.3858778476715088, - "logps/rejected": -1.4422986507415771, - "loss": 2.1258, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3858778476715088, - "rewards/margins": 0.056420810520648956, - "rewards/rejected": -1.4422986507415771, - "semantic_entropy": 0.7777124643325806, + "logits/chosen": 0.04230818152427673, + "logits/rejected": 0.11139015853404999, + "logps/chosen": -1.3790353536605835, + "logps/rejected": -1.427674412727356, + "loss": 1.7338, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3790353536605835, + "rewards/margins": 0.04863894730806351, + "rewards/rejected": -1.427674412727356, "step": 540 }, { "epoch": 0.2916875731727714, - "grad_norm": 7.5790293844104895, + "grad_norm": 7.395680847120547, "learning_rate": 9.714795008912655e-07, - "logits/chosen": -0.08762449026107788, - "logits/rejected": 0.1115502342581749, - "logps/chosen": -1.2886834144592285, - "logps/rejected": -1.4010090827941895, - "loss": 2.0677, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2886834144592285, - "rewards/margins": 0.11232553422451019, - "rewards/rejected": -1.4010090827941895, - "semantic_entropy": 0.8187274932861328, + "logits/chosen": -0.08460564911365509, + "logits/rejected": 0.11045414209365845, + "logps/chosen": -1.282997727394104, + "logps/rejected": -1.3860433101654053, + "loss": 1.6557, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.282997727394104, + "rewards/margins": 0.10304556041955948, + "rewards/rejected": -1.3860433101654053, "step": 545 }, { "epoch": 0.29436360595417294, - "grad_norm": 8.962981954490106, + "grad_norm": 8.761993018053895, "learning_rate": 9.80392156862745e-07, - "logits/chosen": 0.045073509216308594, - "logits/rejected": 0.11081943660974503, - "logps/chosen": -1.2883306741714478, - "logps/rejected": -1.4169273376464844, - "loss": 2.0432, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2883306741714478, - "rewards/margins": 0.12859681248664856, - "rewards/rejected": -1.4169273376464844, - "semantic_entropy": 0.8060620427131653, + "logits/chosen": 0.04571625217795372, + "logits/rejected": 0.10803280025720596, + "logps/chosen": -1.2807788848876953, + "logps/rejected": -1.402484655380249, + "loss": 1.637, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2807788848876953, + "rewards/margins": 0.12170571088790894, + "rewards/rejected": -1.402484655380249, "step": 550 }, { "epoch": 0.2970396387355745, - "grad_norm": 11.47050872584094, + "grad_norm": 11.10366030266417, "learning_rate": 9.893048128342244e-07, - "logits/chosen": -0.07750057429075241, - "logits/rejected": 0.04340776801109314, - "logps/chosen": -1.3622779846191406, - "logps/rejected": -1.4258826971054077, - "loss": 2.1151, + "logits/chosen": -0.06252036243677139, + "logits/rejected": 0.05841824412345886, + "logps/chosen": -1.3565037250518799, + "logps/rejected": -1.4139055013656616, + "loss": 1.7212, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3622779846191406, - "rewards/margins": 0.06360460817813873, - "rewards/rejected": -1.4258826971054077, - "semantic_entropy": 0.7788089513778687, + "rewards/chosen": -1.3565037250518799, + "rewards/margins": 0.057401906698942184, + "rewards/rejected": -1.4139055013656616, "step": 555 }, { "epoch": 0.2997156715169761, - "grad_norm": 10.029435787937103, + "grad_norm": 9.864439916293803, "learning_rate": 9.98217468805704e-07, - "logits/chosen": 0.05368703603744507, - "logits/rejected": 0.0629192367196083, - "logps/chosen": -1.2183544635772705, - "logps/rejected": -1.3473310470581055, - "loss": 2.0256, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2183544635772705, - "rewards/margins": 0.12897667288780212, - "rewards/rejected": -1.3473310470581055, - "semantic_entropy": 0.8247224688529968, + "logits/chosen": 0.04508183151483536, + "logits/rejected": 0.051397740840911865, + "logps/chosen": -1.2115132808685303, + "logps/rejected": -1.3322150707244873, + "loss": 1.6101, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2115132808685303, + "rewards/margins": 0.12070177495479584, + "rewards/rejected": -1.3322150707244873, "step": 560 }, { "epoch": 0.30239170429837764, - "grad_norm": 5.518574576054694, + "grad_norm": 6.165225503632382, "learning_rate": 9.999984476788462e-07, - "logits/chosen": 0.01707286201417446, - "logits/rejected": 0.06581038981676102, - "logps/chosen": -1.332328200340271, - "logps/rejected": -1.4508365392684937, - "loss": 2.1007, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.332328200340271, - "rewards/margins": 0.11850825697183609, - "rewards/rejected": -1.4508365392684937, - "semantic_entropy": 0.7935990691184998, + "logits/chosen": 0.02966439723968506, + "logits/rejected": 0.07653049379587173, + "logps/chosen": -1.3243474960327148, + "logps/rejected": -1.4375768899917603, + "loss": 1.6986, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3243474960327148, + "rewards/margins": 0.11322925984859467, + "rewards/rejected": -1.4375768899917603, "step": 565 }, { "epoch": 0.30506773707977924, - "grad_norm": 8.87344715447719, + "grad_norm": 8.854251236446792, "learning_rate": 9.999921413906797e-07, - "logits/chosen": -0.043978843837976456, - "logits/rejected": 0.16928060352802277, - "logps/chosen": -1.3080699443817139, - "logps/rejected": -1.3878307342529297, - "loss": 2.0981, + "logits/chosen": -0.04597383365035057, + "logits/rejected": 0.16426034271717072, + "logps/chosen": -1.3067337274551392, + "logps/rejected": -1.373793601989746, + "loss": 1.7024, "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3080699443817139, - "rewards/margins": 0.07976074516773224, - "rewards/rejected": -1.3878307342529297, - "semantic_entropy": 0.7911024689674377, + "rewards/chosen": -1.3067337274551392, + "rewards/margins": 0.06705982983112335, + "rewards/rejected": -1.373793601989746, "step": 570 }, { "epoch": 0.3077437698611808, - "grad_norm": 6.60439384217086, + "grad_norm": 6.840144493353446, "learning_rate": 9.999809841765644e-07, - "logits/chosen": -0.004473379347473383, - "logits/rejected": 0.052548039704561234, - "logps/chosen": -1.2383952140808105, - "logps/rejected": -1.35186767578125, - "loss": 2.0731, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2383952140808105, - "rewards/margins": 0.11347250640392303, - "rewards/rejected": -1.35186767578125, - "semantic_entropy": 0.8186966180801392, + "logits/chosen": -0.013875524513423443, + "logits/rejected": 0.04112040624022484, + "logps/chosen": -1.2333790063858032, + "logps/rejected": -1.3387117385864258, + "loss": 1.6608, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2333790063858032, + "rewards/margins": 0.10533283650875092, + "rewards/rejected": -1.3387117385864258, "step": 575 }, { "epoch": 0.3104198026425824, - "grad_norm": 7.041622382443657, + "grad_norm": 7.4380198998326055, "learning_rate": 9.999649761447477e-07, - "logits/chosen": -0.039054978638887405, - "logits/rejected": 0.11595281213521957, - "logps/chosen": -1.2413501739501953, - "logps/rejected": -1.4057656526565552, - "loss": 2.0201, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2413501739501953, - "rewards/margins": 0.16441559791564941, - "rewards/rejected": -1.4057656526565552, - "semantic_entropy": 0.829367995262146, + "logits/chosen": -0.03013746812939644, + "logits/rejected": 0.11791408061981201, + "logps/chosen": -1.2293686866760254, + "logps/rejected": -1.387367844581604, + "loss": 1.6001, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2293686866760254, + "rewards/margins": 0.15799903869628906, + "rewards/rejected": -1.387367844581604, "step": 580 }, { "epoch": 0.31309583542398395, - "grad_norm": 7.080204952027214, + "grad_norm": 7.50710540185207, "learning_rate": 9.999441174505398e-07, - "logits/chosen": -0.07520152628421783, - "logits/rejected": 0.022036874666810036, - "logps/chosen": -1.378169298171997, - "logps/rejected": -1.4294803142547607, - "loss": 2.1316, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.378169298171997, - "rewards/margins": 0.05131111294031143, - "rewards/rejected": -1.4294803142547607, - "semantic_entropy": 0.7770919799804688, + "logits/chosen": -0.06569008529186249, + "logits/rejected": 0.03065279684960842, + "logps/chosen": -1.3712207078933716, + "logps/rejected": -1.4126875400543213, + "loss": 1.7422, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3712207078933716, + "rewards/margins": 0.041466858237981796, + "rewards/rejected": -1.4126875400543213, "step": 585 }, { "epoch": 0.3157718682053855, - "grad_norm": 17.319598964810336, + "grad_norm": 15.466465592760187, "learning_rate": 9.999184082963116e-07, - "logits/chosen": -0.0469600148499012, - "logits/rejected": 0.07453016191720963, - "logps/chosen": -1.3526276350021362, - "logps/rejected": -1.3841309547424316, - "loss": 2.1237, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3526276350021362, - "rewards/margins": 0.031503356993198395, - "rewards/rejected": -1.3841309547424316, - "semantic_entropy": 0.7946040034294128, + "logits/chosen": -0.04614231735467911, + "logits/rejected": 0.0724630355834961, + "logps/chosen": -1.3442991971969604, + "logps/rejected": -1.3700175285339355, + "loss": 1.7183, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3442991971969604, + "rewards/margins": 0.025718364864587784, + "rewards/rejected": -1.3700175285339355, "step": 590 }, { "epoch": 0.3184479009867871, - "grad_norm": 9.00235073415317, + "grad_norm": 8.758215512499829, "learning_rate": 9.998878489314937e-07, - "logits/chosen": 0.003964888397604227, - "logits/rejected": 0.12411659955978394, - "logps/chosen": -1.2833412885665894, - "logps/rejected": -1.350610613822937, - "loss": 2.0957, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.2833412885665894, - "rewards/margins": 0.06726942956447601, - "rewards/rejected": -1.350610613822937, - "semantic_entropy": 0.8191946148872375, + "logits/chosen": 0.018415410071611404, + "logits/rejected": 0.13863402605056763, + "logps/chosen": -1.2763307094573975, + "logps/rejected": -1.3321726322174072, + "loss": 1.6815, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.2763307094573975, + "rewards/margins": 0.05584187060594559, + "rewards/rejected": -1.3321726322174072, "step": 595 }, { "epoch": 0.32112393376818865, - "grad_norm": 7.922144554064656, + "grad_norm": 8.227548047255322, "learning_rate": 9.99852439652573e-07, - "logits/chosen": -0.051727332174777985, - "logits/rejected": 0.08876247704029083, - "logps/chosen": -1.2790896892547607, - "logps/rejected": -1.3022220134735107, - "loss": 2.1024, + "logits/chosen": -0.03972851112484932, + "logits/rejected": 0.097906194627285, + "logps/chosen": -1.269298791885376, + "logps/rejected": -1.2822902202606201, + "loss": 1.6859, "rewards/accuracies": 0.4625000059604645, - "rewards/chosen": -1.2790896892547607, - "rewards/margins": 0.023132145404815674, - "rewards/rejected": -1.3022220134735107, - "semantic_entropy": 0.8291428685188293, + "rewards/chosen": -1.269298791885376, + "rewards/margins": 0.012991341762244701, + "rewards/rejected": -1.2822902202606201, "step": 600 }, { "epoch": 0.32379996654959026, - "grad_norm": 9.965995704685826, + "grad_norm": 9.777490952887712, "learning_rate": 9.998121808030904e-07, - "logits/chosen": -0.09909897297620773, - "logits/rejected": -0.017181608825922012, - "logps/chosen": -1.3383922576904297, - "logps/rejected": -1.487484335899353, - "loss": 2.0822, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3383922576904297, - "rewards/margins": 0.1490919589996338, - "rewards/rejected": -1.487484335899353, - "semantic_entropy": 0.7997158765792847, + "logits/chosen": -0.07380813360214233, + "logits/rejected": 0.008721251972019672, + "logps/chosen": -1.3301905393600464, + "logps/rejected": -1.4657261371612549, + "loss": 1.6796, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3301905393600464, + "rewards/margins": 0.13553544878959656, + "rewards/rejected": -1.4657261371612549, "step": 605 }, { "epoch": 0.3264759993309918, - "grad_norm": 12.670537832160065, + "grad_norm": 12.299409373491619, "learning_rate": 9.997670727736379e-07, - "logits/chosen": 0.044879984110593796, - "logits/rejected": 0.19142109155654907, - "logps/chosen": -1.3152719736099243, - "logps/rejected": -1.384782314300537, - "loss": 2.1013, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3152719736099243, - "rewards/margins": 0.06951036304235458, - "rewards/rejected": -1.384782314300537, - "semantic_entropy": 0.7921839952468872, + "logits/chosen": 0.047155968844890594, + "logits/rejected": 0.187996968626976, + "logps/chosen": -1.307685136795044, + "logps/rejected": -1.3639806509017944, + "loss": 1.7023, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.307685136795044, + "rewards/margins": 0.056295741349458694, + "rewards/rejected": -1.3639806509017944, "step": 610 }, { "epoch": 0.32915203211239336, - "grad_norm": 5.2822164816811235, + "grad_norm": 5.6392494655949825, "learning_rate": 9.99717116001853e-07, - "logits/chosen": -0.03817535936832428, - "logits/rejected": 0.06552287191152573, - "logps/chosen": -1.313217282295227, - "logps/rejected": -1.4523591995239258, - "loss": 2.0817, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.313217282295227, - "rewards/margins": 0.13914184272289276, - "rewards/rejected": -1.4523591995239258, - "semantic_entropy": 0.8057335615158081, + "logits/chosen": -0.0546308234333992, + "logits/rejected": 0.04433388262987137, + "logps/chosen": -1.3067935705184937, + "logps/rejected": -1.43153977394104, + "loss": 1.6778, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3067935705184937, + "rewards/margins": 0.12474598735570908, + "rewards/rejected": -1.43153977394104, "step": 615 }, { "epoch": 0.33182806489379496, - "grad_norm": 7.158263688307781, + "grad_norm": 7.341889878119668, "learning_rate": 9.996623109724173e-07, - "logits/chosen": 0.05255531147122383, - "logits/rejected": 0.1168251484632492, - "logps/chosen": -1.3770633935928345, - "logps/rejected": -1.4801380634307861, - "loss": 2.0993, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3770633935928345, - "rewards/margins": 0.10307463258504868, - "rewards/rejected": -1.4801380634307861, - "semantic_entropy": 0.7853333353996277, + "logits/chosen": 0.06176387146115303, + "logits/rejected": 0.12294511497020721, + "logps/chosen": -1.3664953708648682, + "logps/rejected": -1.4626625776290894, + "loss": 1.7016, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3664953708648682, + "rewards/margins": 0.09616713225841522, + "rewards/rejected": -1.4626625776290894, "step": 620 }, { "epoch": 0.3345040976751965, - "grad_norm": 9.250643921051761, + "grad_norm": 8.995369784784629, "learning_rate": 9.996026582170488e-07, - "logits/chosen": 0.060080431401729584, - "logits/rejected": 0.17307524383068085, - "logps/chosen": -1.2922842502593994, - "logps/rejected": -1.4193775653839111, - "loss": 2.0662, + "logits/chosen": 0.061516355723142624, + "logits/rejected": 0.16859427094459534, + "logps/chosen": -1.2846791744232178, + "logps/rejected": -1.4000781774520874, + "loss": 1.659, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2922842502593994, - "rewards/margins": 0.12709322571754456, - "rewards/rejected": -1.4193775653839111, - "semantic_entropy": 0.809944748878479, + "rewards/chosen": -1.2846791744232178, + "rewards/margins": 0.11539904773235321, + "rewards/rejected": -1.4000781774520874, "step": 625 }, { "epoch": 0.3371801304565981, - "grad_norm": 7.144016418620915, + "grad_norm": 7.600144746952444, "learning_rate": 9.995381583144996e-07, - "logits/chosen": -0.03415603190660477, - "logits/rejected": 0.06720946729183197, - "logps/chosen": -1.3283525705337524, - "logps/rejected": -1.4664812088012695, - "loss": 2.0554, + "logits/chosen": -0.007964099757373333, + "logits/rejected": 0.09590659290552139, + "logps/chosen": -1.3186440467834473, + "logps/rejected": -1.4482091665267944, + "loss": 1.6537, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3283525705337524, - "rewards/margins": 0.1381286084651947, - "rewards/rejected": -1.4664812088012695, - "semantic_entropy": 0.7949910759925842, + "rewards/chosen": -1.3186440467834473, + "rewards/margins": 0.12956508994102478, + "rewards/rejected": -1.4482091665267944, "step": 630 }, { "epoch": 0.33985616323799966, - "grad_norm": 6.276400394476788, + "grad_norm": 6.593150440126051, "learning_rate": 9.994688118905471e-07, - "logits/chosen": -0.024661913514137268, - "logits/rejected": 0.20788328349590302, - "logps/chosen": -1.3947147130966187, - "logps/rejected": -1.4336068630218506, - "loss": 2.1615, - "rewards/accuracies": 0.45625001192092896, - "rewards/chosen": -1.3947147130966187, - "rewards/margins": 0.03889207914471626, - "rewards/rejected": -1.4336068630218506, - "semantic_entropy": 0.7786458730697632, + "logits/chosen": 0.02119055762887001, + "logits/rejected": 0.25564703345298767, + "logps/chosen": -1.3833516836166382, + "logps/rejected": -1.409256935119629, + "loss": 1.7706, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.3833516836166382, + "rewards/margins": 0.02590516209602356, + "rewards/rejected": -1.409256935119629, "step": 635 }, { "epoch": 0.3425321960194012, - "grad_norm": 9.572705692674822, + "grad_norm": 8.898308653082006, "learning_rate": 9.993946196179912e-07, - "logits/chosen": -0.08139273524284363, - "logits/rejected": 0.1119336485862732, - "logps/chosen": -1.3305104970932007, - "logps/rejected": -1.4301774501800537, - "loss": 2.129, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.3305104970932007, - "rewards/margins": 0.09966699033975601, - "rewards/rejected": -1.4301774501800537, - "semantic_entropy": 0.7754771709442139, + "logits/chosen": -0.07971400767564774, + "logits/rejected": 0.10469271242618561, + "logps/chosen": -1.322509527206421, + "logps/rejected": -1.4098836183547974, + "loss": 1.7398, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.322509527206421, + "rewards/margins": 0.08737409114837646, + "rewards/rejected": -1.4098836183547974, "step": 640 }, { "epoch": 0.3452082288008028, - "grad_norm": 6.77104258950228, + "grad_norm": 7.389930536963129, "learning_rate": 9.993155822166455e-07, - "logits/chosen": -0.08386670053005219, - "logits/rejected": 0.001519903540611267, - "logps/chosen": -1.245597004890442, - "logps/rejected": -1.409886360168457, - "loss": 2.0635, + "logits/chosen": -0.07686187326908112, + "logits/rejected": 0.0053823827765882015, + "logps/chosen": -1.2334957122802734, + "logps/rejected": -1.385599136352539, + "loss": 1.6508, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.245597004890442, - "rewards/margins": 0.16428932547569275, - "rewards/rejected": -1.409886360168457, - "semantic_entropy": 0.8197149038314819, + "rewards/chosen": -1.2334957122802734, + "rewards/margins": 0.15210336446762085, + "rewards/rejected": -1.385599136352539, "step": 645 }, { "epoch": 0.34788426158220437, - "grad_norm": 10.81121162736983, + "grad_norm": 10.833609564944332, "learning_rate": 9.992317004533313e-07, - "logits/chosen": -0.026128700003027916, - "logits/rejected": 0.11188068240880966, - "logps/chosen": -1.3841702938079834, - "logps/rejected": -1.5400774478912354, - "loss": 2.1066, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3841702938079834, - "rewards/margins": 0.15590718388557434, - "rewards/rejected": -1.5400774478912354, - "semantic_entropy": 0.7806428670883179, + "logits/chosen": -0.028376352041959763, + "logits/rejected": 0.10559730231761932, + "logps/chosen": -1.3749666213989258, + "logps/rejected": -1.5166696310043335, + "loss": 1.7152, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3749666213989258, + "rewards/margins": 0.14170297980308533, + "rewards/rejected": -1.5166696310043335, "step": 650 }, { "epoch": 0.350560294363606, - "grad_norm": 7.821570622584799, + "grad_norm": 8.272238064740044, "learning_rate": 9.991429751418696e-07, - "logits/chosen": 0.04313744977116585, - "logits/rejected": 0.04878374561667442, - "logps/chosen": -1.3229328393936157, - "logps/rejected": -1.5045139789581299, - "loss": 2.0712, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3229328393936157, - "rewards/margins": 0.18158110976219177, - "rewards/rejected": -1.5045139789581299, - "semantic_entropy": 0.7957868576049805, + "logits/chosen": 0.06738613545894623, + "logits/rejected": 0.0676032230257988, + "logps/chosen": -1.312971830368042, + "logps/rejected": -1.483418583869934, + "loss": 1.671, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.312971830368042, + "rewards/margins": 0.17044667899608612, + "rewards/rejected": -1.483418583869934, "step": 655 }, { "epoch": 0.3532363271450075, - "grad_norm": 7.47285160939844, + "grad_norm": 7.0542822338071005, "learning_rate": 9.99049407143074e-07, - "logits/chosen": -0.0009026184561662376, - "logits/rejected": 0.12640218436717987, - "logps/chosen": -1.2817261219024658, - "logps/rejected": -1.3218984603881836, - "loss": 2.0897, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2817261219024658, - "rewards/margins": 0.04017230123281479, - "rewards/rejected": -1.3218984603881836, - "semantic_entropy": 0.8074017763137817, + "logits/chosen": 0.012846325524151325, + "logits/rejected": 0.13787353038787842, + "logps/chosen": -1.2715208530426025, + "logps/rejected": -1.2986509799957275, + "loss": 1.681, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2715208530426025, + "rewards/margins": 0.02712990716099739, + "rewards/rejected": -1.2986509799957275, "step": 660 }, { "epoch": 0.35591235992640907, - "grad_norm": 8.036948339099359, + "grad_norm": 7.7826006640852095, "learning_rate": 9.989509973647416e-07, - "logits/chosen": 0.004626098088920116, - "logits/rejected": 0.14154979586601257, - "logps/chosen": -1.245429277420044, - "logps/rejected": -1.3789770603179932, - "loss": 2.0455, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.245429277420044, - "rewards/margins": 0.13354769349098206, - "rewards/rejected": -1.3789770603179932, - "semantic_entropy": 0.8267159461975098, + "logits/chosen": -0.0050326017662882805, + "logits/rejected": 0.12580212950706482, + "logps/chosen": -1.2370613813400269, + "logps/rejected": -1.357429027557373, + "loss": 1.6309, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2370613813400269, + "rewards/margins": 0.12036754935979843, + "rewards/rejected": -1.357429027557373, "step": 665 }, { "epoch": 0.3585883927078107, - "grad_norm": 8.52772335856538, + "grad_norm": 8.526708013653826, "learning_rate": 9.988477467616445e-07, - "logits/chosen": -0.016855059191584587, - "logits/rejected": 0.1922694444656372, - "logps/chosen": -1.2839257717132568, - "logps/rejected": -1.335697889328003, - "loss": 2.0814, + "logits/chosen": -0.04048728197813034, + "logits/rejected": 0.15382294356822968, + "logps/chosen": -1.273474931716919, + "logps/rejected": -1.3108594417572021, + "loss": 1.6689, "rewards/accuracies": 0.4625000059604645, - "rewards/chosen": -1.2839257717132568, - "rewards/margins": 0.05177216976881027, - "rewards/rejected": -1.335697889328003, - "semantic_entropy": 0.8241464495658875, + "rewards/chosen": -1.273474931716919, + "rewards/margins": 0.037384580820798874, + "rewards/rejected": -1.3108594417572021, "step": 670 }, { "epoch": 0.3612644254892122, - "grad_norm": 9.71077016531089, + "grad_norm": 10.337099243083788, "learning_rate": 9.987396563355205e-07, - "logits/chosen": -0.027623683214187622, - "logits/rejected": 0.05166678503155708, - "logps/chosen": -1.275935411453247, - "logps/rejected": -1.4907596111297607, - "loss": 2.0427, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.275935411453247, - "rewards/margins": 0.21482422947883606, - "rewards/rejected": -1.4907596111297607, - "semantic_entropy": 0.8094412088394165, + "logits/chosen": -0.030770743265748024, + "logits/rejected": 0.043833933770656586, + "logps/chosen": -1.2658917903900146, + "logps/rejected": -1.4739489555358887, + "loss": 1.6307, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2658917903900146, + "rewards/margins": 0.20805713534355164, + "rewards/rejected": -1.4739489555358887, "step": 675 }, { "epoch": 0.36394045827061383, - "grad_norm": 9.612038280767669, + "grad_norm": 9.013546350604763, "learning_rate": 9.986267271350631e-07, - "logits/chosen": 0.05635018274188042, - "logits/rejected": 0.21157081425189972, - "logps/chosen": -1.3226648569107056, - "logps/rejected": -1.3817113637924194, - "loss": 2.1285, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3226648569107056, - "rewards/margins": 0.05904661491513252, - "rewards/rejected": -1.3817113637924194, - "semantic_entropy": 0.8033465147018433, + "logits/chosen": 0.07694297283887863, + "logits/rejected": 0.2321816235780716, + "logps/chosen": -1.3103058338165283, + "logps/rejected": -1.35648775100708, + "loss": 1.721, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3103058338165283, + "rewards/margins": 0.046181898564100266, + "rewards/rejected": -1.35648775100708, "step": 680 }, { "epoch": 0.3666164910520154, - "grad_norm": 8.664165560897748, + "grad_norm": 9.238506803782625, "learning_rate": 9.985089602559123e-07, - "logits/chosen": 0.022344741970300674, - "logits/rejected": 0.1773933470249176, - "logps/chosen": -1.300033450126648, - "logps/rejected": -1.3612678050994873, - "loss": 2.112, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.300033450126648, - "rewards/margins": 0.06123412400484085, - "rewards/rejected": -1.3612678050994873, - "semantic_entropy": 0.8237413167953491, + "logits/chosen": 0.01726577617228031, + "logits/rejected": 0.1645032912492752, + "logps/chosen": -1.291521430015564, + "logps/rejected": -1.3422341346740723, + "loss": 1.6962, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.291521430015564, + "rewards/margins": 0.050712697207927704, + "rewards/rejected": -1.3422341346740723, "step": 685 }, { "epoch": 0.369292523833417, - "grad_norm": 9.941705389220667, + "grad_norm": 8.565753547349857, "learning_rate": 9.983863568406428e-07, - "logits/chosen": 0.05462765693664551, - "logits/rejected": 0.08798164129257202, - "logps/chosen": -1.3029800653457642, - "logps/rejected": -1.4281790256500244, - "loss": 2.0599, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3029800653457642, - "rewards/margins": 0.12519893050193787, - "rewards/rejected": -1.4281790256500244, - "semantic_entropy": 0.7990589141845703, + "logits/chosen": 0.03928017243742943, + "logits/rejected": 0.06918928772211075, + "logps/chosen": -1.2919032573699951, + "logps/rejected": -1.4055030345916748, + "loss": 1.6564, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2919032573699951, + "rewards/margins": 0.1135997399687767, + "rewards/rejected": -1.4055030345916748, "step": 690 }, { "epoch": 0.37196855661481854, - "grad_norm": 7.327915372973922, + "grad_norm": 7.189058484659085, "learning_rate": 9.982589180787532e-07, - "logits/chosen": 0.003890749765560031, - "logits/rejected": 0.0950247123837471, - "logps/chosen": -1.1939213275909424, - "logps/rejected": -1.3700560331344604, - "loss": 1.9896, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.1939213275909424, - "rewards/margins": 0.1761348396539688, - "rewards/rejected": -1.3700560331344604, - "semantic_entropy": 0.8320623636245728, + "logits/chosen": -0.00870530866086483, + "logits/rejected": 0.07581108063459396, + "logps/chosen": -1.1800092458724976, + "logps/rejected": -1.3516024351119995, + "loss": 1.5623, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.1800092458724976, + "rewards/margins": 0.1715930700302124, + "rewards/rejected": -1.3516024351119995, "step": 695 }, { "epoch": 0.3746445893962201, - "grad_norm": 6.387595285459338, + "grad_norm": 6.2701986940644625, "learning_rate": 9.981266452066553e-07, - "logits/chosen": -0.1137634664773941, - "logits/rejected": 0.020167222246527672, - "logps/chosen": -1.3625977039337158, - "logps/rejected": -1.4447782039642334, - "loss": 2.1061, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3625977039337158, - "rewards/margins": 0.082180455327034, - "rewards/rejected": -1.4447782039642334, - "semantic_entropy": 0.7831142544746399, + "logits/chosen": -0.11826181411743164, + "logits/rejected": 0.009017865173518658, + "logps/chosen": -1.3536287546157837, + "logps/rejected": -1.4267635345458984, + "loss": 1.7099, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3536287546157837, + "rewards/margins": 0.07313470542430878, + "rewards/rejected": -1.4267635345458984, "step": 700 }, { "epoch": 0.3773206221776217, - "grad_norm": 6.265710436146083, + "grad_norm": 6.504609947969548, "learning_rate": 9.979895395076608e-07, - "logits/chosen": -0.07541519403457642, - "logits/rejected": 0.09938128292560577, - "logps/chosen": -1.316994071006775, - "logps/rejected": -1.4729902744293213, - "loss": 2.0786, + "logits/chosen": -0.08614318817853928, + "logits/rejected": 0.07920202612876892, + "logps/chosen": -1.3100945949554443, + "logps/rejected": -1.4507384300231934, + "loss": 1.6808, "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.316994071006775, - "rewards/margins": 0.15599612891674042, - "rewards/rejected": -1.4729902744293213, - "semantic_entropy": 0.7971211671829224, + "rewards/chosen": -1.3100945949554443, + "rewards/margins": 0.1406438946723938, + "rewards/rejected": -1.4507384300231934, "step": 705 }, { "epoch": 0.37999665495902324, - "grad_norm": 7.845912676948312, + "grad_norm": 7.971345593043526, "learning_rate": 9.9784760231197e-07, - "logits/chosen": 0.02958468720316887, - "logits/rejected": 0.12384209781885147, - "logps/chosen": -1.2575891017913818, - "logps/rejected": -1.391495704650879, - "loss": 2.0457, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2575891017913818, - "rewards/margins": 0.13390661776065826, - "rewards/rejected": -1.391495704650879, - "semantic_entropy": 0.8254655003547668, + "logits/chosen": 0.062352318316698074, + "logits/rejected": 0.1580919325351715, + "logps/chosen": -1.2454324960708618, + "logps/rejected": -1.3666789531707764, + "loss": 1.6294, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2454324960708618, + "rewards/margins": 0.12124643474817276, + "rewards/rejected": -1.3666789531707764, "step": 710 }, { "epoch": 0.38267268774042484, - "grad_norm": 8.126446651554827, + "grad_norm": 8.56313993808138, "learning_rate": 9.97700834996658e-07, - "logits/chosen": -0.015873288735747337, - "logits/rejected": 0.1502937376499176, - "logps/chosen": -1.3387916088104248, - "logps/rejected": -1.4656779766082764, - "loss": 2.1058, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3387916088104248, - "rewards/margins": 0.12688623368740082, - "rewards/rejected": -1.4656779766082764, - "semantic_entropy": 0.790082573890686, + "logits/chosen": -0.025264907628297806, + "logits/rejected": 0.13151118159294128, + "logps/chosen": -1.323120355606079, + "logps/rejected": -1.4409905672073364, + "loss": 1.703, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.323120355606079, + "rewards/margins": 0.11787022650241852, + "rewards/rejected": -1.4409905672073364, "step": 715 }, { "epoch": 0.3853487205218264, - "grad_norm": 7.634151426689379, + "grad_norm": 7.449686788662097, "learning_rate": 9.97549238985662e-07, - "logits/chosen": 0.032521337270736694, - "logits/rejected": 0.21769611537456512, - "logps/chosen": -1.3835394382476807, - "logps/rejected": -1.4489504098892212, - "loss": 2.1219, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3835394382476807, - "rewards/margins": 0.0654108077287674, - "rewards/rejected": -1.4489504098892212, - "semantic_entropy": 0.7764595150947571, + "logits/chosen": 0.03104434348642826, + "logits/rejected": 0.20914196968078613, + "logps/chosen": -1.3684965372085571, + "logps/rejected": -1.4199926853179932, + "loss": 1.7275, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3684965372085571, + "rewards/margins": 0.05149605870246887, + "rewards/rejected": -1.4199926853179932, "step": 720 }, { "epoch": 0.38802475330322794, - "grad_norm": 9.067806855520102, + "grad_norm": 9.052793290445388, "learning_rate": 9.973928157497674e-07, - "logits/chosen": -0.07280922681093216, - "logits/rejected": 0.06471310555934906, - "logps/chosen": -1.2224928140640259, - "logps/rejected": -1.479270339012146, - "loss": 2.0026, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2224928140640259, - "rewards/margins": 0.2567773759365082, - "rewards/rejected": -1.479270339012146, - "semantic_entropy": 0.8193053007125854, + "logits/chosen": -0.07327831536531448, + "logits/rejected": 0.05761227756738663, + "logps/chosen": -1.2101671695709229, + "logps/rejected": -1.4562008380889893, + "loss": 1.5896, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2101671695709229, + "rewards/margins": 0.24603363871574402, + "rewards/rejected": -1.4562008380889893, "step": 725 }, { "epoch": 0.39070078608462955, - "grad_norm": 9.424853255347328, + "grad_norm": 9.535694817556166, "learning_rate": 9.972315668065927e-07, - "logits/chosen": -0.11625852435827255, - "logits/rejected": 0.04367467388510704, - "logps/chosen": -1.3176000118255615, - "logps/rejected": -1.418208122253418, - "loss": 2.0845, + "logits/chosen": -0.11410193145275116, + "logits/rejected": 0.04373040795326233, + "logps/chosen": -1.3051345348358154, + "logps/rejected": -1.393215298652649, + "loss": 1.6759, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3176000118255615, - "rewards/margins": 0.10060807317495346, - "rewards/rejected": -1.418208122253418, - "semantic_entropy": 0.8038949966430664, + "rewards/chosen": -1.3051345348358154, + "rewards/margins": 0.08808077871799469, + "rewards/rejected": -1.393215298652649, "step": 730 }, { "epoch": 0.3933768188660311, - "grad_norm": 6.511587191195001, + "grad_norm": 6.620606779481893, "learning_rate": 9.97065493720576e-07, - "logits/chosen": -0.06525146961212158, - "logits/rejected": 0.03639759123325348, - "logps/chosen": -1.338934063911438, - "logps/rejected": -1.4079347848892212, - "loss": 2.0846, + "logits/chosen": -0.09182188659906387, + "logits/rejected": 0.0029895335901528597, + "logps/chosen": -1.3273653984069824, + "logps/rejected": -1.3844788074493408, + "loss": 1.6854, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.338934063911438, - "rewards/margins": 0.06900075078010559, - "rewards/rejected": -1.4079347848892212, - "semantic_entropy": 0.7866891622543335, + "rewards/chosen": -1.3273653984069824, + "rewards/margins": 0.05711333081126213, + "rewards/rejected": -1.3844788074493408, "step": 735 }, { "epoch": 0.3960528516474327, - "grad_norm": 10.655773370348717, + "grad_norm": 10.697652848841608, "learning_rate": 9.968945981029594e-07, - "logits/chosen": -0.06840531527996063, - "logits/rejected": 0.09909432381391525, - "logps/chosen": -1.3917211294174194, - "logps/rejected": -1.4396966695785522, - "loss": 2.1392, - "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -1.3917211294174194, - "rewards/margins": 0.04797549173235893, - "rewards/rejected": -1.4396966695785522, - "semantic_entropy": 0.7690067887306213, + "logits/chosen": -0.04950173944234848, + "logits/rejected": 0.1153801828622818, + "logps/chosen": -1.3779503107070923, + "logps/rejected": -1.4143047332763672, + "loss": 1.748, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3779503107070923, + "rewards/margins": 0.03635428473353386, + "rewards/rejected": -1.4143047332763672, "step": 740 }, { "epoch": 0.39872888442883425, - "grad_norm": 7.2486898664496655, + "grad_norm": 7.083780993102439, "learning_rate": 9.967188816117726e-07, - "logits/chosen": 0.03993947058916092, - "logits/rejected": 0.1131952553987503, - "logps/chosen": -1.3727612495422363, - "logps/rejected": -1.5764000415802002, - "loss": 2.0991, + "logits/chosen": 0.0645158439874649, + "logits/rejected": 0.1322767436504364, + "logps/chosen": -1.3595610857009888, + "logps/rejected": -1.5478967428207397, + "loss": 1.7024, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3727612495422363, - "rewards/margins": 0.20363883674144745, - "rewards/rejected": -1.5764000415802002, - "semantic_entropy": 0.7843629717826843, + "rewards/chosen": -1.3595610857009888, + "rewards/margins": 0.18833574652671814, + "rewards/rejected": -1.5478967428207397, "step": 745 }, { "epoch": 0.4014049172102358, - "grad_norm": 6.977794397353747, + "grad_norm": 7.016282897067713, "learning_rate": 9.965383459518179e-07, - "logits/chosen": -0.020700041204690933, - "logits/rejected": 0.14366576075553894, - "logps/chosen": -1.3073142766952515, - "logps/rejected": -1.472053050994873, - "loss": 2.0539, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3073142766952515, - "rewards/margins": 0.16473881900310516, - "rewards/rejected": -1.472053050994873, - "semantic_entropy": 0.8046748042106628, + "logits/chosen": -0.014546563848853111, + "logits/rejected": 0.14593543112277985, + "logps/chosen": -1.295961618423462, + "logps/rejected": -1.4461864233016968, + "loss": 1.6495, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.295961618423462, + "rewards/margins": 0.15022492408752441, + "rewards/rejected": -1.4461864233016968, "step": 750 }, { "epoch": 0.4040809499916374, - "grad_norm": 5.541992821131799, + "grad_norm": 5.697543818514006, "learning_rate": 9.963529928746533e-07, - "logits/chosen": 0.03654784709215164, - "logits/rejected": 0.1720220446586609, - "logps/chosen": -1.3389575481414795, - "logps/rejected": -1.428675889968872, - "loss": 2.0955, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3389575481414795, - "rewards/margins": 0.08971838653087616, - "rewards/rejected": -1.428675889968872, - "semantic_entropy": 0.8020346760749817, + "logits/chosen": 0.015908140689134598, + "logits/rejected": 0.1339842528104782, + "logps/chosen": -1.3269898891448975, + "logps/rejected": -1.4011058807373047, + "loss": 1.6926, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3269898891448975, + "rewards/margins": 0.07411597669124603, + "rewards/rejected": -1.4011058807373047, "step": 755 }, { "epoch": 0.40675698277303896, - "grad_norm": 5.428036718467446, + "grad_norm": 5.948048365747885, "learning_rate": 9.961628241785746e-07, - "logits/chosen": -0.07116580754518509, - "logits/rejected": -0.0020129233598709106, - "logps/chosen": -1.3516285419464111, - "logps/rejected": -1.5027011632919312, - "loss": 2.0824, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3516285419464111, - "rewards/margins": 0.15107260644435883, - "rewards/rejected": -1.5027011632919312, - "semantic_entropy": 0.7827638387680054, + "logits/chosen": -0.06778191030025482, + "logits/rejected": -0.0030374140478670597, + "logps/chosen": -1.3420777320861816, + "logps/rejected": -1.4758747816085815, + "loss": 1.6879, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3420777320861816, + "rewards/margins": 0.13379700481891632, + "rewards/rejected": -1.4758747816085815, "step": 760 }, { "epoch": 0.40943301555444056, - "grad_norm": 14.644131619246702, + "grad_norm": 14.268347903411364, "learning_rate": 9.959678417085998e-07, - "logits/chosen": -0.04214087873697281, - "logits/rejected": 0.047335632145404816, - "logps/chosen": -1.315185308456421, - "logps/rejected": -1.4251818656921387, - "loss": 2.0729, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.315185308456421, - "rewards/margins": 0.10999642312526703, - "rewards/rejected": -1.4251818656921387, - "semantic_entropy": 0.8101986050605774, + "logits/chosen": -0.037238337099552155, + "logits/rejected": 0.049766041338443756, + "logps/chosen": -1.2987943887710571, + "logps/rejected": -1.3975147008895874, + "loss": 1.6594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2987943887710571, + "rewards/margins": 0.09872031211853027, + "rewards/rejected": -1.3975147008895874, "step": 765 }, { "epoch": 0.4121090483358421, - "grad_norm": 7.817281275347967, + "grad_norm": 7.701723969333382, "learning_rate": 9.957680473564493e-07, - "logits/chosen": 0.05925404280424118, - "logits/rejected": 0.18113121390342712, - "logps/chosen": -1.2680829763412476, - "logps/rejected": -1.492113471031189, - "loss": 2.018, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2680829763412476, - "rewards/margins": 0.22403042018413544, - "rewards/rejected": -1.492113471031189, - "semantic_entropy": 0.8082300424575806, + "logits/chosen": 0.058088578283786774, + "logits/rejected": 0.17226721346378326, + "logps/chosen": -1.2571780681610107, + "logps/rejected": -1.4605745077133179, + "loss": 1.6128, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2571780681610107, + "rewards/margins": 0.20339655876159668, + "rewards/rejected": -1.4605745077133179, "step": 770 }, { "epoch": 0.41478508111724366, - "grad_norm": 8.006770491529315, + "grad_norm": 9.191080049332337, "learning_rate": 9.95563443060529e-07, - "logits/chosen": -0.08509613573551178, - "logits/rejected": 0.08377712965011597, - "logps/chosen": -1.3459033966064453, - "logps/rejected": -1.5283472537994385, - "loss": 2.0884, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3459033966064453, - "rewards/margins": 0.18244390189647675, - "rewards/rejected": -1.5283472537994385, - "semantic_entropy": 0.7789994478225708, + "logits/chosen": -0.08166106045246124, + "logits/rejected": 0.08174169063568115, + "logps/chosen": -1.3375778198242188, + "logps/rejected": -1.5057597160339355, + "loss": 1.6961, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3375778198242188, + "rewards/margins": 0.16818185150623322, + "rewards/rejected": -1.5057597160339355, "step": 775 }, { "epoch": 0.41746111389864526, - "grad_norm": 6.993465817399214, + "grad_norm": 7.203825122320527, "learning_rate": 9.95354030805911e-07, - "logits/chosen": -0.13927313685417175, - "logits/rejected": 0.0057820407673716545, - "logps/chosen": -1.2701890468597412, - "logps/rejected": -1.435626745223999, - "loss": 2.0319, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2701890468597412, - "rewards/margins": 0.16543766856193542, - "rewards/rejected": -1.435626745223999, - "semantic_entropy": 0.8075081706047058, + "logits/chosen": -0.13217060267925262, + "logits/rejected": 0.009336207062005997, + "logps/chosen": -1.261022925376892, + "logps/rejected": -1.4133572578430176, + "loss": 1.6253, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.261022925376892, + "rewards/margins": 0.1523343175649643, + "rewards/rejected": -1.4133572578430176, "step": 780 }, { "epoch": 0.4201371466800468, - "grad_norm": 8.559119399764109, + "grad_norm": 8.499021010654502, "learning_rate": 9.951398126243133e-07, - "logits/chosen": 0.013558772392570972, - "logits/rejected": 0.13729970157146454, - "logps/chosen": -1.2590985298156738, - "logps/rejected": -1.4598546028137207, - "loss": 2.0322, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2590985298156738, - "rewards/margins": 0.2007559984922409, - "rewards/rejected": -1.4598546028137207, - "semantic_entropy": 0.8115525245666504, + "logits/chosen": 0.024778928607702255, + "logits/rejected": 0.14496475458145142, + "logps/chosen": -1.248429298400879, + "logps/rejected": -1.434415578842163, + "loss": 1.6252, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.248429298400879, + "rewards/margins": 0.18598642945289612, + "rewards/rejected": -1.434415578842163, "step": 785 }, { "epoch": 0.4228131794614484, - "grad_norm": 7.447630204541813, + "grad_norm": 6.789172055949601, "learning_rate": 9.94920790594082e-07, - "logits/chosen": -0.057849399745464325, - "logits/rejected": 0.06551474332809448, - "logps/chosen": -1.3101880550384521, - "logps/rejected": -1.3954278230667114, - "loss": 2.0839, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3101880550384521, - "rewards/margins": 0.08523982018232346, - "rewards/rejected": -1.3954278230667114, - "semantic_entropy": 0.8039585947990417, + "logits/chosen": -0.03894413262605667, + "logits/rejected": 0.08288483321666718, + "logps/chosen": -1.3003264665603638, + "logps/rejected": -1.3643810749053955, + "loss": 1.6826, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3003264665603638, + "rewards/margins": 0.0640547126531601, + "rewards/rejected": -1.3643810749053955, "step": 790 }, { "epoch": 0.42548921224284997, - "grad_norm": 6.021313817313798, + "grad_norm": 5.945139150631693, "learning_rate": 9.946969668401696e-07, - "logits/chosen": -0.07067342847585678, - "logits/rejected": 0.11503966152667999, - "logps/chosen": -1.2911334037780762, - "logps/rejected": -1.4451611042022705, - "loss": 2.0554, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2911334037780762, - "rewards/margins": 0.15402746200561523, - "rewards/rejected": -1.4451611042022705, - "semantic_entropy": 0.7996021509170532, + "logits/chosen": -0.05952250957489014, + "logits/rejected": 0.12411437183618546, + "logps/chosen": -1.2805763483047485, + "logps/rejected": -1.4107916355133057, + "loss": 1.6567, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2805763483047485, + "rewards/margins": 0.13021525740623474, + "rewards/rejected": -1.4107916355133057, "step": 795 }, { "epoch": 0.4281652450242516, - "grad_norm": 7.6340391184091345, + "grad_norm": 7.442932191018972, "learning_rate": 9.944683435341155e-07, - "logits/chosen": -0.03889727592468262, - "logits/rejected": 0.03886578604578972, - "logps/chosen": -1.2934906482696533, - "logps/rejected": -1.3591432571411133, - "loss": 2.0818, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2934906482696533, - "rewards/margins": 0.06565256416797638, - "rewards/rejected": -1.3591432571411133, - "semantic_entropy": 0.8230938911437988, + "logits/chosen": -0.013432202860713005, + "logits/rejected": 0.06116775795817375, + "logps/chosen": -1.2843835353851318, + "logps/rejected": -1.3335177898406982, + "loss": 1.6692, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2843835353851318, + "rewards/margins": 0.049134280532598495, + "rewards/rejected": -1.3335177898406982, "step": 800 }, { "epoch": 0.4281652450242516, - "eval_logits/chosen": 0.28142249584198, - "eval_logits/rejected": 0.3657718598842621, - "eval_logps/chosen": -1.3264150619506836, - "eval_logps/rejected": -1.4818079471588135, - "eval_loss": 2.0713980197906494, - "eval_rewards/accuracies": 0.5586053133010864, - "eval_rewards/chosen": -1.3264150619506836, - "eval_rewards/margins": 0.1553928405046463, - "eval_rewards/rejected": -1.4818079471588135, - "eval_runtime": 34.574, - "eval_samples_per_second": 38.902, - "eval_semantic_entropy": 0.7937625646591187, - "eval_steps_per_second": 9.747, + "eval_logits/chosen": 0.288900762796402, + "eval_logits/rejected": 0.3707810044288635, + "eval_logps/chosen": -1.3150991201400757, + "eval_logps/rejected": -1.4532443284988403, + "eval_loss": 1.6718252897262573, + "eval_rewards/accuracies": 0.5578634738922119, + "eval_rewards/chosen": -1.3150991201400757, + "eval_rewards/margins": 0.13814544677734375, + "eval_rewards/rejected": -1.4532443284988403, + "eval_runtime": 40.0611, + "eval_samples_per_second": 33.574, + "eval_steps_per_second": 8.412, "step": 800 }, { "epoch": 0.4308412778056531, - "grad_norm": 6.792990665970197, + "grad_norm": 7.448454933590522, "learning_rate": 9.942349228940236e-07, - "logits/chosen": -0.06414131075143814, - "logits/rejected": 0.09192129969596863, - "logps/chosen": -1.3388340473175049, - "logps/rejected": -1.5097267627716064, - "loss": 2.0773, + "logits/chosen": -0.08388860523700714, + "logits/rejected": 0.05477358028292656, + "logps/chosen": -1.3262841701507568, + "logps/rejected": -1.4756443500518799, + "loss": 1.6797, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3388340473175049, - "rewards/margins": 0.17089280486106873, - "rewards/rejected": -1.5097267627716064, - "semantic_entropy": 0.7890151143074036, + "rewards/chosen": -1.3262841701507568, + "rewards/margins": 0.14936020970344543, + "rewards/rejected": -1.4756443500518799, "step": 805 }, { "epoch": 0.43351731058705467, - "grad_norm": 8.231160151703877, + "grad_norm": 7.629956912163664, "learning_rate": 9.939967071845424e-07, - "logits/chosen": 0.04229467362165451, - "logits/rejected": 0.11417990922927856, - "logps/chosen": -1.2498180866241455, - "logps/rejected": -1.394517183303833, - "loss": 2.0305, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.2498180866241455, - "rewards/margins": 0.14469924569129944, - "rewards/rejected": -1.394517183303833, - "semantic_entropy": 0.8242220878601074, + "logits/chosen": 0.03200405091047287, + "logits/rejected": 0.10098972171545029, + "logps/chosen": -1.2373231649398804, + "logps/rejected": -1.3669945001602173, + "loss": 1.6151, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2373231649398804, + "rewards/margins": 0.1296711266040802, + "rewards/rejected": -1.3669945001602173, "step": 810 }, { "epoch": 0.4361933433684563, - "grad_norm": 8.264111773098524, + "grad_norm": 8.098240198326735, "learning_rate": 9.937536987168413e-07, - "logits/chosen": 0.01688297465443611, - "logits/rejected": 0.1425141841173172, - "logps/chosen": -1.2386586666107178, - "logps/rejected": -1.4612213373184204, - "loss": 1.9811, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2386586666107178, - "rewards/margins": 0.22256258130073547, - "rewards/rejected": -1.4612213373184204, - "semantic_entropy": 0.8069499731063843, + "logits/chosen": 0.03276928886771202, + "logits/rejected": 0.15491512417793274, + "logps/chosen": -1.2305742502212524, + "logps/rejected": -1.4300734996795654, + "loss": 1.5791, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2305742502212524, + "rewards/margins": 0.19949916005134583, + "rewards/rejected": -1.4300734996795654, "step": 815 }, { "epoch": 0.4388693761498578, - "grad_norm": 7.653356935759378, + "grad_norm": 7.768530788641451, "learning_rate": 9.935058998485896e-07, - "logits/chosen": 0.041509293019771576, - "logits/rejected": 0.08839239925146103, - "logps/chosen": -1.2882188558578491, - "logps/rejected": -1.4596892595291138, - "loss": 2.0518, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2882188558578491, - "rewards/margins": 0.17147038877010345, - "rewards/rejected": -1.4596892595291138, - "semantic_entropy": 0.8028401136398315, + "logits/chosen": 0.040918465703725815, + "logits/rejected": 0.08255477994680405, + "logps/chosen": -1.27720046043396, + "logps/rejected": -1.4338114261627197, + "loss": 1.6462, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.27720046043396, + "rewards/margins": 0.15661077201366425, + "rewards/rejected": -1.4338114261627197, "step": 820 }, { "epoch": 0.44154540893125943, - "grad_norm": 8.096203198152919, + "grad_norm": 7.776726581191855, "learning_rate": 9.932533129839333e-07, - "logits/chosen": -0.03431471437215805, - "logits/rejected": 0.0898442417383194, - "logps/chosen": -1.2290394306182861, - "logps/rejected": -1.338889241218567, - "loss": 2.0372, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2290394306182861, - "rewards/margins": 0.10984987020492554, - "rewards/rejected": -1.338889241218567, - "semantic_entropy": 0.8230849504470825, + "logits/chosen": -0.03660018369555473, + "logits/rejected": 0.0787377804517746, + "logps/chosen": -1.2200651168823242, + "logps/rejected": -1.3064601421356201, + "loss": 1.6296, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.2200651168823242, + "rewards/margins": 0.08639508485794067, + "rewards/rejected": -1.3064601421356201, "step": 825 }, { "epoch": 0.444221441712661, - "grad_norm": 6.980870065250988, + "grad_norm": 7.024572269311969, "learning_rate": 9.929959405734711e-07, - "logits/chosen": 0.06439422070980072, - "logits/rejected": 0.22203806042671204, - "logps/chosen": -1.3431024551391602, - "logps/rejected": -1.4144035577774048, - "loss": 2.0952, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.3431024551391602, - "rewards/margins": 0.07130113244056702, - "rewards/rejected": -1.4144035577774048, - "semantic_entropy": 0.7883853912353516, + "logits/chosen": 0.05554322525858879, + "logits/rejected": 0.20837612450122833, + "logps/chosen": -1.3340895175933838, + "logps/rejected": -1.3898991346359253, + "loss": 1.7005, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3340895175933838, + "rewards/margins": 0.05580978840589523, + "rewards/rejected": -1.3898991346359253, "step": 830 }, { "epoch": 0.44689747449406253, - "grad_norm": 10.31936880093806, + "grad_norm": 10.69377124180876, "learning_rate": 9.927337851142314e-07, - "logits/chosen": 0.0007377028232440352, - "logits/rejected": 0.12605439126491547, - "logps/chosen": -1.258170247077942, - "logps/rejected": -1.3931313753128052, - "loss": 2.0665, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.258170247077942, - "rewards/margins": 0.13496126234531403, - "rewards/rejected": -1.3931313753128052, - "semantic_entropy": 0.8210614323616028, + "logits/chosen": 0.009267118759453297, + "logits/rejected": 0.13208474218845367, + "logps/chosen": -1.2435743808746338, + "logps/rejected": -1.3662970066070557, + "loss": 1.6503, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2435743808746338, + "rewards/margins": 0.12272258847951889, + "rewards/rejected": -1.3662970066070557, "step": 835 }, { "epoch": 0.44957350727546413, - "grad_norm": 8.318944088325118, + "grad_norm": 8.144607331637909, "learning_rate": 9.924668491496474e-07, - "logits/chosen": -0.0008976459503173828, - "logits/rejected": 0.1518554389476776, - "logps/chosen": -1.2993545532226562, - "logps/rejected": -1.4758427143096924, - "loss": 2.0728, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2993545532226562, - "rewards/margins": 0.17648813128471375, - "rewards/rejected": -1.4758427143096924, - "semantic_entropy": 0.8037001490592957, + "logits/chosen": -0.006433224771171808, + "logits/rejected": 0.13204315304756165, + "logps/chosen": -1.2838013172149658, + "logps/rejected": -1.4483354091644287, + "loss": 1.6652, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2838013172149658, + "rewards/margins": 0.16453418135643005, + "rewards/rejected": -1.4483354091644287, "step": 840 }, { "epoch": 0.4522495400568657, - "grad_norm": 4.966451864110367, + "grad_norm": 5.55810909000904, "learning_rate": 9.92195135269533e-07, - "logits/chosen": 0.04422593489289284, - "logits/rejected": 0.10922133922576904, - "logps/chosen": -1.303062081336975, - "logps/rejected": -1.3727529048919678, - "loss": 2.0961, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.303062081336975, - "rewards/margins": 0.06969080865383148, - "rewards/rejected": -1.3727529048919678, - "semantic_entropy": 0.8001149892807007, + "logits/chosen": 0.047361455857753754, + "logits/rejected": 0.10631246864795685, + "logps/chosen": -1.2872891426086426, + "logps/rejected": -1.3429710865020752, + "loss": 1.6884, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2872891426086426, + "rewards/margins": 0.05568184331059456, + "rewards/rejected": -1.3429710865020752, "step": 845 }, { "epoch": 0.4549255728382673, - "grad_norm": 9.661166270747207, + "grad_norm": 9.11402763497159, "learning_rate": 9.919186461100574e-07, - "logits/chosen": 0.014062756672501564, - "logits/rejected": 0.07492586970329285, - "logps/chosen": -1.265263557434082, - "logps/rejected": -1.409030795097351, - "loss": 2.0278, + "logits/chosen": 0.01465144194662571, + "logits/rejected": 0.06983944028615952, + "logps/chosen": -1.2490416765213013, + "logps/rejected": -1.3807716369628906, + "loss": 1.612, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.265263557434082, - "rewards/margins": 0.1437673270702362, - "rewards/rejected": -1.409030795097351, - "semantic_entropy": 0.8205739259719849, + "rewards/chosen": -1.2490416765213013, + "rewards/margins": 0.13172993063926697, + "rewards/rejected": -1.3807716369628906, "step": 850 }, { "epoch": 0.45760160561966884, - "grad_norm": 6.229750804352298, + "grad_norm": 6.438125866962163, "learning_rate": 9.9163738435372e-07, - "logits/chosen": -0.012746746651828289, - "logits/rejected": 0.127157062292099, - "logps/chosen": -1.320353388786316, - "logps/rejected": -1.5081716775894165, - "loss": 2.0957, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.320353388786316, - "rewards/margins": 0.18781821429729462, - "rewards/rejected": -1.5081716775894165, - "semantic_entropy": 0.7857638597488403, + "logits/chosen": -0.03615623712539673, + "logits/rejected": 0.09094985574483871, + "logps/chosen": -1.3045870065689087, + "logps/rejected": -1.4764373302459717, + "loss": 1.6946, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3045870065689087, + "rewards/margins": 0.17185047268867493, + "rewards/rejected": -1.4764373302459717, "step": 855 }, { "epoch": 0.4602776384010704, - "grad_norm": 5.359816930546816, + "grad_norm": 5.863387820990229, "learning_rate": 9.913513527293234e-07, - "logits/chosen": -0.05882059782743454, - "logits/rejected": 0.0931854099035263, - "logps/chosen": -1.3550149202346802, - "logps/rejected": -1.555934190750122, - "loss": 2.0702, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3550149202346802, - "rewards/margins": 0.20091931521892548, - "rewards/rejected": -1.555934190750122, - "semantic_entropy": 0.7748786807060242, + "logits/chosen": -0.06574797630310059, + "logits/rejected": 0.07671912014484406, + "logps/chosen": -1.3422025442123413, + "logps/rejected": -1.524712324142456, + "loss": 1.6772, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3422025442123413, + "rewards/margins": 0.18250969052314758, + "rewards/rejected": -1.524712324142456, "step": 860 }, { "epoch": 0.462953671182472, - "grad_norm": 11.204908787425222, + "grad_norm": 10.92852653610282, "learning_rate": 9.910605540119474e-07, - "logits/chosen": -0.01107565127313137, - "logits/rejected": 0.07848058640956879, - "logps/chosen": -1.257359266281128, - "logps/rejected": -1.4614964723587036, - "loss": 2.0473, + "logits/chosen": 0.004780948162078857, + "logits/rejected": 0.09096747636795044, + "logps/chosen": -1.2448174953460693, + "logps/rejected": -1.4256184101104736, + "loss": 1.6361, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.257359266281128, - "rewards/margins": 0.2041374146938324, - "rewards/rejected": -1.4614964723587036, - "semantic_entropy": 0.8138553500175476, + "rewards/chosen": -1.2448174953460693, + "rewards/margins": 0.18080079555511475, + "rewards/rejected": -1.4256184101104736, "step": 865 }, { "epoch": 0.46562970396387354, - "grad_norm": 7.420345286910088, + "grad_norm": 7.087121964770353, "learning_rate": 9.907649910229227e-07, - "logits/chosen": -0.10609734058380127, - "logits/rejected": 0.13778510689735413, - "logps/chosen": -1.3095638751983643, - "logps/rejected": -1.429024338722229, - "loss": 2.0781, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3095638751983643, - "rewards/margins": 0.11946046352386475, - "rewards/rejected": -1.429024338722229, - "semantic_entropy": 0.8033844232559204, + "logits/chosen": -0.08116715401411057, + "logits/rejected": 0.15623739361763, + "logps/chosen": -1.3013743162155151, + "logps/rejected": -1.3912999629974365, + "loss": 1.6759, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3013743162155151, + "rewards/margins": 0.08992559462785721, + "rewards/rejected": -1.3912999629974365, "step": 870 }, { "epoch": 0.46830573674527515, - "grad_norm": 6.920647449331957, + "grad_norm": 7.544115225961909, "learning_rate": 9.90464666629803e-07, - "logits/chosen": 0.015931006520986557, - "logits/rejected": 0.08639371395111084, - "logps/chosen": -1.3419044017791748, - "logps/rejected": -1.471523642539978, - "loss": 2.1269, + "logits/chosen": 0.01658749394118786, + "logits/rejected": 0.08199294656515121, + "logps/chosen": -1.3255009651184082, + "logps/rejected": -1.4433691501617432, + "loss": 1.7238, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3419044017791748, - "rewards/margins": 0.1296190768480301, - "rewards/rejected": -1.471523642539978, - "semantic_entropy": 0.7931000590324402, + "rewards/chosen": -1.3255009651184082, + "rewards/margins": 0.1178680881857872, + "rewards/rejected": -1.4433691501617432, "step": 875 }, { "epoch": 0.4709817695266767, - "grad_norm": 6.174422720488045, + "grad_norm": 6.029370913132447, "learning_rate": 9.901595837463363e-07, - "logits/chosen": 0.028076793998479843, - "logits/rejected": 0.18086400628089905, - "logps/chosen": -1.390434980392456, - "logps/rejected": -1.5196599960327148, - "loss": 2.1118, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.390434980392456, - "rewards/margins": 0.12922483682632446, - "rewards/rejected": -1.5196599960327148, - "semantic_entropy": 0.7693473696708679, + "logits/chosen": 0.019355863332748413, + "logits/rejected": 0.16283582150936127, + "logps/chosen": -1.3766533136367798, + "logps/rejected": -1.4846994876861572, + "loss": 1.7245, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3766533136367798, + "rewards/margins": 0.10804629325866699, + "rewards/rejected": -1.4846994876861572, "step": 880 }, { "epoch": 0.47365780230807825, - "grad_norm": 8.745523774867733, + "grad_norm": 8.629139935459667, "learning_rate": 9.898497453324384e-07, - "logits/chosen": -0.04087982326745987, - "logits/rejected": 0.0349409282207489, - "logps/chosen": -1.280450463294983, - "logps/rejected": -1.4879987239837646, - "loss": 2.0342, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.280450463294983, - "rewards/margins": 0.20754830539226532, - "rewards/rejected": -1.4879987239837646, - "semantic_entropy": 0.8025510907173157, + "logits/chosen": -0.060194194316864014, + "logits/rejected": 0.013257995247840881, + "logps/chosen": -1.269682765007019, + "logps/rejected": -1.453184723854065, + "loss": 1.6313, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.269682765007019, + "rewards/margins": 0.18350182473659515, + "rewards/rejected": -1.453184723854065, "step": 885 }, { "epoch": 0.47633383508947985, - "grad_norm": 7.976852885294416, + "grad_norm": 7.777444725456523, "learning_rate": 9.895351543941628e-07, - "logits/chosen": -0.15843790769577026, - "logits/rejected": -0.04290686175227165, - "logps/chosen": -1.3285924196243286, - "logps/rejected": -1.4766861200332642, - "loss": 2.0804, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3285924196243286, - "rewards/margins": 0.14809374511241913, - "rewards/rejected": -1.4766861200332642, - "semantic_entropy": 0.7809569239616394, + "logits/chosen": -0.14868099987506866, + "logits/rejected": -0.03934749215841293, + "logps/chosen": -1.3191107511520386, + "logps/rejected": -1.4460171461105347, + "loss": 1.6891, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3191107511520386, + "rewards/margins": 0.1269063651561737, + "rewards/rejected": -1.4460171461105347, "step": 890 }, { "epoch": 0.4790098678708814, - "grad_norm": 8.131211934482984, + "grad_norm": 7.654458430179175, "learning_rate": 9.892158139836724e-07, - "logits/chosen": 0.05946514755487442, - "logits/rejected": 0.17120105028152466, - "logps/chosen": -1.2260630130767822, - "logps/rejected": -1.3467620611190796, - "loss": 2.0329, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2260630130767822, - "rewards/margins": 0.12069927155971527, - "rewards/rejected": -1.3467620611190796, - "semantic_entropy": 0.8377145528793335, + "logits/chosen": 0.049044668674468994, + "logits/rejected": 0.15491130948066711, + "logps/chosen": -1.2157189846038818, + "logps/rejected": -1.3172571659088135, + "loss": 1.6145, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2157189846038818, + "rewards/margins": 0.10153820365667343, + "rewards/rejected": -1.3172571659088135, "step": 895 }, { "epoch": 0.481685900652283, - "grad_norm": 7.01576110260866, + "grad_norm": 7.237952882977966, "learning_rate": 9.88891727199209e-07, - "logits/chosen": -0.07405869662761688, - "logits/rejected": -0.011778157204389572, - "logps/chosen": -1.2250185012817383, - "logps/rejected": -1.4775097370147705, - "loss": 1.9937, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2250185012817383, - "rewards/margins": 0.25249117612838745, - "rewards/rejected": -1.4775097370147705, - "semantic_entropy": 0.8225153684616089, + "logits/chosen": -0.07838686555624008, + "logits/rejected": -0.023043682798743248, + "logps/chosen": -1.208280086517334, + "logps/rejected": -1.4431705474853516, + "loss": 1.5745, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.208280086517334, + "rewards/margins": 0.23489037156105042, + "rewards/rejected": -1.4431705474853516, "step": 900 }, { "epoch": 0.48436193343368455, - "grad_norm": 7.600017516249421, + "grad_norm": 7.124123089263012, "learning_rate": 9.885628971850641e-07, - "logits/chosen": 0.029972026124596596, - "logits/rejected": 0.2067229300737381, - "logps/chosen": -1.2975659370422363, - "logps/rejected": -1.4992420673370361, - "loss": 2.0551, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2975659370422363, - "rewards/margins": 0.20167620480060577, - "rewards/rejected": -1.4992420673370361, - "semantic_entropy": 0.7884010076522827, + "logits/chosen": 0.04647911712527275, + "logits/rejected": 0.21987970173358917, + "logps/chosen": -1.2849841117858887, + "logps/rejected": -1.466862678527832, + "loss": 1.6517, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2849841117858887, + "rewards/margins": 0.18187864124774933, + "rewards/rejected": -1.466862678527832, "step": 905 }, { "epoch": 0.48703796621508616, - "grad_norm": 5.169626708622108, + "grad_norm": 5.099811653065707, "learning_rate": 9.882293271315481e-07, - "logits/chosen": -0.04081651568412781, - "logits/rejected": 0.06294569373130798, - "logps/chosen": -1.3483736515045166, - "logps/rejected": -1.4402248859405518, - "loss": 2.1274, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3483736515045166, - "rewards/margins": 0.09185134619474411, - "rewards/rejected": -1.4402248859405518, - "semantic_entropy": 0.7705464959144592, + "logits/chosen": -0.02049119397997856, + "logits/rejected": 0.08140434324741364, + "logps/chosen": -1.3364083766937256, + "logps/rejected": -1.409537434577942, + "loss": 1.7328, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3364083766937256, + "rewards/margins": 0.07312921434640884, + "rewards/rejected": -1.409537434577942, "step": 910 }, { "epoch": 0.4897139989964877, - "grad_norm": 6.381739732007515, + "grad_norm": 6.450546306377393, "learning_rate": 9.878910202749589e-07, - "logits/chosen": -0.0046323067508637905, - "logits/rejected": 0.16577397286891937, - "logps/chosen": -1.270925521850586, - "logps/rejected": -1.4246925115585327, - "loss": 2.038, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.270925521850586, - "rewards/margins": 0.15376701951026917, - "rewards/rejected": -1.4246925115585327, - "semantic_entropy": 0.815610408782959, + "logits/chosen": -0.01597698964178562, + "logits/rejected": 0.1421457976102829, + "logps/chosen": -1.2553143501281738, + "logps/rejected": -1.3936388492584229, + "loss": 1.6273, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2553143501281738, + "rewards/margins": 0.13832440972328186, + "rewards/rejected": -1.3936388492584229, "step": 915 }, { "epoch": 0.49239003177788926, - "grad_norm": 10.446352300340633, + "grad_norm": 9.921746807110368, "learning_rate": 9.875479798975512e-07, - "logits/chosen": 0.08526929467916489, - "logits/rejected": 0.20687708258628845, - "logps/chosen": -1.2325282096862793, - "logps/rejected": -1.399568796157837, - "loss": 2.0354, + "logits/chosen": 0.09873722493648529, + "logits/rejected": 0.2159980982542038, + "logps/chosen": -1.2219921350479126, + "logps/rejected": -1.366690993309021, + "loss": 1.6169, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2325282096862793, - "rewards/margins": 0.16704055666923523, - "rewards/rejected": -1.399568796157837, - "semantic_entropy": 0.8271034955978394, + "rewards/chosen": -1.2219921350479126, + "rewards/margins": 0.144698828458786, + "rewards/rejected": -1.366690993309021, "step": 920 }, { "epoch": 0.49506606455929086, - "grad_norm": 7.057665238411657, + "grad_norm": 7.07195948209997, "learning_rate": 9.87200209327504e-07, - "logits/chosen": -0.04000019282102585, - "logits/rejected": 0.11704482138156891, - "logps/chosen": -1.3189841508865356, - "logps/rejected": -1.3870525360107422, - "loss": 2.076, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3189841508865356, - "rewards/margins": 0.06806856393814087, - "rewards/rejected": -1.3870525360107422, - "semantic_entropy": 0.802047848701477, + "logits/chosen": -0.04622996598482132, + "logits/rejected": 0.09983499348163605, + "logps/chosen": -1.3058781623840332, + "logps/rejected": -1.3538609743118286, + "loss": 1.6725, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3058781623840332, + "rewards/margins": 0.04798286780714989, + "rewards/rejected": -1.3538609743118286, "step": 925 }, { "epoch": 0.4977420973406924, - "grad_norm": 11.790706049371428, + "grad_norm": 11.693390251271618, "learning_rate": 9.868477119388894e-07, - "logits/chosen": -0.05107710883021355, - "logits/rejected": 0.05921541526913643, - "logps/chosen": -1.2975822687149048, - "logps/rejected": -1.510721206665039, - "loss": 2.0754, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2975822687149048, - "rewards/margins": 0.21313898265361786, - "rewards/rejected": -1.510721206665039, - "semantic_entropy": 0.7891716361045837, + "logits/chosen": -0.03761199861764908, + "logits/rejected": 0.06936880201101303, + "logps/chosen": -1.2870805263519287, + "logps/rejected": -1.4754420518875122, + "loss": 1.6781, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2870805263519287, + "rewards/margins": 0.1883614957332611, + "rewards/rejected": -1.4754420518875122, "step": 930 }, { "epoch": 0.500418130122094, - "grad_norm": 9.266305959833732, + "grad_norm": 8.89968298486334, "learning_rate": 9.864904911516383e-07, - "logits/chosen": 0.03296453505754471, - "logits/rejected": 0.07001104205846786, - "logps/chosen": -1.225716233253479, - "logps/rejected": -1.433119773864746, - "loss": 2.0381, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.225716233253479, - "rewards/margins": 0.20740346610546112, - "rewards/rejected": -1.433119773864746, - "semantic_entropy": 0.8156035542488098, + "logits/chosen": 0.03245195001363754, + "logits/rejected": 0.06252043694257736, + "logps/chosen": -1.2151482105255127, + "logps/rejected": -1.4019325971603394, + "loss": 1.6228, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2151482105255127, + "rewards/margins": 0.18678443133831024, + "rewards/rejected": -1.4019325971603394, "step": 935 }, { "epoch": 0.5030941629034956, - "grad_norm": 8.080697489828468, + "grad_norm": 8.443843103780504, "learning_rate": 9.861285504315084e-07, - "logits/chosen": 0.0014633402461186051, - "logits/rejected": 0.10632189363241196, - "logps/chosen": -1.2933341264724731, - "logps/rejected": -1.3772052526474, - "loss": 2.0787, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2933341264724731, - "rewards/margins": 0.08387112617492676, - "rewards/rejected": -1.3772052526474, - "semantic_entropy": 0.8132776021957397, + "logits/chosen": -0.024701697751879692, + "logits/rejected": 0.07754331827163696, + "logps/chosen": -1.2799854278564453, + "logps/rejected": -1.3430149555206299, + "loss": 1.6659, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2799854278564453, + "rewards/margins": 0.06302952766418457, + "rewards/rejected": -1.3430149555206299, "step": 940 }, { "epoch": 0.5057701956848971, - "grad_norm": 7.401792535316144, + "grad_norm": 7.616171348420136, "learning_rate": 9.857618932900502e-07, - "logits/chosen": -0.028624827042222023, - "logits/rejected": 0.09492968767881393, - "logps/chosen": -1.2746442556381226, - "logps/rejected": -1.4527875185012817, - "loss": 2.0347, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2746442556381226, - "rewards/margins": 0.1781432330608368, - "rewards/rejected": -1.4527875185012817, - "semantic_entropy": 0.8073099255561829, + "logits/chosen": -0.036475539207458496, + "logits/rejected": 0.07666192948818207, + "logps/chosen": -1.2560824155807495, + "logps/rejected": -1.4120190143585205, + "loss": 1.6173, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2560824155807495, + "rewards/margins": 0.15593667328357697, + "rewards/rejected": -1.4120190143585205, "step": 945 }, { "epoch": 0.5084462284662987, - "grad_norm": 6.120494404082912, + "grad_norm": 6.162783467034935, "learning_rate": 9.853905232845727e-07, - "logits/chosen": -0.014512471854686737, - "logits/rejected": 0.14616458117961884, - "logps/chosen": -1.3691775798797607, - "logps/rejected": -1.420702338218689, - "loss": 2.1453, - "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -1.3691775798797607, - "rewards/margins": 0.051524870097637177, - "rewards/rejected": -1.420702338218689, - "semantic_entropy": 0.784176230430603, + "logits/chosen": -0.02106516622006893, + "logits/rejected": 0.12846365571022034, + "logps/chosen": -1.3557555675506592, + "logps/rejected": -1.384013295173645, + "loss": 1.749, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.3557555675506592, + "rewards/margins": 0.028257649391889572, + "rewards/rejected": -1.384013295173645, "step": 950 }, { "epoch": 0.5111222612477003, - "grad_norm": 5.814071967296598, + "grad_norm": 5.876016724260786, "learning_rate": 9.850144440181095e-07, - "logits/chosen": -0.013354201801121235, - "logits/rejected": 0.19118373095989227, - "logps/chosen": -1.353991985321045, - "logps/rejected": -1.4249238967895508, - "loss": 2.1352, + "logits/chosen": 0.01717526838183403, + "logits/rejected": 0.21609918773174286, + "logps/chosen": -1.3414275646209717, + "logps/rejected": -1.393541932106018, + "loss": 1.7376, "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.353991985321045, - "rewards/margins": 0.07093212008476257, - "rewards/rejected": -1.4249238967895508, - "semantic_entropy": 0.792451024055481, + "rewards/chosen": -1.3414275646209717, + "rewards/margins": 0.05211412161588669, + "rewards/rejected": -1.393541932106018, "step": 955 }, { "epoch": 0.5137982940291018, - "grad_norm": 6.959793634999091, + "grad_norm": 7.413105057921761, "learning_rate": 9.846336591393832e-07, - "logits/chosen": -0.039055611938238144, - "logits/rejected": 0.09675654023885727, - "logps/chosen": -1.3108570575714111, - "logps/rejected": -1.4037373065948486, - "loss": 2.0793, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3108570575714111, - "rewards/margins": 0.09288022667169571, - "rewards/rejected": -1.4037373065948486, - "semantic_entropy": 0.8116083145141602, + "logits/chosen": -0.042240239679813385, + "logits/rejected": 0.08787455409765244, + "logps/chosen": -1.2983901500701904, + "logps/rejected": -1.37651526927948, + "loss": 1.67, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2983901500701904, + "rewards/margins": 0.0781250074505806, + "rewards/rejected": -1.37651526927948, "step": 960 }, { "epoch": 0.5164743268105034, - "grad_norm": 6.933208594532436, + "grad_norm": 7.210572778613, "learning_rate": 9.842481723427704e-07, - "logits/chosen": 0.037173621356487274, - "logits/rejected": 0.02412385307252407, - "logps/chosen": -1.3575502634048462, - "logps/rejected": -1.5369902849197388, - "loss": 2.0761, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3575502634048462, - "rewards/margins": 0.1794399619102478, - "rewards/rejected": -1.5369902849197388, - "semantic_entropy": 0.7766879796981812, + "logits/chosen": 0.024538477882742882, + "logits/rejected": 0.007992692291736603, + "logps/chosen": -1.3409980535507202, + "logps/rejected": -1.4976381063461304, + "loss": 1.6831, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3409980535507202, + "rewards/margins": 0.15664002299308777, + "rewards/rejected": -1.4976381063461304, "step": 965 }, { "epoch": 0.519150359591905, - "grad_norm": 14.792974650803629, + "grad_norm": 14.130211133969373, "learning_rate": 9.838579873682658e-07, - "logits/chosen": 0.051386892795562744, - "logits/rejected": 0.05269744247198105, - "logps/chosen": -1.238379716873169, - "logps/rejected": -1.3489514589309692, - "loss": 2.0761, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.238379716873169, - "rewards/margins": 0.1105717197060585, - "rewards/rejected": -1.3489514589309692, - "semantic_entropy": 0.8146070241928101, + "logits/chosen": 0.05639838054776192, + "logits/rejected": 0.05497770383954048, + "logps/chosen": -1.2233648300170898, + "logps/rejected": -1.3103077411651611, + "loss": 1.661, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2233648300170898, + "rewards/margins": 0.08694292604923248, + "rewards/rejected": -1.3103077411651611, "step": 970 }, { "epoch": 0.5218263923733065, - "grad_norm": 5.9146769092326315, + "grad_norm": 6.923884257166216, "learning_rate": 9.834631080014457e-07, - "logits/chosen": -0.11935378611087799, - "logits/rejected": 0.04193400591611862, - "logps/chosen": -1.3102973699569702, - "logps/rejected": -1.422062873840332, - "loss": 2.0691, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3102973699569702, - "rewards/margins": 0.11176545917987823, - "rewards/rejected": -1.422062873840332, - "semantic_entropy": 0.7963775396347046, + "logits/chosen": -0.105961874127388, + "logits/rejected": 0.04743831977248192, + "logps/chosen": -1.293992042541504, + "logps/rejected": -1.3859124183654785, + "loss": 1.6687, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.293992042541504, + "rewards/margins": 0.0919203907251358, + "rewards/rejected": -1.3859124183654785, "step": 975 }, { "epoch": 0.5245024251547081, - "grad_norm": 12.142848553242148, + "grad_norm": 12.316637061932141, "learning_rate": 9.830635380734312e-07, - "logits/chosen": -0.10059794038534164, - "logits/rejected": 0.08547468483448029, - "logps/chosen": -1.3487987518310547, - "logps/rejected": -1.466112732887268, - "loss": 2.1156, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3487987518310547, - "rewards/margins": 0.11731388419866562, - "rewards/rejected": -1.466112732887268, - "semantic_entropy": 0.7900714874267578, + "logits/chosen": -0.1073160171508789, + "logits/rejected": 0.06393261253833771, + "logps/chosen": -1.3275487422943115, + "logps/rejected": -1.4250518083572388, + "loss": 1.7113, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3275487422943115, + "rewards/margins": 0.09750307351350784, + "rewards/rejected": -1.4250518083572388, "step": 980 }, { "epoch": 0.5271784579361097, - "grad_norm": 10.252170399799747, + "grad_norm": 9.038208075808697, "learning_rate": 9.826592814608517e-07, - "logits/chosen": 0.006975511554628611, - "logits/rejected": 0.18399174511432648, - "logps/chosen": -1.3139753341674805, - "logps/rejected": -1.4533307552337646, - "loss": 2.0657, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3139753341674805, - "rewards/margins": 0.13935549557209015, - "rewards/rejected": -1.4533307552337646, - "semantic_entropy": 0.791069507598877, + "logits/chosen": 0.007983547635376453, + "logits/rejected": 0.17008168995380402, + "logps/chosen": -1.2993818521499634, + "logps/rejected": -1.4190666675567627, + "loss": 1.6644, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2993818521499634, + "rewards/margins": 0.11968479305505753, + "rewards/rejected": -1.4190666675567627, "step": 985 }, { "epoch": 0.5298544907175113, - "grad_norm": 7.295258985513694, + "grad_norm": 7.199171931454207, "learning_rate": 9.822503420858067e-07, - "logits/chosen": 0.051158357411623, - "logits/rejected": 0.08825485408306122, - "logps/chosen": -1.186547040939331, - "logps/rejected": -1.4159901142120361, - "loss": 1.9773, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.186547040939331, - "rewards/margins": 0.2294432371854782, - "rewards/rejected": -1.4159901142120361, - "semantic_entropy": 0.8391534090042114, + "logits/chosen": 0.02937675081193447, + "logits/rejected": 0.06694046407938004, + "logps/chosen": -1.1760294437408447, + "logps/rejected": -1.3847202062606812, + "loss": 1.5567, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1760294437408447, + "rewards/margins": 0.20869088172912598, + "rewards/rejected": -1.3847202062606812, "step": 990 }, { "epoch": 0.5325305234989128, - "grad_norm": 7.682194736532205, + "grad_norm": 7.8135289022921, "learning_rate": 9.818367239158277e-07, - "logits/chosen": 0.06565249711275101, - "logits/rejected": 0.13260416686534882, - "logps/chosen": -1.3053300380706787, - "logps/rejected": -1.3484420776367188, - "loss": 2.1178, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.3053300380706787, - "rewards/margins": 0.04311198741197586, - "rewards/rejected": -1.3484420776367188, - "semantic_entropy": 0.8133613467216492, + "logits/chosen": 0.055761635303497314, + "logits/rejected": 0.11374112218618393, + "logps/chosen": -1.2928580045700073, + "logps/rejected": -1.311802864074707, + "loss": 1.7083, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.2928580045700073, + "rewards/margins": 0.01894478313624859, + "rewards/rejected": -1.311802864074707, "step": 995 }, { "epoch": 0.5352065562803144, - "grad_norm": 7.541848489315755, + "grad_norm": 7.405239214870988, "learning_rate": 9.8141843096384e-07, - "logits/chosen": 0.07169102132320404, - "logits/rejected": 0.17855951189994812, - "logps/chosen": -1.3297080993652344, - "logps/rejected": -1.4753717184066772, - "loss": 2.0654, + "logits/chosen": 0.05504922941327095, + "logits/rejected": 0.14741148054599762, + "logps/chosen": -1.3170385360717773, + "logps/rejected": -1.4345099925994873, + "loss": 1.6722, "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.3297080993652344, - "rewards/margins": 0.14566387236118317, - "rewards/rejected": -1.4753717184066772, - "semantic_entropy": 0.7911792993545532, + "rewards/chosen": -1.3170385360717773, + "rewards/margins": 0.11747147142887115, + "rewards/rejected": -1.4345099925994873, "step": 1000 }, { "epoch": 0.537882589061716, - "grad_norm": 9.710505906410269, + "grad_norm": 9.441656068563447, "learning_rate": 9.809954672881237e-07, - "logits/chosen": 0.025641867890954018, - "logits/rejected": 0.18236692249774933, - "logps/chosen": -1.336159586906433, - "logps/rejected": -1.445340871810913, - "loss": 2.1067, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.336159586906433, - "rewards/margins": 0.10918121039867401, - "rewards/rejected": -1.445340871810913, - "semantic_entropy": 0.7793572545051575, + "logits/chosen": 0.028905656188726425, + "logits/rejected": 0.17964079976081848, + "logps/chosen": -1.317063808441162, + "logps/rejected": -1.397226333618164, + "loss": 1.7064, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.317063808441162, + "rewards/margins": 0.08016278594732285, + "rewards/rejected": -1.397226333618164, "step": 1005 }, { "epoch": 0.5405586218431175, - "grad_norm": 6.04701898316921, + "grad_norm": 6.208741723671985, "learning_rate": 9.80567836992274e-07, - "logits/chosen": -0.006621755659580231, - "logits/rejected": 0.16392755508422852, - "logps/chosen": -1.1972862482070923, - "logps/rejected": -1.4301495552062988, - "loss": 1.9837, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.1972862482070923, - "rewards/margins": 0.23286327719688416, - "rewards/rejected": -1.4301495552062988, - "semantic_entropy": 0.8263761401176453, + "logits/chosen": 0.009346907958388329, + "logits/rejected": 0.17316535115242004, + "logps/chosen": -1.179496169090271, + "logps/rejected": -1.378589153289795, + "loss": 1.5714, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.179496169090271, + "rewards/margins": 0.1990930140018463, + "rewards/rejected": -1.378589153289795, "step": 1010 }, { "epoch": 0.5432346546245191, - "grad_norm": 9.21549679839962, + "grad_norm": 8.64120907951514, "learning_rate": 9.801355442251625e-07, - "logits/chosen": -0.013613523915410042, - "logits/rejected": 0.1519920378923416, - "logps/chosen": -1.263755202293396, - "logps/rejected": -1.4387528896331787, - "loss": 2.0395, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.263755202293396, - "rewards/margins": 0.17499780654907227, - "rewards/rejected": -1.4387528896331787, - "semantic_entropy": 0.8105853796005249, + "logits/chosen": -0.02329869009554386, + "logits/rejected": 0.12833422422409058, + "logps/chosen": -1.2491451501846313, + "logps/rejected": -1.4052848815917969, + "loss": 1.6333, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2491451501846313, + "rewards/margins": 0.15613953769207, + "rewards/rejected": -1.4052848815917969, "step": 1015 }, { "epoch": 0.5459106874059207, - "grad_norm": 9.448199550904986, + "grad_norm": 8.633383591041156, "learning_rate": 9.796985931808949e-07, - "logits/chosen": -0.008507566526532173, - "logits/rejected": 0.12182090431451797, - "logps/chosen": -1.316265344619751, - "logps/rejected": -1.4976341724395752, - "loss": 2.0655, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.316265344619751, - "rewards/margins": 0.1813688725233078, - "rewards/rejected": -1.4976341724395752, - "semantic_entropy": 0.7896450757980347, + "logits/chosen": -0.0002554789243731648, + "logits/rejected": 0.12322355806827545, + "logps/chosen": -1.3016084432601929, + "logps/rejected": -1.4578951597213745, + "loss": 1.6696, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3016084432601929, + "rewards/margins": 0.1562865972518921, + "rewards/rejected": -1.4578951597213745, "step": 1020 }, { "epoch": 0.5485867201873222, - "grad_norm": 8.559242488471353, + "grad_norm": 7.691560710071344, "learning_rate": 9.792569880987724e-07, - "logits/chosen": -0.05689584091305733, - "logits/rejected": 0.050126176327466965, - "logps/chosen": -1.2220516204833984, - "logps/rejected": -1.4843356609344482, - "loss": 1.9726, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2220516204833984, - "rewards/margins": 0.26228418946266174, - "rewards/rejected": -1.4843356609344482, - "semantic_entropy": 0.8148363828659058, + "logits/chosen": -0.05090395361185074, + "logits/rejected": 0.048211853951215744, + "logps/chosen": -1.2026124000549316, + "logps/rejected": -1.439203143119812, + "loss": 1.5598, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2026124000549316, + "rewards/margins": 0.2365906983613968, + "rewards/rejected": -1.439203143119812, "step": 1025 }, { "epoch": 0.5512627529687238, - "grad_norm": 7.995435761621511, + "grad_norm": 8.010467384551692, "learning_rate": 9.788107332632493e-07, - "logits/chosen": -0.04692408815026283, - "logits/rejected": 0.03308724984526634, - "logps/chosen": -1.2967350482940674, - "logps/rejected": -1.3965247869491577, - "loss": 2.0667, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.2967350482940674, - "rewards/margins": 0.09978990256786346, - "rewards/rejected": -1.3965247869491577, - "semantic_entropy": 0.8044675588607788, + "logits/chosen": -0.017558079212903976, + "logits/rejected": 0.06259426474571228, + "logps/chosen": -1.280426025390625, + "logps/rejected": -1.343672513961792, + "loss": 1.6636, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.280426025390625, + "rewards/margins": 0.06324651092290878, + "rewards/rejected": -1.343672513961792, "step": 1030 }, { "epoch": 0.5539387857501255, - "grad_norm": 6.868541007975739, + "grad_norm": 7.046839044633882, "learning_rate": 9.783598330038924e-07, - "logits/chosen": -0.04235472530126572, - "logits/rejected": 0.0641079992055893, - "logps/chosen": -1.3835198879241943, - "logps/rejected": -1.4320094585418701, - "loss": 2.1453, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3835198879241943, - "rewards/margins": 0.048489637672901154, - "rewards/rejected": -1.4320094585418701, - "semantic_entropy": 0.7668356895446777, + "logits/chosen": -0.03272116556763649, + "logits/rejected": 0.06766636669635773, + "logps/chosen": -1.3677171468734741, + "logps/rejected": -1.4003586769104004, + "loss": 1.7535, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3677171468734741, + "rewards/margins": 0.03264139965176582, + "rewards/rejected": -1.4003586769104004, "step": 1035 }, { "epoch": 0.5566148185315271, - "grad_norm": 7.118306635424307, + "grad_norm": 7.201284326111254, "learning_rate": 9.779042916953376e-07, - "logits/chosen": -0.023677151650190353, - "logits/rejected": 0.11396167427301407, - "logps/chosen": -1.3007886409759521, - "logps/rejected": -1.4533780813217163, - "loss": 2.063, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3007886409759521, - "rewards/margins": 0.15258948504924774, - "rewards/rejected": -1.4533780813217163, - "semantic_entropy": 0.8035913705825806, + "logits/chosen": 0.0016980856889858842, + "logits/rejected": 0.1325521320104599, + "logps/chosen": -1.2892191410064697, + "logps/rejected": -1.4091163873672485, + "loss": 1.6637, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2892191410064697, + "rewards/margins": 0.11989720165729523, + "rewards/rejected": -1.4091163873672485, "step": 1040 }, { "epoch": 0.5592908513129285, - "grad_norm": 6.137340844968411, + "grad_norm": 6.310168511316686, "learning_rate": 9.774441137572487e-07, - "logits/chosen": -0.07179170101881027, - "logits/rejected": 0.056475620716810226, - "logps/chosen": -1.2769570350646973, - "logps/rejected": -1.4823251962661743, - "loss": 2.0323, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.2769570350646973, - "rewards/margins": 0.20536811649799347, - "rewards/rejected": -1.4823251962661743, - "semantic_entropy": 0.7965216636657715, + "logits/chosen": -0.04989878088235855, + "logits/rejected": 0.07154484838247299, + "logps/chosen": -1.2653822898864746, + "logps/rejected": -1.4335815906524658, + "loss": 1.6333, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2653822898864746, + "rewards/margins": 0.16819944977760315, + "rewards/rejected": -1.4335815906524658, "step": 1045 }, { "epoch": 0.5619668840943302, - "grad_norm": 13.048109925378954, + "grad_norm": 12.552512510195202, "learning_rate": 9.76979303654274e-07, - "logits/chosen": -0.09428082406520844, - "logits/rejected": -0.0019208311568945646, - "logps/chosen": -1.3292375802993774, - "logps/rejected": -1.5085084438323975, - "loss": 2.0742, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3292375802993774, - "rewards/margins": 0.1792708784341812, - "rewards/rejected": -1.5085084438323975, - "semantic_entropy": 0.7879042029380798, + "logits/chosen": -0.09465078264474869, + "logits/rejected": -0.012076696380972862, + "logps/chosen": -1.3088659048080444, + "logps/rejected": -1.4423149824142456, + "loss": 1.678, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.3088659048080444, + "rewards/margins": 0.13344910740852356, + "rewards/rejected": -1.4423149824142456, "step": 1050 }, { "epoch": 0.5646429168757318, - "grad_norm": 11.948351218965916, + "grad_norm": 11.281832576670151, "learning_rate": 9.765098658960035e-07, - "logits/chosen": -0.023944038897752762, - "logits/rejected": 0.04848332703113556, - "logps/chosen": -1.3154340982437134, - "logps/rejected": -1.4888745546340942, - "loss": 2.0566, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3154340982437134, - "rewards/margins": 0.17344054579734802, - "rewards/rejected": -1.4888745546340942, - "semantic_entropy": 0.7741323709487915, + "logits/chosen": -0.008454844355583191, + "logits/rejected": 0.06004105880856514, + "logps/chosen": -1.2986464500427246, + "logps/rejected": -1.4412143230438232, + "loss": 1.6622, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2986464500427246, + "rewards/margins": 0.14256784319877625, + "rewards/rejected": -1.4412143230438232, "step": 1055 }, { "epoch": 0.5673189496571333, - "grad_norm": 8.531396445978677, + "grad_norm": 8.998794142616624, "learning_rate": 9.76035805036924e-07, - "logits/chosen": 0.01059248112142086, - "logits/rejected": 0.175526961684227, - "logps/chosen": -1.3848090171813965, - "logps/rejected": -1.5161800384521484, - "loss": 2.1052, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3848090171813965, - "rewards/margins": 0.13137102127075195, - "rewards/rejected": -1.5161800384521484, - "semantic_entropy": 0.7662585973739624, + "logits/chosen": 0.02145402505993843, + "logits/rejected": 0.17861071228981018, + "logps/chosen": -1.3666810989379883, + "logps/rejected": -1.471959114074707, + "loss": 1.7192, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3666810989379883, + "rewards/margins": 0.10527794063091278, + "rewards/rejected": -1.471959114074707, "step": 1060 }, { "epoch": 0.5699949824385349, - "grad_norm": 6.393750476767566, + "grad_norm": 5.910688973070728, "learning_rate": 9.755571256763764e-07, - "logits/chosen": 0.028286850079894066, - "logits/rejected": 0.14960213005542755, - "logps/chosen": -1.2523295879364014, - "logps/rejected": -1.4868011474609375, - "loss": 1.9998, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2523295879364014, - "rewards/margins": 0.23447155952453613, - "rewards/rejected": -1.4868011474609375, - "semantic_entropy": 0.7917420864105225, + "logits/chosen": 0.061039119958877563, + "logits/rejected": 0.17979125678539276, + "logps/chosen": -1.2307828664779663, + "logps/rejected": -1.4314491748809814, + "loss": 1.6004, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2307828664779663, + "rewards/margins": 0.20066621899604797, + "rewards/rejected": -1.4314491748809814, "step": 1065 }, { "epoch": 0.5726710152199365, - "grad_norm": 7.772674162303587, + "grad_norm": 7.079875986443437, "learning_rate": 9.750738324585097e-07, - "logits/chosen": -0.0999111533164978, - "logits/rejected": 0.1293635070323944, - "logps/chosen": -1.3048207759857178, - "logps/rejected": -1.5056029558181763, - "loss": 2.0207, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3048207759857178, - "rewards/margins": 0.20078222453594208, - "rewards/rejected": -1.5056029558181763, - "semantic_entropy": 0.7988306879997253, + "logits/chosen": -0.1030394583940506, + "logits/rejected": 0.11257269233465195, + "logps/chosen": -1.2871735095977783, + "logps/rejected": -1.462566614151001, + "loss": 1.6165, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2871735095977783, + "rewards/margins": 0.17539313435554504, + "rewards/rejected": -1.462566614151001, "step": 1070 }, { "epoch": 0.5753470480013381, - "grad_norm": 7.3703453263060315, + "grad_norm": 6.974332495983628, "learning_rate": 9.74585930072237e-07, - "logits/chosen": -0.029549255967140198, - "logits/rejected": 0.0868106484413147, - "logps/chosen": -1.2544111013412476, - "logps/rejected": -1.4883978366851807, - "loss": 2.0379, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2544111013412476, - "rewards/margins": 0.23398666083812714, - "rewards/rejected": -1.4883978366851807, - "semantic_entropy": 0.8126281499862671, + "logits/chosen": -0.025203552097082138, + "logits/rejected": 0.0852215439081192, + "logps/chosen": -1.2349945306777954, + "logps/rejected": -1.438617467880249, + "loss": 1.6265, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2349945306777954, + "rewards/margins": 0.20362301170825958, + "rewards/rejected": -1.438617467880249, "step": 1075 }, { "epoch": 0.5780230807827396, - "grad_norm": 9.372783611133503, + "grad_norm": 7.500831024647689, "learning_rate": 9.740934232511892e-07, - "logits/chosen": -0.0778825581073761, - "logits/rejected": 0.02420179545879364, - "logps/chosen": -1.3782198429107666, - "logps/rejected": -1.4401485919952393, - "loss": 2.1265, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.3782198429107666, - "rewards/margins": 0.06192876026034355, - "rewards/rejected": -1.4401485919952393, - "semantic_entropy": 0.7773522138595581, + "logits/chosen": -0.11578289419412613, + "logits/rejected": -0.024845128878951073, + "logps/chosen": -1.3552777767181396, + "logps/rejected": -1.3890711069107056, + "loss": 1.7297, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3552777767181396, + "rewards/margins": 0.03379334136843681, + "rewards/rejected": -1.3890711069107056, "step": 1080 }, { "epoch": 0.5806991135641412, - "grad_norm": 6.910970222784744, + "grad_norm": 6.994719249130065, "learning_rate": 9.735963167736698e-07, - "logits/chosen": 0.008498705923557281, - "logits/rejected": 0.16839997470378876, - "logps/chosen": -1.335448980331421, - "logps/rejected": -1.3817907571792603, - "loss": 2.1352, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.335448980331421, - "rewards/margins": 0.04634168744087219, - "rewards/rejected": -1.3817907571792603, - "semantic_entropy": 0.8023829460144043, + "logits/chosen": -0.01449662446975708, + "logits/rejected": 0.1305081993341446, + "logps/chosen": -1.3126853704452515, + "logps/rejected": -1.344916582107544, + "loss": 1.7248, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.3126853704452515, + "rewards/margins": 0.03223109990358353, + "rewards/rejected": -1.344916582107544, "step": 1085 }, { "epoch": 0.5833751463455428, - "grad_norm": 6.445622120943113, + "grad_norm": 5.89657380761943, "learning_rate": 9.730946154626078e-07, - "logits/chosen": -0.009309718385338783, - "logits/rejected": 0.08693398535251617, - "logps/chosen": -1.3074315786361694, - "logps/rejected": -1.3497371673583984, - "loss": 2.1139, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3074315786361694, - "rewards/margins": 0.04230565205216408, - "rewards/rejected": -1.3497371673583984, - "semantic_entropy": 0.8121057748794556, + "logits/chosen": -0.006966861430555582, + "logits/rejected": 0.08126161247491837, + "logps/chosen": -1.2898008823394775, + "logps/rejected": -1.3175550699234009, + "loss": 1.6988, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.2898008823394775, + "rewards/margins": 0.02775425836443901, + "rewards/rejected": -1.3175550699234009, "step": 1090 }, { "epoch": 0.5860511791269443, - "grad_norm": 10.138338076160649, + "grad_norm": 9.83209660948282, "learning_rate": 9.725883241855117e-07, - "logits/chosen": -0.11470967531204224, - "logits/rejected": 0.01609751023352146, - "logps/chosen": -1.2618416547775269, - "logps/rejected": -1.4212886095046997, - "loss": 2.0482, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2618416547775269, - "rewards/margins": 0.15944695472717285, - "rewards/rejected": -1.4212886095046997, - "semantic_entropy": 0.811863899230957, + "logits/chosen": -0.14097405970096588, + "logits/rejected": -0.023608043789863586, + "logps/chosen": -1.242818832397461, + "logps/rejected": -1.3774590492248535, + "loss": 1.6302, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.242818832397461, + "rewards/margins": 0.1346401870250702, + "rewards/rejected": -1.3774590492248535, "step": 1095 }, { "epoch": 0.5887272119083459, - "grad_norm": 8.91871654543862, + "grad_norm": 8.709679661390545, "learning_rate": 9.720774478544218e-07, - "logits/chosen": -0.00520699005573988, - "logits/rejected": 0.09051401168107986, - "logps/chosen": -1.1748051643371582, - "logps/rejected": -1.500089406967163, - "loss": 1.954, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.1748051643371582, - "rewards/margins": 0.3252841532230377, - "rewards/rejected": -1.500089406967163, - "semantic_entropy": 0.8329869508743286, + "logits/chosen": -0.002015142235904932, + "logits/rejected": 0.08891472965478897, + "logps/chosen": -1.1542670726776123, + "logps/rejected": -1.448899745941162, + "loss": 1.5282, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1542670726776123, + "rewards/margins": 0.29463261365890503, + "rewards/rejected": -1.448899745941162, "step": 1100 }, { "epoch": 0.5914032446897475, - "grad_norm": 7.263517263520201, + "grad_norm": 7.005717163731625, "learning_rate": 9.715619914258624e-07, - "logits/chosen": -0.043882619589567184, - "logits/rejected": 0.029377352446317673, - "logps/chosen": -1.297758936882019, - "logps/rejected": -1.424983263015747, - "loss": 2.0699, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.297758936882019, - "rewards/margins": 0.1272241324186325, - "rewards/rejected": -1.424983263015747, - "semantic_entropy": 0.799700140953064, + "logits/chosen": -0.04424266517162323, + "logits/rejected": 0.01915797032415867, + "logps/chosen": -1.2812672853469849, + "logps/rejected": -1.3807398080825806, + "loss": 1.6659, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2812672853469849, + "rewards/margins": 0.09947264194488525, + "rewards/rejected": -1.3807398080825806, "step": 1105 }, { "epoch": 0.594079277471149, - "grad_norm": 9.534682687394518, + "grad_norm": 8.61150436242116, "learning_rate": 9.710419599007937e-07, - "logits/chosen": -0.00919708888977766, - "logits/rejected": 0.11027137190103531, - "logps/chosen": -1.2790029048919678, - "logps/rejected": -1.341855764389038, - "loss": 2.076, + "logits/chosen": -0.007998655550181866, + "logits/rejected": 0.10622884333133698, + "logps/chosen": -1.2612874507904053, + "logps/rejected": -1.2952603101730347, + "loss": 1.6641, "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.2790029048919678, - "rewards/margins": 0.06285303086042404, - "rewards/rejected": -1.341855764389038, - "semantic_entropy": 0.8141870498657227, + "rewards/chosen": -1.2612874507904053, + "rewards/margins": 0.03397286683320999, + "rewards/rejected": -1.2952603101730347, "step": 1110 }, { "epoch": 0.5967553102525506, - "grad_norm": 11.77533862230331, + "grad_norm": 12.059532727951146, "learning_rate": 9.705173583245643e-07, - "logits/chosen": 0.05369449406862259, - "logits/rejected": 0.16988304257392883, - "logps/chosen": -1.2029147148132324, - "logps/rejected": -1.4237674474716187, - "loss": 1.9884, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2029147148132324, - "rewards/margins": 0.2208527773618698, - "rewards/rejected": -1.4237674474716187, - "semantic_entropy": 0.8146249651908875, + "logits/chosen": 0.0511791817843914, + "logits/rejected": 0.15223221480846405, + "logps/chosen": -1.1839970350265503, + "logps/rejected": -1.3715298175811768, + "loss": 1.5661, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1839970350265503, + "rewards/margins": 0.18753281235694885, + "rewards/rejected": -1.3715298175811768, "step": 1115 }, { "epoch": 0.5994313430339522, - "grad_norm": 8.183547599673236, + "grad_norm": 8.460088417331862, "learning_rate": 9.699881917868609e-07, - "logits/chosen": -0.13708457350730896, - "logits/rejected": -0.037772614508867264, - "logps/chosen": -1.2671456336975098, - "logps/rejected": -1.4331334829330444, - "loss": 2.0164, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2671456336975098, - "rewards/margins": 0.16598792374134064, - "rewards/rejected": -1.4331334829330444, - "semantic_entropy": 0.8121305704116821, + "logits/chosen": -0.13747264444828033, + "logits/rejected": -0.044413208961486816, + "logps/chosen": -1.2539507150650024, + "logps/rejected": -1.3953502178192139, + "loss": 1.6098, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2539507150650024, + "rewards/margins": 0.14139962196350098, + "rewards/rejected": -1.3953502178192139, "step": 1120 }, { "epoch": 0.6021073758153538, - "grad_norm": 9.638683526837815, + "grad_norm": 9.32199269477602, "learning_rate": 9.694544654216594e-07, - "logits/chosen": -0.11906708776950836, - "logits/rejected": 0.0555792935192585, - "logps/chosen": -1.2739698886871338, - "logps/rejected": -1.4571704864501953, - "loss": 2.0407, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2739698886871338, - "rewards/margins": 0.18320061266422272, - "rewards/rejected": -1.4571704864501953, - "semantic_entropy": 0.8070036172866821, + "logits/chosen": -0.1262091100215912, + "logits/rejected": 0.03779655322432518, + "logps/chosen": -1.2518672943115234, + "logps/rejected": -1.3900775909423828, + "loss": 1.6304, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2518672943115234, + "rewards/margins": 0.13821040093898773, + "rewards/rejected": -1.3900775909423828, "step": 1125 }, { "epoch": 0.6047834085967553, - "grad_norm": 7.966285318311775, + "grad_norm": 7.3873109008422615, "learning_rate": 9.689161844071755e-07, - "logits/chosen": 0.04917113855481148, - "logits/rejected": 0.10522178560495377, - "logps/chosen": -1.310528039932251, - "logps/rejected": -1.4806125164031982, - "loss": 2.0596, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.310528039932251, - "rewards/margins": 0.17008444666862488, - "rewards/rejected": -1.4806125164031982, - "semantic_entropy": 0.8099870681762695, + "logits/chosen": 0.04846873879432678, + "logits/rejected": 0.09871874004602432, + "logps/chosen": -1.2924776077270508, + "logps/rejected": -1.4353249073028564, + "loss": 1.6502, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2924776077270508, + "rewards/margins": 0.1428474634885788, + "rewards/rejected": -1.4353249073028564, "step": 1130 }, { "epoch": 0.6074594413781569, - "grad_norm": 10.549909775271933, + "grad_norm": 10.565256810319221, "learning_rate": 9.683733539658138e-07, - "logits/chosen": -0.02498762123286724, - "logits/rejected": 0.12934377789497375, - "logps/chosen": -1.325805902481079, - "logps/rejected": -1.5380194187164307, - "loss": 2.046, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.325805902481079, - "rewards/margins": 0.21221347153186798, - "rewards/rejected": -1.5380194187164307, - "semantic_entropy": 0.7939231991767883, + "logits/chosen": -0.029242968186736107, + "logits/rejected": 0.11600425094366074, + "logps/chosen": -1.3042986392974854, + "logps/rejected": -1.4900563955307007, + "loss": 1.6411, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3042986392974854, + "rewards/margins": 0.18575787544250488, + "rewards/rejected": -1.4900563955307007, "step": 1135 }, { "epoch": 0.6101354741595585, - "grad_norm": 8.453341612686003, + "grad_norm": 7.840932704918959, "learning_rate": 9.678259793641178e-07, - "logits/chosen": -0.04770956188440323, - "logits/rejected": -0.01541087031364441, - "logps/chosen": -1.301532506942749, - "logps/rejected": -1.348470687866211, - "loss": 2.1, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.301532506942749, - "rewards/margins": 0.04693824052810669, - "rewards/rejected": -1.348470687866211, - "semantic_entropy": 0.8088309168815613, + "logits/chosen": -0.021750206127762794, + "logits/rejected": 0.005784572567790747, + "logps/chosen": -1.2818795442581177, + "logps/rejected": -1.3069055080413818, + "loss": 1.6906, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.2818795442581177, + "rewards/margins": 0.025025952607393265, + "rewards/rejected": -1.3069055080413818, "step": 1140 }, { "epoch": 0.61281150694096, - "grad_norm": 7.110999863323454, + "grad_norm": 7.417020729970113, "learning_rate": 9.672740659127183e-07, - "logits/chosen": -0.15768319368362427, - "logits/rejected": -0.049283333122730255, - "logps/chosen": -1.320922613143921, - "logps/rejected": -1.4701412916183472, - "loss": 2.0717, + "logits/chosen": -0.1478433907032013, + "logits/rejected": -0.04975400120019913, + "logps/chosen": -1.3073512315750122, + "logps/rejected": -1.417533040046692, + "loss": 1.6762, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.320922613143921, - "rewards/margins": 0.14921870827674866, - "rewards/rejected": -1.4701412916183472, - "semantic_entropy": 0.7909232378005981, + "rewards/chosen": -1.3073512315750122, + "rewards/margins": 0.1101817637681961, + "rewards/rejected": -1.417533040046692, "step": 1145 }, { "epoch": 0.6154875397223616, - "grad_norm": 7.890663915423388, + "grad_norm": 8.074345294334199, "learning_rate": 9.667176189662818e-07, - "logits/chosen": -0.13520444929599762, - "logits/rejected": -0.0030995309352874756, - "logps/chosen": -1.2132972478866577, - "logps/rejected": -1.3859599828720093, - "loss": 2.0013, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2132972478866577, - "rewards/margins": 0.1726626455783844, - "rewards/rejected": -1.3859599828720093, - "semantic_entropy": 0.8403644561767578, + "logits/chosen": -0.13061223924160004, + "logits/rejected": -0.005057701375335455, + "logps/chosen": -1.1960101127624512, + "logps/rejected": -1.3409080505371094, + "loss": 1.5788, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1960101127624512, + "rewards/margins": 0.144897922873497, + "rewards/rejected": -1.3409080505371094, "step": 1150 }, { "epoch": 0.6181635725037632, - "grad_norm": 6.803336058728524, + "grad_norm": 6.857872596234929, "learning_rate": 9.661566439234592e-07, - "logits/chosen": -0.015679482370615005, - "logits/rejected": 0.06638450920581818, - "logps/chosen": -1.3122317790985107, - "logps/rejected": -1.4226499795913696, - "loss": 2.0871, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3122317790985107, - "rewards/margins": 0.11041836440563202, - "rewards/rejected": -1.4226499795913696, - "semantic_entropy": 0.8015656471252441, + "logits/chosen": -0.008821931667625904, + "logits/rejected": 0.064828060567379, + "logps/chosen": -1.2959681749343872, + "logps/rejected": -1.3864507675170898, + "loss": 1.6786, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2959681749343872, + "rewards/margins": 0.09048257768154144, + "rewards/rejected": -1.3864507675170898, "step": 1155 }, { "epoch": 0.6208396052851648, - "grad_norm": 7.659599498658578, + "grad_norm": 7.673039625152903, "learning_rate": 9.655911462268327e-07, - "logits/chosen": 0.052912432700395584, - "logits/rejected": 0.13948659598827362, - "logps/chosen": -1.2624635696411133, - "logps/rejected": -1.39972984790802, - "loss": 2.0293, + "logits/chosen": 0.021009016782045364, + "logits/rejected": 0.08989116549491882, + "logps/chosen": -1.2484314441680908, + "logps/rejected": -1.3629471063613892, + "loss": 1.6193, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2624635696411133, - "rewards/margins": 0.13726648688316345, - "rewards/rejected": -1.39972984790802, - "semantic_entropy": 0.8163033723831177, + "rewards/chosen": -1.2484314441680908, + "rewards/margins": 0.11451568454504013, + "rewards/rejected": -1.3629471063613892, "step": 1160 }, { "epoch": 0.6235156380665663, - "grad_norm": 5.930536297989213, + "grad_norm": 5.934848176767732, "learning_rate": 9.650211313628636e-07, - "logits/chosen": -0.006730362772941589, - "logits/rejected": 0.06927184015512466, - "logps/chosen": -1.2054722309112549, - "logps/rejected": -1.3947350978851318, - "loss": 1.9887, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2054722309112549, - "rewards/margins": 0.18926294147968292, - "rewards/rejected": -1.3947350978851318, - "semantic_entropy": 0.8181927800178528, + "logits/chosen": -0.02263835072517395, + "logits/rejected": 0.0471782460808754, + "logps/chosen": -1.187469244003296, + "logps/rejected": -1.3563673496246338, + "loss": 1.5709, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.187469244003296, + "rewards/margins": 0.16889812052249908, + "rewards/rejected": -1.3563673496246338, "step": 1165 }, { "epoch": 0.6261916708479679, - "grad_norm": 6.258386860243564, + "grad_norm": 5.886224416634032, "learning_rate": 9.644466048618386e-07, - "logits/chosen": -0.05160114914178848, - "logits/rejected": 0.10188324749469757, - "logps/chosen": -1.422415018081665, - "logps/rejected": -1.477027416229248, - "loss": 2.1507, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.422415018081665, - "rewards/margins": 0.05461234971880913, - "rewards/rejected": -1.477027416229248, - "semantic_entropy": 0.7627812027931213, + "logits/chosen": -0.059956144541502, + "logits/rejected": 0.08554370701313019, + "logps/chosen": -1.4021120071411133, + "logps/rejected": -1.421523094177246, + "loss": 1.7618, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4021120071411133, + "rewards/margins": 0.019411057233810425, + "rewards/rejected": -1.421523094177246, "step": 1170 }, { "epoch": 0.6288677036293695, - "grad_norm": 5.970398580196082, + "grad_norm": 6.185455085126803, "learning_rate": 9.63867572297816e-07, - "logits/chosen": -0.024037795141339302, - "logits/rejected": 0.14671321213245392, - "logps/chosen": -1.252068281173706, - "logps/rejected": -1.376847743988037, - "loss": 2.0549, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.252068281173706, - "rewards/margins": 0.12477940320968628, - "rewards/rejected": -1.376847743988037, - "semantic_entropy": 0.8178479075431824, + "logits/chosen": -0.04533197358250618, + "logits/rejected": 0.10845015197992325, + "logps/chosen": -1.2356188297271729, + "logps/rejected": -1.3273508548736572, + "loss": 1.6465, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2356188297271729, + "rewards/margins": 0.09173201024532318, + "rewards/rejected": -1.3273508548736572, "step": 1175 }, { "epoch": 0.631543736410771, - "grad_norm": 6.256496318286121, + "grad_norm": 5.862756658253257, "learning_rate": 9.632840392885727e-07, - "logits/chosen": -0.0464647002518177, - "logits/rejected": 0.0785941556096077, - "logps/chosen": -1.3559671640396118, - "logps/rejected": -1.552578091621399, - "loss": 2.0598, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3559671640396118, - "rewards/margins": 0.19661085307598114, - "rewards/rejected": -1.552578091621399, - "semantic_entropy": 0.7601912021636963, + "logits/chosen": -0.046646635979413986, + "logits/rejected": 0.06485871970653534, + "logps/chosen": -1.3361549377441406, + "logps/rejected": -1.5056731700897217, + "loss": 1.6712, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3361549377441406, + "rewards/margins": 0.16951832175254822, + "rewards/rejected": -1.5056731700897217, "step": 1180 }, { "epoch": 0.6342197691921726, - "grad_norm": 9.894684261717916, + "grad_norm": 9.160651550896507, "learning_rate": 9.626960114955483e-07, - "logits/chosen": 0.01621159166097641, - "logits/rejected": 0.14661137759685516, - "logps/chosen": -1.3828543424606323, - "logps/rejected": -1.5047622919082642, - "loss": 2.1197, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3828543424606323, - "rewards/margins": 0.12190792709589005, - "rewards/rejected": -1.5047622919082642, - "semantic_entropy": 0.769521951675415, + "logits/chosen": -0.009948519058525562, + "logits/rejected": 0.1008104532957077, + "logps/chosen": -1.3717749118804932, + "logps/rejected": -1.4444175958633423, + "loss": 1.7385, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.3717749118804932, + "rewards/margins": 0.07264275848865509, + "rewards/rejected": -1.4444175958633423, "step": 1185 }, { "epoch": 0.6368958019735742, - "grad_norm": 8.232140475800872, + "grad_norm": 8.485100054672055, "learning_rate": 9.621034946237909e-07, - "logits/chosen": -0.0651417151093483, - "logits/rejected": 0.06812240183353424, - "logps/chosen": -1.3452720642089844, - "logps/rejected": -1.5318175554275513, - "loss": 2.0596, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3452720642089844, - "rewards/margins": 0.18654534220695496, - "rewards/rejected": -1.5318175554275513, - "semantic_entropy": 0.7749825716018677, + "logits/chosen": -0.06173746660351753, + "logits/rejected": 0.060471467673778534, + "logps/chosen": -1.328833818435669, + "logps/rejected": -1.476729393005371, + "loss": 1.6721, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.328833818435669, + "rewards/margins": 0.1478956639766693, + "rewards/rejected": -1.476729393005371, "step": 1190 }, { "epoch": 0.6395718347549757, - "grad_norm": 10.119925156565927, + "grad_norm": 9.67704832986553, "learning_rate": 9.615064944219021e-07, - "logits/chosen": 0.003726619528606534, - "logits/rejected": 0.12318499386310577, - "logps/chosen": -1.2271159887313843, - "logps/rejected": -1.4569180011749268, - "loss": 1.9712, + "logits/chosen": -0.012681236490607262, + "logits/rejected": 0.0875493735074997, + "logps/chosen": -1.2063080072402954, + "logps/rejected": -1.4012012481689453, + "loss": 1.5614, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2271159887313843, - "rewards/margins": 0.22980189323425293, - "rewards/rejected": -1.4569180011749268, - "semantic_entropy": 0.8198936581611633, + "rewards/chosen": -1.2063080072402954, + "rewards/margins": 0.19489315152168274, + "rewards/rejected": -1.4012012481689453, "step": 1195 }, { "epoch": 0.6422478675363773, - "grad_norm": 8.765807332648222, + "grad_norm": 7.578613594936711, "learning_rate": 9.609050166819803e-07, - "logits/chosen": -0.046974822878837585, - "logits/rejected": 0.017429247498512268, - "logps/chosen": -1.269205927848816, - "logps/rejected": -1.436964750289917, - "loss": 2.0333, + "logits/chosen": -0.06550656259059906, + "logits/rejected": -0.013155996799468994, + "logps/chosen": -1.2424876689910889, + "logps/rejected": -1.388684868812561, + "loss": 1.6206, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.269205927848816, - "rewards/margins": 0.1677587926387787, - "rewards/rejected": -1.436964750289917, - "semantic_entropy": 0.8055108189582825, + "rewards/chosen": -1.2424876689910889, + "rewards/margins": 0.1461973935365677, + "rewards/rejected": -1.388684868812561, "step": 1200 }, { "epoch": 0.6422478675363773, - "eval_logits/chosen": 0.32719656825065613, - "eval_logits/rejected": 0.4174032211303711, - "eval_logps/chosen": -1.3249365091323853, - "eval_logps/rejected": -1.5070233345031738, - "eval_loss": 2.0610902309417725, - "eval_rewards/accuracies": 0.5734421610832214, - "eval_rewards/chosen": -1.3249365091323853, - "eval_rewards/margins": 0.18208687007427216, - "eval_rewards/rejected": -1.5070233345031738, - "eval_runtime": 34.5017, - "eval_samples_per_second": 38.984, - "eval_semantic_entropy": 0.7899767756462097, - "eval_steps_per_second": 9.768, + "eval_logits/chosen": 0.2713717818260193, + "eval_logits/rejected": 0.3522588908672333, + "eval_logps/chosen": -1.308337688446045, + "eval_logps/rejected": -1.4521799087524414, + "eval_loss": 1.6640232801437378, + "eval_rewards/accuracies": 0.5563797950744629, + "eval_rewards/chosen": -1.308337688446045, + "eval_rewards/margins": 0.14384222030639648, + "eval_rewards/rejected": -1.4521799087524414, + "eval_runtime": 40.2855, + "eval_samples_per_second": 33.387, + "eval_steps_per_second": 8.365, "step": 1200 }, { "epoch": 0.6449239003177789, - "grad_norm": 10.989286448969933, + "grad_norm": 9.902098500941502, "learning_rate": 9.602990672395653e-07, - "logits/chosen": -0.12443922460079193, - "logits/rejected": 0.04555289074778557, - "logps/chosen": -1.2795411348342896, - "logps/rejected": -1.4252101182937622, - "loss": 2.0401, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2795411348342896, - "rewards/margins": 0.14566898345947266, - "rewards/rejected": -1.4252101182937622, - "semantic_entropy": 0.8055251836776733, + "logits/chosen": -0.13126561045646667, + "logits/rejected": 0.023626195266842842, + "logps/chosen": -1.2611668109893799, + "logps/rejected": -1.3815600872039795, + "loss": 1.6263, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2611668109893799, + "rewards/margins": 0.12039327621459961, + "rewards/rejected": -1.3815600872039795, "step": 1205 }, { "epoch": 0.6475999330991805, - "grad_norm": 10.284102990108195, + "grad_norm": 9.735377329344027, "learning_rate": 9.59688651973581e-07, - "logits/chosen": -0.07771871238946915, - "logits/rejected": 0.10380265861749649, - "logps/chosen": -1.3103466033935547, - "logps/rejected": -1.4392341375350952, - "loss": 2.1027, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3103466033935547, - "rewards/margins": 0.12888756394386292, - "rewards/rejected": -1.4392341375350952, - "semantic_entropy": 0.7999576926231384, + "logits/chosen": -0.05382559448480606, + "logits/rejected": 0.12032803148031235, + "logps/chosen": -1.2983372211456299, + "logps/rejected": -1.4043587446212769, + "loss": 1.6978, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2983372211456299, + "rewards/margins": 0.1060214415192604, + "rewards/rejected": -1.4043587446212769, "step": 1210 }, { "epoch": 0.650275965880582, - "grad_norm": 5.798825984707924, + "grad_norm": 6.387580672860148, "learning_rate": 9.590737768062792e-07, - "logits/chosen": -0.11275134980678558, - "logits/rejected": 0.0032500834204256535, - "logps/chosen": -1.3132580518722534, - "logps/rejected": -1.3747795820236206, - "loss": 2.0875, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3132580518722534, - "rewards/margins": 0.061521708965301514, - "rewards/rejected": -1.3747795820236206, - "semantic_entropy": 0.8179332613945007, + "logits/chosen": -0.0932522639632225, + "logits/rejected": 0.019932487979531288, + "logps/chosen": -1.2985928058624268, + "logps/rejected": -1.3247510194778442, + "loss": 1.6794, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2985928058624268, + "rewards/margins": 0.02615833841264248, + "rewards/rejected": -1.3247510194778442, "step": 1215 }, { "epoch": 0.6529519986619836, - "grad_norm": 9.076422127592291, + "grad_norm": 8.286558491291323, "learning_rate": 9.584544477031816e-07, - "logits/chosen": 0.07115252315998077, - "logits/rejected": 0.17380015552043915, - "logps/chosen": -1.232552170753479, - "logps/rejected": -1.386908769607544, - "loss": 2.0484, + "logits/chosen": 0.06500460207462311, + "logits/rejected": 0.160376638174057, + "logps/chosen": -1.2172410488128662, + "logps/rejected": -1.3483210802078247, + "loss": 1.6241, "rewards/accuracies": 0.53125, - "rewards/chosen": -1.232552170753479, - "rewards/margins": 0.15435653924942017, - "rewards/rejected": -1.386908769607544, - "semantic_entropy": 0.8239533305168152, + "rewards/chosen": -1.2172410488128662, + "rewards/margins": 0.13107988238334656, + "rewards/rejected": -1.3483210802078247, "step": 1220 }, { "epoch": 0.6556280314433852, - "grad_norm": 6.438707549082974, + "grad_norm": 5.9781859984561345, "learning_rate": 9.578306706730215e-07, - "logits/chosen": -0.16714417934417725, - "logits/rejected": 0.03010159358382225, - "logps/chosen": -1.3243261575698853, - "logps/rejected": -1.446779489517212, - "loss": 2.0669, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3243261575698853, - "rewards/margins": 0.12245325744152069, - "rewards/rejected": -1.446779489517212, - "semantic_entropy": 0.7870798110961914, + "logits/chosen": -0.13773970305919647, + "logits/rejected": 0.055517297238111496, + "logps/chosen": -1.3097951412200928, + "logps/rejected": -1.4055107831954956, + "loss": 1.6675, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3097951412200928, + "rewards/margins": 0.09571562707424164, + "rewards/rejected": -1.4055107831954956, "step": 1225 }, { "epoch": 0.6583040642247867, - "grad_norm": 8.726078412835406, + "grad_norm": 8.031756035684708, "learning_rate": 9.572024517676865e-07, - "logits/chosen": -0.07110466808080673, - "logits/rejected": 0.03021205961704254, - "logps/chosen": -1.2625112533569336, - "logps/rejected": -1.4501873254776, - "loss": 2.0438, + "logits/chosen": -0.06108025461435318, + "logits/rejected": 0.04098379611968994, + "logps/chosen": -1.2422094345092773, + "logps/rejected": -1.4047267436981201, + "loss": 1.6224, "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2625112533569336, - "rewards/margins": 0.18767592310905457, - "rewards/rejected": -1.4501873254776, - "semantic_entropy": 0.8156733512878418, + "rewards/chosen": -1.2422094345092773, + "rewards/margins": 0.16251742839813232, + "rewards/rejected": -1.4047267436981201, "step": 1230 }, { "epoch": 0.6609800970061883, - "grad_norm": 6.3660145593188995, + "grad_norm": 6.818886107239191, "learning_rate": 9.565697970821593e-07, - "logits/chosen": -0.040026407688856125, - "logits/rejected": 0.08541294187307358, - "logps/chosen": -1.3209205865859985, - "logps/rejected": -1.4224975109100342, - "loss": 2.0924, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3209205865859985, - "rewards/margins": 0.10157684236764908, - "rewards/rejected": -1.4224975109100342, - "semantic_entropy": 0.7984825372695923, + "logits/chosen": -0.030785422772169113, + "logits/rejected": 0.08796034008264542, + "logps/chosen": -1.2995566129684448, + "logps/rejected": -1.3784334659576416, + "loss": 1.6868, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2995566129684448, + "rewards/margins": 0.07887676358222961, + "rewards/rejected": -1.3784334659576416, "step": 1235 }, { "epoch": 0.6636561297875899, - "grad_norm": 9.056017527039876, + "grad_norm": 8.55024697740653, "learning_rate": 9.559327127544585e-07, - "logits/chosen": -0.1611124575138092, - "logits/rejected": -0.035345181822776794, - "logps/chosen": -1.2943966388702393, - "logps/rejected": -1.4472328424453735, - "loss": 2.0583, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2943966388702393, - "rewards/margins": 0.15283603966236115, - "rewards/rejected": -1.4472328424453735, - "semantic_entropy": 0.8084890246391296, + "logits/chosen": -0.156296044588089, + "logits/rejected": -0.04075910523533821, + "logps/chosen": -1.2702831029891968, + "logps/rejected": -1.3911727666854858, + "loss": 1.6497, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2702831029891968, + "rewards/margins": 0.12088973820209503, + "rewards/rejected": -1.3911727666854858, "step": 1240 }, { "epoch": 0.6663321625689914, - "grad_norm": 8.337516442690244, + "grad_norm": 7.909529033039376, "learning_rate": 9.552912049655789e-07, - "logits/chosen": -0.07143320143222809, - "logits/rejected": 0.09474059194326401, - "logps/chosen": -1.3714230060577393, - "logps/rejected": -1.4419299364089966, - "loss": 2.1097, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.3714230060577393, - "rewards/margins": 0.07050693780183792, - "rewards/rejected": -1.4419299364089966, - "semantic_entropy": 0.7875084280967712, + "logits/chosen": -0.05753855034708977, + "logits/rejected": 0.09787220507860184, + "logps/chosen": -1.3558070659637451, + "logps/rejected": -1.3945585489273071, + "loss": 1.7134, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3558070659637451, + "rewards/margins": 0.03875157609581947, + "rewards/rejected": -1.3945585489273071, "step": 1245 }, { "epoch": 0.669008195350393, - "grad_norm": 11.557141587121166, + "grad_norm": 11.207775248078786, "learning_rate": 9.546452799394315e-07, - "logits/chosen": -0.06920245289802551, - "logits/rejected": 0.1126919835805893, - "logps/chosen": -1.355477213859558, - "logps/rejected": -1.4375073909759521, - "loss": 2.109, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.355477213859558, - "rewards/margins": 0.0820300504565239, - "rewards/rejected": -1.4375073909759521, - "semantic_entropy": 0.7804659605026245, + "logits/chosen": -0.059803556650877, + "logits/rejected": 0.10910670459270477, + "logps/chosen": -1.3280284404754639, + "logps/rejected": -1.3814871311187744, + "loss": 1.7036, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.3280284404754639, + "rewards/margins": 0.053458768874406815, + "rewards/rejected": -1.3814871311187744, "step": 1250 }, { "epoch": 0.6716842281317946, - "grad_norm": 10.611897366182086, + "grad_norm": 9.396884172972708, "learning_rate": 9.539949439427846e-07, - "logits/chosen": -0.07876632362604141, - "logits/rejected": 0.041526369750499725, - "logps/chosen": -1.3036260604858398, - "logps/rejected": -1.4317988157272339, - "loss": 2.0595, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3036260604858398, - "rewards/margins": 0.12817277014255524, - "rewards/rejected": -1.4317988157272339, - "semantic_entropy": 0.8029844164848328, + "logits/chosen": -0.06721973419189453, + "logits/rejected": 0.04472237452864647, + "logps/chosen": -1.2888884544372559, + "logps/rejected": -1.3872915506362915, + "loss": 1.6554, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2888884544372559, + "rewards/margins": 0.0984029546380043, + "rewards/rejected": -1.3872915506362915, "step": 1255 }, { "epoch": 0.6743602609131962, - "grad_norm": 7.570396919067868, + "grad_norm": 7.7532438287878245, "learning_rate": 9.533402032852002e-07, - "logits/chosen": -0.09820413589477539, - "logits/rejected": 0.027981841936707497, - "logps/chosen": -1.2500559091567993, - "logps/rejected": -1.462172269821167, - "loss": 2.0205, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2500559091567993, - "rewards/margins": 0.21211643517017365, - "rewards/rejected": -1.462172269821167, - "semantic_entropy": 0.8136337995529175, + "logits/chosen": -0.10559892654418945, + "logits/rejected": 0.011072332970798016, + "logps/chosen": -1.2306454181671143, + "logps/rejected": -1.4112110137939453, + "loss": 1.6035, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2306454181671143, + "rewards/margins": 0.18056556582450867, + "rewards/rejected": -1.4112110137939453, "step": 1260 }, { "epoch": 0.6770362936945977, - "grad_norm": 7.519925985649294, + "grad_norm": 6.734623913255203, "learning_rate": 9.526810643189754e-07, - "logits/chosen": -0.0017353773582726717, - "logits/rejected": 0.13616499304771423, - "logps/chosen": -1.289548397064209, - "logps/rejected": -1.444461464881897, - "loss": 2.0378, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.289548397064209, - "rewards/margins": 0.15491308271884918, - "rewards/rejected": -1.444461464881897, - "semantic_entropy": 0.8108280301094055, + "logits/chosen": -0.03286564350128174, + "logits/rejected": 0.08090507984161377, + "logps/chosen": -1.2756887674331665, + "logps/rejected": -1.387162446975708, + "loss": 1.6341, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2756887674331665, + "rewards/margins": 0.11147379875183105, + "rewards/rejected": -1.387162446975708, "step": 1265 }, { "epoch": 0.6797123264759993, - "grad_norm": 7.8869932020973454, + "grad_norm": 8.00249171934378, "learning_rate": 9.52017533439079e-07, - "logits/chosen": -0.06916630268096924, - "logits/rejected": 0.03533398360013962, - "logps/chosen": -1.260999083518982, - "logps/rejected": -1.4440901279449463, - "loss": 2.0333, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.260999083518982, - "rewards/margins": 0.1830909699201584, - "rewards/rejected": -1.4440901279449463, - "semantic_entropy": 0.8088814616203308, + "logits/chosen": -0.07302910834550858, + "logits/rejected": 0.02558300271630287, + "logps/chosen": -1.2368438243865967, + "logps/rejected": -1.3913614749908447, + "loss": 1.6175, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2368438243865967, + "rewards/margins": 0.15451772511005402, + "rewards/rejected": -1.3913614749908447, "step": 1270 }, { "epoch": 0.6823883592574009, - "grad_norm": 5.869146541625044, + "grad_norm": 5.610957995849928, "learning_rate": 9.513496170830909e-07, - "logits/chosen": -0.04706912487745285, - "logits/rejected": 0.05594928190112114, - "logps/chosen": -1.3073713779449463, - "logps/rejected": -1.4249076843261719, - "loss": 2.0741, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.3073713779449463, - "rewards/margins": 0.11753638088703156, - "rewards/rejected": -1.4249076843261719, - "semantic_entropy": 0.8061819076538086, + "logits/chosen": -0.06308968365192413, + "logits/rejected": 0.03248829022049904, + "logps/chosen": -1.295707106590271, + "logps/rejected": -1.3742659091949463, + "loss": 1.6766, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.295707106590271, + "rewards/margins": 0.07855904847383499, + "rewards/rejected": -1.3742659091949463, "step": 1275 }, { "epoch": 0.6850643920388024, - "grad_norm": 8.181696193081255, + "grad_norm": 8.34191349008034, "learning_rate": 9.506773217311382e-07, - "logits/chosen": -0.06857715547084808, - "logits/rejected": 0.07334651052951813, - "logps/chosen": -1.381034016609192, - "logps/rejected": -1.4767224788665771, - "loss": 2.1219, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.381034016609192, - "rewards/margins": 0.09568850696086884, - "rewards/rejected": -1.4767224788665771, - "semantic_entropy": 0.7766083478927612, + "logits/chosen": -0.07875625044107437, + "logits/rejected": 0.04786789044737816, + "logps/chosen": -1.3658252954483032, + "logps/rejected": -1.4279983043670654, + "loss": 1.7333, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3658252954483032, + "rewards/margins": 0.062173031270504, + "rewards/rejected": -1.4279983043670654, "step": 1280 }, { "epoch": 0.687740424820204, - "grad_norm": 9.44924573864344, + "grad_norm": 7.628569943813764, "learning_rate": 9.500006539058334e-07, - "logits/chosen": -0.018108461052179337, - "logits/rejected": 0.1121261939406395, - "logps/chosen": -1.2533257007598877, - "logps/rejected": -1.3792779445648193, - "loss": 2.0225, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2533257007598877, - "rewards/margins": 0.12595216929912567, - "rewards/rejected": -1.3792779445648193, - "semantic_entropy": 0.8158677816390991, + "logits/chosen": -0.04173643887042999, + "logits/rejected": 0.0719747543334961, + "logps/chosen": -1.2409158945083618, + "logps/rejected": -1.3405625820159912, + "loss": 1.6127, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2409158945083618, + "rewards/margins": 0.09964674711227417, + "rewards/rejected": -1.3405625820159912, "step": 1285 }, { "epoch": 0.6904164576016056, - "grad_norm": 7.4959004943032825, + "grad_norm": 7.976144585160272, "learning_rate": 9.493196201722109e-07, - "logits/chosen": -0.15983861684799194, - "logits/rejected": -0.022212965413928032, - "logps/chosen": -1.3256019353866577, - "logps/rejected": -1.3897264003753662, - "loss": 2.1197, + "logits/chosen": -0.17471951246261597, + "logits/rejected": -0.04663420841097832, + "logps/chosen": -1.3102699518203735, + "logps/rejected": -1.3529913425445557, + "loss": 1.7179, "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.3256019353866577, - "rewards/margins": 0.06412459164857864, - "rewards/rejected": -1.3897264003753662, - "semantic_entropy": 0.7991577386856079, + "rewards/chosen": -1.3102699518203735, + "rewards/margins": 0.042721252888441086, + "rewards/rejected": -1.3529913425445557, "step": 1290 }, { "epoch": 0.6930924903830072, - "grad_norm": 6.065432032119847, + "grad_norm": 6.085502180077309, "learning_rate": 9.486342271376628e-07, - "logits/chosen": -0.07144840061664581, - "logits/rejected": -0.05996709316968918, - "logps/chosen": -1.2956889867782593, - "logps/rejected": -1.4863463640213013, - "loss": 2.0367, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2956889867782593, - "rewards/margins": 0.1906573474407196, - "rewards/rejected": -1.4863463640213013, - "semantic_entropy": 0.7861794233322144, + "logits/chosen": -0.07361548393964767, + "logits/rejected": -0.06373373419046402, + "logps/chosen": -1.2772166728973389, + "logps/rejected": -1.4289699792861938, + "loss": 1.6439, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2772166728973389, + "rewards/margins": 0.15175335109233856, + "rewards/rejected": -1.4289699792861938, "step": 1295 }, { "epoch": 0.6957685231644087, - "grad_norm": 7.768340559958128, + "grad_norm": 7.4475181215635375, "learning_rate": 9.479444814518755e-07, - "logits/chosen": -0.057303112000226974, - "logits/rejected": 0.17862965166568756, - "logps/chosen": -1.2886030673980713, - "logps/rejected": -1.4833580255508423, - "loss": 2.0291, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2886030673980713, - "rewards/margins": 0.19475503265857697, - "rewards/rejected": -1.4833580255508423, - "semantic_entropy": 0.7993043661117554, + "logits/chosen": -0.0670512318611145, + "logits/rejected": 0.14799444377422333, + "logps/chosen": -1.27315354347229, + "logps/rejected": -1.4144771099090576, + "loss": 1.6302, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.27315354347229, + "rewards/margins": 0.14132359623908997, + "rewards/rejected": -1.4144771099090576, "step": 1300 }, { "epoch": 0.6984445559458103, - "grad_norm": 6.829811228292914, + "grad_norm": 7.023481889009539, "learning_rate": 9.472503898067645e-07, - "logits/chosen": 0.05778735876083374, - "logits/rejected": 0.10782381147146225, - "logps/chosen": -1.3004398345947266, - "logps/rejected": -1.4733632802963257, - "loss": 2.0328, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3004398345947266, - "rewards/margins": 0.1729235202074051, - "rewards/rejected": -1.4733632802963257, - "semantic_entropy": 0.8007766008377075, + "logits/chosen": 0.011247454211115837, + "logits/rejected": 0.05740055441856384, + "logps/chosen": -1.2823988199234009, + "logps/rejected": -1.4183152914047241, + "loss": 1.6301, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2823988199234009, + "rewards/margins": 0.13591650128364563, + "rewards/rejected": -1.4183152914047241, "step": 1305 }, { "epoch": 0.701120588727212, - "grad_norm": 6.806301756870639, + "grad_norm": 7.044884087209908, "learning_rate": 9.465519589364099e-07, - "logits/chosen": 0.03991737216711044, - "logits/rejected": 0.11657501757144928, - "logps/chosen": -1.310382604598999, - "logps/rejected": -1.4569346904754639, - "loss": 2.0875, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.310382604598999, - "rewards/margins": 0.1465521603822708, - "rewards/rejected": -1.4569346904754639, - "semantic_entropy": 0.7917109131813049, + "logits/chosen": 0.00906024593859911, + "logits/rejected": 0.07833532989025116, + "logps/chosen": -1.2953308820724487, + "logps/rejected": -1.419379472732544, + "loss": 1.6834, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2953308820724487, + "rewards/margins": 0.1240486279129982, + "rewards/rejected": -1.419379472732544, "step": 1310 }, { "epoch": 0.7037966215086134, - "grad_norm": 6.290006408474125, + "grad_norm": 6.3756912656805484, "learning_rate": 9.458491956169914e-07, - "logits/chosen": -0.06548132747411728, - "logits/rejected": 0.10813206434249878, - "logps/chosen": -1.2515592575073242, - "logps/rejected": -1.487073302268982, - "loss": 2.0074, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2515592575073242, - "rewards/margins": 0.23551401495933533, - "rewards/rejected": -1.487073302268982, - "semantic_entropy": 0.8081199526786804, + "logits/chosen": -0.073188416659832, + "logits/rejected": 0.08738872408866882, + "logps/chosen": -1.237274408340454, + "logps/rejected": -1.440292239189148, + "loss": 1.6012, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.237274408340454, + "rewards/margins": 0.20301775634288788, + "rewards/rejected": -1.440292239189148, "step": 1315 }, { "epoch": 0.706472654290015, - "grad_norm": 5.8407006796066785, + "grad_norm": 5.800572640904087, "learning_rate": 9.451421066667215e-07, - "logits/chosen": -0.15501976013183594, - "logits/rejected": 0.03836467117071152, - "logps/chosen": -1.2408206462860107, - "logps/rejected": -1.441028118133545, - "loss": 2.0022, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2408206462860107, - "rewards/margins": 0.20020751655101776, - "rewards/rejected": -1.441028118133545, - "semantic_entropy": 0.8187205195426941, + "logits/chosen": -0.16412648558616638, + "logits/rejected": 0.015692204236984253, + "logps/chosen": -1.2207753658294678, + "logps/rejected": -1.3832472562789917, + "loss": 1.5895, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2207753658294678, + "rewards/margins": 0.1624719202518463, + "rewards/rejected": -1.3832472562789917, "step": 1320 }, { "epoch": 0.7091486870714167, - "grad_norm": 12.171979565537207, + "grad_norm": 11.079064346768625, "learning_rate": 9.444306989457805e-07, - "logits/chosen": -0.00010145604755962268, - "logits/rejected": 0.10248501598834991, - "logps/chosen": -1.3242946863174438, - "logps/rejected": -1.4491575956344604, - "loss": 2.0964, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3242946863174438, - "rewards/margins": 0.12486305087804794, - "rewards/rejected": -1.4491575956344604, - "semantic_entropy": 0.7907122373580933, + "logits/chosen": -0.03358171135187149, + "logits/rejected": 0.056316912174224854, + "logps/chosen": -1.2931864261627197, + "logps/rejected": -1.380968689918518, + "loss": 1.6817, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2931864261627197, + "rewards/margins": 0.08778227865695953, + "rewards/rejected": -1.380968689918518, "step": 1325 }, { "epoch": 0.7118247198528181, - "grad_norm": 5.813429350518074, + "grad_norm": 5.522926070818182, "learning_rate": 9.437149793562489e-07, - "logits/chosen": -0.030075322836637497, - "logits/rejected": 0.08605314791202545, - "logps/chosen": -1.2983765602111816, - "logps/rejected": -1.3950271606445312, - "loss": 2.0709, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2983765602111816, - "rewards/margins": 0.09665055572986603, - "rewards/rejected": -1.3950271606445312, - "semantic_entropy": 0.8118602633476257, + "logits/chosen": -0.07296119630336761, + "logits/rejected": 0.03256779909133911, + "logps/chosen": -1.2829877138137817, + "logps/rejected": -1.357912302017212, + "loss": 1.6631, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2829877138137817, + "rewards/margins": 0.07492466270923615, + "rewards/rejected": -1.357912302017212, "step": 1330 }, { "epoch": 0.7145007526342197, - "grad_norm": 6.511695432503269, + "grad_norm": 6.39437906575835, "learning_rate": 9.429949548420417e-07, - "logits/chosen": -0.010198342613875866, - "logits/rejected": 0.0688561499118805, - "logps/chosen": -1.3720452785491943, - "logps/rejected": -1.5215284824371338, - "loss": 2.0767, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3720452785491943, - "rewards/margins": 0.14948336780071259, - "rewards/rejected": -1.5215284824371338, - "semantic_entropy": 0.7785710096359253, + "logits/chosen": -0.022577999159693718, + "logits/rejected": 0.04554181545972824, + "logps/chosen": -1.3532822132110596, + "logps/rejected": -1.4808286428451538, + "loss": 1.6798, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3532822132110596, + "rewards/margins": 0.12754635512828827, + "rewards/rejected": -1.4808286428451538, "step": 1335 }, { "epoch": 0.7171767854156214, - "grad_norm": 7.43508239763397, + "grad_norm": 7.854902159500145, "learning_rate": 9.422706323888396e-07, - "logits/chosen": -0.021798694506287575, - "logits/rejected": 0.007907522842288017, - "logps/chosen": -1.3327934741973877, - "logps/rejected": -1.4697725772857666, - "loss": 2.0901, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3327934741973877, - "rewards/margins": 0.1369791030883789, - "rewards/rejected": -1.4697725772857666, - "semantic_entropy": 0.7791727185249329, + "logits/chosen": -0.03494944050908089, + "logits/rejected": -0.010035835206508636, + "logps/chosen": -1.3224153518676758, + "logps/rejected": -1.4352055788040161, + "loss": 1.6933, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3224153518676758, + "rewards/margins": 0.11279022693634033, + "rewards/rejected": -1.4352055788040161, "step": 1340 }, { "epoch": 0.719852818197023, - "grad_norm": 5.620204494767762, + "grad_norm": 5.1287235604030545, "learning_rate": 9.415420190240225e-07, - "logits/chosen": 0.012555956840515137, - "logits/rejected": 0.18066881597042084, - "logps/chosen": -1.3168660402297974, - "logps/rejected": -1.4390350580215454, - "loss": 2.0643, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3168660402297974, - "rewards/margins": 0.1221691146492958, - "rewards/rejected": -1.4390350580215454, - "semantic_entropy": 0.7964397668838501, + "logits/chosen": 0.011534917168319225, + "logits/rejected": 0.16566434502601624, + "logps/chosen": -1.3075754642486572, + "logps/rejected": -1.392835259437561, + "loss": 1.6681, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3075754642486572, + "rewards/margins": 0.08525966107845306, + "rewards/rejected": -1.392835259437561, "step": 1345 }, { "epoch": 0.7225288509784245, - "grad_norm": 7.437201739326167, + "grad_norm": 7.552550459316424, "learning_rate": 9.408091218166002e-07, - "logits/chosen": 0.035775817930698395, - "logits/rejected": 0.09171895682811737, - "logps/chosen": -1.307445764541626, - "logps/rejected": -1.3502302169799805, - "loss": 2.1128, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.307445764541626, - "rewards/margins": 0.04278445243835449, - "rewards/rejected": -1.3502302169799805, - "semantic_entropy": 0.8082399368286133, + "logits/chosen": -0.008674606680870056, + "logits/rejected": 0.03636609762907028, + "logps/chosen": -1.292672872543335, + "logps/rejected": -1.3186228275299072, + "loss": 1.7023, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.292672872543335, + "rewards/margins": 0.025949766859412193, + "rewards/rejected": -1.3186228275299072, "step": 1350 }, { "epoch": 0.7252048837598261, - "grad_norm": 6.421590530442405, + "grad_norm": 7.132339621039502, "learning_rate": 9.400719478771449e-07, - "logits/chosen": -0.00508537283167243, - "logits/rejected": 0.272622287273407, - "logps/chosen": -1.363750696182251, - "logps/rejected": -1.4495420455932617, - "loss": 2.121, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.363750696182251, - "rewards/margins": 0.08579148352146149, - "rewards/rejected": -1.4495420455932617, - "semantic_entropy": 0.7730823755264282, + "logits/chosen": -0.049724649637937546, + "logits/rejected": 0.19820116460323334, + "logps/chosen": -1.345320701599121, + "logps/rejected": -1.4055614471435547, + "loss": 1.7265, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.345320701599121, + "rewards/margins": 0.060240667313337326, + "rewards/rejected": -1.4055614471435547, "step": 1355 }, { "epoch": 0.7278809165412277, - "grad_norm": 8.172558548172619, + "grad_norm": 8.479499284806872, "learning_rate": 9.393305043577209e-07, - "logits/chosen": -0.1188274472951889, - "logits/rejected": 0.022789832204580307, - "logps/chosen": -1.3701941967010498, - "logps/rejected": -1.5333750247955322, - "loss": 2.0899, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3701941967010498, - "rewards/margins": 0.16318091750144958, - "rewards/rejected": -1.5333750247955322, - "semantic_entropy": 0.7707791924476624, + "logits/chosen": -0.13432948291301727, + "logits/rejected": -0.0047289221547544, + "logps/chosen": -1.3591015338897705, + "logps/rejected": -1.484825849533081, + "loss": 1.7062, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3591015338897705, + "rewards/margins": 0.12572428584098816, + "rewards/rejected": -1.484825849533081, "step": 1360 }, { "epoch": 0.7305569493226292, - "grad_norm": 5.194706880837731, + "grad_norm": 5.378054207471065, "learning_rate": 9.38584798451817e-07, - "logits/chosen": 0.0019635481294244528, - "logits/rejected": 0.13548122346401215, - "logps/chosen": -1.303626298904419, - "logps/rejected": -1.4488534927368164, - "loss": 2.0556, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.303626298904419, - "rewards/margins": 0.14522729814052582, - "rewards/rejected": -1.4488534927368164, - "semantic_entropy": 0.7995003461837769, + "logits/chosen": 9.254701581085101e-05, + "logits/rejected": 0.12194955348968506, + "logps/chosen": -1.2895641326904297, + "logps/rejected": -1.4120676517486572, + "loss": 1.6517, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2895641326904297, + "rewards/margins": 0.12250330299139023, + "rewards/rejected": -1.4120676517486572, "step": 1365 }, { "epoch": 0.7332329821040308, - "grad_norm": 8.017472283818565, + "grad_norm": 8.339099044480037, "learning_rate": 9.37834837394275e-07, - "logits/chosen": -0.009090607985854149, - "logits/rejected": 0.10668311268091202, - "logps/chosen": -1.3580187559127808, - "logps/rejected": -1.578385591506958, - "loss": 2.0559, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3580187559127808, - "rewards/margins": 0.2203669250011444, - "rewards/rejected": -1.578385591506958, - "semantic_entropy": 0.7690521478652954, + "logits/chosen": -0.03549307957291603, + "logits/rejected": 0.06424375623464584, + "logps/chosen": -1.344766616821289, + "logps/rejected": -1.5251480340957642, + "loss": 1.6706, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.344766616821289, + "rewards/margins": 0.18038137257099152, + "rewards/rejected": -1.5251480340957642, "step": 1370 }, { "epoch": 0.7359090148854324, - "grad_norm": 6.748183181544977, + "grad_norm": 6.481678501748973, "learning_rate": 9.370806284612203e-07, - "logits/chosen": -0.04126691818237305, - "logits/rejected": 0.09672950208187103, - "logps/chosen": -1.2818526029586792, - "logps/rejected": -1.540055513381958, - "loss": 2.0156, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2818526029586792, - "rewards/margins": 0.25820282101631165, - "rewards/rejected": -1.540055513381958, - "semantic_entropy": 0.7934702038764954, + "logits/chosen": -0.090664342045784, + "logits/rejected": 0.026456937193870544, + "logps/chosen": -1.264943242073059, + "logps/rejected": -1.4818670749664307, + "loss": 1.6211, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.264943242073059, + "rewards/margins": 0.21692386269569397, + "rewards/rejected": -1.4818670749664307, "step": 1375 }, { "epoch": 0.738585047666834, - "grad_norm": 9.607687872946466, + "grad_norm": 9.402893063974487, "learning_rate": 9.363221789699912e-07, - "logits/chosen": -0.09153338521718979, - "logits/rejected": 0.023612957447767258, - "logps/chosen": -1.307871699333191, - "logps/rejected": -1.3920269012451172, - "loss": 2.1003, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.307871699333191, - "rewards/margins": 0.08415510505437851, - "rewards/rejected": -1.3920269012451172, - "semantic_entropy": 0.8024643659591675, + "logits/chosen": -0.10575218498706818, + "logits/rejected": -0.005930374842137098, + "logps/chosen": -1.2926733493804932, + "logps/rejected": -1.3471362590789795, + "loss": 1.6957, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2926733493804932, + "rewards/margins": 0.05446278303861618, + "rewards/rejected": -1.3471362590789795, "step": 1380 }, { "epoch": 0.7412610804482355, - "grad_norm": 7.5314238148632295, + "grad_norm": 7.4118705202897255, "learning_rate": 9.355594962790682e-07, - "logits/chosen": -0.0758143737912178, - "logits/rejected": 0.04460468143224716, - "logps/chosen": -1.2546486854553223, - "logps/rejected": -1.397021770477295, - "loss": 2.0542, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2546486854553223, - "rewards/margins": 0.14237311482429504, - "rewards/rejected": -1.397021770477295, - "semantic_entropy": 0.8238178491592407, + "logits/chosen": -0.1039794534444809, + "logits/rejected": -4.317015554988757e-05, + "logps/chosen": -1.2439075708389282, + "logps/rejected": -1.3433864116668701, + "loss": 1.6463, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2439075708389282, + "rewards/margins": 0.09947877377271652, + "rewards/rejected": -1.3433864116668701, "step": 1385 }, { "epoch": 0.7439371132296371, - "grad_norm": 9.691312986451734, + "grad_norm": 9.343791160069676, "learning_rate": 9.34792587788002e-07, - "logits/chosen": 0.007501109037548304, - "logits/rejected": 0.11551366001367569, - "logps/chosen": -1.3173161745071411, - "logps/rejected": -1.4741413593292236, - "loss": 2.0667, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3173161745071411, - "rewards/margins": 0.15682531893253326, - "rewards/rejected": -1.4741413593292236, - "semantic_entropy": 0.7868748903274536, + "logits/chosen": -0.012524081394076347, + "logits/rejected": 0.08128078281879425, + "logps/chosen": -1.2931407690048218, + "logps/rejected": -1.4175056219100952, + "loss": 1.6637, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2931407690048218, + "rewards/margins": 0.12436497211456299, + "rewards/rejected": -1.4175056219100952, "step": 1390 }, { "epoch": 0.7466131460110387, - "grad_norm": 6.04815777040883, + "grad_norm": 6.2264638302612125, "learning_rate": 9.34021460937342e-07, - "logits/chosen": 0.007738134823739529, - "logits/rejected": 0.09423503279685974, - "logps/chosen": -1.2795543670654297, - "logps/rejected": -1.3807176351547241, - "loss": 2.0639, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2795543670654297, - "rewards/margins": 0.10116332769393921, - "rewards/rejected": -1.3807176351547241, - "semantic_entropy": 0.807846188545227, + "logits/chosen": 0.00481851352378726, + "logits/rejected": 0.08721502125263214, + "logps/chosen": -1.2536060810089111, + "logps/rejected": -1.3298468589782715, + "loss": 1.6526, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2536060810089111, + "rewards/margins": 0.0762406438589096, + "rewards/rejected": -1.3298468589782715, "step": 1395 }, { "epoch": 0.7492891787924402, - "grad_norm": 5.210066011422572, + "grad_norm": 5.1453513483583695, "learning_rate": 9.332461232085646e-07, - "logits/chosen": -0.1578998863697052, - "logits/rejected": -0.03726965934038162, - "logps/chosen": -1.3505934476852417, - "logps/rejected": -1.4699254035949707, - "loss": 2.0896, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3505934476852417, - "rewards/margins": 0.11933205276727676, - "rewards/rejected": -1.4699254035949707, - "semantic_entropy": 0.7852131128311157, + "logits/chosen": -0.19782808423042297, + "logits/rejected": -0.09469230473041534, + "logps/chosen": -1.3321107625961304, + "logps/rejected": -1.3970056772232056, + "loss": 1.6925, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3321107625961304, + "rewards/margins": 0.06489496678113937, + "rewards/rejected": -1.3970056772232056, "step": 1400 }, { "epoch": 0.7519652115738418, - "grad_norm": 8.145037317688463, + "grad_norm": 5.777001331063325, "learning_rate": 9.324665821239998e-07, - "logits/chosen": -0.08859096467494965, - "logits/rejected": 0.07373972237110138, - "logps/chosen": -1.2071774005889893, - "logps/rejected": -1.505427598953247, - "loss": 1.981, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2071774005889893, - "rewards/margins": 0.29825007915496826, - "rewards/rejected": -1.505427598953247, - "semantic_entropy": 0.8246868252754211, + "logits/chosen": -0.05654817074537277, + "logits/rejected": 0.10098656266927719, + "logps/chosen": -1.1896693706512451, + "logps/rejected": -1.40790855884552, + "loss": 1.5696, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1896693706512451, + "rewards/margins": 0.21823914349079132, + "rewards/rejected": -1.40790855884552, "step": 1405 }, { "epoch": 0.7546412443552434, - "grad_norm": 6.162440905857332, + "grad_norm": 5.61242400636659, "learning_rate": 9.316828452467583e-07, - "logits/chosen": -0.12044434249401093, - "logits/rejected": 0.0488150492310524, - "logps/chosen": -1.3291029930114746, - "logps/rejected": -1.5076282024383545, - "loss": 2.0544, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3291029930114746, - "rewards/margins": 0.17852529883384705, - "rewards/rejected": -1.5076282024383545, - "semantic_entropy": 0.7849763631820679, + "logits/chosen": -0.10295641422271729, + "logits/rejected": 0.05629279464483261, + "logps/chosen": -1.2972595691680908, + "logps/rejected": -1.4456944465637207, + "loss": 1.6474, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2972595691680908, + "rewards/margins": 0.14843474328517914, + "rewards/rejected": -1.4456944465637207, "step": 1410 }, { "epoch": 0.7573172771366449, - "grad_norm": 10.836969712453396, + "grad_norm": 10.179841050216837, "learning_rate": 9.30894920180659e-07, - "logits/chosen": -0.03222806006669998, - "logits/rejected": 0.10289813578128815, - "logps/chosen": -1.3662335872650146, - "logps/rejected": -1.3669545650482178, - "loss": 2.1405, + "logits/chosen": -0.007915811613202095, + "logits/rejected": 0.11887457221746445, + "logps/chosen": -1.3415172100067139, + "logps/rejected": -1.3081309795379639, + "loss": 1.7415, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3662335872650146, - "rewards/margins": 0.000720818352419883, - "rewards/rejected": -1.3669545650482178, - "semantic_entropy": 0.7890567779541016, + "rewards/chosen": -1.3415172100067139, + "rewards/margins": -0.033386241644620895, + "rewards/rejected": -1.3081309795379639, "step": 1415 }, { "epoch": 0.7599933099180465, - "grad_norm": 6.394489482016526, + "grad_norm": 5.916410428318361, "learning_rate": 9.301028145701543e-07, - "logits/chosen": 0.004710095934569836, - "logits/rejected": 0.127283975481987, - "logps/chosen": -1.2482645511627197, - "logps/rejected": -1.5582225322723389, - "loss": 2.01, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.2482645511627197, - "rewards/margins": 0.3099580407142639, - "rewards/rejected": -1.5582225322723389, - "semantic_entropy": 0.8073336482048035, + "logits/chosen": -0.02033943310379982, + "logits/rejected": 0.08537691831588745, + "logps/chosen": -1.2250733375549316, + "logps/rejected": -1.4753800630569458, + "loss": 1.5933, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2250733375549316, + "rewards/margins": 0.2503066658973694, + "rewards/rejected": -1.4753800630569458, "step": 1420 }, { "epoch": 0.7626693426994481, - "grad_norm": 6.29413562581532, + "grad_norm": 6.290863299930604, "learning_rate": 9.293065361002563e-07, - "logits/chosen": 0.010683017782866955, - "logits/rejected": 0.08055853843688965, - "logps/chosen": -1.2729721069335938, - "logps/rejected": -1.573007345199585, - "loss": 2.0105, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2729721069335938, - "rewards/margins": 0.3000350594520569, - "rewards/rejected": -1.573007345199585, - "semantic_entropy": 0.7954605221748352, + "logits/chosen": -0.011968791484832764, + "logits/rejected": 0.040803950279951096, + "logps/chosen": -1.2538902759552002, + "logps/rejected": -1.514215111732483, + "loss": 1.6073, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2538902759552002, + "rewards/margins": 0.26032498478889465, + "rewards/rejected": -1.514215111732483, "step": 1425 }, { "epoch": 0.7653453754808497, - "grad_norm": 7.1164925104779, + "grad_norm": 6.750555821696019, "learning_rate": 9.285060924964622e-07, - "logits/chosen": -0.10673652589321136, - "logits/rejected": 0.014233958907425404, - "logps/chosen": -1.3369324207305908, - "logps/rejected": -1.4339196681976318, - "loss": 2.0844, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3369324207305908, - "rewards/margins": 0.09698706865310669, - "rewards/rejected": -1.4339196681976318, - "semantic_entropy": 0.7893370389938354, + "logits/chosen": -0.10954097658395767, + "logits/rejected": 0.0022288993932306767, + "logps/chosen": -1.317549467086792, + "logps/rejected": -1.3856089115142822, + "loss": 1.6836, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.317549467086792, + "rewards/margins": 0.06805931776762009, + "rewards/rejected": -1.3856089115142822, "step": 1430 }, { "epoch": 0.7680214082622512, - "grad_norm": 7.812013678094771, + "grad_norm": 7.482354045279545, "learning_rate": 9.277014915246792e-07, - "logits/chosen": 0.032980311661958694, - "logits/rejected": 0.08035765588283539, - "logps/chosen": -1.2653449773788452, - "logps/rejected": -1.5054515600204468, - "loss": 2.0513, + "logits/chosen": 0.003342367708683014, + "logits/rejected": 0.04428841918706894, + "logps/chosen": -1.254521131515503, + "logps/rejected": -1.4428226947784424, + "loss": 1.6476, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2653449773788452, - "rewards/margins": 0.2401064932346344, - "rewards/rejected": -1.5054515600204468, - "semantic_entropy": 0.8130581974983215, + "rewards/chosen": -1.254521131515503, + "rewards/margins": 0.18830154836177826, + "rewards/rejected": -1.4428226947784424, "step": 1435 }, { "epoch": 0.7706974410436528, - "grad_norm": 6.118702707949794, + "grad_norm": 5.8446392499081625, "learning_rate": 9.268927409911498e-07, - "logits/chosen": -0.07677887380123138, - "logits/rejected": 0.010664084926247597, - "logps/chosen": -1.324356198310852, - "logps/rejected": -1.4014503955841064, - "loss": 2.0742, + "logits/chosen": -0.08462724834680557, + "logits/rejected": -0.008226044476032257, + "logps/chosen": -1.3154375553131104, + "logps/rejected": -1.3610209226608276, + "loss": 1.6737, "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.324356198310852, - "rewards/margins": 0.07709423452615738, - "rewards/rejected": -1.4014503955841064, - "semantic_entropy": 0.8042744398117065, + "rewards/chosen": -1.3154375553131104, + "rewards/margins": 0.04558344930410385, + "rewards/rejected": -1.3610209226608276, "step": 1440 }, { "epoch": 0.7733734738250544, - "grad_norm": 8.801912177647397, + "grad_norm": 8.253023221142572, "learning_rate": 9.260798487423749e-07, - "logits/chosen": -0.10331074148416519, - "logits/rejected": 0.0976705476641655, - "logps/chosen": -1.3601049184799194, - "logps/rejected": -1.4674112796783447, - "loss": 2.1, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3601049184799194, - "rewards/margins": 0.10730626434087753, - "rewards/rejected": -1.4674112796783447, - "semantic_entropy": 0.7811123728752136, + "logits/chosen": -0.14412081241607666, + "logits/rejected": 0.03400403633713722, + "logps/chosen": -1.3450403213500977, + "logps/rejected": -1.4334951639175415, + "loss": 1.7044, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3450403213500977, + "rewards/margins": 0.08845490962266922, + "rewards/rejected": -1.4334951639175415, "step": 1445 }, { "epoch": 0.7760495066064559, - "grad_norm": 16.238716470237314, + "grad_norm": 16.894600835872414, "learning_rate": 9.252628226650389e-07, - "logits/chosen": 0.009607335552573204, - "logits/rejected": 0.09555571526288986, - "logps/chosen": -1.2560529708862305, - "logps/rejected": -1.3869524002075195, - "loss": 2.0566, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2560529708862305, - "rewards/margins": 0.13089947402477264, - "rewards/rejected": -1.3869524002075195, - "semantic_entropy": 0.8172558546066284, + "logits/chosen": -0.0017465263372287154, + "logits/rejected": 0.07787595689296722, + "logps/chosen": -1.2358486652374268, + "logps/rejected": -1.349094271659851, + "loss": 1.6353, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2358486652374268, + "rewards/margins": 0.11324580013751984, + "rewards/rejected": -1.349094271659851, "step": 1450 }, { "epoch": 0.7787255393878575, - "grad_norm": 9.370488834506501, + "grad_norm": 9.338591447617187, "learning_rate": 9.244416706859321e-07, - "logits/chosen": -0.02865959145128727, - "logits/rejected": 0.12177982181310654, - "logps/chosen": -1.2793071269989014, - "logps/rejected": -1.4887911081314087, - "loss": 2.0322, + "logits/chosen": -0.038293130695819855, + "logits/rejected": 0.09338172525167465, + "logps/chosen": -1.2623484134674072, + "logps/rejected": -1.4510595798492432, + "loss": 1.6238, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2793071269989014, - "rewards/margins": 0.2094838172197342, - "rewards/rejected": -1.4887911081314087, - "semantic_entropy": 0.7951102256774902, + "rewards/chosen": -1.2623484134674072, + "rewards/margins": 0.1887110322713852, + "rewards/rejected": -1.4510595798492432, "step": 1455 }, { "epoch": 0.7814015721692591, - "grad_norm": 6.519181163142868, + "grad_norm": 5.6439465725538955, "learning_rate": 9.23616400771875e-07, - "logits/chosen": -0.017458543181419373, - "logits/rejected": 0.12942534685134888, - "logps/chosen": -1.2433431148529053, - "logps/rejected": -1.4446688890457153, - "loss": 2.0067, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2433431148529053, - "rewards/margins": 0.20132584869861603, - "rewards/rejected": -1.4446688890457153, - "semantic_entropy": 0.816464900970459, + "logits/chosen": -0.042051661759614944, + "logits/rejected": 0.08378352224826813, + "logps/chosen": -1.2224422693252563, + "logps/rejected": -1.3968486785888672, + "loss": 1.5918, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2224422693252563, + "rewards/margins": 0.17440657317638397, + "rewards/rejected": -1.3968486785888672, "step": 1460 }, { "epoch": 0.7840776049506607, - "grad_norm": 6.515758245208593, + "grad_norm": 6.489533578561058, "learning_rate": 9.227870209296395e-07, - "logits/chosen": 0.008127368986606598, - "logits/rejected": 0.10638369619846344, - "logps/chosen": -1.356247067451477, - "logps/rejected": -1.5012516975402832, - "loss": 2.1029, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.356247067451477, - "rewards/margins": 0.14500463008880615, - "rewards/rejected": -1.5012516975402832, - "semantic_entropy": 0.7735931277275085, + "logits/chosen": -0.021588444709777832, + "logits/rejected": 0.06242724508047104, + "logps/chosen": -1.3425042629241943, + "logps/rejected": -1.4560575485229492, + "loss": 1.7156, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3425042629241943, + "rewards/margins": 0.113553486764431, + "rewards/rejected": -1.4560575485229492, "step": 1465 }, { "epoch": 0.7867536377320622, - "grad_norm": 8.64957671159049, + "grad_norm": 8.645614434625807, "learning_rate": 9.219535392058728e-07, - "logits/chosen": -0.08659469336271286, - "logits/rejected": -0.0552179217338562, - "logps/chosen": -1.3174679279327393, - "logps/rejected": -1.4319454431533813, - "loss": 2.0845, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3174679279327393, - "rewards/margins": 0.11447747051715851, - "rewards/rejected": -1.4319454431533813, - "semantic_entropy": 0.7948979735374451, + "logits/chosen": -0.10292228311300278, + "logits/rejected": -0.07323513925075531, + "logps/chosen": -1.3078405857086182, + "logps/rejected": -1.382354497909546, + "loss": 1.6892, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3078405857086182, + "rewards/margins": 0.07451382279396057, + "rewards/rejected": -1.382354497909546, "step": 1470 }, { "epoch": 0.7894296705134638, - "grad_norm": 5.8600076062762465, + "grad_norm": 5.772767333891252, "learning_rate": 9.211159636870181e-07, - "logits/chosen": -0.08313588052988052, - "logits/rejected": 0.07544960081577301, - "logps/chosen": -1.3022918701171875, - "logps/rejected": -1.4836384057998657, - "loss": 2.0271, + "logits/chosen": -0.09033913165330887, + "logits/rejected": 0.052205026149749756, + "logps/chosen": -1.2833424806594849, + "logps/rejected": -1.4271941184997559, + "loss": 1.6229, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3022918701171875, - "rewards/margins": 0.18134646117687225, - "rewards/rejected": -1.4836384057998657, - "semantic_entropy": 0.7985160946846008, + "rewards/chosen": -1.2833424806594849, + "rewards/margins": 0.14385172724723816, + "rewards/rejected": -1.4271941184997559, "step": 1475 }, { "epoch": 0.7921057032948654, - "grad_norm": 7.922496666081785, + "grad_norm": 7.526256119076799, "learning_rate": 9.202743024992367e-07, - "logits/chosen": -0.013608187437057495, - "logits/rejected": 0.08938765525817871, - "logps/chosen": -1.2842402458190918, - "logps/rejected": -1.5133750438690186, - "loss": 2.0415, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2842402458190918, - "rewards/margins": 0.22913464903831482, - "rewards/rejected": -1.5133750438690186, - "semantic_entropy": 0.7954760789871216, + "logits/chosen": -0.027691710740327835, + "logits/rejected": 0.06575385481119156, + "logps/chosen": -1.2705562114715576, + "logps/rejected": -1.4572803974151611, + "loss": 1.642, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2705562114715576, + "rewards/margins": 0.18672427535057068, + "rewards/rejected": -1.4572803974151611, "step": 1480 }, { "epoch": 0.7947817360762669, - "grad_norm": 8.575669716734673, + "grad_norm": 7.680511985762157, "learning_rate": 9.194285638083293e-07, - "logits/chosen": 0.0062799095176160336, - "logits/rejected": 0.15394124388694763, - "logps/chosen": -1.3310755491256714, - "logps/rejected": -1.5266507863998413, - "loss": 2.0564, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3310755491256714, - "rewards/margins": 0.19557540118694305, - "rewards/rejected": -1.5266507863998413, - "semantic_entropy": 0.783862292766571, + "logits/chosen": -0.007962608709931374, + "logits/rejected": 0.12913981080055237, + "logps/chosen": -1.3152211904525757, + "logps/rejected": -1.4535510540008545, + "loss": 1.6639, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3152211904525757, + "rewards/margins": 0.13832971453666687, + "rewards/rejected": -1.4535510540008545, "step": 1485 }, { "epoch": 0.7974577688576685, - "grad_norm": 9.957844967407029, + "grad_norm": 10.528838663469754, "learning_rate": 9.185787558196562e-07, - "logits/chosen": -0.07122926414012909, - "logits/rejected": 0.032294441014528275, - "logps/chosen": -1.312718391418457, - "logps/rejected": -1.3787847757339478, - "loss": 2.0974, + "logits/chosen": -0.06781430542469025, + "logits/rejected": 0.02455865405499935, + "logps/chosen": -1.2909950017929077, + "logps/rejected": -1.3279635906219482, + "loss": 1.6853, "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.312718391418457, - "rewards/margins": 0.0660664290189743, - "rewards/rejected": -1.3787847757339478, - "semantic_entropy": 0.8111189007759094, + "rewards/chosen": -1.2909950017929077, + "rewards/margins": 0.036968667060136795, + "rewards/rejected": -1.3279635906219482, "step": 1490 }, { "epoch": 0.8001338016390701, - "grad_norm": 9.791559868001285, + "grad_norm": 9.310501631330935, "learning_rate": 9.177248867780583e-07, - "logits/chosen": -0.05186697095632553, - "logits/rejected": 0.04654816910624504, - "logps/chosen": -1.4116029739379883, - "logps/rejected": -1.4615944623947144, - "loss": 2.1464, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.4116029739379883, - "rewards/margins": 0.04999139904975891, - "rewards/rejected": -1.4615944623947144, - "semantic_entropy": 0.7640504837036133, + "logits/chosen": -0.05040599778294563, + "logits/rejected": 0.04599171131849289, + "logps/chosen": -1.3820037841796875, + "logps/rejected": -1.4067412614822388, + "loss": 1.7502, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3820037841796875, + "rewards/margins": 0.024737408384680748, + "rewards/rejected": -1.4067412614822388, "step": 1495 }, { "epoch": 0.8028098344204716, - "grad_norm": 9.880609014831158, + "grad_norm": 9.32817945324196, "learning_rate": 9.168669649677769e-07, - "logits/chosen": -0.0890929326415062, - "logits/rejected": 0.006459700874984264, - "logps/chosen": -1.2926914691925049, - "logps/rejected": -1.4606952667236328, - "loss": 2.0543, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2926914691925049, - "rewards/margins": 0.16800378262996674, - "rewards/rejected": -1.4606952667236328, - "semantic_entropy": 0.7951599359512329, + "logits/chosen": -0.07473565638065338, + "logits/rejected": 0.014921599999070168, + "logps/chosen": -1.2716801166534424, + "logps/rejected": -1.4177004098892212, + "loss": 1.6462, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2716801166534424, + "rewards/margins": 0.14602023363113403, + "rewards/rejected": -1.4177004098892212, "step": 1500 }, { "epoch": 0.8054858672018732, - "grad_norm": 9.372810455385036, + "grad_norm": 8.872541653326993, "learning_rate": 9.16004998712373e-07, - "logits/chosen": 0.004992163274437189, - "logits/rejected": 0.06641830503940582, - "logps/chosen": -1.2445884943008423, - "logps/rejected": -1.5159660577774048, - "loss": 1.9905, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2445884943008423, - "rewards/margins": 0.2713775336742401, - "rewards/rejected": -1.5159660577774048, - "semantic_entropy": 0.8220226168632507, + "logits/chosen": 0.016116593033075333, + "logits/rejected": 0.06647348403930664, + "logps/chosen": -1.2263656854629517, + "logps/rejected": -1.446277379989624, + "loss": 1.5796, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2263656854629517, + "rewards/margins": 0.21991190314292908, + "rewards/rejected": -1.446277379989624, "step": 1505 }, { "epoch": 0.8081618999832748, - "grad_norm": 6.017423772699667, + "grad_norm": 6.140760178845314, "learning_rate": 9.151389963746472e-07, - "logits/chosen": -0.09114827960729599, - "logits/rejected": 0.16105875372886658, - "logps/chosen": -1.3403730392456055, - "logps/rejected": -1.5040563344955444, - "loss": 2.0586, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3403730392456055, - "rewards/margins": 0.16368329524993896, - "rewards/rejected": -1.5040563344955444, - "semantic_entropy": 0.770714282989502, + "logits/chosen": -0.07571863383054733, + "logits/rejected": 0.15782687067985535, + "logps/chosen": -1.3236324787139893, + "logps/rejected": -1.4447602033615112, + "loss": 1.6687, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3236324787139893, + "rewards/margins": 0.12112772464752197, + "rewards/rejected": -1.4447602033615112, "step": 1510 }, { "epoch": 0.8108379327646764, - "grad_norm": 6.37267342623801, + "grad_norm": 6.274424281796091, "learning_rate": 9.142689663565577e-07, - "logits/chosen": -0.005736993160098791, - "logits/rejected": 0.06145425885915756, - "logps/chosen": -1.2793912887573242, - "logps/rejected": -1.4553543329238892, - "loss": 2.0492, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2793912887573242, - "rewards/margins": 0.17596301436424255, - "rewards/rejected": -1.4553543329238892, - "semantic_entropy": 0.8051084280014038, + "logits/chosen": -0.0122830243781209, + "logits/rejected": 0.04894590005278587, + "logps/chosen": -1.2596652507781982, + "logps/rejected": -1.3989272117614746, + "loss": 1.6466, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2596652507781982, + "rewards/margins": 0.13926205039024353, + "rewards/rejected": -1.3989272117614746, "step": 1515 }, { "epoch": 0.8135139655460779, - "grad_norm": 9.942639013117367, + "grad_norm": 8.012759631154838, "learning_rate": 9.133949170991397e-07, - "logits/chosen": -0.007551294751465321, - "logits/rejected": 0.0692036896944046, - "logps/chosen": -1.317042589187622, - "logps/rejected": -1.4655804634094238, - "loss": 2.0592, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.317042589187622, - "rewards/margins": 0.14853790402412415, - "rewards/rejected": -1.4655804634094238, - "semantic_entropy": 0.7813259959220886, + "logits/chosen": -0.020154908299446106, + "logits/rejected": 0.04682103544473648, + "logps/chosen": -1.2988132238388062, + "logps/rejected": -1.4243242740631104, + "loss": 1.6557, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2988132238388062, + "rewards/margins": 0.12551096081733704, + "rewards/rejected": -1.4243242740631104, "step": 1520 }, { "epoch": 0.8161899983274795, - "grad_norm": 7.054321515604811, + "grad_norm": 7.330468460048957, "learning_rate": 9.125168570824231e-07, - "logits/chosen": -0.038256287574768066, - "logits/rejected": 0.13500620424747467, - "logps/chosen": -1.3114440441131592, - "logps/rejected": -1.4267799854278564, - "loss": 2.1041, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3114440441131592, - "rewards/margins": 0.11533576250076294, - "rewards/rejected": -1.4267799854278564, - "semantic_entropy": 0.804587185382843, + "logits/chosen": -0.044300626963377, + "logits/rejected": 0.10973195731639862, + "logps/chosen": -1.2899428606033325, + "logps/rejected": -1.3735395669937134, + "loss": 1.689, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2899428606033325, + "rewards/margins": 0.08359669148921967, + "rewards/rejected": -1.3735395669937134, "step": 1525 }, { "epoch": 0.8188660311088811, - "grad_norm": 8.248911243528406, + "grad_norm": 9.830201816802699, "learning_rate": 9.116347948253496e-07, - "logits/chosen": -0.03580842912197113, - "logits/rejected": 0.06639468669891357, - "logps/chosen": -1.3471962213516235, - "logps/rejected": -1.468531608581543, - "loss": 2.0808, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3471962213516235, - "rewards/margins": 0.12133540958166122, - "rewards/rejected": -1.468531608581543, - "semantic_entropy": 0.7958540916442871, + "logits/chosen": -0.06985237449407578, + "logits/rejected": 0.011676972731947899, + "logps/chosen": -1.3224313259124756, + "logps/rejected": -1.4183366298675537, + "loss": 1.671, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3224313259124756, + "rewards/margins": 0.09590514004230499, + "rewards/rejected": -1.4183366298675537, "step": 1530 }, { "epoch": 0.8215420638902826, - "grad_norm": 9.11613499956825, + "grad_norm": 9.685933820455805, "learning_rate": 9.107487388856916e-07, - "logits/chosen": -0.07226817309856415, - "logits/rejected": 0.0961885154247284, - "logps/chosen": -1.2631547451019287, - "logps/rejected": -1.463041067123413, - "loss": 2.0131, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2631547451019287, - "rewards/margins": 0.19988617300987244, - "rewards/rejected": -1.463041067123413, - "semantic_entropy": 0.7994940876960754, + "logits/chosen": -0.08251698315143585, + "logits/rejected": 0.06552572548389435, + "logps/chosen": -1.245185136795044, + "logps/rejected": -1.4116127490997314, + "loss": 1.6071, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.245185136795044, + "rewards/margins": 0.1664276123046875, + "rewards/rejected": -1.4116127490997314, "step": 1535 }, { "epoch": 0.8242180966716842, - "grad_norm": 8.024790708944156, + "grad_norm": 8.791302067711484, "learning_rate": 9.098586978599673e-07, - "logits/chosen": 0.0032311968971043825, - "logits/rejected": 0.15999026596546173, - "logps/chosen": -1.2885329723358154, - "logps/rejected": -1.5601550340652466, - "loss": 2.0041, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2885329723358154, - "rewards/margins": 0.2716220021247864, - "rewards/rejected": -1.5601550340652466, - "semantic_entropy": 0.7936614751815796, + "logits/chosen": -0.032307274639606476, + "logits/rejected": 0.1064511314034462, + "logps/chosen": -1.273343801498413, + "logps/rejected": -1.4894983768463135, + "loss": 1.6085, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.273343801498413, + "rewards/margins": 0.21615469455718994, + "rewards/rejected": -1.4894983768463135, "step": 1540 }, { "epoch": 0.8268941294530858, - "grad_norm": 6.98164321849856, + "grad_norm": 6.908997857266348, "learning_rate": 9.089646803833588e-07, - "logits/chosen": 0.008408062160015106, - "logits/rejected": 0.16597135365009308, - "logps/chosen": -1.2991634607315063, - "logps/rejected": -1.4175589084625244, - "loss": 2.0821, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2991634607315063, - "rewards/margins": 0.11839548498392105, - "rewards/rejected": -1.4175589084625244, - "semantic_entropy": 0.8139492273330688, + "logits/chosen": -0.013830190524458885, + "logits/rejected": 0.12852010130882263, + "logps/chosen": -1.2819262742996216, + "logps/rejected": -1.3675658702850342, + "loss": 1.6723, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2819262742996216, + "rewards/margins": 0.08563955128192902, + "rewards/rejected": -1.3675658702850342, "step": 1545 }, { "epoch": 0.8295701622344873, - "grad_norm": 7.811228517744147, + "grad_norm": 8.030389353007761, "learning_rate": 9.080666951296276e-07, - "logits/chosen": -0.13311918079853058, - "logits/rejected": 0.13327452540397644, - "logps/chosen": -1.3553602695465088, - "logps/rejected": -1.5140632390975952, - "loss": 2.0947, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3553602695465088, - "rewards/margins": 0.1587028056383133, - "rewards/rejected": -1.5140632390975952, - "semantic_entropy": 0.7813464403152466, + "logits/chosen": -0.14801613986492157, + "logits/rejected": 0.08842134475708008, + "logps/chosen": -1.3376647233963013, + "logps/rejected": -1.4502404928207397, + "loss": 1.7009, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3376647233963013, + "rewards/margins": 0.11257576942443848, + "rewards/rejected": -1.4502404928207397, "step": 1550 }, { "epoch": 0.8322461950158889, - "grad_norm": 4.678684985118599, + "grad_norm": 4.847521377767916, "learning_rate": 9.071647508110305e-07, - "logits/chosen": -0.09198766946792603, - "logits/rejected": 0.15760457515716553, - "logps/chosen": -1.387629508972168, - "logps/rejected": -1.5720300674438477, - "loss": 2.1029, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.387629508972168, - "rewards/margins": 0.18440048396587372, - "rewards/rejected": -1.5720300674438477, - "semantic_entropy": 0.7744709849357605, + "logits/chosen": -0.11283471435308456, + "logits/rejected": 0.10331887006759644, + "logps/chosen": -1.3693912029266357, + "logps/rejected": -1.5027391910552979, + "loss": 1.7083, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3693912029266357, + "rewards/margins": 0.13334789872169495, + "rewards/rejected": -1.5027391910552979, "step": 1555 }, { "epoch": 0.8349222277972905, - "grad_norm": 7.168853445949985, + "grad_norm": 7.5086666418473715, "learning_rate": 9.062588561782354e-07, - "logits/chosen": 0.026187364012002945, - "logits/rejected": 0.09108327329158783, - "logps/chosen": -1.3502163887023926, - "logps/rejected": -1.5217186212539673, - "loss": 2.0609, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3502163887023926, - "rewards/margins": 0.17150214314460754, - "rewards/rejected": -1.5217186212539673, - "semantic_entropy": 0.7796697616577148, + "logits/chosen": -0.018067115917801857, + "logits/rejected": 0.037450969219207764, + "logps/chosen": -1.32191002368927, + "logps/rejected": -1.46347975730896, + "loss": 1.66, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.32191002368927, + "rewards/margins": 0.14156992733478546, + "rewards/rejected": -1.46347975730896, "step": 1560 }, { "epoch": 0.8375982605786921, - "grad_norm": 6.101638528531929, + "grad_norm": 6.464434757263405, "learning_rate": 9.053490200202358e-07, - "logits/chosen": 0.002669230103492737, - "logits/rejected": 0.09444611519575119, - "logps/chosen": -1.3471533060073853, - "logps/rejected": -1.483074426651001, - "loss": 2.0915, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3471533060073853, - "rewards/margins": 0.13592123985290527, - "rewards/rejected": -1.483074426651001, - "semantic_entropy": 0.7881175875663757, + "logits/chosen": -0.01471424289047718, + "logits/rejected": 0.06980814039707184, + "logps/chosen": -1.3211361169815063, + "logps/rejected": -1.4248106479644775, + "loss": 1.6868, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3211361169815063, + "rewards/margins": 0.10367457568645477, + "rewards/rejected": -1.4248106479644775, "step": 1565 }, { "epoch": 0.8402742933600936, - "grad_norm": 9.652107285256822, + "grad_norm": 7.922400117981282, "learning_rate": 9.044352511642661e-07, - "logits/chosen": 0.0475144125521183, - "logits/rejected": 0.05616613104939461, - "logps/chosen": -1.246050477027893, - "logps/rejected": -1.417238712310791, - "loss": 2.0355, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.246050477027893, - "rewards/margins": 0.17118819057941437, - "rewards/rejected": -1.417238712310791, - "semantic_entropy": 0.8253181576728821, + "logits/chosen": 0.017235392704606056, + "logits/rejected": 0.025248417630791664, + "logps/chosen": -1.2190972566604614, + "logps/rejected": -1.36015784740448, + "loss": 1.6115, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2190972566604614, + "rewards/margins": 0.1410606950521469, + "rewards/rejected": -1.36015784740448, "step": 1570 }, { "epoch": 0.8429503261414952, - "grad_norm": 8.0454186757764, + "grad_norm": 8.04143104023758, "learning_rate": 9.03517558475716e-07, - "logits/chosen": 0.008113816380500793, - "logits/rejected": 0.11139838397502899, - "logps/chosen": -1.3103927373886108, - "logps/rejected": -1.4039795398712158, - "loss": 2.0769, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3103927373886108, - "rewards/margins": 0.09358695894479752, - "rewards/rejected": -1.4039795398712158, - "semantic_entropy": 0.8116000294685364, + "logits/chosen": -0.03161366656422615, + "logits/rejected": 0.05319889262318611, + "logps/chosen": -1.2939759492874146, + "logps/rejected": -1.35179603099823, + "loss": 1.6676, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2939759492874146, + "rewards/margins": 0.05782013386487961, + "rewards/rejected": -1.35179603099823, "step": 1575 }, { "epoch": 0.8456263589228968, - "grad_norm": 6.658449200158449, + "grad_norm": 6.655102165682414, "learning_rate": 9.025959508580436e-07, - "logits/chosen": 0.0631398856639862, - "logits/rejected": 0.29851242899894714, - "logps/chosen": -1.3078067302703857, - "logps/rejected": -1.4754230976104736, - "loss": 2.0336, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3078067302703857, - "rewards/margins": 0.16761623322963715, - "rewards/rejected": -1.4754230976104736, - "semantic_entropy": 0.8020066022872925, + "logits/chosen": 0.026246452704072, + "logits/rejected": 0.23437242209911346, + "logps/chosen": -1.2864961624145508, + "logps/rejected": -1.4211158752441406, + "loss": 1.6252, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2864961624145508, + "rewards/margins": 0.13461963832378387, + "rewards/rejected": -1.4211158752441406, "step": 1580 }, { "epoch": 0.8483023917042983, - "grad_norm": 6.859574582510765, + "grad_norm": 7.037479275050492, "learning_rate": 9.016704372526905e-07, - "logits/chosen": 0.017910093069076538, - "logits/rejected": 0.1749291867017746, - "logps/chosen": -1.2329002618789673, - "logps/rejected": -1.524070143699646, - "loss": 1.9995, + "logits/chosen": -0.013667059130966663, + "logits/rejected": 0.12505964934825897, + "logps/chosen": -1.2189918756484985, + "logps/rejected": -1.4661953449249268, + "loss": 1.5914, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2329002618789673, - "rewards/margins": 0.29117000102996826, - "rewards/rejected": -1.524070143699646, - "semantic_entropy": 0.8060545921325684, + "rewards/chosen": -1.2189918756484985, + "rewards/margins": 0.24720346927642822, + "rewards/rejected": -1.4661953449249268, "step": 1585 }, { "epoch": 0.8509784244856999, - "grad_norm": 7.229156363043744, + "grad_norm": 7.432156146892551, "learning_rate": 9.007410266389934e-07, - "logits/chosen": -0.04009784385561943, - "logits/rejected": 0.05179458111524582, - "logps/chosen": -1.2556885480880737, - "logps/rejected": -1.3836594820022583, - "loss": 2.0359, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2556885480880737, - "rewards/margins": 0.1279708296060562, - "rewards/rejected": -1.3836594820022583, - "semantic_entropy": 0.8144699931144714, + "logits/chosen": -0.07939986884593964, + "logits/rejected": 0.00025191306485794485, + "logps/chosen": -1.2421469688415527, + "logps/rejected": -1.3436261415481567, + "loss": 1.6291, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2421469688415527, + "rewards/margins": 0.10147915035486221, + "rewards/rejected": -1.3436261415481567, "step": 1590 }, { "epoch": 0.8536544572671015, - "grad_norm": 9.068092685958812, + "grad_norm": 9.242470163562162, "learning_rate": 8.998077280340981e-07, - "logits/chosen": 0.03531334921717644, - "logits/rejected": 0.11247481405735016, - "logps/chosen": -1.3818706274032593, - "logps/rejected": -1.414201021194458, - "loss": 2.1353, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.3818706274032593, - "rewards/margins": 0.0323302187025547, - "rewards/rejected": -1.414201021194458, - "semantic_entropy": 0.7828024625778198, + "logits/chosen": 0.012076860293745995, + "logits/rejected": 0.08173094689846039, + "logps/chosen": -1.3606398105621338, + "logps/rejected": -1.3600795269012451, + "loss": 1.7323, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.3606398105621338, + "rewards/margins": -0.0005601089214906096, + "rewards/rejected": -1.3600795269012451, "step": 1595 }, { "epoch": 0.8563304900485031, - "grad_norm": 6.487284359042391, + "grad_norm": 6.88825620139616, "learning_rate": 8.988705504928722e-07, - "logits/chosen": -0.07598348706960678, - "logits/rejected": 0.10973948240280151, - "logps/chosen": -1.3424265384674072, - "logps/rejected": -1.5661863088607788, - "loss": 2.0467, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3424265384674072, - "rewards/margins": 0.223759725689888, - "rewards/rejected": -1.5661863088607788, - "semantic_entropy": 0.7749485969543457, + "logits/chosen": -0.1084945797920227, + "logits/rejected": 0.05396001413464546, + "logps/chosen": -1.326180338859558, + "logps/rejected": -1.500441551208496, + "loss": 1.6566, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.326180338859558, + "rewards/margins": 0.17426112294197083, + "rewards/rejected": -1.500441551208496, "step": 1600 }, { "epoch": 0.8563304900485031, - "eval_logits/chosen": 0.2518528699874878, - "eval_logits/rejected": 0.33622872829437256, - "eval_logps/chosen": -1.3256367444992065, - "eval_logps/rejected": -1.5169671773910522, - "eval_loss": 2.056356430053711, - "eval_rewards/accuracies": 0.5764095187187195, - "eval_rewards/chosen": -1.3256367444992065, - "eval_rewards/margins": 0.19133047759532928, - "eval_rewards/rejected": -1.5169671773910522, - "eval_runtime": 34.6083, - "eval_samples_per_second": 38.864, - "eval_semantic_entropy": 0.7919985055923462, - "eval_steps_per_second": 9.738, + "eval_logits/chosen": 0.27639397978782654, + "eval_logits/rejected": 0.35783126950263977, + "eval_logps/chosen": -1.3096314668655396, + "eval_logps/rejected": -1.4584813117980957, + "eval_loss": 1.659972906112671, + "eval_rewards/accuracies": 0.5593471527099609, + "eval_rewards/chosen": -1.3096314668655396, + "eval_rewards/margins": 0.14884990453720093, + "eval_rewards/rejected": -1.4584813117980957, + "eval_runtime": 40.3391, + "eval_samples_per_second": 33.342, + "eval_steps_per_second": 8.354, "step": 1600 }, { "epoch": 0.8590065228299046, - "grad_norm": 5.214394261380477, + "grad_norm": 5.1888977883125165, "learning_rate": 8.979295031078157e-07, - "logits/chosen": -0.07590194046497345, - "logits/rejected": 0.14043793082237244, - "logps/chosen": -1.2876328229904175, - "logps/rejected": -1.5351704359054565, - "loss": 2.012, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2876328229904175, - "rewards/margins": 0.24753758311271667, - "rewards/rejected": -1.5351704359054565, - "semantic_entropy": 0.8124788403511047, + "logits/chosen": -0.09929139912128448, + "logits/rejected": 0.09505654871463776, + "logps/chosen": -1.2664982080459595, + "logps/rejected": -1.482974886894226, + "loss": 1.5998, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2664982080459595, + "rewards/margins": 0.21647676825523376, + "rewards/rejected": -1.482974886894226, "step": 1605 }, { "epoch": 0.8616825556113062, - "grad_norm": 5.950197629128667, + "grad_norm": 5.823354557557946, "learning_rate": 8.969845950089751e-07, - "logits/chosen": -0.09720132499933243, - "logits/rejected": 0.07550375908613205, - "logps/chosen": -1.269771933555603, - "logps/rejected": -1.4900038242340088, - "loss": 2.0197, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.269771933555603, - "rewards/margins": 0.22023192048072815, - "rewards/rejected": -1.4900038242340088, - "semantic_entropy": 0.8048882484436035, + "logits/chosen": -0.10225830227136612, + "logits/rejected": 0.05890607833862305, + "logps/chosen": -1.2548028230667114, + "logps/rejected": -1.4133626222610474, + "loss": 1.6205, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2548028230667114, + "rewards/margins": 0.158559650182724, + "rewards/rejected": -1.4133626222610474, "step": 1610 }, { "epoch": 0.8643585883927078, - "grad_norm": 7.206569071318787, + "grad_norm": 7.225956675808371, "learning_rate": 8.960358353638526e-07, - "logits/chosen": -0.05517633631825447, - "logits/rejected": 0.043794918805360794, - "logps/chosen": -1.3460371494293213, - "logps/rejected": -1.574812889099121, - "loss": 2.0663, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3460371494293213, - "rewards/margins": 0.2287757396697998, - "rewards/rejected": -1.574812889099121, - "semantic_entropy": 0.7761720418930054, + "logits/chosen": -0.06003696471452713, + "logits/rejected": 0.027661174535751343, + "logps/chosen": -1.3222529888153076, + "logps/rejected": -1.5003571510314941, + "loss": 1.6707, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3222529888153076, + "rewards/margins": 0.1781042069196701, + "rewards/rejected": -1.5003571510314941, "step": 1615 }, { "epoch": 0.8670346211741093, - "grad_norm": 6.789900956245166, + "grad_norm": 6.830752926640301, "learning_rate": 8.950832333773184e-07, - "logits/chosen": -0.002611640142276883, - "logits/rejected": 0.12673768401145935, - "logps/chosen": -1.2162271738052368, - "logps/rejected": -1.507699966430664, - "loss": 1.9731, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2162271738052368, - "rewards/margins": 0.2914729714393616, - "rewards/rejected": -1.507699966430664, - "semantic_entropy": 0.8164950609207153, + "logits/chosen": 0.0059167868457734585, + "logits/rejected": 0.12878912687301636, + "logps/chosen": -1.190798044204712, + "logps/rejected": -1.4466768503189087, + "loss": 1.5546, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.190798044204712, + "rewards/margins": 0.25587883591651917, + "rewards/rejected": -1.4466768503189087, "step": 1620 }, { "epoch": 0.869710653955511, - "grad_norm": 7.59597640909701, + "grad_norm": 8.251986204131775, "learning_rate": 8.941267982915213e-07, - "logits/chosen": 0.08257068693637848, - "logits/rejected": 0.1271582841873169, - "logps/chosen": -1.3797576427459717, - "logps/rejected": -1.525801658630371, - "loss": 2.0991, + "logits/chosen": 0.07970758527517319, + "logits/rejected": 0.12433300167322159, + "logps/chosen": -1.3536022901535034, + "logps/rejected": -1.475412130355835, + "loss": 1.7004, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3797576427459717, - "rewards/margins": 0.14604386687278748, - "rewards/rejected": -1.525801658630371, - "semantic_entropy": 0.7759956121444702, + "rewards/chosen": -1.3536022901535034, + "rewards/margins": 0.12180988490581512, + "rewards/rejected": -1.475412130355835, "step": 1625 }, { "epoch": 0.8723866867369126, - "grad_norm": 8.434961396653323, + "grad_norm": 7.521930965978341, "learning_rate": 8.931665393857983e-07, - "logits/chosen": 0.0038153603672981262, - "logits/rejected": 0.15201179683208466, - "logps/chosen": -1.3320643901824951, - "logps/rejected": -1.4746291637420654, - "loss": 2.0713, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3320643901824951, - "rewards/margins": 0.1425647884607315, - "rewards/rejected": -1.4746291637420654, - "semantic_entropy": 0.7873384952545166, + "logits/chosen": 0.007930627092719078, + "logits/rejected": 0.14260335266590118, + "logps/chosen": -1.312232255935669, + "logps/rejected": -1.3909637928009033, + "loss": 1.6768, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.312232255935669, + "rewards/margins": 0.07873149961233139, + "rewards/rejected": -1.3909637928009033, "step": 1630 }, { "epoch": 0.875062719518314, - "grad_norm": 10.044887524931466, + "grad_norm": 10.993310009459227, "learning_rate": 8.922024659765861e-07, - "logits/chosen": -0.0778026208281517, - "logits/rejected": 0.035470522940158844, - "logps/chosen": -1.2441432476043701, - "logps/rejected": -1.4577016830444336, - "loss": 2.0174, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2441432476043701, - "rewards/margins": 0.2135585993528366, - "rewards/rejected": -1.4577016830444336, - "semantic_entropy": 0.8132489919662476, + "logits/chosen": -0.07959005236625671, + "logits/rejected": 0.022915149107575417, + "logps/chosen": -1.2250585556030273, + "logps/rejected": -1.3788336515426636, + "loss": 1.6106, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2250585556030273, + "rewards/margins": 0.15377524495124817, + "rewards/rejected": -1.3788336515426636, "step": 1635 }, { "epoch": 0.8777387522997157, - "grad_norm": 6.639101328261095, + "grad_norm": 6.924128529614549, "learning_rate": 8.912345874173288e-07, - "logits/chosen": -0.0712386816740036, - "logits/rejected": 0.037735819816589355, - "logps/chosen": -1.2489250898361206, - "logps/rejected": -1.4933502674102783, - "loss": 2.0323, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2489250898361206, - "rewards/margins": 0.24442513287067413, - "rewards/rejected": -1.4933502674102783, - "semantic_entropy": 0.8062291145324707, + "logits/chosen": -0.0716363936662674, + "logits/rejected": 0.020317738875746727, + "logps/chosen": -1.2232820987701416, + "logps/rejected": -1.3890739679336548, + "loss": 1.6274, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2232820987701416, + "rewards/margins": 0.1657918244600296, + "rewards/rejected": -1.3890739679336548, "step": 1640 }, { "epoch": 0.8804147850811173, - "grad_norm": 11.912160547547236, + "grad_norm": 10.01123514535389, "learning_rate": 8.902629130983885e-07, - "logits/chosen": 0.0011784828966483474, - "logits/rejected": 0.05601752549409866, - "logps/chosen": -1.2473455667495728, - "logps/rejected": -1.4221245050430298, - "loss": 2.0301, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2473455667495728, - "rewards/margins": 0.17477881908416748, - "rewards/rejected": -1.4221245050430298, - "semantic_entropy": 0.825102686882019, + "logits/chosen": -0.0073282113298773766, + "logits/rejected": 0.03696933388710022, + "logps/chosen": -1.2226999998092651, + "logps/rejected": -1.3663519620895386, + "loss": 1.6106, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2226999998092651, + "rewards/margins": 0.1436520665884018, + "rewards/rejected": -1.3663519620895386, "step": 1645 }, { "epoch": 0.8830908178625189, - "grad_norm": 10.842471269722136, + "grad_norm": 11.142210987143999, "learning_rate": 8.892874524469537e-07, - "logits/chosen": 0.07691788673400879, - "logits/rejected": 0.13914301991462708, - "logps/chosen": -1.270114779472351, - "logps/rejected": -1.4903980493545532, - "loss": 2.0299, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.270114779472351, - "rewards/margins": 0.22028322517871857, - "rewards/rejected": -1.4903980493545532, - "semantic_entropy": 0.7986249327659607, + "logits/chosen": 0.06746591627597809, + "logits/rejected": 0.1274791657924652, + "logps/chosen": -1.2502325773239136, + "logps/rejected": -1.4318780899047852, + "loss": 1.6284, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2502325773239136, + "rewards/margins": 0.18164536356925964, + "rewards/rejected": -1.4318780899047852, "step": 1650 }, { "epoch": 0.8857668506439204, - "grad_norm": 9.732866542987194, + "grad_norm": 10.041962734713945, "learning_rate": 8.883082149269478e-07, - "logits/chosen": -0.026226142421364784, - "logits/rejected": 0.08298458904027939, - "logps/chosen": -1.3272168636322021, - "logps/rejected": -1.469519019126892, - "loss": 2.1097, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3272168636322021, - "rewards/margins": 0.14230214059352875, - "rewards/rejected": -1.469519019126892, - "semantic_entropy": 0.7965654730796814, + "logits/chosen": -0.07328319549560547, + "logits/rejected": 0.01814514584839344, + "logps/chosen": -1.3024415969848633, + "logps/rejected": -1.3887003660202026, + "loss": 1.7004, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3024415969848633, + "rewards/margins": 0.0862586721777916, + "rewards/rejected": -1.3887003660202026, "step": 1655 }, { "epoch": 0.888442883425322, - "grad_norm": 6.200821594341999, + "grad_norm": 6.492097536398011, "learning_rate": 8.873252100389377e-07, - "logits/chosen": 0.053123731166124344, - "logits/rejected": 0.06377711892127991, - "logps/chosen": -1.256958246231079, - "logps/rejected": -1.4503594636917114, - "loss": 2.0351, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.256958246231079, - "rewards/margins": 0.19340117275714874, - "rewards/rejected": -1.4503594636917114, - "semantic_entropy": 0.8116283416748047, + "logits/chosen": 0.004718318581581116, + "logits/rejected": 0.0009179293992929161, + "logps/chosen": -1.2343924045562744, + "logps/rejected": -1.4021685123443604, + "loss": 1.6224, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2343924045562744, + "rewards/margins": 0.16777613759040833, + "rewards/rejected": -1.4021685123443604, "step": 1660 }, { "epoch": 0.8911189162067236, - "grad_norm": 7.294249630027461, + "grad_norm": 6.711455617048511, "learning_rate": 8.863384473200411e-07, - "logits/chosen": 0.02938307449221611, - "logits/rejected": 0.09694571793079376, - "logps/chosen": -1.3356560468673706, - "logps/rejected": -1.4853121042251587, - "loss": 2.0737, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3356560468673706, - "rewards/margins": 0.14965587854385376, - "rewards/rejected": -1.4853121042251587, - "semantic_entropy": 0.7866509556770325, + "logits/chosen": -0.04459734261035919, + "logits/rejected": 0.012989720329642296, + "logps/chosen": -1.3119922876358032, + "logps/rejected": -1.4353927373886108, + "loss": 1.6688, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3119922876358032, + "rewards/margins": 0.12340055406093597, + "rewards/rejected": -1.4353927373886108, "step": 1665 }, { "epoch": 0.8937949489881251, - "grad_norm": 6.133334888255327, + "grad_norm": 6.329821330332764, "learning_rate": 8.853479363438342e-07, - "logits/chosen": 0.06209275871515274, - "logits/rejected": 0.22910866141319275, - "logps/chosen": -1.360039234161377, - "logps/rejected": -1.456312894821167, - "loss": 2.1464, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.360039234161377, - "rewards/margins": 0.09627362340688705, - "rewards/rejected": -1.456312894821167, - "semantic_entropy": 0.7923996448516846, + "logits/chosen": -0.0069259097799658775, + "logits/rejected": 0.13729408383369446, + "logps/chosen": -1.3276126384735107, + "logps/rejected": -1.3940479755401611, + "loss": 1.7322, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.3276126384735107, + "rewards/margins": 0.06643550097942352, + "rewards/rejected": -1.3940479755401611, "step": 1670 }, { "epoch": 0.8964709817695267, - "grad_norm": 5.3571289027092766, + "grad_norm": 5.243545096090104, "learning_rate": 8.843536867202588e-07, - "logits/chosen": 0.058256424963474274, - "logits/rejected": 0.2694912850856781, - "logps/chosen": -1.3328102827072144, - "logps/rejected": -1.606268286705017, - "loss": 2.075, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3328102827072144, - "rewards/margins": 0.27345791459083557, - "rewards/rejected": -1.606268286705017, - "semantic_entropy": 0.7753814458847046, + "logits/chosen": -0.015339165925979614, + "logits/rejected": 0.16871802508831024, + "logps/chosen": -1.3177587985992432, + "logps/rejected": -1.5341602563858032, + "loss": 1.6806, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3177587985992432, + "rewards/margins": 0.2164015769958496, + "rewards/rejected": -1.5341602563858032, "step": 1675 }, { "epoch": 0.8991470145509283, - "grad_norm": 9.514567268775329, + "grad_norm": 7.139112205969165, "learning_rate": 8.833557080955292e-07, - "logits/chosen": -0.06700204312801361, - "logits/rejected": 0.04518015310168266, - "logps/chosen": -1.3658771514892578, - "logps/rejected": -1.5113369226455688, - "loss": 2.083, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3658771514892578, - "rewards/margins": 0.14545971155166626, - "rewards/rejected": -1.5113369226455688, - "semantic_entropy": 0.7836839556694031, + "logits/chosen": -0.10702624171972275, + "logits/rejected": -0.009430733509361744, + "logps/chosen": -1.346219778060913, + "logps/rejected": -1.455933928489685, + "loss": 1.6872, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.346219778060913, + "rewards/margins": 0.10971428453922272, + "rewards/rejected": -1.455933928489685, "step": 1680 }, { "epoch": 0.9018230473323299, - "grad_norm": 8.79669468104146, + "grad_norm": 8.788895829020646, "learning_rate": 8.823540101520381e-07, - "logits/chosen": -0.04735895246267319, - "logits/rejected": 0.17872780561447144, - "logps/chosen": -1.3166942596435547, - "logps/rejected": -1.4989925622940063, - "loss": 2.0616, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3166942596435547, - "rewards/margins": 0.18229825794696808, - "rewards/rejected": -1.4989925622940063, - "semantic_entropy": 0.8012421727180481, + "logits/chosen": -0.09646473824977875, + "logits/rejected": 0.10258965194225311, + "logps/chosen": -1.2965043783187866, + "logps/rejected": -1.4185608625411987, + "loss": 1.6591, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2965043783187866, + "rewards/margins": 0.1220565065741539, + "rewards/rejected": -1.4185608625411987, "step": 1685 }, { "epoch": 0.9044990801137314, - "grad_norm": 7.070667857103815, + "grad_norm": 7.074812329175778, "learning_rate": 8.813486026082637e-07, - "logits/chosen": -0.050979845225811005, - "logits/rejected": 0.13490286469459534, - "logps/chosen": -1.2526905536651611, - "logps/rejected": -1.4703716039657593, - "loss": 2.0054, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2526905536651611, - "rewards/margins": 0.21768124401569366, - "rewards/rejected": -1.4703716039657593, - "semantic_entropy": 0.8110173940658569, + "logits/chosen": -0.08094370365142822, + "logits/rejected": 0.08374644815921783, + "logps/chosen": -1.2347418069839478, + "logps/rejected": -1.4043574333190918, + "loss": 1.5944, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2347418069839478, + "rewards/margins": 0.16961567103862762, + "rewards/rejected": -1.4043574333190918, "step": 1690 }, { "epoch": 0.907175112895133, - "grad_norm": 12.349116290155283, + "grad_norm": 11.325862698178144, "learning_rate": 8.803394952186742e-07, - "logits/chosen": -0.17546625435352325, - "logits/rejected": -0.032556891441345215, - "logps/chosen": -1.3627218008041382, - "logps/rejected": -1.4914048910140991, - "loss": 2.085, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3627218008041382, - "rewards/margins": 0.12868306040763855, - "rewards/rejected": -1.4914048910140991, - "semantic_entropy": 0.7780641317367554, + "logits/chosen": -0.2112002819776535, + "logits/rejected": -0.09278073161840439, + "logps/chosen": -1.3451100587844849, + "logps/rejected": -1.4331865310668945, + "loss": 1.693, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3451100587844849, + "rewards/margins": 0.08807633072137833, + "rewards/rejected": -1.4331865310668945, "step": 1695 }, { "epoch": 0.9098511456765346, - "grad_norm": 9.21235392234306, + "grad_norm": 8.409412894002354, "learning_rate": 8.793266977736342e-07, - "logits/chosen": -0.01133874524384737, - "logits/rejected": -0.05470942333340645, - "logps/chosen": -1.3494858741760254, - "logps/rejected": -1.4131577014923096, - "loss": 2.1016, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.3494858741760254, - "rewards/margins": 0.0636717677116394, - "rewards/rejected": -1.4131577014923096, - "semantic_entropy": 0.7917024493217468, + "logits/chosen": -0.054956771433353424, + "logits/rejected": -0.09063141793012619, + "logps/chosen": -1.3343979120254517, + "logps/rejected": -1.3663153648376465, + "loss": 1.7033, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3343979120254517, + "rewards/margins": 0.03191746026277542, + "rewards/rejected": -1.3663153648376465, "step": 1700 }, { "epoch": 0.9125271784579361, - "grad_norm": 8.882528510651557, + "grad_norm": 8.440378996313939, "learning_rate": 8.783102200993085e-07, - "logits/chosen": -0.002578629646450281, - "logits/rejected": 0.13472047448158264, - "logps/chosen": -1.3230177164077759, - "logps/rejected": -1.435495138168335, - "loss": 2.0797, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3230177164077759, - "rewards/margins": 0.11247744411230087, - "rewards/rejected": -1.435495138168335, - "semantic_entropy": 0.8014957308769226, + "logits/chosen": -0.007442918606102467, + "logits/rejected": 0.11806105077266693, + "logps/chosen": -1.3040060997009277, + "logps/rejected": -1.3856302499771118, + "loss": 1.6761, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3040060997009277, + "rewards/margins": 0.08162431418895721, + "rewards/rejected": -1.3856302499771118, "step": 1705 }, { "epoch": 0.9152032112393377, - "grad_norm": 7.565053404244324, + "grad_norm": 7.30589729352165, "learning_rate": 8.772900720575683e-07, - "logits/chosen": -0.04457225650548935, - "logits/rejected": 0.03571139648556709, - "logps/chosen": -1.2638635635375977, - "logps/rejected": -1.428993582725525, - "loss": 2.0241, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2638635635375977, - "rewards/margins": 0.16512994468212128, - "rewards/rejected": -1.428993582725525, - "semantic_entropy": 0.8144906163215637, + "logits/chosen": -0.07810314744710922, + "logits/rejected": -0.009394729509949684, + "logps/chosen": -1.243417501449585, + "logps/rejected": -1.3742151260375977, + "loss": 1.6096, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.243417501449585, + "rewards/margins": 0.13079769909381866, + "rewards/rejected": -1.3742151260375977, "step": 1710 }, { "epoch": 0.9178792440207393, - "grad_norm": 6.65669576170742, + "grad_norm": 6.627437386798884, "learning_rate": 8.762662635458944e-07, - "logits/chosen": -0.02387130819261074, - "logits/rejected": 0.1807420551776886, - "logps/chosen": -1.3656136989593506, - "logps/rejected": -1.4667924642562866, - "loss": 2.1195, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3656136989593506, - "rewards/margins": 0.10117874294519424, - "rewards/rejected": -1.4667924642562866, - "semantic_entropy": 0.7991411089897156, + "logits/chosen": -0.03734375163912773, + "logits/rejected": 0.15281075239181519, + "logps/chosen": -1.3413758277893066, + "logps/rejected": -1.4052072763442993, + "loss": 1.7142, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3413758277893066, + "rewards/margins": 0.0638314038515091, + "rewards/rejected": -1.4052072763442993, "step": 1715 }, { "epoch": 0.9205552768021408, - "grad_norm": 6.801503977545353, + "grad_norm": 6.446085230868156, "learning_rate": 8.752388044972811e-07, - "logits/chosen": 0.005317692644894123, - "logits/rejected": 0.06661224365234375, - "logps/chosen": -1.20073401927948, - "logps/rejected": -1.474448561668396, - "loss": 1.9924, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.20073401927948, - "rewards/margins": 0.2737146019935608, - "rewards/rejected": -1.474448561668396, - "semantic_entropy": 0.8212075233459473, + "logits/chosen": -0.07835554331541061, + "logits/rejected": -0.030281897634267807, + "logps/chosen": -1.1814202070236206, + "logps/rejected": -1.4106985330581665, + "loss": 1.573, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1814202070236206, + "rewards/margins": 0.22927825152873993, + "rewards/rejected": -1.4106985330581665, "step": 1720 }, { "epoch": 0.9232313095835424, - "grad_norm": 6.786794245858191, + "grad_norm": 6.826260743987488, "learning_rate": 8.74207704880141e-07, - "logits/chosen": -0.014214864000678062, - "logits/rejected": 0.08873619884252548, - "logps/chosen": -1.3316367864608765, - "logps/rejected": -1.5858074426651, - "loss": 2.0367, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3316367864608765, - "rewards/margins": 0.2541707754135132, - "rewards/rejected": -1.5858074426651, - "semantic_entropy": 0.7817917466163635, + "logits/chosen": -0.08342117816209793, + "logits/rejected": 0.0009416237589903176, + "logps/chosen": -1.3154761791229248, + "logps/rejected": -1.509495735168457, + "loss": 1.6444, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3154761791229248, + "rewards/margins": 0.1940193474292755, + "rewards/rejected": -1.509495735168457, "step": 1725 }, { "epoch": 0.925907342364944, - "grad_norm": 9.01368242373458, + "grad_norm": 9.427502099175387, "learning_rate": 8.731729746982068e-07, - "logits/chosen": 0.07874997705221176, - "logits/rejected": 0.1306711733341217, - "logps/chosen": -1.3039653301239014, - "logps/rejected": -1.4204844236373901, - "loss": 2.0723, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3039653301239014, - "rewards/margins": 0.11651904881000519, - "rewards/rejected": -1.4204844236373901, - "semantic_entropy": 0.7985782027244568, + "logits/chosen": 0.021935302764177322, + "logits/rejected": 0.061028145253658295, + "logps/chosen": -1.2851629257202148, + "logps/rejected": -1.3662439584732056, + "loss": 1.6694, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2851629257202148, + "rewards/margins": 0.08108110725879669, + "rewards/rejected": -1.3662439584732056, "step": 1730 }, { "epoch": 0.9285833751463456, - "grad_norm": 7.397503812781015, + "grad_norm": 7.23708158585448, "learning_rate": 8.721346239904355e-07, - "logits/chosen": -0.09926575422286987, - "logits/rejected": 0.058978479355573654, - "logps/chosen": -1.2411620616912842, - "logps/rejected": -1.6494128704071045, - "loss": 1.957, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2411620616912842, - "rewards/margins": 0.40825071930885315, - "rewards/rejected": -1.6494128704071045, - "semantic_entropy": 0.801047146320343, + "logits/chosen": -0.13660050928592682, + "logits/rejected": -0.009003483690321445, + "logps/chosen": -1.2197661399841309, + "logps/rejected": -1.5537761449813843, + "loss": 1.5576, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2197661399841309, + "rewards/margins": 0.33400994539260864, + "rewards/rejected": -1.5537761449813843, "step": 1735 }, { "epoch": 0.9312594079277471, - "grad_norm": 7.049995899457256, + "grad_norm": 6.690179189666134, "learning_rate": 8.710926628309101e-07, - "logits/chosen": -0.020289301872253418, - "logits/rejected": 0.11928510665893555, - "logps/chosen": -1.2865359783172607, - "logps/rejected": -1.4603403806686401, - "loss": 2.0347, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2865359783172607, - "rewards/margins": 0.17380447685718536, - "rewards/rejected": -1.4603403806686401, - "semantic_entropy": 0.8009761571884155, + "logits/chosen": -0.11041511595249176, + "logits/rejected": -0.007809894625097513, + "logps/chosen": -1.2693771123886108, + "logps/rejected": -1.3839272260665894, + "loss": 1.6346, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2693771123886108, + "rewards/margins": 0.11454999446868896, + "rewards/rejected": -1.3839272260665894, "step": 1740 }, { "epoch": 0.9339354407091487, - "grad_norm": 6.429018945726928, + "grad_norm": 6.875548426964406, "learning_rate": 8.700471013287424e-07, - "logits/chosen": 0.053223930299282074, - "logits/rejected": 0.06973040848970413, - "logps/chosen": -1.2947371006011963, - "logps/rejected": -1.4536296129226685, - "loss": 2.0479, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2947371006011963, - "rewards/margins": 0.1588924676179886, - "rewards/rejected": -1.4536296129226685, - "semantic_entropy": 0.8022210001945496, + "logits/chosen": -0.03753243386745453, + "logits/rejected": -0.023988056927919388, + "logps/chosen": -1.2683788537979126, + "logps/rejected": -1.3910294771194458, + "loss": 1.6375, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2683788537979126, + "rewards/margins": 0.12265037000179291, + "rewards/rejected": -1.3910294771194458, "step": 1745 }, { "epoch": 0.9366114734905503, - "grad_norm": 10.83261920976737, + "grad_norm": 10.913820939420868, "learning_rate": 8.689979496279746e-07, - "logits/chosen": -0.05656473711133003, - "logits/rejected": 0.0013455540174618363, - "logps/chosen": -1.3060810565948486, - "logps/rejected": -1.5460128784179688, - "loss": 2.0247, + "logits/chosen": -0.11481869220733643, + "logits/rejected": -0.06308847665786743, + "logps/chosen": -1.266616702079773, + "logps/rejected": -1.4824692010879517, + "loss": 1.6015, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3060810565948486, - "rewards/margins": 0.2399318963289261, - "rewards/rejected": -1.5460128784179688, - "semantic_entropy": 0.7921985387802124, + "rewards/chosen": -1.266616702079773, + "rewards/margins": 0.2158524990081787, + "rewards/rejected": -1.4824692010879517, "step": 1750 }, { "epoch": 0.9392875062719518, - "grad_norm": 5.858542388966641, + "grad_norm": 5.804834151418999, "learning_rate": 8.679452179074811e-07, - "logits/chosen": -0.03769116848707199, - "logits/rejected": 0.05500766634941101, - "logps/chosen": -1.2704050540924072, - "logps/rejected": -1.412957787513733, - "loss": 2.0511, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2704050540924072, - "rewards/margins": 0.14255282282829285, - "rewards/rejected": -1.412957787513733, - "semantic_entropy": 0.8166753053665161, + "logits/chosen": -0.10651911795139313, + "logits/rejected": -0.03721824288368225, + "logps/chosen": -1.2545320987701416, + "logps/rejected": -1.3591195344924927, + "loss": 1.6401, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2545320987701416, + "rewards/margins": 0.10458721220493317, + "rewards/rejected": -1.3591195344924927, "step": 1755 }, { "epoch": 0.9419635390533534, - "grad_norm": 5.872711066743105, + "grad_norm": 6.638379460376084, "learning_rate": 8.668889163808698e-07, - "logits/chosen": -0.010468644089996815, - "logits/rejected": 0.11073984205722809, - "logps/chosen": -1.2528936862945557, - "logps/rejected": -1.4464694261550903, - "loss": 2.0165, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2528936862945557, - "rewards/margins": 0.1935756951570511, - "rewards/rejected": -1.4464694261550903, - "semantic_entropy": 0.8102561235427856, + "logits/chosen": -0.0438620001077652, + "logits/rejected": 0.054749589413404465, + "logps/chosen": -1.2415411472320557, + "logps/rejected": -1.392091989517212, + "loss": 1.6126, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2415411472320557, + "rewards/margins": 0.1505509614944458, + "rewards/rejected": -1.392091989517212, "step": 1760 }, { "epoch": 0.944639571834755, - "grad_norm": 6.529911274017464, + "grad_norm": 6.540934099687025, "learning_rate": 8.658290552963827e-07, - "logits/chosen": -0.0010960951913148165, - "logits/rejected": 0.01580260880291462, - "logps/chosen": -1.2851160764694214, - "logps/rejected": -1.490731954574585, - "loss": 2.0604, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2851160764694214, - "rewards/margins": 0.20561587810516357, - "rewards/rejected": -1.490731954574585, - "semantic_entropy": 0.8163287043571472, + "logits/chosen": -0.03682807832956314, + "logits/rejected": -0.020194586366415024, + "logps/chosen": -1.2712775468826294, + "logps/rejected": -1.428292989730835, + "loss": 1.6514, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2712775468826294, + "rewards/margins": 0.15701545774936676, + "rewards/rejected": -1.428292989730835, "step": 1765 }, { "epoch": 0.9473156046161565, - "grad_norm": 5.639046399121027, + "grad_norm": 5.755972920864021, "learning_rate": 8.647656449367966e-07, - "logits/chosen": -0.009508803486824036, - "logits/rejected": 0.13738605380058289, - "logps/chosen": -1.3116872310638428, - "logps/rejected": -1.4082788228988647, - "loss": 2.0579, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3116872310638428, - "rewards/margins": 0.09659181535243988, - "rewards/rejected": -1.4082788228988647, - "semantic_entropy": 0.7913161516189575, + "logits/chosen": -0.03591788560152054, + "logits/rejected": 0.09994689375162125, + "logps/chosen": -1.296185851097107, + "logps/rejected": -1.3564786911010742, + "loss": 1.6628, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.296185851097107, + "rewards/margins": 0.06029283255338669, + "rewards/rejected": -1.3564786911010742, "step": 1770 }, { "epoch": 0.9499916373975581, - "grad_norm": 9.492998520557995, + "grad_norm": 8.516675145333354, "learning_rate": 8.636986956193235e-07, - "logits/chosen": -0.060478221625089645, - "logits/rejected": 0.026750242337584496, - "logps/chosen": -1.262905478477478, - "logps/rejected": -1.4266475439071655, - "loss": 2.0257, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.262905478477478, - "rewards/margins": 0.16374202072620392, - "rewards/rejected": -1.4266475439071655, - "semantic_entropy": 0.8194987177848816, + "logits/chosen": -0.10436888039112091, + "logits/rejected": -0.04029594361782074, + "logps/chosen": -1.2394301891326904, + "logps/rejected": -1.3689510822296143, + "loss": 1.6142, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2394301891326904, + "rewards/margins": 0.12952089309692383, + "rewards/rejected": -1.3689510822296143, "step": 1775 }, { "epoch": 0.9526676701789597, - "grad_norm": 6.987088294233002, + "grad_norm": 6.289999309014636, "learning_rate": 8.626282176955104e-07, - "logits/chosen": -0.061791181564331055, - "logits/rejected": 0.05312754586338997, - "logps/chosen": -1.2885668277740479, - "logps/rejected": -1.507681131362915, - "loss": 2.0293, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2885668277740479, - "rewards/margins": 0.2191143035888672, - "rewards/rejected": -1.507681131362915, - "semantic_entropy": 0.8043468594551086, + "logits/chosen": -0.09868212044239044, + "logits/rejected": 0.0011174462269991636, + "logps/chosen": -1.2685974836349487, + "logps/rejected": -1.431061029434204, + "loss": 1.6255, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2685974836349487, + "rewards/margins": 0.16246357560157776, + "rewards/rejected": -1.431061029434204, "step": 1780 }, { "epoch": 0.9553437029603613, - "grad_norm": 7.2172640331427695, + "grad_norm": 7.183464870616547, "learning_rate": 8.615542215511389e-07, - "logits/chosen": 0.04582851380109787, - "logits/rejected": 0.11913655698299408, - "logps/chosen": -1.2383428812026978, - "logps/rejected": -1.3574765920639038, - "loss": 2.058, + "logits/chosen": 0.006868282798677683, + "logits/rejected": 0.06984652578830719, + "logps/chosen": -1.2149683237075806, + "logps/rejected": -1.309650182723999, + "loss": 1.6388, "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2383428812026978, - "rewards/margins": 0.11913367360830307, - "rewards/rejected": -1.3574765920639038, - "semantic_entropy": 0.8183929324150085, + "rewards/chosen": -1.2149683237075806, + "rewards/margins": 0.09468193352222443, + "rewards/rejected": -1.309650182723999, "step": 1785 }, { "epoch": 0.9580197357417628, - "grad_norm": 8.066411676659767, + "grad_norm": 7.560385247647893, "learning_rate": 8.604767176061241e-07, - "logits/chosen": 0.0605916753411293, - "logits/rejected": 0.10191599279642105, - "logps/chosen": -1.311759114265442, - "logps/rejected": -1.4463961124420166, - "loss": 2.0579, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.311759114265442, - "rewards/margins": 0.13463714718818665, - "rewards/rejected": -1.4463961124420166, - "semantic_entropy": 0.7989506721496582, + "logits/chosen": 0.004362165927886963, + "logits/rejected": 0.0364392027258873, + "logps/chosen": -1.2910511493682861, + "logps/rejected": -1.4012508392333984, + "loss": 1.653, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2910511493682861, + "rewards/margins": 0.11019964516162872, + "rewards/rejected": -1.4012508392333984, "step": 1790 }, { "epoch": 0.9606957685231644, - "grad_norm": 7.885711535048902, + "grad_norm": 7.882374537885048, "learning_rate": 8.593957163144141e-07, - "logits/chosen": -0.08106566965579987, - "logits/rejected": 0.053579796105623245, - "logps/chosen": -1.2748843431472778, - "logps/rejected": -1.4692093133926392, - "loss": 2.0301, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2748843431472778, - "rewards/margins": 0.19432485103607178, - "rewards/rejected": -1.4692093133926392, - "semantic_entropy": 0.8063615560531616, + "logits/chosen": -0.09401813894510269, + "logits/rejected": 0.024964818730950356, + "logps/chosen": -1.2600610256195068, + "logps/rejected": -1.4081159830093384, + "loss": 1.6276, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2600610256195068, + "rewards/margins": 0.1480550318956375, + "rewards/rejected": -1.4081159830093384, "step": 1795 }, { "epoch": 0.963371801304566, - "grad_norm": 5.108281485121351, + "grad_norm": 5.2752952279039445, "learning_rate": 8.58311228163888e-07, - "logits/chosen": 0.007795440498739481, - "logits/rejected": 0.08127208799123764, - "logps/chosen": -1.273447871208191, - "logps/rejected": -1.4030935764312744, - "loss": 2.0492, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.273447871208191, - "rewards/margins": 0.12964563071727753, - "rewards/rejected": -1.4030935764312744, - "semantic_entropy": 0.8049715757369995, + "logits/chosen": -0.0389956533908844, + "logits/rejected": 0.017822042107582092, + "logps/chosen": -1.2581193447113037, + "logps/rejected": -1.3478018045425415, + "loss": 1.6489, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2581193447113037, + "rewards/margins": 0.08968259394168854, + "rewards/rejected": -1.3478018045425415, "step": 1800 }, { "epoch": 0.9660478340859675, - "grad_norm": 6.028916082322847, + "grad_norm": 5.94588261820782, "learning_rate": 8.57223263676255e-07, - "logits/chosen": -0.12820856273174286, - "logits/rejected": -0.002626272616907954, - "logps/chosen": -1.2371407747268677, - "logps/rejected": -1.5107574462890625, - "loss": 1.9814, + "logits/chosen": -0.16185346245765686, + "logits/rejected": -0.05402858182787895, + "logps/chosen": -1.218846082687378, + "logps/rejected": -1.4322948455810547, + "loss": 1.5672, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2371407747268677, - "rewards/margins": 0.2736164927482605, - "rewards/rejected": -1.5107574462890625, - "semantic_entropy": 0.8165399432182312, + "rewards/chosen": -1.218846082687378, + "rewards/margins": 0.21344876289367676, + "rewards/rejected": -1.4322948455810547, "step": 1805 }, { "epoch": 0.9687238668673691, - "grad_norm": 7.801922833669752, + "grad_norm": 7.631863260969375, "learning_rate": 8.561318334069511e-07, - "logits/chosen": -0.01773199997842312, - "logits/rejected": 0.11925344169139862, - "logps/chosen": -1.2662723064422607, - "logps/rejected": -1.418873906135559, - "loss": 2.0459, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2662723064422607, - "rewards/margins": 0.15260140597820282, - "rewards/rejected": -1.418873906135559, - "semantic_entropy": 0.8175581097602844, + "logits/chosen": -0.038693930953741074, + "logits/rejected": 0.08398672938346863, + "logps/chosen": -1.2442553043365479, + "logps/rejected": -1.3523961305618286, + "loss": 1.6342, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2442553043365479, + "rewards/margins": 0.10814078152179718, + "rewards/rejected": -1.3523961305618286, "step": 1810 }, { "epoch": 0.9713998996487707, - "grad_norm": 6.630422780032403, + "grad_norm": 6.160883134645266, "learning_rate": 8.550369479450375e-07, - "logits/chosen": -0.039072971791028976, - "logits/rejected": 0.09387413412332535, - "logps/chosen": -1.2889964580535889, - "logps/rejected": -1.4401779174804688, - "loss": 2.0326, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2889964580535889, - "rewards/margins": 0.15118160843849182, - "rewards/rejected": -1.4401779174804688, - "semantic_entropy": 0.8038996458053589, + "logits/chosen": -0.08054449409246445, + "logits/rejected": 0.023082170635461807, + "logps/chosen": -1.2673180103302002, + "logps/rejected": -1.3657923936843872, + "loss": 1.6303, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2673180103302002, + "rewards/margins": 0.09847430884838104, + "rewards/rejected": -1.3657923936843872, "step": 1815 }, { "epoch": 0.9740759324301723, - "grad_norm": 7.132366412010857, + "grad_norm": 7.3685895699380515, "learning_rate": 8.539386179130977e-07, - "logits/chosen": -0.010481951758265495, - "logits/rejected": 0.05843520164489746, - "logps/chosen": -1.2933626174926758, - "logps/rejected": -1.4336893558502197, - "loss": 2.0552, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2933626174926758, - "rewards/margins": 0.1403266340494156, - "rewards/rejected": -1.4336893558502197, - "semantic_entropy": 0.8076707124710083, + "logits/chosen": -0.057333867996931076, + "logits/rejected": 0.0019186340505257249, + "logps/chosen": -1.2531261444091797, + "logps/rejected": -1.3625218868255615, + "loss": 1.6317, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2531261444091797, + "rewards/margins": 0.10939564555883408, + "rewards/rejected": -1.3625218868255615, "step": 1820 }, { "epoch": 0.9767519652115738, - "grad_norm": 8.28316100471642, + "grad_norm": 7.955820131517521, "learning_rate": 8.528368539671347e-07, - "logits/chosen": -0.06624146550893784, - "logits/rejected": 0.08388527482748032, - "logps/chosen": -1.2708795070648193, - "logps/rejected": -1.5188466310501099, - "loss": 2.0311, + "logits/chosen": -0.11731233447790146, + "logits/rejected": 0.002155174035578966, + "logps/chosen": -1.2458884716033936, + "logps/rejected": -1.417532205581665, + "loss": 1.6204, "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2708795070648193, - "rewards/margins": 0.24796704947948456, - "rewards/rejected": -1.5188466310501099, - "semantic_entropy": 0.8095704913139343, + "rewards/chosen": -1.2458884716033936, + "rewards/margins": 0.17164357006549835, + "rewards/rejected": -1.417532205581665, "step": 1825 }, { "epoch": 0.9794279979929754, - "grad_norm": 5.971238453093119, + "grad_norm": 6.122737863351037, "learning_rate": 8.51731666796467e-07, - "logits/chosen": 0.1276635229587555, - "logits/rejected": 0.15978917479515076, - "logps/chosen": -1.339593768119812, - "logps/rejected": -1.4184544086456299, - "loss": 2.0979, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.339593768119812, - "rewards/margins": 0.07886041700839996, - "rewards/rejected": -1.4184544086456299, - "semantic_entropy": 0.7924166917800903, + "logits/chosen": 0.03416532278060913, + "logits/rejected": 0.053548503667116165, + "logps/chosen": -1.3238328695297241, + "logps/rejected": -1.3591514825820923, + "loss": 1.7031, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3238328695297241, + "rewards/margins": 0.035318635404109955, + "rewards/rejected": -1.3591514825820923, "step": 1830 }, { "epoch": 0.982104030774377, - "grad_norm": 6.927656624192063, + "grad_norm": 7.675621169957029, "learning_rate": 8.506230671236254e-07, - "logits/chosen": -0.013933452777564526, - "logits/rejected": 0.051707565784454346, - "logps/chosen": -1.3016908168792725, - "logps/rejected": -1.355043649673462, - "loss": 2.08, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3016908168792725, - "rewards/margins": 0.05335277318954468, - "rewards/rejected": -1.355043649673462, - "semantic_entropy": 0.8091516494750977, + "logits/chosen": -0.06183997914195061, + "logits/rejected": -0.016029536724090576, + "logps/chosen": -1.2829948663711548, + "logps/rejected": -1.3081411123275757, + "loss": 1.6731, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2829948663711548, + "rewards/margins": 0.02514636516571045, + "rewards/rejected": -1.3081411123275757, "step": 1835 }, { "epoch": 0.9847800635557785, - "grad_norm": 6.708651387780945, + "grad_norm": 6.871647817378736, "learning_rate": 8.495110657042488e-07, - "logits/chosen": 0.022884510457515717, - "logits/rejected": 0.12576675415039062, - "logps/chosen": -1.3016811609268188, - "logps/rejected": -1.5378698110580444, - "loss": 2.0291, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3016811609268188, - "rewards/margins": 0.23618856072425842, - "rewards/rejected": -1.5378698110580444, - "semantic_entropy": 0.8028618097305298, + "logits/chosen": -0.02273593842983246, + "logits/rejected": 0.05914074927568436, + "logps/chosen": -1.279006838798523, + "logps/rejected": -1.465643286705017, + "loss": 1.6271, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.279006838798523, + "rewards/margins": 0.18663650751113892, + "rewards/rejected": -1.465643286705017, "step": 1840 }, { "epoch": 0.9874560963371801, - "grad_norm": 8.94078484600404, + "grad_norm": 9.202293879728787, "learning_rate": 8.483956733269799e-07, - "logits/chosen": -0.034597523510456085, - "logits/rejected": 0.05511583760380745, - "logps/chosen": -1.322399377822876, - "logps/rejected": -1.4499645233154297, - "loss": 2.0718, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.322399377822876, - "rewards/margins": 0.1275651454925537, - "rewards/rejected": -1.4499645233154297, - "semantic_entropy": 0.7913865447044373, + "logits/chosen": -0.09814430773258209, + "logits/rejected": -0.019922833889722824, + "logps/chosen": -1.301499366760254, + "logps/rejected": -1.3693764209747314, + "loss": 1.676, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.301499366760254, + "rewards/margins": 0.06787705421447754, + "rewards/rejected": -1.3693764209747314, "step": 1845 }, { "epoch": 0.9901321291185817, - "grad_norm": 7.903504654248402, + "grad_norm": 6.264497155893721, "learning_rate": 8.472769008133602e-07, - "logits/chosen": -0.1613004505634308, - "logits/rejected": -0.03331545740365982, - "logps/chosen": -1.3431861400604248, - "logps/rejected": -1.3820966482162476, - "loss": 2.1119, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.3431861400604248, - "rewards/margins": 0.03891069442033768, - "rewards/rejected": -1.3820966482162476, - "semantic_entropy": 0.8067256808280945, + "logits/chosen": -0.18302765488624573, + "logits/rejected": -0.07154009491205215, + "logps/chosen": -1.3125717639923096, + "logps/rejected": -1.3157421350479126, + "loss": 1.6926, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3125717639923096, + "rewards/margins": 0.0031705095898360014, + "rewards/rejected": -1.3157421350479126, "step": 1850 }, { "epoch": 0.9928081618999832, - "grad_norm": 6.836428084610912, + "grad_norm": 6.488223481774972, "learning_rate": 8.461547590177259e-07, - "logits/chosen": -0.02012103796005249, - "logits/rejected": 0.08856208622455597, - "logps/chosen": -1.2511141300201416, - "logps/rejected": -1.4271875619888306, - "loss": 2.032, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2511141300201416, - "rewards/margins": 0.17607346177101135, - "rewards/rejected": -1.4271875619888306, - "semantic_entropy": 0.80317622423172, + "logits/chosen": -0.08081166446208954, + "logits/rejected": 0.0005839228397235274, + "logps/chosen": -1.2326545715332031, + "logps/rejected": -1.369754433631897, + "loss": 1.6284, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2326545715332031, + "rewards/margins": 0.13709980249404907, + "rewards/rejected": -1.369754433631897, "step": 1855 }, { "epoch": 0.9954841946813848, - "grad_norm": 8.191047160297591, + "grad_norm": 7.904758128328132, "learning_rate": 8.450292588271014e-07, - "logits/chosen": -0.010422793217003345, - "logits/rejected": 0.07045050710439682, - "logps/chosen": -1.3515708446502686, - "logps/rejected": -1.4826161861419678, - "loss": 2.0701, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3515708446502686, - "rewards/margins": 0.13104534149169922, - "rewards/rejected": -1.4826161861419678, - "semantic_entropy": 0.7868179082870483, + "logits/chosen": -0.04447736218571663, + "logits/rejected": 0.01741199567914009, + "logps/chosen": -1.3151956796646118, + "logps/rejected": -1.4005998373031616, + "loss": 1.6629, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3151956796646118, + "rewards/margins": 0.08540423959493637, + "rewards/rejected": -1.4005998373031616, "step": 1860 }, { "epoch": 0.9981602274627864, - "grad_norm": 7.277078462725191, + "grad_norm": 6.880587886858402, "learning_rate": 8.439004111610945e-07, - "logits/chosen": -0.04057621955871582, - "logits/rejected": 0.03624313324689865, - "logps/chosen": -1.207397699356079, - "logps/rejected": -1.4739655256271362, - "loss": 1.989, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.207397699356079, - "rewards/margins": 0.2665678858757019, - "rewards/rejected": -1.4739655256271362, - "semantic_entropy": 0.8229459524154663, + "logits/chosen": -0.07799827307462692, + "logits/rejected": -0.016821760684251785, + "logps/chosen": -1.185073733329773, + "logps/rejected": -1.4089863300323486, + "loss": 1.5735, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.185073733329773, + "rewards/margins": 0.22391244769096375, + "rewards/rejected": -1.4089863300323486, "step": 1865 }, { "epoch": 1.000836260244188, - "grad_norm": 7.269625131401338, + "grad_norm": 7.095370270812544, "learning_rate": 8.427682269717901e-07, - "logits/chosen": -0.061928071081638336, - "logits/rejected": 0.09646090120077133, - "logps/chosen": -1.3524137735366821, - "logps/rejected": -1.469217300415039, - "loss": 2.0911, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.3524137735366821, - "rewards/margins": 0.11680366098880768, - "rewards/rejected": -1.469217300415039, - "semantic_entropy": 0.7895178198814392, + "logits/chosen": -0.13477222621440887, + "logits/rejected": -0.005845165345817804, + "logps/chosen": -1.3289659023284912, + "logps/rejected": -1.377912163734436, + "loss": 1.6938, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.3289659023284912, + "rewards/margins": 0.048946212977170944, + "rewards/rejected": -1.377912163734436, "step": 1870 }, { "epoch": 1.0035122930255895, - "grad_norm": 6.234940546892951, + "grad_norm": 5.750529158786094, "learning_rate": 8.416327172436446e-07, - "logits/chosen": -0.0901242271065712, - "logits/rejected": 0.04721410945057869, - "logps/chosen": -1.3372011184692383, - "logps/rejected": -1.4556587934494019, - "loss": 2.0741, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3372011184692383, - "rewards/margins": 0.1184576004743576, - "rewards/rejected": -1.4556587934494019, - "semantic_entropy": 0.8036667704582214, + "logits/chosen": -0.1507563441991806, + "logits/rejected": -0.0380830354988575, + "logps/chosen": -1.3004181385040283, + "logps/rejected": -1.3883024454116821, + "loss": 1.6578, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3004181385040283, + "rewards/margins": 0.08788414299488068, + "rewards/rejected": -1.3883024454116821, "step": 1875 }, { "epoch": 1.0061883258069912, - "grad_norm": 10.862639044006587, + "grad_norm": 7.777439085414502, "learning_rate": 8.404938929933778e-07, - "logits/chosen": 0.009253310039639473, - "logits/rejected": 0.1762274205684662, - "logps/chosen": -1.2881697416305542, - "logps/rejected": -1.610016107559204, - "loss": 2.0204, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2881697416305542, - "rewards/margins": 0.3218461871147156, - "rewards/rejected": -1.610016107559204, - "semantic_entropy": 0.7875820398330688, + "logits/chosen": -0.03777514398097992, + "logits/rejected": 0.11495566368103027, + "logps/chosen": -1.254887342453003, + "logps/rejected": -1.5188519954681396, + "loss": 1.616, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.254887342453003, + "rewards/margins": 0.26396480202674866, + "rewards/rejected": -1.5188519954681396, "step": 1880 }, { "epoch": 1.0088643585883927, - "grad_norm": 7.426144032081365, + "grad_norm": 7.538747422203276, "learning_rate": 8.39351765269868e-07, - "logits/chosen": -0.04300857335329056, - "logits/rejected": 0.03298963978886604, - "logps/chosen": -1.2392830848693848, - "logps/rejected": -1.4619297981262207, - "loss": 2.0175, + "logits/chosen": -0.08718445152044296, + "logits/rejected": -0.020811621099710464, + "logps/chosen": -1.2131198644638062, + "logps/rejected": -1.3882570266723633, + "loss": 1.5975, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2392830848693848, - "rewards/margins": 0.2226467877626419, - "rewards/rejected": -1.4619297981262207, - "semantic_entropy": 0.8182765245437622, + "rewards/chosen": -1.2131198644638062, + "rewards/margins": 0.17513707280158997, + "rewards/rejected": -1.3882570266723633, "step": 1885 }, { "epoch": 1.0115403913697942, - "grad_norm": 6.766335194681512, + "grad_norm": 6.472314903623082, "learning_rate": 8.382063451540431e-07, - "logits/chosen": -0.025482019409537315, - "logits/rejected": 0.16996082663536072, - "logps/chosen": -1.2850615978240967, - "logps/rejected": -1.4646522998809814, - "loss": 2.07, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2850615978240967, - "rewards/margins": 0.17959053814411163, - "rewards/rejected": -1.4646522998809814, - "semantic_entropy": 0.7993168830871582, + "logits/chosen": -0.09305468946695328, + "logits/rejected": 0.07835519313812256, + "logps/chosen": -1.268439531326294, + "logps/rejected": -1.4149713516235352, + "loss": 1.6652, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.268439531326294, + "rewards/margins": 0.1465318351984024, + "rewards/rejected": -1.4149713516235352, "step": 1890 }, { "epoch": 1.014216424151196, - "grad_norm": 8.042008089739506, + "grad_norm": 7.459073670556534, "learning_rate": 8.370576437587742e-07, - "logits/chosen": 0.036351293325424194, - "logits/rejected": 0.09470394253730774, - "logps/chosen": -1.2692261934280396, - "logps/rejected": -1.434579610824585, - "loss": 2.0235, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2692261934280396, - "rewards/margins": 0.16535338759422302, - "rewards/rejected": -1.434579610824585, - "semantic_entropy": 0.809338390827179, + "logits/chosen": -0.03299138322472572, + "logits/rejected": 0.019478967413306236, + "logps/chosen": -1.2478959560394287, + "logps/rejected": -1.388780951499939, + "loss": 1.61, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2478959560394287, + "rewards/margins": 0.14088508486747742, + "rewards/rejected": -1.388780951499939, "step": 1895 }, { "epoch": 1.0168924569325974, - "grad_norm": 6.394105938084402, + "grad_norm": 7.2190581936132885, "learning_rate": 8.359056722287674e-07, - "logits/chosen": -0.10826126486063004, - "logits/rejected": 0.14603324234485626, - "logps/chosen": -1.3070743083953857, - "logps/rejected": -1.4380731582641602, - "loss": 2.063, + "logits/chosen": -0.1465390920639038, + "logits/rejected": 0.07248485833406448, + "logps/chosen": -1.2892343997955322, + "logps/rejected": -1.3898210525512695, + "loss": 1.6613, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3070743083953857, - "rewards/margins": 0.13099884986877441, - "rewards/rejected": -1.4380731582641602, - "semantic_entropy": 0.7941070795059204, + "rewards/chosen": -1.2892343997955322, + "rewards/margins": 0.10058672726154327, + "rewards/rejected": -1.3898210525512695, "step": 1900 }, { "epoch": 1.019568489713999, - "grad_norm": 6.349824057450553, + "grad_norm": 6.660162026732205, "learning_rate": 8.347504417404553e-07, - "logits/chosen": -0.007749214768409729, - "logits/rejected": 0.13887669146060944, - "logps/chosen": -1.3314285278320312, - "logps/rejected": -1.4583871364593506, - "loss": 2.0747, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3314285278320312, - "rewards/margins": 0.12695881724357605, - "rewards/rejected": -1.4583871364593506, - "semantic_entropy": 0.791080117225647, + "logits/chosen": -0.07367981225252151, + "logits/rejected": 0.04928433522582054, + "logps/chosen": -1.3099291324615479, + "logps/rejected": -1.4067552089691162, + "loss": 1.6726, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3099291324615479, + "rewards/margins": 0.0968259871006012, + "rewards/rejected": -1.4067552089691162, "step": 1905 }, { "epoch": 1.0222445224954007, - "grad_norm": 7.4786128948638515, + "grad_norm": 7.458385903072796, "learning_rate": 8.335919635018893e-07, - "logits/chosen": -0.10085531324148178, - "logits/rejected": 0.018569733947515488, - "logps/chosen": -1.2909607887268066, - "logps/rejected": -1.4696437120437622, - "loss": 2.0521, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2909607887268066, - "rewards/margins": 0.1786828637123108, - "rewards/rejected": -1.4696437120437622, - "semantic_entropy": 0.791267454624176, + "logits/chosen": -0.16138581931591034, + "logits/rejected": -0.05747319385409355, + "logps/chosen": -1.2732274532318115, + "logps/rejected": -1.4249045848846436, + "loss": 1.6463, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2732274532318115, + "rewards/margins": 0.15167725086212158, + "rewards/rejected": -1.4249045848846436, "step": 1910 }, { "epoch": 1.0249205552768021, - "grad_norm": 5.803922148585013, + "grad_norm": 5.044798526395231, "learning_rate": 8.324302487526303e-07, - "logits/chosen": -0.04810156300663948, - "logits/rejected": 0.024439021944999695, - "logps/chosen": -1.2567697763442993, - "logps/rejected": -1.391143560409546, - "loss": 2.035, + "logits/chosen": -0.12010698020458221, + "logits/rejected": -0.06151549890637398, + "logps/chosen": -1.23702073097229, + "logps/rejected": -1.3350869417190552, + "loss": 1.6219, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2567697763442993, - "rewards/margins": 0.13437364995479584, - "rewards/rejected": -1.391143560409546, - "semantic_entropy": 0.8175903558731079, + "rewards/chosen": -1.23702073097229, + "rewards/margins": 0.0980663076043129, + "rewards/rejected": -1.3350869417190552, "step": 1915 }, { "epoch": 1.0275965880582036, - "grad_norm": 6.84748741629461, + "grad_norm": 6.461444915897312, "learning_rate": 8.312653087636398e-07, - "logits/chosen": -0.08552031219005585, - "logits/rejected": -0.009279889054596424, - "logps/chosen": -1.1754251718521118, - "logps/rejected": -1.4189541339874268, - "loss": 1.956, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1754251718521118, - "rewards/margins": 0.24352887272834778, - "rewards/rejected": -1.4189541339874268, - "semantic_entropy": 0.8310561180114746, + "logits/chosen": -0.13163354992866516, + "logits/rejected": -0.070621058344841, + "logps/chosen": -1.1589525938034058, + "logps/rejected": -1.3587532043457031, + "loss": 1.5428, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1589525938034058, + "rewards/margins": 0.19980056583881378, + "rewards/rejected": -1.3587532043457031, "step": 1920 }, { "epoch": 1.0302726208396054, - "grad_norm": 7.9402409829710425, + "grad_norm": 7.789203310879303, "learning_rate": 8.300971548371711e-07, - "logits/chosen": -0.2018776386976242, - "logits/rejected": 0.003242161823436618, - "logps/chosen": -1.3566595315933228, - "logps/rejected": -1.4750354290008545, - "loss": 2.0812, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3566595315933228, - "rewards/margins": 0.11837591975927353, - "rewards/rejected": -1.4750354290008545, - "semantic_entropy": 0.7911876440048218, + "logits/chosen": -0.2395746260881424, + "logits/rejected": -0.06477358937263489, + "logps/chosen": -1.3387569189071655, + "logps/rejected": -1.4158661365509033, + "loss": 1.6874, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3387569189071655, + "rewards/margins": 0.07710927724838257, + "rewards/rejected": -1.4158661365509033, "step": 1925 }, { "epoch": 1.0329486536210069, - "grad_norm": 7.251246592815628, + "grad_norm": 6.64609691322332, "learning_rate": 8.289257983066582e-07, - "logits/chosen": -0.1044299378991127, - "logits/rejected": 0.03499078005552292, - "logps/chosen": -1.2216999530792236, - "logps/rejected": -1.4600203037261963, - "loss": 1.9767, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2216999530792236, - "rewards/margins": 0.23832039535045624, - "rewards/rejected": -1.4600203037261963, - "semantic_entropy": 0.8182482719421387, + "logits/chosen": -0.14642703533172607, + "logits/rejected": -0.025876719504594803, + "logps/chosen": -1.2055902481079102, + "logps/rejected": -1.3968555927276611, + "loss": 1.5728, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2055902481079102, + "rewards/margins": 0.19126519560813904, + "rewards/rejected": -1.3968555927276611, "step": 1930 }, { "epoch": 1.0356246864024083, - "grad_norm": 8.134051978549717, + "grad_norm": 6.594534161726122, "learning_rate": 8.277512505366077e-07, - "logits/chosen": -0.1392371654510498, - "logits/rejected": 0.031945519149303436, - "logps/chosen": -1.2987267971038818, - "logps/rejected": -1.5219374895095825, - "loss": 2.0236, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2987267971038818, - "rewards/margins": 0.22321060299873352, - "rewards/rejected": -1.5219374895095825, - "semantic_entropy": 0.8038732409477234, + "logits/chosen": -0.17431317269802094, + "logits/rejected": -0.02574203535914421, + "logps/chosen": -1.272735834121704, + "logps/rejected": -1.4293947219848633, + "loss": 1.6216, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.272735834121704, + "rewards/margins": 0.1566590517759323, + "rewards/rejected": -1.4293947219848633, "step": 1935 }, { "epoch": 1.03830071918381, - "grad_norm": 8.837560923069026, + "grad_norm": 7.331159146339648, "learning_rate": 8.265735229224868e-07, - "logits/chosen": -0.07545419037342072, - "logits/rejected": 0.03142388537526131, - "logps/chosen": -1.3016760349273682, - "logps/rejected": -1.5333623886108398, - "loss": 2.0343, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3016760349273682, - "rewards/margins": 0.23168623447418213, - "rewards/rejected": -1.5333623886108398, - "semantic_entropy": 0.7905539870262146, + "logits/chosen": -0.09168613702058792, + "logits/rejected": 0.0004261195717845112, + "logps/chosen": -1.2771458625793457, + "logps/rejected": -1.4360650777816772, + "loss": 1.642, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2771458625793457, + "rewards/margins": 0.1589193344116211, + "rewards/rejected": -1.4360650777816772, "step": 1940 }, { "epoch": 1.0409767519652116, - "grad_norm": 13.135999992185194, + "grad_norm": 7.080897027099071, "learning_rate": 8.253926268906144e-07, - "logits/chosen": -0.14592385292053223, - "logits/rejected": 0.019411101937294006, - "logps/chosen": -1.2989904880523682, - "logps/rejected": -1.5625553131103516, - "loss": 2.0003, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2989904880523682, - "rewards/margins": 0.2635648250579834, - "rewards/rejected": -1.5625553131103516, - "semantic_entropy": 0.7920844554901123, + "logits/chosen": -0.18290378153324127, + "logits/rejected": -0.03943333774805069, + "logps/chosen": -1.2643144130706787, + "logps/rejected": -1.4040201902389526, + "loss": 1.6022, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2643144130706787, + "rewards/margins": 0.13970568776130676, + "rewards/rejected": -1.4040201902389526, "step": 1945 }, { "epoch": 1.043652784746613, - "grad_norm": 5.779607975582833, + "grad_norm": 6.1550230568679645, "learning_rate": 8.242085738980487e-07, - "logits/chosen": -0.06912654638290405, - "logits/rejected": 0.13862493634223938, - "logps/chosen": -1.3420753479003906, - "logps/rejected": -1.5418751239776611, - "loss": 2.0447, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3420753479003906, - "rewards/margins": 0.19979973137378693, - "rewards/rejected": -1.5418751239776611, - "semantic_entropy": 0.7756515741348267, + "logits/chosen": -0.11934999376535416, + "logits/rejected": 0.055243026465177536, + "logps/chosen": -1.310400128364563, + "logps/rejected": -1.4350045919418335, + "loss": 1.6469, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.310400128364563, + "rewards/margins": 0.12460446357727051, + "rewards/rejected": -1.4350045919418335, "step": 1950 }, { "epoch": 1.0463288175280148, - "grad_norm": 7.5077098873095665, + "grad_norm": 7.148429628311151, "learning_rate": 8.230213754324772e-07, - "logits/chosen": -0.013625724241137505, - "logits/rejected": 0.055890560150146484, - "logps/chosen": -1.2228213548660278, - "logps/rejected": -1.4776057004928589, - "loss": 1.97, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2228213548660278, - "rewards/margins": 0.2547842860221863, - "rewards/rejected": -1.4776057004928589, - "semantic_entropy": 0.8201814889907837, + "logits/chosen": -0.08163203299045563, + "logits/rejected": -0.026378247886896133, + "logps/chosen": -1.2024458646774292, + "logps/rejected": -1.4101860523223877, + "loss": 1.555, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2024458646774292, + "rewards/margins": 0.2077401578426361, + "rewards/rejected": -1.4101860523223877, "step": 1955 }, { "epoch": 1.0490048503094163, - "grad_norm": 7.321166037812535, + "grad_norm": 6.742860934814458, "learning_rate": 8.218310430121045e-07, - "logits/chosen": -0.07903581112623215, - "logits/rejected": -0.04870419204235077, - "logps/chosen": -1.2837450504302979, - "logps/rejected": -1.4462873935699463, - "loss": 2.0399, + "logits/chosen": -0.1711881458759308, + "logits/rejected": -0.14690272510051727, + "logps/chosen": -1.2601053714752197, + "logps/rejected": -1.3863235712051392, + "loss": 1.6365, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2837450504302979, - "rewards/margins": 0.1625424325466156, - "rewards/rejected": -1.4462873935699463, - "semantic_entropy": 0.789503276348114, + "rewards/chosen": -1.2601053714752197, + "rewards/margins": 0.1262180358171463, + "rewards/rejected": -1.3863235712051392, "step": 1960 }, { "epoch": 1.051680883090818, - "grad_norm": 7.931221556919084, + "grad_norm": 7.520939323766149, "learning_rate": 8.20637588185541e-07, - "logits/chosen": 0.03140250965952873, - "logits/rejected": 0.10349716991186142, - "logps/chosen": -1.226862907409668, - "logps/rejected": -1.5837271213531494, - "loss": 1.942, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.226862907409668, - "rewards/margins": 0.35686445236206055, - "rewards/rejected": -1.5837271213531494, - "semantic_entropy": 0.8064924478530884, + "logits/chosen": -0.06285201013088226, + "logits/rejected": -0.008772213943302631, + "logps/chosen": -1.2074944972991943, + "logps/rejected": -1.4803524017333984, + "loss": 1.5389, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2074944972991943, + "rewards/margins": 0.27285805344581604, + "rewards/rejected": -1.4803524017333984, "step": 1965 }, { "epoch": 1.0543569158722195, - "grad_norm": 8.900927053864605, + "grad_norm": 8.444048221481284, "learning_rate": 8.194410225316906e-07, - "logits/chosen": -0.04091443866491318, - "logits/rejected": 0.10504456609487534, - "logps/chosen": -1.2857110500335693, - "logps/rejected": -1.512974500656128, - "loss": 2.0268, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2857110500335693, - "rewards/margins": 0.2272634208202362, - "rewards/rejected": -1.512974500656128, - "semantic_entropy": 0.7864609956741333, + "logits/chosen": -0.12451770156621933, + "logits/rejected": -0.010346454568207264, + "logps/chosen": -1.2716000080108643, + "logps/rejected": -1.442080020904541, + "loss": 1.6334, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2716000080108643, + "rewards/margins": 0.1704799234867096, + "rewards/rejected": -1.442080020904541, "step": 1970 }, { "epoch": 1.057032948653621, - "grad_norm": 7.377180436735642, + "grad_norm": 7.439698493505291, "learning_rate": 8.182413576596385e-07, - "logits/chosen": 0.031790219247341156, - "logits/rejected": 0.12696322798728943, - "logps/chosen": -1.224799394607544, - "logps/rejected": -1.4564673900604248, - "loss": 1.9856, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.224799394607544, - "rewards/margins": 0.23166804015636444, - "rewards/rejected": -1.4564673900604248, - "semantic_entropy": 0.8123728036880493, + "logits/chosen": -0.06397426128387451, + "logits/rejected": 0.01084409561008215, + "logps/chosen": -1.2019623517990112, + "logps/rejected": -1.3936353921890259, + "loss": 1.5667, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2019623517990112, + "rewards/margins": 0.19167283177375793, + "rewards/rejected": -1.3936353921890259, "step": 1975 }, { "epoch": 1.0597089814350227, - "grad_norm": 8.157139086496546, + "grad_norm": 7.66250379267884, "learning_rate": 8.170386052085389e-07, - "logits/chosen": 0.032434239983558655, - "logits/rejected": 0.1504647582769394, - "logps/chosen": -1.2983589172363281, - "logps/rejected": -1.518869400024414, - "loss": 2.0384, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2983589172363281, - "rewards/margins": 0.2205105721950531, - "rewards/rejected": -1.518869400024414, - "semantic_entropy": 0.7908387184143066, + "logits/chosen": -0.014584923163056374, + "logits/rejected": 0.0913209542632103, + "logps/chosen": -1.276272177696228, + "logps/rejected": -1.4327203035354614, + "loss": 1.6424, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.276272177696228, + "rewards/margins": 0.15644793212413788, + "rewards/rejected": -1.4327203035354614, "step": 1980 }, { "epoch": 1.0623850142164242, - "grad_norm": 7.536876492898136, + "grad_norm": 7.6835509995952345, "learning_rate": 8.158327768475008e-07, - "logits/chosen": -0.02274470403790474, - "logits/rejected": 0.13517804443836212, - "logps/chosen": -1.3373349905014038, - "logps/rejected": -1.4611749649047852, - "loss": 2.0847, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3373349905014038, - "rewards/margins": 0.1238398551940918, - "rewards/rejected": -1.4611749649047852, - "semantic_entropy": 0.782866358757019, + "logits/chosen": -0.10186652094125748, + "logits/rejected": 0.030913090333342552, + "logps/chosen": -1.2977344989776611, + "logps/rejected": -1.3779385089874268, + "loss": 1.6738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2977344989776611, + "rewards/margins": 0.08020380884408951, + "rewards/rejected": -1.3779385089874268, "step": 1985 }, { "epoch": 1.0650610469978257, - "grad_norm": 7.9100389576496, + "grad_norm": 7.38233771315589, "learning_rate": 8.146238842754767e-07, - "logits/chosen": -0.04121997952461243, - "logits/rejected": 0.05382202938199043, - "logps/chosen": -1.326257586479187, - "logps/rejected": -1.5107661485671997, - "loss": 2.0481, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.326257586479187, - "rewards/margins": 0.18450842797756195, - "rewards/rejected": -1.5107661485671997, - "semantic_entropy": 0.7959458827972412, + "logits/chosen": -0.127910777926445, + "logits/rejected": -0.053719568997621536, + "logps/chosen": -1.301751732826233, + "logps/rejected": -1.4438529014587402, + "loss": 1.6436, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.301751732826233, + "rewards/margins": 0.1421012133359909, + "rewards/rejected": -1.4438529014587402, "step": 1990 }, { "epoch": 1.0677370797792274, - "grad_norm": 7.393372917797862, + "grad_norm": 7.261162820145197, "learning_rate": 8.134119392211476e-07, - "logits/chosen": 0.04782380536198616, - "logits/rejected": 0.19727477431297302, - "logps/chosen": -1.2395175695419312, - "logps/rejected": -1.5628464221954346, - "loss": 1.9818, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2395175695419312, - "rewards/margins": 0.3233289122581482, - "rewards/rejected": -1.5628464221954346, - "semantic_entropy": 0.8064082860946655, + "logits/chosen": -0.023806992918252945, + "logits/rejected": 0.10135696083307266, + "logps/chosen": -1.2175931930541992, + "logps/rejected": -1.4716691970825195, + "loss": 1.576, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2175931930541992, + "rewards/margins": 0.25407615303993225, + "rewards/rejected": -1.4716691970825195, "step": 1995 }, { "epoch": 1.0704131125606289, - "grad_norm": 15.049418095336085, + "grad_norm": 11.282402041281472, "learning_rate": 8.121969534428094e-07, - "logits/chosen": -0.048318177461624146, - "logits/rejected": 0.09915760904550552, - "logps/chosen": -1.3366820812225342, - "logps/rejected": -1.4741013050079346, - "loss": 2.1246, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3366820812225342, - "rewards/margins": 0.13741900026798248, - "rewards/rejected": -1.4741013050079346, - "semantic_entropy": 0.799536943435669, + "logits/chosen": -0.10262684524059296, + "logits/rejected": 0.02830558642745018, + "logps/chosen": -1.3009599447250366, + "logps/rejected": -1.3559610843658447, + "loss": 1.7104, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3009599447250366, + "rewards/margins": 0.05500108003616333, + "rewards/rejected": -1.3559610843658447, "step": 2000 }, { "epoch": 1.0704131125606289, - "eval_logits/chosen": 0.36331358551979065, - "eval_logits/rejected": 0.4596547782421112, - "eval_logps/chosen": -1.3185980319976807, - "eval_logps/rejected": -1.5339993238449097, - "eval_loss": 2.0504531860351562, - "eval_rewards/accuracies": 0.577151358127594, - "eval_rewards/chosen": -1.3185980319976807, - "eval_rewards/margins": 0.21540121734142303, - "eval_rewards/rejected": -1.5339993238449097, - "eval_runtime": 34.7693, - "eval_samples_per_second": 38.684, - "eval_semantic_entropy": 0.7883589863777161, - "eval_steps_per_second": 9.692, + "eval_logits/chosen": 0.17809128761291504, + "eval_logits/rejected": 0.2528059482574463, + "eval_logps/chosen": -1.3006004095077515, + "eval_logps/rejected": -1.4569226503372192, + "eval_loss": 1.6553415060043335, + "eval_rewards/accuracies": 0.5660237669944763, + "eval_rewards/chosen": -1.3006004095077515, + "eval_rewards/margins": 0.15632230043411255, + "eval_rewards/rejected": -1.4569226503372192, + "eval_runtime": 40.4167, + "eval_samples_per_second": 33.278, + "eval_steps_per_second": 8.338, "step": 2000 }, { "epoch": 1.0730891453420304, - "grad_norm": 9.046281060813591, + "grad_norm": 8.336898237104945, "learning_rate": 8.109789387282599e-07, - "logits/chosen": 0.03845970332622528, - "logits/rejected": 0.11954043060541153, - "logps/chosen": -1.3330227136611938, - "logps/rejected": -1.4478117227554321, - "loss": 2.0881, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3330227136611938, - "rewards/margins": 0.11478898674249649, - "rewards/rejected": -1.4478117227554321, - "semantic_entropy": 0.805001437664032, + "logits/chosen": -0.08942779153585434, + "logits/rejected": -0.022323116660118103, + "logps/chosen": -1.3037192821502686, + "logps/rejected": -1.3781172037124634, + "loss": 1.6777, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3037192821502686, + "rewards/margins": 0.07439792156219482, + "rewards/rejected": -1.3781172037124634, "step": 2005 }, { "epoch": 1.075765178123432, - "grad_norm": 10.256124217400894, + "grad_norm": 9.214908562514987, "learning_rate": 8.097579068946827e-07, - "logits/chosen": 0.04520503804087639, - "logits/rejected": 0.15148118138313293, - "logps/chosen": -1.2535759210586548, - "logps/rejected": -1.433410882949829, - "loss": 2.0351, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2535759210586548, - "rewards/margins": 0.17983505129814148, - "rewards/rejected": -1.433410882949829, - "semantic_entropy": 0.8159645199775696, + "logits/chosen": -0.02387317083775997, + "logits/rejected": 0.06140471249818802, + "logps/chosen": -1.2356733083724976, + "logps/rejected": -1.3744370937347412, + "loss": 1.6234, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2356733083724976, + "rewards/margins": 0.13876380026340485, + "rewards/rejected": -1.3744370937347412, "step": 2010 }, { "epoch": 1.0784412109048336, - "grad_norm": 7.344453328822589, + "grad_norm": 6.725378431673041, "learning_rate": 8.085338697885344e-07, - "logits/chosen": -0.02262263372540474, - "logits/rejected": 0.12338912487030029, - "logps/chosen": -1.2342798709869385, - "logps/rejected": -1.444035530090332, - "loss": 2.0145, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2342798709869385, - "rewards/margins": 0.20975565910339355, - "rewards/rejected": -1.444035530090332, - "semantic_entropy": 0.8241077661514282, + "logits/chosen": -0.09659741073846817, + "logits/rejected": 0.035605426877737045, + "logps/chosen": -1.2066147327423096, + "logps/rejected": -1.3792082071304321, + "loss": 1.5918, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2066147327423096, + "rewards/margins": 0.17259351909160614, + "rewards/rejected": -1.3792082071304321, "step": 2015 }, { "epoch": 1.081117243686235, - "grad_norm": 7.496592903248984, + "grad_norm": 7.312873076103676, "learning_rate": 8.073068392854282e-07, - "logits/chosen": -0.049902092665433884, - "logits/rejected": 0.13317637145519257, - "logps/chosen": -1.325439691543579, - "logps/rejected": -1.5331872701644897, - "loss": 2.0448, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.325439691543579, - "rewards/margins": 0.2077476978302002, - "rewards/rejected": -1.5331872701644897, - "semantic_entropy": 0.7916399240493774, + "logits/chosen": -0.1383553296327591, + "logits/rejected": -0.005746991373598576, + "logps/chosen": -1.3074740171432495, + "logps/rejected": -1.4675743579864502, + "loss": 1.6512, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3074740171432495, + "rewards/margins": 0.16010025143623352, + "rewards/rejected": -1.4675743579864502, "step": 2020 }, { "epoch": 1.0837932764676368, - "grad_norm": 6.372316019210651, + "grad_norm": 7.354426390150471, "learning_rate": 8.060768272900193e-07, - "logits/chosen": 0.04581315070390701, - "logits/rejected": 0.1767156422138214, - "logps/chosen": -1.2968662977218628, - "logps/rejected": -1.532001256942749, - "loss": 2.024, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2968662977218628, - "rewards/margins": 0.2351350039243698, - "rewards/rejected": -1.532001256942749, - "semantic_entropy": 0.7861354351043701, + "logits/chosen": -0.03547487407922745, + "logits/rejected": 0.06639555096626282, + "logps/chosen": -1.279608964920044, + "logps/rejected": -1.4450252056121826, + "loss": 1.6361, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.279608964920044, + "rewards/margins": 0.1654161959886551, + "rewards/rejected": -1.4450252056121826, "step": 2025 }, { "epoch": 1.0864693092490383, - "grad_norm": 6.788766807266465, + "grad_norm": 6.417914554300857, "learning_rate": 8.0484384573589e-07, - "logits/chosen": -0.04738964885473251, - "logits/rejected": -0.025576937943696976, - "logps/chosen": -1.2473602294921875, - "logps/rejected": -1.433471918106079, - "loss": 2.0237, + "logits/chosen": -0.12844283878803253, + "logits/rejected": -0.11059192568063736, + "logps/chosen": -1.2247509956359863, + "logps/rejected": -1.3813955783843994, + "loss": 1.6081, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2473602294921875, - "rewards/margins": 0.18611164391040802, - "rewards/rejected": -1.433471918106079, - "semantic_entropy": 0.8121569752693176, + "rewards/chosen": -1.2247509956359863, + "rewards/margins": 0.15664449334144592, + "rewards/rejected": -1.3813955783843994, "step": 2030 }, { "epoch": 1.0891453420304398, - "grad_norm": 7.346242709370745, + "grad_norm": 6.961956058534192, "learning_rate": 8.03607906585432e-07, - "logits/chosen": -0.06825561821460724, - "logits/rejected": 0.0949849784374237, - "logps/chosen": -1.247241735458374, - "logps/rejected": -1.4357866048812866, - "loss": 2.0112, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.247241735458374, - "rewards/margins": 0.1885448694229126, - "rewards/rejected": -1.4357866048812866, - "semantic_entropy": 0.8164774179458618, + "logits/chosen": -0.13266727328300476, + "logits/rejected": 0.008277666755020618, + "logps/chosen": -1.2267061471939087, + "logps/rejected": -1.3689435720443726, + "loss": 1.5991, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2267061471939087, + "rewards/margins": 0.142237588763237, + "rewards/rejected": -1.3689435720443726, "step": 2035 }, { "epoch": 1.0918213748118415, - "grad_norm": 9.962051708647408, + "grad_norm": 8.991338068218205, "learning_rate": 8.023690218297329e-07, - "logits/chosen": -0.1509312242269516, - "logits/rejected": -0.08851532638072968, - "logps/chosen": -1.274482011795044, - "logps/rejected": -1.4187238216400146, - "loss": 2.0289, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.274482011795044, - "rewards/margins": 0.14424176514148712, - "rewards/rejected": -1.4187238216400146, - "semantic_entropy": 0.7936094403266907, + "logits/chosen": -0.19082000851631165, + "logits/rejected": -0.1373734176158905, + "logps/chosen": -1.2447383403778076, + "logps/rejected": -1.355181097984314, + "loss": 1.6218, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2447383403778076, + "rewards/margins": 0.11044273525476456, + "rewards/rejected": -1.355181097984314, "step": 2040 }, { "epoch": 1.094497407593243, - "grad_norm": 11.622046675876497, + "grad_norm": 10.371267959736507, "learning_rate": 8.01127203488458e-07, - "logits/chosen": -0.0111696757376194, - "logits/rejected": 0.03334783762693405, - "logps/chosen": -1.2602503299713135, - "logps/rejected": -1.4635040760040283, - "loss": 2.025, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2602503299713135, - "rewards/margins": 0.20325377583503723, - "rewards/rejected": -1.4635040760040283, - "semantic_entropy": 0.8120501637458801, + "logits/chosen": -0.0713386982679367, + "logits/rejected": -0.03666887432336807, + "logps/chosen": -1.242963433265686, + "logps/rejected": -1.4130642414093018, + "loss": 1.6154, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.242963433265686, + "rewards/margins": 0.17010077834129333, + "rewards/rejected": -1.4130642414093018, "step": 2045 }, { "epoch": 1.0971734403746445, - "grad_norm": 9.748815999988883, + "grad_norm": 8.380066198572631, "learning_rate": 7.998824636097339e-07, - "logits/chosen": -0.12438543885946274, - "logits/rejected": -0.0023122006095945835, - "logps/chosen": -1.3013404607772827, - "logps/rejected": -1.4263468980789185, - "loss": 2.0826, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3013404607772827, - "rewards/margins": 0.12500649690628052, - "rewards/rejected": -1.4263468980789185, - "semantic_entropy": 0.7970176339149475, + "logits/chosen": -0.15726640820503235, + "logits/rejected": -0.06450683623552322, + "logps/chosen": -1.2815487384796143, + "logps/rejected": -1.3583753108978271, + "loss": 1.6766, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2815487384796143, + "rewards/margins": 0.07682657241821289, + "rewards/rejected": -1.3583753108978271, "step": 2050 }, { "epoch": 1.0998494731560462, - "grad_norm": 8.91959075076058, + "grad_norm": 8.343216936885899, "learning_rate": 7.986348142700328e-07, - "logits/chosen": -0.023914365097880363, - "logits/rejected": 0.09678036719560623, - "logps/chosen": -1.2690775394439697, - "logps/rejected": -1.427825689315796, - "loss": 2.0212, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2690775394439697, - "rewards/margins": 0.15874823927879333, - "rewards/rejected": -1.427825689315796, - "semantic_entropy": 0.8028534054756165, + "logits/chosen": -0.06073587015271187, + "logits/rejected": 0.04076801612973213, + "logps/chosen": -1.2568342685699463, + "logps/rejected": -1.366917610168457, + "loss": 1.6255, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2568342685699463, + "rewards/margins": 0.1100834459066391, + "rewards/rejected": -1.366917610168457, "step": 2055 }, { "epoch": 1.1025255059374477, - "grad_norm": 9.230908880998141, + "grad_norm": 8.089157458893371, "learning_rate": 7.973842675740539e-07, - "logits/chosen": -0.013653213158249855, - "logits/rejected": 0.03546537458896637, - "logps/chosen": -1.3021272420883179, - "logps/rejected": -1.5597717761993408, - "loss": 2.0238, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3021272420883179, - "rewards/margins": 0.257644385099411, - "rewards/rejected": -1.5597717761993408, - "semantic_entropy": 0.7928792238235474, + "logits/chosen": -0.03909459710121155, + "logits/rejected": -0.0011225551133975387, + "logps/chosen": -1.2822999954223633, + "logps/rejected": -1.475353717803955, + "loss": 1.6247, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2822999954223633, + "rewards/margins": 0.193053737282753, + "rewards/rejected": -1.475353717803955, "step": 2060 }, { "epoch": 1.1052015387188494, - "grad_norm": 8.470596064865148, + "grad_norm": 7.708362290838383, "learning_rate": 7.961308356546066e-07, - "logits/chosen": -0.05723114684224129, - "logits/rejected": 0.06854735314846039, - "logps/chosen": -1.2799628973007202, - "logps/rejected": -1.4498138427734375, - "loss": 2.0457, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2799628973007202, - "rewards/margins": 0.16985103487968445, - "rewards/rejected": -1.4498138427734375, - "semantic_entropy": 0.8039534687995911, + "logits/chosen": -0.07062169164419174, + "logits/rejected": 0.03906797990202904, + "logps/chosen": -1.2624900341033936, + "logps/rejected": -1.367241382598877, + "loss": 1.6437, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2624900341033936, + "rewards/margins": 0.10475137084722519, + "rewards/rejected": -1.367241382598877, "step": 2065 }, { "epoch": 1.107877571500251, - "grad_norm": 6.879708138429533, + "grad_norm": 8.132968939547782, "learning_rate": 7.948745306724931e-07, - "logits/chosen": -0.06296677142381668, - "logits/rejected": 0.08676932007074356, - "logps/chosen": -1.2351148128509521, - "logps/rejected": -1.5215286016464233, - "loss": 1.9843, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2351148128509521, - "rewards/margins": 0.28641384840011597, - "rewards/rejected": -1.5215286016464233, - "semantic_entropy": 0.8108504414558411, + "logits/chosen": -0.10409660637378693, + "logits/rejected": 0.02609199844300747, + "logps/chosen": -1.2147948741912842, + "logps/rejected": -1.4452593326568604, + "loss": 1.5803, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2147948741912842, + "rewards/margins": 0.2304643839597702, + "rewards/rejected": -1.4452593326568604, "step": 2070 }, { "epoch": 1.1105536042816524, - "grad_norm": 10.281724019392756, + "grad_norm": 7.762202574182165, "learning_rate": 7.936153648163897e-07, - "logits/chosen": -0.0729776918888092, - "logits/rejected": 0.02537897601723671, - "logps/chosen": -1.285667061805725, - "logps/rejected": -1.5334974527359009, - "loss": 2.0094, + "logits/chosen": -0.14026805758476257, + "logits/rejected": -0.05574485659599304, + "logps/chosen": -1.257171630859375, + "logps/rejected": -1.452803373336792, + "loss": 1.6068, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.285667061805725, - "rewards/margins": 0.2478303164243698, - "rewards/rejected": -1.5334974527359009, - "semantic_entropy": 0.7966839075088501, + "rewards/chosen": -1.257171630859375, + "rewards/margins": 0.19563186168670654, + "rewards/rejected": -1.452803373336792, "step": 2075 }, { "epoch": 1.1132296370630541, - "grad_norm": 5.658066237322829, + "grad_norm": 5.722052395771005, "learning_rate": 7.92353350302729e-07, - "logits/chosen": -0.105830118060112, - "logits/rejected": 0.049424778670072556, - "logps/chosen": -1.2078402042388916, - "logps/rejected": -1.5260502099990845, - "loss": 1.9342, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2078402042388916, - "rewards/margins": 0.3182099163532257, - "rewards/rejected": -1.5260502099990845, - "semantic_entropy": 0.8191325068473816, + "logits/chosen": -0.1834503412246704, + "logits/rejected": -0.05271712690591812, + "logps/chosen": -1.1889479160308838, + "logps/rejected": -1.4604111909866333, + "loss": 1.5221, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1889479160308838, + "rewards/margins": 0.27146315574645996, + "rewards/rejected": -1.4604111909866333, "step": 2080 }, { "epoch": 1.1159056698444556, - "grad_norm": 12.694433127022538, + "grad_norm": 11.535482204762738, "learning_rate": 7.910884993755816e-07, - "logits/chosen": -0.10474447160959244, - "logits/rejected": 0.0023083791602402925, - "logps/chosen": -1.2569905519485474, - "logps/rejected": -1.488846778869629, - "loss": 2.0114, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2569905519485474, - "rewards/margins": 0.23185637593269348, - "rewards/rejected": -1.488846778869629, - "semantic_entropy": 0.8038360476493835, + "logits/chosen": -0.15596617758274078, + "logits/rejected": -0.06275282800197601, + "logps/chosen": -1.237905740737915, + "logps/rejected": -1.3934085369110107, + "loss": 1.6117, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.237905740737915, + "rewards/margins": 0.15550284087657928, + "rewards/rejected": -1.3934085369110107, "step": 2085 }, { "epoch": 1.118581702625857, - "grad_norm": 9.38006102694871, + "grad_norm": 8.74537384643787, "learning_rate": 7.898208243065367e-07, - "logits/chosen": -0.16282737255096436, - "logits/rejected": -0.1553858518600464, - "logps/chosen": -1.237786054611206, - "logps/rejected": -1.4228355884552002, - "loss": 2.0154, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.237786054611206, - "rewards/margins": 0.18504968285560608, - "rewards/rejected": -1.4228355884552002, - "semantic_entropy": 0.8132780194282532, + "logits/chosen": -0.21057133376598358, + "logits/rejected": -0.2055818736553192, + "logps/chosen": -1.2205783128738403, + "logps/rejected": -1.3647421598434448, + "loss": 1.6084, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2205783128738403, + "rewards/margins": 0.14416398108005524, + "rewards/rejected": -1.3647421598434448, "step": 2090 }, { "epoch": 1.1212577354072588, - "grad_norm": 6.012799912827595, + "grad_norm": 4.88317103034377, "learning_rate": 7.88550337394583e-07, - "logits/chosen": -0.0706481784582138, - "logits/rejected": 0.06993953138589859, - "logps/chosen": -1.393044114112854, - "logps/rejected": -1.5504987239837646, - "loss": 2.1148, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.393044114112854, - "rewards/margins": 0.157454714179039, - "rewards/rejected": -1.5504987239837646, - "semantic_entropy": 0.7678667902946472, + "logits/chosen": -0.11465215682983398, + "logits/rejected": 0.0019001305336132646, + "logps/chosen": -1.3683791160583496, + "logps/rejected": -1.4810022115707397, + "loss": 1.7229, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3683791160583496, + "rewards/margins": 0.11262323707342148, + "rewards/rejected": -1.4810022115707397, "step": 2095 }, { "epoch": 1.1239337681886603, - "grad_norm": 9.165376113792247, + "grad_norm": 8.94810065146586, "learning_rate": 7.872770509659905e-07, - "logits/chosen": 0.02150576561689377, - "logits/rejected": 0.07556779682636261, - "logps/chosen": -1.3992087841033936, - "logps/rejected": -1.5149985551834106, - "loss": 2.107, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3992087841033936, - "rewards/margins": 0.1157897338271141, - "rewards/rejected": -1.5149985551834106, - "semantic_entropy": 0.756324052810669, + "logits/chosen": -0.07218264043331146, + "logits/rejected": -0.04129823297262192, + "logps/chosen": -1.3749353885650635, + "logps/rejected": -1.4446145296096802, + "loss": 1.7206, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3749353885650635, + "rewards/margins": 0.06967911124229431, + "rewards/rejected": -1.4446145296096802, "step": 2100 }, { "epoch": 1.1266098009700618, - "grad_norm": 7.418887878348668, + "grad_norm": 7.16251528042221, "learning_rate": 7.860009773741896e-07, - "logits/chosen": 0.025624120607972145, - "logits/rejected": 0.14256080985069275, - "logps/chosen": -1.3226193189620972, - "logps/rejected": -1.5097525119781494, - "loss": 2.0465, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3226193189620972, - "rewards/margins": 0.18713310360908508, - "rewards/rejected": -1.5097525119781494, - "semantic_entropy": 0.787027895450592, + "logits/chosen": -0.052335310727357864, + "logits/rejected": 0.04018264636397362, + "logps/chosen": -1.300136923789978, + "logps/rejected": -1.4270994663238525, + "loss": 1.6535, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.300136923789978, + "rewards/margins": 0.12696246802806854, + "rewards/rejected": -1.4270994663238525, "step": 2105 }, { "epoch": 1.1292858337514635, - "grad_norm": 9.246072581454596, + "grad_norm": 8.329444300535107, "learning_rate": 7.84722128999652e-07, - "logits/chosen": -0.04906727001070976, - "logits/rejected": 0.11425349861383438, - "logps/chosen": -1.2612407207489014, - "logps/rejected": -1.6420915126800537, - "loss": 1.9762, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2612407207489014, - "rewards/margins": 0.3808509409427643, - "rewards/rejected": -1.6420915126800537, - "semantic_entropy": 0.7930108904838562, + "logits/chosen": -0.13611355423927307, + "logits/rejected": 0.005605706479400396, + "logps/chosen": -1.243820309638977, + "logps/rejected": -1.5600693225860596, + "loss": 1.5738, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.243820309638977, + "rewards/margins": 0.3162487745285034, + "rewards/rejected": -1.5600693225860596, "step": 2110 }, { "epoch": 1.131961866532865, - "grad_norm": 8.482976009607345, + "grad_norm": 7.9381413036119435, "learning_rate": 7.834405182497699e-07, - "logits/chosen": 0.058322250843048096, - "logits/rejected": 0.11610279232263565, - "logps/chosen": -1.2708790302276611, - "logps/rejected": -1.4597584009170532, - "loss": 2.0116, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2708790302276611, - "rewards/margins": 0.1888793557882309, - "rewards/rejected": -1.4597584009170532, - "semantic_entropy": 0.7919198274612427, + "logits/chosen": -0.03070439025759697, + "logits/rejected": 0.010776013135910034, + "logps/chosen": -1.2483736276626587, + "logps/rejected": -1.3772048950195312, + "loss": 1.6129, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2483736276626587, + "rewards/margins": 0.12883132696151733, + "rewards/rejected": -1.3772048950195312, "step": 2115 }, { "epoch": 1.1346378993142665, - "grad_norm": 8.859454340426378, + "grad_norm": 8.610968983850155, "learning_rate": 7.821561575587368e-07, - "logits/chosen": -0.02575806714594364, - "logits/rejected": 0.013861206360161304, - "logps/chosen": -1.313529372215271, - "logps/rejected": -1.4739990234375, - "loss": 2.0587, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.313529372215271, - "rewards/margins": 0.16046956181526184, - "rewards/rejected": -1.4739990234375, - "semantic_entropy": 0.7935684323310852, + "logits/chosen": -0.15365591645240784, + "logits/rejected": -0.1279035359621048, + "logps/chosen": -1.2922980785369873, + "logps/rejected": -1.4172145128250122, + "loss": 1.6552, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2922980785369873, + "rewards/margins": 0.12491631507873535, + "rewards/rejected": -1.4172145128250122, "step": 2120 }, { "epoch": 1.1373139320956682, - "grad_norm": 6.909830428655394, + "grad_norm": 6.9824498893646885, "learning_rate": 7.808690593874254e-07, - "logits/chosen": -0.008657902479171753, - "logits/rejected": 0.05696826055645943, - "logps/chosen": -1.2222939729690552, - "logps/rejected": -1.5031594038009644, - "loss": 1.9572, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2222939729690552, - "rewards/margins": 0.2808656096458435, - "rewards/rejected": -1.5031594038009644, - "semantic_entropy": 0.8107255101203918, + "logits/chosen": -0.10298570245504379, + "logits/rejected": -0.05632113292813301, + "logps/chosen": -1.2003002166748047, + "logps/rejected": -1.4210450649261475, + "loss": 1.5523, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2003002166748047, + "rewards/margins": 0.2207448035478592, + "rewards/rejected": -1.4210450649261475, "step": 2125 }, { "epoch": 1.1399899648770697, - "grad_norm": 10.724106791378182, + "grad_norm": 8.891391222995903, "learning_rate": 7.79579236223268e-07, - "logits/chosen": -0.0033174841664731503, - "logits/rejected": 0.23977334797382355, - "logps/chosen": -1.316141963005066, - "logps/rejected": -1.5274380445480347, - "loss": 2.0362, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.316141963005066, - "rewards/margins": 0.21129612624645233, - "rewards/rejected": -1.5274380445480347, - "semantic_entropy": 0.788119912147522, + "logits/chosen": -0.08387793600559235, + "logits/rejected": 0.10497613251209259, + "logps/chosen": -1.2898766994476318, + "logps/rejected": -1.453222632408142, + "loss": 1.6373, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2898766994476318, + "rewards/margins": 0.1633458435535431, + "rewards/rejected": -1.453222632408142, "step": 2130 }, { "epoch": 1.1426659976584714, - "grad_norm": 7.046612138948062, + "grad_norm": 6.476456909040568, "learning_rate": 7.782867005801346e-07, - "logits/chosen": -0.00464610755443573, - "logits/rejected": 0.1614435464143753, - "logps/chosen": -1.315813422203064, - "logps/rejected": -1.5444300174713135, - "loss": 2.0404, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.315813422203064, - "rewards/margins": 0.22861655056476593, - "rewards/rejected": -1.5444300174713135, - "semantic_entropy": 0.7877010107040405, + "logits/chosen": -0.08066694438457489, + "logits/rejected": 0.04436764121055603, + "logps/chosen": -1.2959989309310913, + "logps/rejected": -1.446028470993042, + "loss": 1.6487, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2959989309310913, + "rewards/margins": 0.15002937614917755, + "rewards/rejected": -1.446028470993042, "step": 2135 }, { "epoch": 1.145342030439873, - "grad_norm": 8.597150217290602, + "grad_norm": 7.992391927922851, "learning_rate": 7.769914649982117e-07, - "logits/chosen": -0.055145204067230225, - "logits/rejected": 0.08367632329463959, - "logps/chosen": -1.29514479637146, - "logps/rejected": -1.4626885652542114, - "loss": 2.0509, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.29514479637146, - "rewards/margins": 0.16754384338855743, - "rewards/rejected": -1.4626885652542114, - "semantic_entropy": 0.8096902966499329, + "logits/chosen": -0.1058705598115921, + "logits/rejected": 0.00794590637087822, + "logps/chosen": -1.2681523561477661, + "logps/rejected": -1.3907512426376343, + "loss": 1.6402, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2681523561477661, + "rewards/margins": 0.12259910255670547, + "rewards/rejected": -1.3907512426376343, "step": 2140 }, { "epoch": 1.1480180632212744, - "grad_norm": 8.339971979435767, + "grad_norm": 7.766977663702184, "learning_rate": 7.756935420438803e-07, - "logits/chosen": -0.004235397092998028, - "logits/rejected": 0.08324207365512848, - "logps/chosen": -1.1817706823349, - "logps/rejected": -1.4775861501693726, - "loss": 1.954, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.1817706823349, - "rewards/margins": 0.29581543803215027, - "rewards/rejected": -1.4775861501693726, - "semantic_entropy": 0.819560170173645, + "logits/chosen": -0.07947366684675217, + "logits/rejected": -0.01362493634223938, + "logps/chosen": -1.1541688442230225, + "logps/rejected": -1.3922502994537354, + "loss": 1.5354, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1541688442230225, + "rewards/margins": 0.23808152973651886, + "rewards/rejected": -1.3922502994537354, "step": 2145 }, { "epoch": 1.1506940960026761, - "grad_norm": 5.499676423671464, + "grad_norm": 5.489764217394201, "learning_rate": 7.743929443095951e-07, - "logits/chosen": -0.0550382137298584, - "logits/rejected": 0.001202210783958435, - "logps/chosen": -1.3445276021957397, - "logps/rejected": -1.547282099723816, - "loss": 2.0586, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3445276021957397, - "rewards/margins": 0.2027544230222702, - "rewards/rejected": -1.547282099723816, - "semantic_entropy": 0.7729575634002686, + "logits/chosen": -0.09513584524393082, + "logits/rejected": -0.053471989929676056, + "logps/chosen": -1.3207443952560425, + "logps/rejected": -1.4349052906036377, + "loss": 1.6716, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3207443952560425, + "rewards/margins": 0.1141607016324997, + "rewards/rejected": -1.4349052906036377, "step": 2150 }, { "epoch": 1.1533701287840776, - "grad_norm": 6.473506673489069, + "grad_norm": 6.564842400730169, "learning_rate": 7.730896844137609e-07, - "logits/chosen": 0.031207948923110962, - "logits/rejected": 0.09783817827701569, - "logps/chosen": -1.3366258144378662, - "logps/rejected": -1.5274697542190552, - "loss": 2.0341, + "logits/chosen": -0.04338967055082321, + "logits/rejected": 0.007275203708559275, + "logps/chosen": -1.3063982725143433, + "logps/rejected": -1.4485085010528564, + "loss": 1.6369, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3366258144378662, - "rewards/margins": 0.19084373116493225, - "rewards/rejected": -1.5274697542190552, - "semantic_entropy": 0.7779184579849243, + "rewards/chosen": -1.3063982725143433, + "rewards/margins": 0.1421101987361908, + "rewards/rejected": -1.4485085010528564, "step": 2155 }, { "epoch": 1.1560461615654791, - "grad_norm": 8.835825021503956, + "grad_norm": 8.656188129297469, "learning_rate": 7.717837750006106e-07, - "logits/chosen": -0.0980040580034256, - "logits/rejected": 0.01028486154973507, - "logps/chosen": -1.2548919916152954, - "logps/rejected": -1.509281039237976, - "loss": 1.9958, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2548919916152954, - "rewards/margins": 0.2543889880180359, - "rewards/rejected": -1.509281039237976, - "semantic_entropy": 0.7877534627914429, + "logits/chosen": -0.14563269913196564, + "logits/rejected": -0.050481997430324554, + "logps/chosen": -1.225486397743225, + "logps/rejected": -1.425831913948059, + "loss": 1.5927, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.225486397743225, + "rewards/margins": 0.20034563541412354, + "rewards/rejected": -1.425831913948059, "step": 2160 }, { "epoch": 1.1587221943468808, - "grad_norm": 8.271327898672205, + "grad_norm": 6.986966699998474, "learning_rate": 7.704752287400832e-07, - "logits/chosen": -0.0610247440636158, - "logits/rejected": 0.11114144325256348, - "logps/chosen": -1.2976696491241455, - "logps/rejected": -1.5627309083938599, - "loss": 2.0145, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2976696491241455, - "rewards/margins": 0.2650611996650696, - "rewards/rejected": -1.5627309083938599, - "semantic_entropy": 0.7958112359046936, + "logits/chosen": -0.11772508919239044, + "logits/rejected": 0.03461534529924393, + "logps/chosen": -1.2676708698272705, + "logps/rejected": -1.477513074874878, + "loss": 1.6074, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2676708698272705, + "rewards/margins": 0.20984220504760742, + "rewards/rejected": -1.477513074874878, "step": 2165 }, { "epoch": 1.1613982271282823, - "grad_norm": 7.367775189859744, + "grad_norm": 5.522124273534204, "learning_rate": 7.691640583277004e-07, - "logits/chosen": -0.036937564611434937, - "logits/rejected": 0.12986062467098236, - "logps/chosen": -1.2528033256530762, - "logps/rejected": -1.5739809274673462, - "loss": 1.9942, + "logits/chosen": -0.10463432222604752, + "logits/rejected": 0.03196742385625839, + "logps/chosen": -1.2275731563568115, + "logps/rejected": -1.4633052349090576, + "loss": 1.5896, "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2528033256530762, - "rewards/margins": 0.32117748260498047, - "rewards/rejected": -1.5739809274673462, - "semantic_entropy": 0.8013005256652832, + "rewards/chosen": -1.2275731563568115, + "rewards/margins": 0.23573215305805206, + "rewards/rejected": -1.4633052349090576, "step": 2170 }, { "epoch": 1.1640742599096838, - "grad_norm": 6.321138290200016, + "grad_norm": 6.115225759248435, "learning_rate": 7.678502764844433e-07, - "logits/chosen": -0.06491942703723907, - "logits/rejected": 0.10677101463079453, - "logps/chosen": -1.3190665245056152, - "logps/rejected": -1.4719650745391846, - "loss": 2.0565, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3190665245056152, - "rewards/margins": 0.1528986245393753, - "rewards/rejected": -1.4719650745391846, - "semantic_entropy": 0.7967050075531006, + "logits/chosen": -0.11995580047369003, + "logits/rejected": 0.019036246463656425, + "logps/chosen": -1.300389051437378, + "logps/rejected": -1.3865238428115845, + "loss": 1.6644, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.300389051437378, + "rewards/margins": 0.08613482862710953, + "rewards/rejected": -1.3865238428115845, "step": 2175 }, { "epoch": 1.1667502926910855, - "grad_norm": 7.660731521759613, + "grad_norm": 6.201062151895898, "learning_rate": 7.665338959566288e-07, - "logits/chosen": -0.033620767295360565, - "logits/rejected": 0.047455258667469025, - "logps/chosen": -1.2621140480041504, - "logps/rejected": -1.4954121112823486, - "loss": 2.0003, + "logits/chosen": -0.11636760085821152, + "logits/rejected": -0.05718974396586418, + "logps/chosen": -1.234830617904663, + "logps/rejected": -1.407841444015503, + "loss": 1.5975, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2621140480041504, - "rewards/margins": 0.2332981824874878, - "rewards/rejected": -1.4954121112823486, - "semantic_entropy": 0.8004210591316223, + "rewards/chosen": -1.234830617904663, + "rewards/margins": 0.17301085591316223, + "rewards/rejected": -1.407841444015503, "step": 2180 }, { "epoch": 1.169426325472487, - "grad_norm": 9.018605412309794, + "grad_norm": 8.382011760189442, "learning_rate": 7.652149295157868e-07, - "logits/chosen": 0.06298673897981644, - "logits/rejected": 0.208059623837471, - "logps/chosen": -1.2907410860061646, - "logps/rejected": -1.4562091827392578, - "loss": 2.0281, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2907410860061646, - "rewards/margins": 0.16546815633773804, - "rewards/rejected": -1.4562091827392578, - "semantic_entropy": 0.7983787655830383, + "logits/chosen": -0.04539335146546364, + "logits/rejected": 0.0673140436410904, + "logps/chosen": -1.265032172203064, + "logps/rejected": -1.3793548345565796, + "loss": 1.6196, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.265032172203064, + "rewards/margins": 0.11432279646396637, + "rewards/rejected": -1.3793548345565796, "step": 2185 }, { "epoch": 1.1721023582538885, - "grad_norm": 7.8766805185523046, + "grad_norm": 6.889993205894781, "learning_rate": 7.638933899585354e-07, - "logits/chosen": 0.14884532988071442, - "logits/rejected": 0.19815945625305176, - "logps/chosen": -1.2859556674957275, - "logps/rejected": -1.4986616373062134, - "loss": 2.0256, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2859556674957275, - "rewards/margins": 0.21270573139190674, - "rewards/rejected": -1.4986616373062134, - "semantic_entropy": 0.7964907884597778, + "logits/chosen": 0.018428370356559753, + "logits/rejected": 0.05882607027888298, + "logps/chosen": -1.25342857837677, + "logps/rejected": -1.3916927576065063, + "loss": 1.6259, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.25342857837677, + "rewards/margins": 0.13826411962509155, + "rewards/rejected": -1.3916927576065063, "step": 2190 }, { "epoch": 1.1747783910352902, - "grad_norm": 10.1996833053212, + "grad_norm": 9.041271982810615, "learning_rate": 7.625692901064573e-07, - "logits/chosen": 0.08653564751148224, - "logits/rejected": 0.18244849145412445, - "logps/chosen": -1.2636315822601318, - "logps/rejected": -1.5912013053894043, - "loss": 1.9884, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2636315822601318, - "rewards/margins": 0.3275696933269501, - "rewards/rejected": -1.5912013053894043, - "semantic_entropy": 0.7970950603485107, + "logits/chosen": -0.04829593747854233, + "logits/rejected": 0.029843684285879135, + "logps/chosen": -1.2311931848526, + "logps/rejected": -1.4910755157470703, + "loss": 1.5839, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2311931848526, + "rewards/margins": 0.2598823010921478, + "rewards/rejected": -1.4910755157470703, "step": 2195 }, { "epoch": 1.1774544238166917, - "grad_norm": 7.299173024972554, + "grad_norm": 7.524567421829956, "learning_rate": 7.61242642805975e-07, - "logits/chosen": -0.008318186737596989, - "logits/rejected": -0.021589037030935287, - "logps/chosen": -1.2870399951934814, - "logps/rejected": -1.5103387832641602, - "loss": 2.0217, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2870399951934814, - "rewards/margins": 0.22329886257648468, - "rewards/rejected": -1.5103387832641602, - "semantic_entropy": 0.7910685539245605, + "logits/chosen": -0.12873823940753937, + "logits/rejected": -0.1393408328294754, + "logps/chosen": -1.2622649669647217, + "logps/rejected": -1.4139279127120972, + "loss": 1.6225, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2622649669647217, + "rewards/margins": 0.15166299045085907, + "rewards/rejected": -1.4139279127120972, "step": 2200 }, { "epoch": 1.1801304565980932, - "grad_norm": 7.213949372344186, + "grad_norm": 7.098693824228783, "learning_rate": 7.599134609282266e-07, - "logits/chosen": -0.04548101872205734, - "logits/rejected": 0.1449245810508728, - "logps/chosen": -1.2256078720092773, - "logps/rejected": -1.4781060218811035, - "loss": 1.9978, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2256078720092773, - "rewards/margins": 0.25249817967414856, - "rewards/rejected": -1.4781060218811035, - "semantic_entropy": 0.8226034045219421, + "logits/chosen": -0.1603354513645172, + "logits/rejected": -0.010439865291118622, + "logps/chosen": -1.1993751525878906, + "logps/rejected": -1.374045729637146, + "loss": 1.5809, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1993751525878906, + "rewards/margins": 0.17467060685157776, + "rewards/rejected": -1.374045729637146, "step": 2205 }, { "epoch": 1.182806489379495, - "grad_norm": 9.18705823903751, + "grad_norm": 8.549821414081535, "learning_rate": 7.585817573689402e-07, - "logits/chosen": -0.10582272708415985, - "logits/rejected": 0.008244603872299194, - "logps/chosen": -1.1650890111923218, - "logps/rejected": -1.5030286312103271, - "loss": 1.9383, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1650890111923218, - "rewards/margins": 0.33793967962265015, - "rewards/rejected": -1.5030286312103271, - "semantic_entropy": 0.8308181762695312, + "logits/chosen": -0.21130235493183136, + "logits/rejected": -0.11883778870105743, + "logps/chosen": -1.1435682773590088, + "logps/rejected": -1.3947386741638184, + "loss": 1.5242, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1435682773590088, + "rewards/margins": 0.25117045640945435, + "rewards/rejected": -1.3947386741638184, "step": 2210 }, { "epoch": 1.1854825221608964, - "grad_norm": 8.865593844658033, + "grad_norm": 7.835406150352905, "learning_rate": 7.572475450483098e-07, - "logits/chosen": -0.0699770599603653, - "logits/rejected": 0.01554514653980732, - "logps/chosen": -1.3917657136917114, - "logps/rejected": -1.6706657409667969, - "loss": 2.0636, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3917657136917114, - "rewards/margins": 0.278900146484375, - "rewards/rejected": -1.6706657409667969, - "semantic_entropy": 0.7539466619491577, + "logits/chosen": -0.1915084421634674, + "logits/rejected": -0.1323508620262146, + "logps/chosen": -1.364108681678772, + "logps/rejected": -1.531037449836731, + "loss": 1.6856, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.364108681678772, + "rewards/margins": 0.16692869365215302, + "rewards/rejected": -1.531037449836731, "step": 2215 }, { "epoch": 1.188158554942298, - "grad_norm": 11.056070419744794, + "grad_norm": 6.2139292317312265, "learning_rate": 7.559108369108689e-07, - "logits/chosen": -0.1278703808784485, - "logits/rejected": -0.0024462491273880005, - "logps/chosen": -1.2356823682785034, - "logps/rejected": -1.4762585163116455, - "loss": 2.0046, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2356823682785034, - "rewards/margins": 0.24057602882385254, - "rewards/rejected": -1.4762585163116455, - "semantic_entropy": 0.814600944519043, + "logits/chosen": -0.2246476113796234, + "logits/rejected": -0.120921291410923, + "logps/chosen": -1.2061656713485718, + "logps/rejected": -1.3511766195297241, + "loss": 1.5949, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2061656713485718, + "rewards/margins": 0.14501085877418518, + "rewards/rejected": -1.3511766195297241, "step": 2220 }, { "epoch": 1.1908345877236997, - "grad_norm": 8.271989533397697, + "grad_norm": 8.07090751462236, "learning_rate": 7.54571645925366e-07, - "logits/chosen": -0.09756255894899368, - "logits/rejected": 0.10917206108570099, - "logps/chosen": -1.2540271282196045, - "logps/rejected": -1.5666847229003906, - "loss": 1.9733, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2540271282196045, - "rewards/margins": 0.3126576244831085, - "rewards/rejected": -1.5666847229003906, - "semantic_entropy": 0.8026211857795715, + "logits/chosen": -0.15275296568870544, + "logits/rejected": 0.017976239323616028, + "logps/chosen": -1.2264677286148071, + "logps/rejected": -1.4621691703796387, + "loss": 1.5633, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2264677286148071, + "rewards/margins": 0.23570141196250916, + "rewards/rejected": -1.4621691703796387, "step": 2225 }, { "epoch": 1.1935106205051011, - "grad_norm": 14.450927596316225, + "grad_norm": 12.851901330750579, "learning_rate": 7.532299850846378e-07, - "logits/chosen": -0.11772173643112183, - "logits/rejected": 0.016855159774422646, - "logps/chosen": -1.3069515228271484, - "logps/rejected": -1.6907488107681274, - "loss": 2.0293, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3069515228271484, - "rewards/margins": 0.3837973475456238, - "rewards/rejected": -1.6907488107681274, - "semantic_entropy": 0.783217191696167, + "logits/chosen": -0.2065228968858719, + "logits/rejected": -0.09520624577999115, + "logps/chosen": -1.2863290309906006, + "logps/rejected": -1.5747162103652954, + "loss": 1.635, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2863290309906006, + "rewards/margins": 0.2883870601654053, + "rewards/rejected": -1.5747162103652954, "step": 2230 }, { "epoch": 1.1961866532865026, - "grad_norm": 9.066081790085091, + "grad_norm": 9.135773391171075, "learning_rate": 7.518858674054838e-07, - "logits/chosen": -0.060464583337306976, - "logits/rejected": 0.12888970971107483, - "logps/chosen": -1.2510316371917725, - "logps/rejected": -1.5621397495269775, - "loss": 1.994, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2510316371917725, - "rewards/margins": 0.3111079931259155, - "rewards/rejected": -1.5621397495269775, - "semantic_entropy": 0.8048974275588989, + "logits/chosen": -0.176313579082489, + "logits/rejected": -0.02505047246813774, + "logps/chosen": -1.2322592735290527, + "logps/rejected": -1.4805638790130615, + "loss": 1.5927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2322592735290527, + "rewards/margins": 0.24830465018749237, + "rewards/rejected": -1.4805638790130615, "step": 2235 }, { "epoch": 1.1988626860679044, - "grad_norm": 9.594387405303804, + "grad_norm": 8.979940076666123, "learning_rate": 7.505393059285394e-07, - "logits/chosen": -0.046233952045440674, - "logits/rejected": 0.10925587266683578, - "logps/chosen": -1.2652842998504639, - "logps/rejected": -1.5169308185577393, - "loss": 1.984, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2652842998504639, - "rewards/margins": 0.2516464591026306, - "rewards/rejected": -1.5169308185577393, - "semantic_entropy": 0.7933940887451172, + "logits/chosen": -0.1995842158794403, + "logits/rejected": -0.08602572232484818, + "logps/chosen": -1.2478491067886353, + "logps/rejected": -1.4512939453125, + "loss": 1.5858, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2478491067886353, + "rewards/margins": 0.20344488322734833, + "rewards/rejected": -1.4512939453125, "step": 2240 }, { "epoch": 1.2015387188493059, - "grad_norm": 9.062658843804321, + "grad_norm": 9.39158424428714, "learning_rate": 7.491903137181501e-07, - "logits/chosen": -0.012463415041565895, - "logits/rejected": 0.03613627701997757, - "logps/chosen": -1.2368113994598389, - "logps/rejected": -1.4751683473587036, - "loss": 2.0087, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2368113994598389, - "rewards/margins": 0.2383570671081543, - "rewards/rejected": -1.4751683473587036, - "semantic_entropy": 0.8038736581802368, + "logits/chosen": -0.15345005691051483, + "logits/rejected": -0.11476775258779526, + "logps/chosen": -1.2204474210739136, + "logps/rejected": -1.4159668684005737, + "loss": 1.6053, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2204474210739136, + "rewards/margins": 0.1955193430185318, + "rewards/rejected": -1.4159668684005737, "step": 2245 }, { "epoch": 1.2042147516307076, - "grad_norm": 8.361895591161364, + "grad_norm": 7.494636699449011, "learning_rate": 7.478389038622441e-07, - "logits/chosen": 0.04597688838839531, - "logits/rejected": 0.06491976976394653, - "logps/chosen": -1.2090892791748047, - "logps/rejected": -1.5203975439071655, - "loss": 1.9498, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2090892791748047, - "rewards/margins": 0.311308354139328, - "rewards/rejected": -1.5203975439071655, - "semantic_entropy": 0.8206877708435059, + "logits/chosen": -0.1034126877784729, + "logits/rejected": -0.09635962545871735, + "logps/chosen": -1.1871516704559326, + "logps/rejected": -1.4469674825668335, + "loss": 1.5351, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1871516704559326, + "rewards/margins": 0.25981590151786804, + "rewards/rejected": -1.4469674825668335, "step": 2250 }, { "epoch": 1.206890784412109, - "grad_norm": 8.36617213318591, + "grad_norm": 6.954594962262396, "learning_rate": 7.46485089472206e-07, - "logits/chosen": -0.02347579412162304, - "logits/rejected": 0.08216370642185211, - "logps/chosen": -1.3362557888031006, - "logps/rejected": -1.4455738067626953, - "loss": 2.0886, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3362557888031006, - "rewards/margins": 0.10931817442178726, - "rewards/rejected": -1.4455738067626953, - "semantic_entropy": 0.7876633405685425, + "logits/chosen": -0.17379239201545715, + "logits/rejected": -0.08324754983186722, + "logps/chosen": -1.3096932172775269, + "logps/rejected": -1.3674468994140625, + "loss": 1.684, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3096932172775269, + "rewards/margins": 0.05775368958711624, + "rewards/rejected": -1.3674468994140625, "step": 2255 }, { "epoch": 1.2095668171935106, - "grad_norm": 7.947935461378185, + "grad_norm": 7.200104189312471, "learning_rate": 7.451288836827487e-07, - "logits/chosen": 0.022869199514389038, - "logits/rejected": 0.00841585360467434, - "logps/chosen": -1.2855030298233032, - "logps/rejected": -1.4511579275131226, - "loss": 2.0189, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2855030298233032, - "rewards/margins": 0.16565480828285217, - "rewards/rejected": -1.4511579275131226, - "semantic_entropy": 0.8032156825065613, + "logits/chosen": -0.10099692642688751, + "logits/rejected": -0.11718853563070297, + "logps/chosen": -1.2691152095794678, + "logps/rejected": -1.392392873764038, + "loss": 1.6195, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2691152095794678, + "rewards/margins": 0.1232776790857315, + "rewards/rejected": -1.392392873764038, "step": 2260 }, { "epoch": 1.2122428499749123, - "grad_norm": 8.103829288787791, + "grad_norm": 8.410740178588743, "learning_rate": 7.437702996517869e-07, - "logits/chosen": -0.07720647007226944, - "logits/rejected": 0.021549483761191368, - "logps/chosen": -1.3225862979888916, - "logps/rejected": -1.4839069843292236, - "loss": 2.0504, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3225862979888916, - "rewards/margins": 0.16132058203220367, - "rewards/rejected": -1.4839069843292236, - "semantic_entropy": 0.7962474822998047, + "logits/chosen": -0.18823234736919403, + "logits/rejected": -0.10828492790460587, + "logps/chosen": -1.300615668296814, + "logps/rejected": -1.4223439693450928, + "loss": 1.6481, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.300615668296814, + "rewards/margins": 0.12172845751047134, + "rewards/rejected": -1.4223439693450928, "step": 2265 }, { "epoch": 1.2149188827563138, - "grad_norm": 8.813740593042496, + "grad_norm": 8.488143673141622, "learning_rate": 7.424093505603087e-07, - "logits/chosen": -0.16498062014579773, - "logits/rejected": 0.010989139787852764, - "logps/chosen": -1.264560580253601, - "logps/rejected": -1.5233685970306396, - "loss": 1.9983, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.264560580253601, - "rewards/margins": 0.2588079273700714, - "rewards/rejected": -1.5233685970306396, - "semantic_entropy": 0.7983893156051636, + "logits/chosen": -0.2915375530719757, + "logits/rejected": -0.16539961099624634, + "logps/chosen": -1.239471673965454, + "logps/rejected": -1.4392166137695312, + "loss": 1.5973, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.239471673965454, + "rewards/margins": 0.1997450292110443, + "rewards/rejected": -1.4392166137695312, "step": 2270 }, { "epoch": 1.2175949155377153, - "grad_norm": 8.927905431133137, + "grad_norm": 8.296572864657373, "learning_rate": 7.410460496122482e-07, - "logits/chosen": -0.031638335436582565, - "logits/rejected": 0.09178587049245834, - "logps/chosen": -1.2444086074829102, - "logps/rejected": -1.5470324754714966, - "loss": 1.9644, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2444086074829102, - "rewards/margins": 0.30262380838394165, - "rewards/rejected": -1.5470324754714966, - "semantic_entropy": 0.787211537361145, + "logits/chosen": -0.1600394994020462, + "logits/rejected": -0.07500159740447998, + "logps/chosen": -1.2279202938079834, + "logps/rejected": -1.467357873916626, + "loss": 1.5723, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2279202938079834, + "rewards/margins": 0.2394375503063202, + "rewards/rejected": -1.467357873916626, "step": 2275 }, { "epoch": 1.220270948319117, - "grad_norm": 10.498951525504093, + "grad_norm": 10.477747918745342, "learning_rate": 7.396804100343572e-07, - "logits/chosen": -0.09466014802455902, - "logits/rejected": 0.07289015501737595, - "logps/chosen": -1.1898757219314575, - "logps/rejected": -1.4143764972686768, - "loss": 1.9808, + "logits/chosen": -0.19462502002716064, + "logits/rejected": -0.07571206986904144, + "logps/chosen": -1.1689397096633911, + "logps/rejected": -1.347392201423645, + "loss": 1.5665, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.1898757219314575, - "rewards/margins": 0.22450082004070282, - "rewards/rejected": -1.4143764972686768, - "semantic_entropy": 0.8181502223014832, + "rewards/chosen": -1.1689397096633911, + "rewards/margins": 0.17845246195793152, + "rewards/rejected": -1.347392201423645, "step": 2280 }, { "epoch": 1.2229469811005185, - "grad_norm": 8.619007313844211, + "grad_norm": 7.736285254184248, "learning_rate": 7.383124450760768e-07, - "logits/chosen": -0.02043074555695057, - "logits/rejected": 0.18445360660552979, - "logps/chosen": -1.3112406730651855, - "logps/rejected": -1.5405431985855103, - "loss": 2.0467, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3112406730651855, - "rewards/margins": 0.22930267453193665, - "rewards/rejected": -1.5405431985855103, - "semantic_entropy": 0.7737176418304443, + "logits/chosen": -0.13334044814109802, + "logits/rejected": 0.028622111305594444, + "logps/chosen": -1.2916946411132812, + "logps/rejected": -1.444271445274353, + "loss": 1.6559, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2916946411132812, + "rewards/margins": 0.15257684886455536, + "rewards/rejected": -1.444271445274353, "step": 2285 }, { "epoch": 1.22562301388192, - "grad_norm": 11.166453971795224, + "grad_norm": 9.641761863507444, "learning_rate": 7.369421680094091e-07, - "logits/chosen": -0.11235109716653824, - "logits/rejected": 0.04711022973060608, - "logps/chosen": -1.177172303199768, - "logps/rejected": -1.4215004444122314, - "loss": 1.9917, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.177172303199768, - "rewards/margins": 0.24432817101478577, - "rewards/rejected": -1.4215004444122314, - "semantic_entropy": 0.8288639783859253, + "logits/chosen": -0.22626297175884247, + "logits/rejected": -0.10098767280578613, + "logps/chosen": -1.1568272113800049, + "logps/rejected": -1.3377034664154053, + "loss": 1.5632, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1568272113800049, + "rewards/margins": 0.180876225233078, + "rewards/rejected": -1.3377034664154053, "step": 2290 }, { "epoch": 1.2282990466633217, - "grad_norm": 8.551694391398613, + "grad_norm": 7.916149713153111, "learning_rate": 7.355695921287881e-07, - "logits/chosen": -0.08187909424304962, - "logits/rejected": 0.006174634210765362, - "logps/chosen": -1.265074372291565, - "logps/rejected": -1.4966572523117065, - "loss": 2.0164, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.265074372291565, - "rewards/margins": 0.23158295452594757, - "rewards/rejected": -1.4966572523117065, - "semantic_entropy": 0.8085952997207642, + "logits/chosen": -0.20858672261238098, + "logits/rejected": -0.1384880095720291, + "logps/chosen": -1.2436957359313965, + "logps/rejected": -1.4017614126205444, + "loss": 1.6109, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2436957359313965, + "rewards/margins": 0.15806587040424347, + "rewards/rejected": -1.4017614126205444, "step": 2295 }, { "epoch": 1.2309750794447232, - "grad_norm": 10.584942593444998, + "grad_norm": 8.863796337651957, "learning_rate": 7.341947307509513e-07, - "logits/chosen": -0.03479890897870064, - "logits/rejected": 0.09125839173793793, - "logps/chosen": -1.2871692180633545, - "logps/rejected": -1.4339960813522339, - "loss": 2.0749, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2871692180633545, - "rewards/margins": 0.14682689309120178, - "rewards/rejected": -1.4339960813522339, - "semantic_entropy": 0.8087296485900879, + "logits/chosen": -0.14621509611606598, + "logits/rejected": -0.04588788002729416, + "logps/chosen": -1.2604069709777832, + "logps/rejected": -1.3629649877548218, + "loss": 1.6573, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2604069709777832, + "rewards/margins": 0.10255799442529678, + "rewards/rejected": -1.3629649877548218, "step": 2300 }, { "epoch": 1.233651112226125, - "grad_norm": 7.3984945064866965, + "grad_norm": 7.524654604515273, "learning_rate": 7.328175972148094e-07, - "logits/chosen": -0.08773527294397354, - "logits/rejected": 0.05295741558074951, - "logps/chosen": -1.3983827829360962, - "logps/rejected": -1.5909889936447144, - "loss": 2.0939, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3983827829360962, - "rewards/margins": 0.1926061362028122, - "rewards/rejected": -1.5909889936447144, - "semantic_entropy": 0.7503756284713745, + "logits/chosen": -0.19042053818702698, + "logits/rejected": -0.07803567498922348, + "logps/chosen": -1.3758207559585571, + "logps/rejected": -1.4862630367279053, + "loss": 1.7169, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3758207559585571, + "rewards/margins": 0.11044234037399292, + "rewards/rejected": -1.4862630367279053, "step": 2305 }, { "epoch": 1.2363271450075264, - "grad_norm": 13.241352238082227, + "grad_norm": 11.70027581573188, "learning_rate": 7.314382048813185e-07, - "logits/chosen": -0.05741016939282417, - "logits/rejected": 0.21026751399040222, - "logps/chosen": -1.3185993432998657, - "logps/rejected": -1.5880451202392578, - "loss": 2.0274, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3185993432998657, - "rewards/margins": 0.2694457173347473, - "rewards/rejected": -1.5880451202392578, - "semantic_entropy": 0.7889279127120972, + "logits/chosen": -0.13621467351913452, + "logits/rejected": 0.08509569615125656, + "logps/chosen": -1.2939293384552002, + "logps/rejected": -1.4757057428359985, + "loss": 1.627, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2939293384552002, + "rewards/margins": 0.18177632987499237, + "rewards/rejected": -1.4757057428359985, "step": 2310 }, { "epoch": 1.2390031777889279, - "grad_norm": 9.867053081624858, + "grad_norm": 9.092757887640735, "learning_rate": 7.300565671333486e-07, - "logits/chosen": -0.03601957857608795, - "logits/rejected": 0.1427186280488968, - "logps/chosen": -1.3128204345703125, - "logps/rejected": -1.576196551322937, - "loss": 2.0158, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3128204345703125, - "rewards/margins": 0.26337605714797974, - "rewards/rejected": -1.576196551322937, - "semantic_entropy": 0.7808772325515747, + "logits/chosen": -0.12994270026683807, + "logits/rejected": 0.010222077369689941, + "logps/chosen": -1.271327018737793, + "logps/rejected": -1.4542458057403564, + "loss": 1.6104, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.271327018737793, + "rewards/margins": 0.18291863799095154, + "rewards/rejected": -1.4542458057403564, "step": 2315 }, { "epoch": 1.2416792105703296, - "grad_norm": 7.953399629652333, + "grad_norm": 7.451524709977769, "learning_rate": 7.286726973755554e-07, - "logits/chosen": 0.061737217009067535, - "logits/rejected": 0.08541327714920044, - "logps/chosen": -1.3010971546173096, - "logps/rejected": -1.557395577430725, - "loss": 2.0297, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3010971546173096, - "rewards/margins": 0.2562984526157379, - "rewards/rejected": -1.557395577430725, - "semantic_entropy": 0.7937830090522766, + "logits/chosen": -0.04890859127044678, + "logits/rejected": -0.03615367412567139, + "logps/chosen": -1.278269648551941, + "logps/rejected": -1.4786007404327393, + "loss": 1.629, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.278269648551941, + "rewards/margins": 0.20033085346221924, + "rewards/rejected": -1.4786007404327393, "step": 2320 }, { "epoch": 1.244355243351731, - "grad_norm": 10.194369450500247, + "grad_norm": 10.219394443903381, "learning_rate": 7.272866090342493e-07, - "logits/chosen": 0.08444453775882721, - "logits/rejected": 0.16419485211372375, - "logps/chosen": -1.3324655294418335, - "logps/rejected": -1.6092456579208374, - "loss": 2.0442, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3324655294418335, - "rewards/margins": 0.27678006887435913, - "rewards/rejected": -1.6092456579208374, - "semantic_entropy": 0.7854612469673157, + "logits/chosen": -0.008174806833267212, + "logits/rejected": 0.049251943826675415, + "logps/chosen": -1.3031771183013916, + "logps/rejected": -1.532463788986206, + "loss": 1.6425, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3031771183013916, + "rewards/margins": 0.22928662598133087, + "rewards/rejected": -1.532463788986206, "step": 2325 }, { "epoch": 1.2470312761331326, - "grad_norm": 8.188144233030942, + "grad_norm": 7.739260291455012, "learning_rate": 7.258983155572656e-07, - "logits/chosen": -0.10631588846445084, - "logits/rejected": 0.006649285554885864, - "logps/chosen": -1.2845680713653564, - "logps/rejected": -1.5136401653289795, - "loss": 2.027, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2845680713653564, - "rewards/margins": 0.22907209396362305, - "rewards/rejected": -1.5136401653289795, - "semantic_entropy": 0.7916483879089355, + "logits/chosen": -0.21783366799354553, + "logits/rejected": -0.13086874783039093, + "logps/chosen": -1.2585595846176147, + "logps/rejected": -1.446861982345581, + "loss": 1.6193, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2585595846176147, + "rewards/margins": 0.18830236792564392, + "rewards/rejected": -1.446861982345581, "step": 2330 }, { "epoch": 1.2497073089145343, - "grad_norm": 9.980220667454514, + "grad_norm": 8.686763534911524, "learning_rate": 7.245078304138335e-07, - "logits/chosen": 0.05152348801493645, - "logits/rejected": 0.11100976169109344, - "logps/chosen": -1.278947114944458, - "logps/rejected": -1.57237708568573, - "loss": 1.9967, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.278947114944458, - "rewards/margins": 0.2934300899505615, - "rewards/rejected": -1.57237708568573, - "semantic_entropy": 0.7966108918190002, + "logits/chosen": -0.0693262368440628, + "logits/rejected": -0.02944047376513481, + "logps/chosen": -1.2556920051574707, + "logps/rejected": -1.493545651435852, + "loss": 1.5915, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2556920051574707, + "rewards/margins": 0.23785361647605896, + "rewards/rejected": -1.493545651435852, "step": 2335 }, { "epoch": 1.2523833416959358, - "grad_norm": 6.364400964164751, + "grad_norm": 5.993060290872711, "learning_rate": 7.231151670944462e-07, - "logits/chosen": -0.12852008640766144, - "logits/rejected": 0.05516228824853897, - "logps/chosen": -1.2988489866256714, - "logps/rejected": -1.5035803318023682, - "loss": 2.059, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2988489866256714, - "rewards/margins": 0.20473137497901917, - "rewards/rejected": -1.5035803318023682, - "semantic_entropy": 0.8019092679023743, + "logits/chosen": -0.23667342960834503, + "logits/rejected": -0.09508855640888214, + "logps/chosen": -1.270794153213501, + "logps/rejected": -1.4260084629058838, + "loss": 1.6538, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.270794153213501, + "rewards/margins": 0.155214324593544, + "rewards/rejected": -1.4260084629058838, "step": 2340 }, { "epoch": 1.2550593744773373, - "grad_norm": 8.247743814735342, + "grad_norm": 7.783007918313454, "learning_rate": 7.217203391107291e-07, - "logits/chosen": -0.03583495691418648, - "logits/rejected": 0.12947800755500793, - "logps/chosen": -1.2738090753555298, - "logps/rejected": -1.526583194732666, - "loss": 2.0255, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2738090753555298, - "rewards/margins": 0.25277405977249146, - "rewards/rejected": -1.526583194732666, - "semantic_entropy": 0.7844155430793762, + "logits/chosen": -0.15866756439208984, + "logits/rejected": -0.02681003138422966, + "logps/chosen": -1.2622493505477905, + "logps/rejected": -1.4406099319458008, + "loss": 1.6396, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2622493505477905, + "rewards/margins": 0.1783604919910431, + "rewards/rejected": -1.4406099319458008, "step": 2345 }, { "epoch": 1.257735407258739, - "grad_norm": 7.081772579883173, + "grad_norm": 6.8802055144839045, "learning_rate": 7.203233599953096e-07, - "logits/chosen": -0.035393621772527695, - "logits/rejected": 0.10859780013561249, - "logps/chosen": -1.3218568563461304, - "logps/rejected": -1.488811731338501, - "loss": 2.0513, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3218568563461304, - "rewards/margins": 0.16695484519004822, - "rewards/rejected": -1.488811731338501, - "semantic_entropy": 0.7921909689903259, + "logits/chosen": -0.1418730765581131, + "logits/rejected": -0.03416634723544121, + "logps/chosen": -1.2981500625610352, + "logps/rejected": -1.4062778949737549, + "loss": 1.6544, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2981500625610352, + "rewards/margins": 0.10812785476446152, + "rewards/rejected": -1.4062778949737549, "step": 2350 }, { "epoch": 1.2604114400401405, - "grad_norm": 10.049995364195077, + "grad_norm": 8.495832340611686, "learning_rate": 7.189242433016852e-07, - "logits/chosen": 0.032818764448165894, - "logits/rejected": 0.16986875236034393, - "logps/chosen": -1.2119081020355225, - "logps/rejected": -1.5190856456756592, - "loss": 1.9815, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2119081020355225, - "rewards/margins": 0.3071775436401367, - "rewards/rejected": -1.5190856456756592, - "semantic_entropy": 0.8008241653442383, + "logits/chosen": -0.08535777032375336, + "logits/rejected": 0.027291741222143173, + "logps/chosen": -1.1947400569915771, + "logps/rejected": -1.429766058921814, + "loss": 1.5805, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1947400569915771, + "rewards/margins": 0.23502619564533234, + "rewards/rejected": -1.429766058921814, "step": 2355 }, { "epoch": 1.263087472821542, - "grad_norm": 9.041411333486012, + "grad_norm": 7.007217118241574, "learning_rate": 7.17523002604092e-07, - "logits/chosen": 0.006859609391540289, - "logits/rejected": 0.13708624243736267, - "logps/chosen": -1.2518107891082764, - "logps/rejected": -1.6344480514526367, - "loss": 1.9757, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2518107891082764, - "rewards/margins": 0.38263726234436035, - "rewards/rejected": -1.6344480514526367, - "semantic_entropy": 0.7938586473464966, + "logits/chosen": -0.0938306525349617, + "logits/rejected": 0.003519988153129816, + "logps/chosen": -1.2325066328048706, + "logps/rejected": -1.562382698059082, + "loss": 1.5775, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2325066328048706, + "rewards/margins": 0.32987624406814575, + "rewards/rejected": -1.562382698059082, "step": 2360 }, { "epoch": 1.2657635056029437, - "grad_norm": 6.068539137807391, + "grad_norm": 5.8155125097634786, "learning_rate": 7.161196514973734e-07, - "logits/chosen": 0.029105842113494873, - "logits/rejected": 0.17803865671157837, - "logps/chosen": -1.2924420833587646, - "logps/rejected": -1.551058053970337, - "loss": 1.9978, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2924420833587646, - "rewards/margins": 0.25861606001853943, - "rewards/rejected": -1.551058053970337, - "semantic_entropy": 0.7914843559265137, + "logits/chosen": -0.09181183576583862, + "logits/rejected": 0.010623258538544178, + "logps/chosen": -1.2614425420761108, + "logps/rejected": -1.4549132585525513, + "loss": 1.5924, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2614425420761108, + "rewards/margins": 0.1934705525636673, + "rewards/rejected": -1.4549132585525513, "step": 2365 }, { "epoch": 1.2684395383843452, - "grad_norm": 8.797661180013133, + "grad_norm": 8.061942671596224, "learning_rate": 7.147142035968483e-07, - "logits/chosen": 0.05001775175333023, - "logits/rejected": 0.1784544289112091, - "logps/chosen": -1.2478030920028687, - "logps/rejected": -1.5050370693206787, - "loss": 1.9815, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2478030920028687, - "rewards/margins": 0.2572340667247772, - "rewards/rejected": -1.5050370693206787, - "semantic_entropy": 0.8060957193374634, + "logits/chosen": -0.06603260338306427, + "logits/rejected": 0.021892230957746506, + "logps/chosen": -1.222551941871643, + "logps/rejected": -1.429960012435913, + "loss": 1.5705, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.222551941871643, + "rewards/margins": 0.20740799605846405, + "rewards/rejected": -1.429960012435913, "step": 2370 }, { "epoch": 1.2711155711657467, - "grad_norm": 8.101720923817023, + "grad_norm": 8.431180545335108, "learning_rate": 7.133066725381781e-07, - "logits/chosen": -0.1124950498342514, - "logits/rejected": 0.06909557431936264, - "logps/chosen": -1.1976344585418701, - "logps/rejected": -1.3552278280258179, - "loss": 1.9959, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.1976344585418701, - "rewards/margins": 0.1575932800769806, - "rewards/rejected": -1.3552278280258179, - "semantic_entropy": 0.8371988534927368, + "logits/chosen": -0.2085665911436081, + "logits/rejected": -0.06773025542497635, + "logps/chosen": -1.1748874187469482, + "logps/rejected": -1.2752532958984375, + "loss": 1.5719, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.1748874187469482, + "rewards/margins": 0.1003657728433609, + "rewards/rejected": -1.2752532958984375, "step": 2375 }, { "epoch": 1.2737916039471484, - "grad_norm": 9.301816034600291, + "grad_norm": 8.787806946795875, "learning_rate": 7.118970719772354e-07, - "logits/chosen": -0.08418088406324387, - "logits/rejected": 0.13591249287128448, - "logps/chosen": -1.3185670375823975, - "logps/rejected": -1.5692827701568604, - "loss": 2.0367, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3185670375823975, - "rewards/margins": 0.2507156431674957, - "rewards/rejected": -1.5692827701568604, - "semantic_entropy": 0.7790293097496033, + "logits/chosen": -0.20111064612865448, + "logits/rejected": -0.016483021900057793, + "logps/chosen": -1.2925128936767578, + "logps/rejected": -1.4721014499664307, + "loss": 1.646, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2925128936767578, + "rewards/margins": 0.17958858609199524, + "rewards/rejected": -1.4721014499664307, "step": 2380 }, { "epoch": 1.27646763672855, - "grad_norm": 9.894290297080861, + "grad_norm": 8.15008584285262, "learning_rate": 7.104854155899711e-07, - "logits/chosen": 0.022958554327487946, - "logits/rejected": 0.12819671630859375, - "logps/chosen": -1.2903987169265747, - "logps/rejected": -1.5150518417358398, - "loss": 2.0494, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2903987169265747, - "rewards/margins": 0.2246532440185547, - "rewards/rejected": -1.5150518417358398, - "semantic_entropy": 0.7933465838432312, + "logits/chosen": -0.08273342996835709, + "logits/rejected": -0.0012397721875458956, + "logps/chosen": -1.259852409362793, + "logps/rejected": -1.4208247661590576, + "loss": 1.6395, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.259852409362793, + "rewards/margins": 0.16097232699394226, + "rewards/rejected": -1.4208247661590576, "step": 2385 }, { "epoch": 1.2791436695099514, - "grad_norm": 7.120856691958988, + "grad_norm": 6.717432726032404, "learning_rate": 7.090717170722817e-07, - "logits/chosen": 0.02720743417739868, - "logits/rejected": 0.09357274323701859, - "logps/chosen": -1.2580147981643677, - "logps/rejected": -1.5381114482879639, - "loss": 1.9848, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2580147981643677, - "rewards/margins": 0.28009670972824097, - "rewards/rejected": -1.5381114482879639, - "semantic_entropy": 0.8013747930526733, + "logits/chosen": -0.08920027315616608, + "logits/rejected": -0.04481660574674606, + "logps/chosen": -1.2388746738433838, + "logps/rejected": -1.4416940212249756, + "loss": 1.5898, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2388746738433838, + "rewards/margins": 0.20281946659088135, + "rewards/rejected": -1.4416940212249756, "step": 2390 }, { "epoch": 1.2818197022913531, - "grad_norm": 11.88828279536148, + "grad_norm": 10.284664281846965, "learning_rate": 7.076559901398762e-07, - "logits/chosen": -0.19155053794384003, - "logits/rejected": -0.06659739464521408, - "logps/chosen": -1.1978116035461426, - "logps/rejected": -1.4624278545379639, - "loss": 1.9853, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.1978116035461426, - "rewards/margins": 0.2646161615848541, - "rewards/rejected": -1.4624278545379639, - "semantic_entropy": 0.8260299563407898, + "logits/chosen": -0.25946319103240967, + "logits/rejected": -0.1676512062549591, + "logps/chosen": -1.1732776165008545, + "logps/rejected": -1.3788777589797974, + "loss": 1.5704, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1732776165008545, + "rewards/margins": 0.20560026168823242, + "rewards/rejected": -1.3788777589797974, "step": 2395 }, { "epoch": 1.2844957350727546, - "grad_norm": 9.950995582560179, + "grad_norm": 9.357298837621496, "learning_rate": 7.062382485281436e-07, - "logits/chosen": -0.035932350903749466, - "logits/rejected": 0.08602927625179291, - "logps/chosen": -1.2562742233276367, - "logps/rejected": -1.5017836093902588, - "loss": 2.0207, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2562742233276367, - "rewards/margins": 0.24550941586494446, - "rewards/rejected": -1.5017836093902588, - "semantic_entropy": 0.8130423426628113, + "logits/chosen": -0.12120018154382706, + "logits/rejected": -0.031773291528224945, + "logps/chosen": -1.232176661491394, + "logps/rejected": -1.4104655981063843, + "loss": 1.6123, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.232176661491394, + "rewards/margins": 0.1782890111207962, + "rewards/rejected": -1.4104655981063843, "step": 2400 }, { "epoch": 1.2844957350727546, - "eval_logits/chosen": 0.2933657169342041, - "eval_logits/rejected": 0.3859518766403198, - "eval_logps/chosen": -1.330057144165039, - "eval_logps/rejected": -1.577328085899353, - "eval_loss": 2.0472466945648193, - "eval_rewards/accuracies": 0.5875371098518372, - "eval_rewards/chosen": -1.330057144165039, - "eval_rewards/margins": 0.24727098643779755, - "eval_rewards/rejected": -1.577328085899353, - "eval_runtime": 34.6857, - "eval_samples_per_second": 38.777, - "eval_semantic_entropy": 0.7809399366378784, - "eval_steps_per_second": 9.716, + "eval_logits/chosen": 0.09563587605953217, + "eval_logits/rejected": 0.16495996713638306, + "eval_logps/chosen": -1.3029249906539917, + "eval_logps/rejected": -1.4742597341537476, + "eval_loss": 1.652089238166809, + "eval_rewards/accuracies": 0.5667656064033508, + "eval_rewards/chosen": -1.3029249906539917, + "eval_rewards/margins": 0.17133480310440063, + "eval_rewards/rejected": -1.4742597341537476, + "eval_runtime": 40.4637, + "eval_samples_per_second": 33.24, + "eval_steps_per_second": 8.328, "step": 2400 }, { "epoch": 1.287171767854156, - "grad_norm": 8.736922052582077, + "grad_norm": 7.866399941057585, "learning_rate": 7.048185059920193e-07, - "logits/chosen": -0.0388624481856823, - "logits/rejected": 0.1136031299829483, - "logps/chosen": -1.3021931648254395, - "logps/rejected": -1.598894715309143, - "loss": 2.0115, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3021931648254395, - "rewards/margins": 0.29670166969299316, - "rewards/rejected": -1.598894715309143, - "semantic_entropy": 0.7815759181976318, + "logits/chosen": -0.13615265488624573, + "logits/rejected": -0.005280242767184973, + "logps/chosen": -1.266261100769043, + "logps/rejected": -1.4741570949554443, + "loss": 1.6092, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.266261100769043, + "rewards/margins": 0.20789583027362823, + "rewards/rejected": -1.4741570949554443, "step": 2405 }, { "epoch": 1.2898478006355578, - "grad_norm": 5.844097600671448, + "grad_norm": 5.782974933673425, "learning_rate": 7.033967763058516e-07, - "logits/chosen": -0.18287523090839386, - "logits/rejected": 0.016366440802812576, - "logps/chosen": -1.2722828388214111, - "logps/rejected": -1.3984174728393555, - "loss": 2.064, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2722828388214111, - "rewards/margins": 0.12613460421562195, - "rewards/rejected": -1.3984174728393555, - "semantic_entropy": 0.8081968426704407, + "logits/chosen": -0.26628702878952026, + "logits/rejected": -0.09593107551336288, + "logps/chosen": -1.2359964847564697, + "logps/rejected": -1.3293280601501465, + "loss": 1.6401, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2359964847564697, + "rewards/margins": 0.09333159029483795, + "rewards/rejected": -1.3293280601501465, "step": 2410 }, { "epoch": 1.2925238334169593, - "grad_norm": 6.860452427864468, + "grad_norm": 6.547931969592147, "learning_rate": 7.019730732632681e-07, - "logits/chosen": 0.014550815336406231, - "logits/rejected": 0.08596150577068329, - "logps/chosen": -1.1932735443115234, - "logps/rejected": -1.5706613063812256, - "loss": 1.9274, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1932735443115234, - "rewards/margins": 0.3773877024650574, - "rewards/rejected": -1.5706613063812256, - "semantic_entropy": 0.8176706433296204, + "logits/chosen": -0.10351938009262085, + "logits/rejected": -0.055590152740478516, + "logps/chosen": -1.1684571504592896, + "logps/rejected": -1.4438257217407227, + "loss": 1.5218, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1684571504592896, + "rewards/margins": 0.2753686010837555, + "rewards/rejected": -1.4438257217407227, "step": 2415 }, { "epoch": 1.2951998661983608, - "grad_norm": 8.814610765856207, + "grad_norm": 8.18176580142601, "learning_rate": 7.005474106770418e-07, - "logits/chosen": -0.1114654541015625, - "logits/rejected": 0.013631993904709816, - "logps/chosen": -1.3067487478256226, - "logps/rejected": -1.6183964014053345, - "loss": 1.9872, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3067487478256226, - "rewards/margins": 0.3116476535797119, - "rewards/rejected": -1.6183964014053345, - "semantic_entropy": 0.768617570400238, + "logits/chosen": -0.24395263195037842, + "logits/rejected": -0.1428661048412323, + "logps/chosen": -1.278551697731018, + "logps/rejected": -1.5387744903564453, + "loss": 1.5966, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.278551697731018, + "rewards/margins": 0.26022282242774963, + "rewards/rejected": -1.5387744903564453, "step": 2420 }, { "epoch": 1.2978758989797625, - "grad_norm": 8.996235476343262, + "grad_norm": 8.163975822722538, "learning_rate": 6.991198023789577e-07, - "logits/chosen": -0.0221896730363369, - "logits/rejected": 0.056027401238679886, - "logps/chosen": -1.2366410493850708, - "logps/rejected": -1.4527819156646729, - "loss": 1.9974, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2366410493850708, - "rewards/margins": 0.21614094078540802, - "rewards/rejected": -1.4527819156646729, - "semantic_entropy": 0.8054319620132446, + "logits/chosen": -0.11933907121419907, + "logits/rejected": -0.05643367022275925, + "logps/chosen": -1.2212616205215454, + "logps/rejected": -1.398843765258789, + "loss": 1.5925, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2212616205215454, + "rewards/margins": 0.1775822937488556, + "rewards/rejected": -1.398843765258789, "step": 2425 }, { "epoch": 1.300551931761164, - "grad_norm": 9.971777847894783, + "grad_norm": 8.50468106485477, "learning_rate": 6.976902622196776e-07, - "logits/chosen": 0.007852977141737938, - "logits/rejected": 0.058755289763212204, - "logps/chosen": -1.3803184032440186, - "logps/rejected": -1.5980432033538818, - "loss": 2.0738, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3803184032440186, - "rewards/margins": 0.2177247703075409, - "rewards/rejected": -1.5980432033538818, - "semantic_entropy": 0.7618513107299805, + "logits/chosen": -0.1156187430024147, + "logits/rejected": -0.08730246126651764, + "logps/chosen": -1.3546292781829834, + "logps/rejected": -1.5157809257507324, + "loss": 1.6826, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3546292781829834, + "rewards/margins": 0.16115155816078186, + "rewards/rejected": -1.5157809257507324, "step": 2430 }, { "epoch": 1.3032279645425655, - "grad_norm": 5.783046917505834, + "grad_norm": 4.972232516102699, "learning_rate": 6.962588040686064e-07, - "logits/chosen": 0.024611715227365494, - "logits/rejected": 0.15348206460475922, - "logps/chosen": -1.2611327171325684, - "logps/rejected": -1.4416377544403076, - "loss": 2.0195, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2611327171325684, - "rewards/margins": 0.18050506711006165, - "rewards/rejected": -1.4416377544403076, - "semantic_entropy": 0.8049419522285461, + "logits/chosen": -0.08850985765457153, + "logits/rejected": 0.003945094998925924, + "logps/chosen": -1.2295047044754028, + "logps/rejected": -1.3775475025177002, + "loss": 1.6045, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2295047044754028, + "rewards/margins": 0.1480426788330078, + "rewards/rejected": -1.3775475025177002, "step": 2435 }, { "epoch": 1.3059039973239672, - "grad_norm": 8.973601983735593, + "grad_norm": 9.197101107744537, "learning_rate": 6.948254418137573e-07, - "logits/chosen": -0.10572667419910431, - "logits/rejected": 0.0066367872059345245, - "logps/chosen": -1.2377347946166992, - "logps/rejected": -1.4910590648651123, - "loss": 1.998, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2377347946166992, - "rewards/margins": 0.2533242404460907, - "rewards/rejected": -1.4910590648651123, - "semantic_entropy": 0.8126150369644165, + "logits/chosen": -0.23168103396892548, + "logits/rejected": -0.14563602209091187, + "logps/chosen": -1.2089626789093018, + "logps/rejected": -1.4014352560043335, + "loss": 1.5829, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2089626789093018, + "rewards/margins": 0.19247262179851532, + "rewards/rejected": -1.4014352560043335, "step": 2440 }, { "epoch": 1.3085800301053687, - "grad_norm": 7.804619647636399, + "grad_norm": 8.679503255748509, "learning_rate": 6.933901893616174e-07, - "logits/chosen": -0.07127559185028076, - "logits/rejected": 0.07549577951431274, - "logps/chosen": -1.2812305688858032, - "logps/rejected": -1.4737753868103027, - "loss": 2.0333, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2812305688858032, - "rewards/margins": 0.19254475831985474, - "rewards/rejected": -1.4737753868103027, - "semantic_entropy": 0.8040180206298828, + "logits/chosen": -0.1771184206008911, + "logits/rejected": -0.0606854073703289, + "logps/chosen": -1.2561490535736084, + "logps/rejected": -1.3943471908569336, + "loss": 1.6229, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2561490535736084, + "rewards/margins": 0.13819798827171326, + "rewards/rejected": -1.3943471908569336, "step": 2445 }, { "epoch": 1.3112560628867704, - "grad_norm": 6.889523714270502, + "grad_norm": 6.545905205306638, "learning_rate": 6.919530606370121e-07, - "logits/chosen": -0.027874654158949852, - "logits/rejected": 0.13589277863502502, - "logps/chosen": -1.247702956199646, - "logps/rejected": -1.584751009941101, - "loss": 1.9647, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.247702956199646, - "rewards/margins": 0.33704808354377747, - "rewards/rejected": -1.584751009941101, - "semantic_entropy": 0.8122938275337219, + "logits/chosen": -0.17317122220993042, + "logits/rejected": -0.04910287261009216, + "logps/chosen": -1.217040777206421, + "logps/rejected": -1.4837815761566162, + "loss": 1.5456, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.217040777206421, + "rewards/margins": 0.2667407989501953, + "rewards/rejected": -1.4837815761566162, "step": 2450 }, { "epoch": 1.313932095668172, - "grad_norm": 5.118254464470323, + "grad_norm": 5.185633419013426, "learning_rate": 6.905140695829706e-07, - "logits/chosen": -0.06709776073694229, - "logits/rejected": 0.17308706045150757, - "logps/chosen": -1.3292291164398193, - "logps/rejected": -1.5290602445602417, - "loss": 2.0415, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3292291164398193, - "rewards/margins": 0.19983109831809998, - "rewards/rejected": -1.5290602445602417, - "semantic_entropy": 0.7836787700653076, + "logits/chosen": -0.17966656386852264, + "logits/rejected": -0.0012095480924472213, + "logps/chosen": -1.3041753768920898, + "logps/rejected": -1.4204387664794922, + "loss": 1.6485, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3041753768920898, + "rewards/margins": 0.11626337468624115, + "rewards/rejected": -1.4204387664794922, "step": 2455 }, { "epoch": 1.3166081284495736, - "grad_norm": 8.789989155313604, + "grad_norm": 8.749038985182217, "learning_rate": 6.890732301605904e-07, - "logits/chosen": -0.00417876522988081, - "logits/rejected": 0.1012817770242691, - "logps/chosen": -1.3258728981018066, - "logps/rejected": -1.4803338050842285, - "loss": 2.0531, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3258728981018066, - "rewards/margins": 0.15446093678474426, - "rewards/rejected": -1.4803338050842285, - "semantic_entropy": 0.7863596081733704, + "logits/chosen": -0.150712251663208, + "logits/rejected": -0.07498790323734283, + "logps/chosen": -1.2950248718261719, + "logps/rejected": -1.3897525072097778, + "loss": 1.6523, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2950248718261719, + "rewards/margins": 0.09472791105508804, + "rewards/rejected": -1.3897525072097778, "step": 2460 }, { "epoch": 1.3192841612309751, - "grad_norm": 7.69060213765338, + "grad_norm": 6.837164978630255, "learning_rate": 6.876305563489021e-07, - "logits/chosen": 0.011214707978069782, - "logits/rejected": 0.09302522987127304, - "logps/chosen": -1.2799160480499268, - "logps/rejected": -1.5444409847259521, - "loss": 2.0133, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2799160480499268, - "rewards/margins": 0.26452499628067017, - "rewards/rejected": -1.5444409847259521, - "semantic_entropy": 0.8035866022109985, + "logits/chosen": -0.12322545051574707, + "logits/rejected": -0.07410383224487305, + "logps/chosen": -1.2550203800201416, + "logps/rejected": -1.4570932388305664, + "loss": 1.6043, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2550203800201416, + "rewards/margins": 0.2020729035139084, + "rewards/rejected": -1.4570932388305664, "step": 2465 }, { "epoch": 1.3219601940123766, - "grad_norm": 10.928279384945997, + "grad_norm": 10.434805459545437, "learning_rate": 6.861860621447331e-07, - "logits/chosen": -0.13337582349777222, - "logits/rejected": -0.002061316277831793, - "logps/chosen": -1.2584311962127686, - "logps/rejected": -1.4112979173660278, - "loss": 2.0326, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2584311962127686, - "rewards/margins": 0.15286675095558167, - "rewards/rejected": -1.4112979173660278, - "semantic_entropy": 0.8194764256477356, + "logits/chosen": -0.24980542063713074, + "logits/rejected": -0.14923252165317535, + "logps/chosen": -1.2360928058624268, + "logps/rejected": -1.3302528858184814, + "loss": 1.6215, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2360928058624268, + "rewards/margins": 0.0941600427031517, + "rewards/rejected": -1.3302528858184814, "step": 2470 }, { "epoch": 1.3246362267937783, - "grad_norm": 7.148688930285466, + "grad_norm": 6.649978989330003, "learning_rate": 6.847397615625725e-07, - "logits/chosen": -0.013584035448729992, - "logits/rejected": 0.05738170072436333, - "logps/chosen": -1.2745800018310547, - "logps/rejected": -1.5176098346710205, - "loss": 2.0152, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2745800018310547, - "rewards/margins": 0.24302978813648224, - "rewards/rejected": -1.5176098346710205, - "semantic_entropy": 0.7967642545700073, + "logits/chosen": -0.16329626739025116, + "logits/rejected": -0.10834117233753204, + "logps/chosen": -1.2520506381988525, + "logps/rejected": -1.458328366279602, + "loss": 1.6092, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2520506381988525, + "rewards/margins": 0.20627769827842712, + "rewards/rejected": -1.458328366279602, "step": 2475 }, { "epoch": 1.3273122595751798, - "grad_norm": 6.877033441682886, + "grad_norm": 6.3096691733673715, "learning_rate": 6.83291668634435e-07, - "logits/chosen": -0.1592644453048706, - "logits/rejected": 0.020404372364282608, - "logps/chosen": -1.302016019821167, - "logps/rejected": -1.5877892971038818, - "loss": 1.994, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.302016019821167, - "rewards/margins": 0.2857731580734253, - "rewards/rejected": -1.5877892971038818, - "semantic_entropy": 0.7693041563034058, + "logits/chosen": -0.28249967098236084, + "logits/rejected": -0.14977525174617767, + "logps/chosen": -1.2829852104187012, + "logps/rejected": -1.4930446147918701, + "loss": 1.6114, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2829852104187012, + "rewards/margins": 0.21005935966968536, + "rewards/rejected": -1.4930446147918701, "step": 2480 }, { "epoch": 1.3299882923565813, - "grad_norm": 7.9456489134187205, + "grad_norm": 8.077535905450087, "learning_rate": 6.818417974097246e-07, - "logits/chosen": 0.0417151153087616, - "logits/rejected": 0.22702746093273163, - "logps/chosen": -1.3054397106170654, - "logps/rejected": -1.6041944026947021, - "loss": 2.0443, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3054397106170654, - "rewards/margins": 0.2987547218799591, - "rewards/rejected": -1.6041944026947021, - "semantic_entropy": 0.787520706653595, + "logits/chosen": -0.10574954748153687, + "logits/rejected": 0.047827187925577164, + "logps/chosen": -1.283026099205017, + "logps/rejected": -1.4941781759262085, + "loss": 1.646, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.283026099205017, + "rewards/margins": 0.21115219593048096, + "rewards/rejected": -1.4941781759262085, "step": 2485 }, { "epoch": 1.332664325137983, - "grad_norm": 7.924827416846528, + "grad_norm": 7.356301919669881, "learning_rate": 6.803901619550981e-07, - "logits/chosen": -0.08295613527297974, - "logits/rejected": -0.02383790723979473, - "logps/chosen": -1.302221655845642, - "logps/rejected": -1.5922086238861084, - "loss": 2.0096, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.302221655845642, - "rewards/margins": 0.289986789226532, - "rewards/rejected": -1.5922086238861084, - "semantic_entropy": 0.7813442945480347, + "logits/chosen": -0.22362974286079407, + "logits/rejected": -0.1858159899711609, + "logps/chosen": -1.2722280025482178, + "logps/rejected": -1.5098580121994019, + "loss": 1.6113, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2722280025482178, + "rewards/margins": 0.2376299649477005, + "rewards/rejected": -1.5098580121994019, "step": 2490 }, { "epoch": 1.3353403579193845, - "grad_norm": 7.910158635170881, + "grad_norm": 7.104468059666298, "learning_rate": 6.789367763543292e-07, - "logits/chosen": 0.026933297514915466, - "logits/rejected": 0.03556499630212784, - "logps/chosen": -1.2895691394805908, - "logps/rejected": -1.5162808895111084, - "loss": 2.021, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2895691394805908, - "rewards/margins": 0.2267116755247116, - "rewards/rejected": -1.5162808895111084, - "semantic_entropy": 0.7934345602989197, + "logits/chosen": -0.10563309490680695, + "logits/rejected": -0.1071210652589798, + "logps/chosen": -1.2554916143417358, + "logps/rejected": -1.4397131204605103, + "loss": 1.61, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2554916143417358, + "rewards/margins": 0.18422135710716248, + "rewards/rejected": -1.4397131204605103, "step": 2495 }, { "epoch": 1.338016390700786, - "grad_norm": 8.084202427838166, + "grad_norm": 7.991481488996529, "learning_rate": 6.774816547081714e-07, - "logits/chosen": 0.05614136531949043, - "logits/rejected": 0.22107501327991486, - "logps/chosen": -1.2359117269515991, - "logps/rejected": -1.4719440937042236, - "loss": 1.9882, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2359117269515991, - "rewards/margins": 0.23603227734565735, - "rewards/rejected": -1.4719440937042236, - "semantic_entropy": 0.8171242475509644, + "logits/chosen": -0.11559624969959259, + "logits/rejected": 0.007312471512705088, + "logps/chosen": -1.2053483724594116, + "logps/rejected": -1.389601469039917, + "loss": 1.569, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2053483724594116, + "rewards/margins": 0.18425318598747253, + "rewards/rejected": -1.389601469039917, "step": 2500 }, { "epoch": 1.3406924234821878, - "grad_norm": 7.735130816689047, + "grad_norm": 7.161788696656108, "learning_rate": 6.760248111342211e-07, - "logits/chosen": 0.009106556884944439, - "logits/rejected": 0.1891101449728012, - "logps/chosen": -1.24410080909729, - "logps/rejected": -1.4906928539276123, - "loss": 1.9864, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.24410080909729, - "rewards/margins": 0.24659200012683868, - "rewards/rejected": -1.4906928539276123, - "semantic_entropy": 0.8127717971801758, + "logits/chosen": -0.12719660997390747, + "logits/rejected": 0.011344531551003456, + "logps/chosen": -1.214221715927124, + "logps/rejected": -1.3893407583236694, + "loss": 1.5752, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.214221715927124, + "rewards/margins": 0.17511887848377228, + "rewards/rejected": -1.3893407583236694, "step": 2505 }, { "epoch": 1.3433684562635893, - "grad_norm": 12.384562731005605, + "grad_norm": 10.254805100878995, "learning_rate": 6.745662597667813e-07, - "logits/chosen": -0.08548152446746826, - "logits/rejected": 0.04962456226348877, - "logps/chosen": -1.232684850692749, - "logps/rejected": -1.4977141618728638, - "loss": 1.9834, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.232684850692749, - "rewards/margins": 0.26502925157546997, - "rewards/rejected": -1.4977141618728638, - "semantic_entropy": 0.8126821517944336, + "logits/chosen": -0.20160675048828125, + "logits/rejected": -0.0989512950181961, + "logps/chosen": -1.2082228660583496, + "logps/rejected": -1.4115922451019287, + "loss": 1.5709, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2082228660583496, + "rewards/margins": 0.20336949825286865, + "rewards/rejected": -1.4115922451019287, "step": 2510 }, { "epoch": 1.3460444890449907, - "grad_norm": 6.177445891852524, + "grad_norm": 6.201294311291015, "learning_rate": 6.731060147567236e-07, - "logits/chosen": 0.008504378609359264, - "logits/rejected": 0.1291893571615219, - "logps/chosen": -1.3171823024749756, - "logps/rejected": -1.4955008029937744, - "loss": 2.0666, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3171823024749756, - "rewards/margins": 0.17831860482692719, - "rewards/rejected": -1.4955008029937744, - "semantic_entropy": 0.7949143052101135, + "logits/chosen": -0.11442514508962631, + "logits/rejected": -0.02388397417962551, + "logps/chosen": -1.2926146984100342, + "logps/rejected": -1.401564359664917, + "loss": 1.668, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2926146984100342, + "rewards/margins": 0.1089496836066246, + "rewards/rejected": -1.401564359664917, "step": 2515 }, { "epoch": 1.3487205218263925, - "grad_norm": 5.895468706608909, + "grad_norm": 5.807785907494708, "learning_rate": 6.716440902713515e-07, - "logits/chosen": -0.06746038049459457, - "logits/rejected": 0.014505298808217049, - "logps/chosen": -1.2996203899383545, - "logps/rejected": -1.5103718042373657, - "loss": 2.0256, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2996203899383545, - "rewards/margins": 0.2107514888048172, - "rewards/rejected": -1.5103718042373657, - "semantic_entropy": 0.7952625155448914, + "logits/chosen": -0.2186061143875122, + "logits/rejected": -0.15785038471221924, + "logps/chosen": -1.2709650993347168, + "logps/rejected": -1.4271172285079956, + "loss": 1.6172, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2709650993347168, + "rewards/margins": 0.1561521738767624, + "rewards/rejected": -1.4271172285079956, "step": 2520 }, { "epoch": 1.351396554607794, - "grad_norm": 12.24033172449453, + "grad_norm": 10.614893624777126, "learning_rate": 6.701805004942627e-07, - "logits/chosen": -0.06568801403045654, - "logits/rejected": 0.014113761484622955, - "logps/chosen": -1.324389934539795, - "logps/rejected": -1.5580135583877563, - "loss": 2.0244, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.324389934539795, - "rewards/margins": 0.23362377285957336, - "rewards/rejected": -1.5580135583877563, - "semantic_entropy": 0.7667808532714844, + "logits/chosen": -0.1689203381538391, + "logits/rejected": -0.1105555072426796, + "logps/chosen": -1.2965948581695557, + "logps/rejected": -1.4499021768569946, + "loss": 1.6348, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2965948581695557, + "rewards/margins": 0.15330716967582703, + "rewards/rejected": -1.4499021768569946, "step": 2525 }, { "epoch": 1.3540725873891954, - "grad_norm": 9.655691096098971, + "grad_norm": 8.62582624798431, "learning_rate": 6.687152596252119e-07, - "logits/chosen": -0.08653044700622559, - "logits/rejected": -0.03837307170033455, - "logps/chosen": -1.2778962850570679, - "logps/rejected": -1.4817698001861572, - "loss": 2.0308, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2778962850570679, - "rewards/margins": 0.20387335121631622, - "rewards/rejected": -1.4817698001861572, - "semantic_entropy": 0.8072389364242554, + "logits/chosen": -0.20537002384662628, + "logits/rejected": -0.17075181007385254, + "logps/chosen": -1.2418001890182495, + "logps/rejected": -1.410526990890503, + "loss": 1.6153, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2418001890182495, + "rewards/margins": 0.16872690618038177, + "rewards/rejected": -1.410526990890503, "step": 2530 }, { "epoch": 1.3567486201705972, - "grad_norm": 6.229786698881941, + "grad_norm": 6.14248446253142, "learning_rate": 6.672483818799722e-07, - "logits/chosen": -0.10884450376033783, - "logits/rejected": 0.031116357073187828, - "logps/chosen": -1.2840421199798584, - "logps/rejected": -1.5486438274383545, - "loss": 2.0033, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2840421199798584, - "rewards/margins": 0.2646019756793976, - "rewards/rejected": -1.5486438274383545, - "semantic_entropy": 0.7965590357780457, + "logits/chosen": -0.24151122570037842, + "logits/rejected": -0.13936847448349, + "logps/chosen": -1.26100754737854, + "logps/rejected": -1.454439401626587, + "loss": 1.607, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.26100754737854, + "rewards/margins": 0.1934318244457245, + "rewards/rejected": -1.454439401626587, "step": 2535 }, { "epoch": 1.3594246529519987, - "grad_norm": 11.08324125453285, + "grad_norm": 9.504606958292078, "learning_rate": 6.657798814901978e-07, - "logits/chosen": -0.030958428978919983, - "logits/rejected": 0.1495594084262848, - "logps/chosen": -1.3801937103271484, - "logps/rejected": -1.5122677087783813, - "loss": 2.1004, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3801937103271484, - "rewards/margins": 0.1320740431547165, - "rewards/rejected": -1.5122677087783813, - "semantic_entropy": 0.7679113149642944, + "logits/chosen": -0.14703956246376038, + "logits/rejected": -0.01648893393576145, + "logps/chosen": -1.3492311239242554, + "logps/rejected": -1.4347513914108276, + "loss": 1.6997, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3492311239242554, + "rewards/margins": 0.08552038669586182, + "rewards/rejected": -1.4347513914108276, "step": 2540 }, { "epoch": 1.3621006857334002, - "grad_norm": 7.463242215270924, + "grad_norm": 6.612189818543011, "learning_rate": 6.643097727032863e-07, - "logits/chosen": -0.011565339751541615, - "logits/rejected": 0.16919642686843872, - "logps/chosen": -1.2788037061691284, - "logps/rejected": -1.5690239667892456, - "loss": 2.0136, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2788037061691284, - "rewards/margins": 0.2902204394340515, - "rewards/rejected": -1.5690239667892456, - "semantic_entropy": 0.7988485097885132, + "logits/chosen": -0.13871613144874573, + "logits/rejected": -0.015263721346855164, + "logps/chosen": -1.256090760231018, + "logps/rejected": -1.455414056777954, + "loss": 1.6162, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.256090760231018, + "rewards/margins": 0.19932350516319275, + "rewards/rejected": -1.455414056777954, "step": 2545 }, { "epoch": 1.3647767185148019, - "grad_norm": 8.401641521017426, + "grad_norm": 7.058132572253126, "learning_rate": 6.628380697822392e-07, - "logits/chosen": -0.03879554197192192, - "logits/rejected": 0.1336180716753006, - "logps/chosen": -1.2727123498916626, - "logps/rejected": -1.4142420291900635, - "loss": 2.0508, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2727123498916626, - "rewards/margins": 0.1415296494960785, - "rewards/rejected": -1.4142420291900635, - "semantic_entropy": 0.814642071723938, + "logits/chosen": -0.15726891160011292, + "logits/rejected": -0.03172614425420761, + "logps/chosen": -1.2419919967651367, + "logps/rejected": -1.326724648475647, + "loss": 1.6355, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2419919967651367, + "rewards/margins": 0.08473268151283264, + "rewards/rejected": -1.326724648475647, "step": 2550 }, { "epoch": 1.3674527512962034, - "grad_norm": 10.17566266604566, + "grad_norm": 9.209693166256022, "learning_rate": 6.61364787005525e-07, - "logits/chosen": 0.024192985147237778, - "logits/rejected": 0.11562930047512054, - "logps/chosen": -1.212947130203247, - "logps/rejected": -1.5666682720184326, - "loss": 1.9549, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.212947130203247, - "rewards/margins": 0.3537212312221527, - "rewards/rejected": -1.5666682720184326, - "semantic_entropy": 0.8101435899734497, + "logits/chosen": -0.10544419288635254, + "logits/rejected": -0.04484523460268974, + "logps/chosen": -1.1819980144500732, + "logps/rejected": -1.4745718240737915, + "loss": 1.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1819980144500732, + "rewards/margins": 0.2925736606121063, + "rewards/rejected": -1.4745718240737915, "step": 2555 }, { "epoch": 1.3701287840776049, - "grad_norm": 8.236825612826049, + "grad_norm": 7.432396154450075, "learning_rate": 6.598899386669395e-07, - "logits/chosen": 0.03629554063081741, - "logits/rejected": 0.16760225594043732, - "logps/chosen": -1.2631809711456299, - "logps/rejected": -1.5157405138015747, - "loss": 1.998, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2631809711456299, - "rewards/margins": 0.2525593638420105, - "rewards/rejected": -1.5157405138015747, - "semantic_entropy": 0.8036211133003235, + "logits/chosen": -0.0917210653424263, + "logits/rejected": 0.01057431660592556, + "logps/chosen": -1.229265570640564, + "logps/rejected": -1.4219061136245728, + "loss": 1.5893, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.229265570640564, + "rewards/margins": 0.19264045357704163, + "rewards/rejected": -1.4219061136245728, "step": 2560 }, { "epoch": 1.3728048168590066, - "grad_norm": 10.644311265208762, + "grad_norm": 8.895934890344009, "learning_rate": 6.584135390754679e-07, - "logits/chosen": 0.02121199481189251, - "logits/rejected": 0.15904763340950012, - "logps/chosen": -1.2449887990951538, - "logps/rejected": -1.528660774230957, - "loss": 1.9903, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2449887990951538, - "rewards/margins": 0.28367194533348083, - "rewards/rejected": -1.528660774230957, - "semantic_entropy": 0.8081823587417603, + "logits/chosen": -0.12629587948322296, + "logits/rejected": -0.03258613497018814, + "logps/chosen": -1.2157530784606934, + "logps/rejected": -1.4207779169082642, + "loss": 1.5777, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2157530784606934, + "rewards/margins": 0.2050248682498932, + "rewards/rejected": -1.4207779169082642, "step": 2565 }, { "epoch": 1.375480849640408, - "grad_norm": 5.671198047003181, + "grad_norm": 5.284246111498499, "learning_rate": 6.569356025551454e-07, - "logits/chosen": 0.08652433753013611, - "logits/rejected": 0.15835759043693542, - "logps/chosen": -1.2589101791381836, - "logps/rejected": -1.5290107727050781, - "loss": 1.9852, + "logits/chosen": -0.06085330992937088, + "logits/rejected": -0.015537110157310963, + "logps/chosen": -1.2201460599899292, + "logps/rejected": -1.4362637996673584, + "loss": 1.5669, "rewards/accuracies": 0.625, - "rewards/chosen": -1.2589101791381836, - "rewards/margins": 0.27010056376457214, - "rewards/rejected": -1.5290107727050781, - "semantic_entropy": 0.8052641153335571, + "rewards/chosen": -1.2201460599899292, + "rewards/margins": 0.21611778438091278, + "rewards/rejected": -1.4362637996673584, "step": 2570 }, { "epoch": 1.3781568824218096, - "grad_norm": 10.644019148144604, + "grad_norm": 9.453518891765544, "learning_rate": 6.554561434449186e-07, - "logits/chosen": -0.07601086795330048, - "logits/rejected": 0.0795358270406723, - "logps/chosen": -1.2429871559143066, - "logps/rejected": -1.529396414756775, - "loss": 1.9813, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2429871559143066, - "rewards/margins": 0.28640928864479065, - "rewards/rejected": -1.529396414756775, - "semantic_entropy": 0.8111133575439453, + "logits/chosen": -0.21074800193309784, + "logits/rejected": -0.0934746041893959, + "logps/chosen": -1.2131701707839966, + "logps/rejected": -1.4448621273040771, + "loss": 1.5703, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2131701707839966, + "rewards/margins": 0.23169195652008057, + "rewards/rejected": -1.4448621273040771, "step": 2575 }, { "epoch": 1.3808329152032113, - "grad_norm": 11.586334506338712, + "grad_norm": 10.55445943981361, "learning_rate": 6.539751760985063e-07, - "logits/chosen": -0.006502258591353893, - "logits/rejected": 0.07593898475170135, - "logps/chosen": -1.338326096534729, - "logps/rejected": -1.5027631521224976, - "loss": 2.0704, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.338326096534729, - "rewards/margins": 0.1644371747970581, - "rewards/rejected": -1.5027631521224976, - "semantic_entropy": 0.7761968374252319, + "logits/chosen": -0.12727180123329163, + "logits/rejected": -0.07430537790060043, + "logps/chosen": -1.3147636651992798, + "logps/rejected": -1.4463951587677002, + "loss": 1.6785, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3147636651992798, + "rewards/margins": 0.13163141906261444, + "rewards/rejected": -1.4463951587677002, "step": 2580 }, { "epoch": 1.3835089479846128, - "grad_norm": 7.7658484961464085, + "grad_norm": 7.260128046427542, "learning_rate": 6.524927148842602e-07, - "logits/chosen": 0.09149026870727539, - "logits/rejected": 0.2646264433860779, - "logps/chosen": -1.1860164403915405, - "logps/rejected": -1.5092562437057495, - "loss": 1.9556, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1860164403915405, - "rewards/margins": 0.32323986291885376, - "rewards/rejected": -1.5092562437057495, - "semantic_entropy": 0.8078700304031372, + "logits/chosen": -0.05735338479280472, + "logits/rejected": 0.07477830350399017, + "logps/chosen": -1.1611049175262451, + "logps/rejected": -1.4185049533843994, + "loss": 1.5439, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1611049175262451, + "rewards/margins": 0.2574000358581543, + "rewards/rejected": -1.4185049533843994, "step": 2585 }, { "epoch": 1.3861849807660143, - "grad_norm": 9.208964534441007, + "grad_norm": 8.563324012040841, "learning_rate": 6.510087741850254e-07, - "logits/chosen": -0.021234478801488876, - "logits/rejected": 0.12869387865066528, - "logps/chosen": -1.2381582260131836, - "logps/rejected": -1.4897607564926147, - "loss": 2.0092, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2381582260131836, - "rewards/margins": 0.2516025900840759, - "rewards/rejected": -1.4897607564926147, - "semantic_entropy": 0.8143725395202637, + "logits/chosen": -0.1607932448387146, + "logits/rejected": -0.03989001363515854, + "logps/chosen": -1.208744764328003, + "logps/rejected": -1.4180536270141602, + "loss": 1.5911, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.208744764328003, + "rewards/margins": 0.2093089520931244, + "rewards/rejected": -1.4180536270141602, "step": 2590 }, { "epoch": 1.388861013547416, - "grad_norm": 10.626958785316473, + "grad_norm": 8.132441147071921, "learning_rate": 6.495233683980012e-07, - "logits/chosen": 0.013047700747847557, - "logits/rejected": 0.05873823165893555, - "logps/chosen": -1.249180555343628, - "logps/rejected": -1.4837150573730469, - "loss": 1.9945, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.249180555343628, - "rewards/margins": 0.23453423380851746, - "rewards/rejected": -1.4837150573730469, - "semantic_entropy": 0.8165580034255981, + "logits/chosen": -0.10679218918085098, + "logits/rejected": -0.08082714676856995, + "logps/chosen": -1.2230453491210938, + "logps/rejected": -1.407715082168579, + "loss": 1.5794, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2230453491210938, + "rewards/margins": 0.18466970324516296, + "rewards/rejected": -1.407715082168579, "step": 2595 }, { "epoch": 1.3915370463288175, - "grad_norm": 10.315240324352786, + "grad_norm": 9.724335849310258, "learning_rate": 6.480365119346011e-07, - "logits/chosen": 0.07690244913101196, - "logits/rejected": 0.2094108760356903, - "logps/chosen": -1.2769521474838257, - "logps/rejected": -1.4337047338485718, - "loss": 2.0389, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2769521474838257, - "rewards/margins": 0.15675252676010132, - "rewards/rejected": -1.4337047338485718, - "semantic_entropy": 0.803144633769989, + "logits/chosen": -0.05054439231753349, + "logits/rejected": 0.049748364835977554, + "logps/chosen": -1.249579668045044, + "logps/rejected": -1.3751084804534912, + "loss": 1.6245, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.249579668045044, + "rewards/margins": 0.1255287230014801, + "rewards/rejected": -1.3751084804534912, "step": 2600 }, { "epoch": 1.394213079110219, - "grad_norm": 8.152851427219868, + "grad_norm": 8.232190498117061, "learning_rate": 6.465482192203129e-07, - "logits/chosen": 0.06097614765167236, - "logits/rejected": 0.10995031893253326, - "logps/chosen": -1.2642452716827393, - "logps/rejected": -1.460809350013733, - "loss": 2.0252, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2642452716827393, - "rewards/margins": 0.19656404852867126, - "rewards/rejected": -1.460809350013733, - "semantic_entropy": 0.8018202781677246, + "logits/chosen": -0.07768498361110687, + "logits/rejected": -0.050233401358127594, + "logps/chosen": -1.238800048828125, + "logps/rejected": -1.3910675048828125, + "loss": 1.6121, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.238800048828125, + "rewards/margins": 0.1522674858570099, + "rewards/rejected": -1.3910675048828125, "step": 2605 }, { "epoch": 1.3968891118916207, - "grad_norm": 9.626880922808345, + "grad_norm": 9.200695719936968, "learning_rate": 6.45058504694559e-07, - "logits/chosen": 0.07766013592481613, - "logits/rejected": 0.15449103713035583, - "logps/chosen": -1.3124924898147583, - "logps/rejected": -1.5030378103256226, - "loss": 2.0732, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3124924898147583, - "rewards/margins": 0.1905454695224762, - "rewards/rejected": -1.5030378103256226, - "semantic_entropy": 0.7874075770378113, + "logits/chosen": -0.03262275084853172, + "logits/rejected": 0.03302247077226639, + "logps/chosen": -1.2927391529083252, + "logps/rejected": -1.4022797346115112, + "loss": 1.684, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2927391529083252, + "rewards/margins": 0.1095406785607338, + "rewards/rejected": -1.4022797346115112, "step": 2610 }, { "epoch": 1.3995651446730222, - "grad_norm": 12.498925680101337, + "grad_norm": 11.089373494205253, "learning_rate": 6.435673828105564e-07, - "logits/chosen": -0.04191456735134125, - "logits/rejected": 0.10613715648651123, - "logps/chosen": -1.2324085235595703, - "logps/rejected": -1.5605417490005493, - "loss": 1.9814, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2324085235595703, - "rewards/margins": 0.3281332850456238, - "rewards/rejected": -1.5605417490005493, - "semantic_entropy": 0.8089901804924011, + "logits/chosen": -0.15007483959197998, + "logits/rejected": -0.035728149116039276, + "logps/chosen": -1.2085886001586914, + "logps/rejected": -1.47746741771698, + "loss": 1.5697, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2085886001586914, + "rewards/margins": 0.2688790261745453, + "rewards/rejected": -1.47746741771698, "step": 2615 }, { "epoch": 1.402241177454424, - "grad_norm": 9.860457798037306, + "grad_norm": 9.176711063517878, "learning_rate": 6.420748680351763e-07, - "logits/chosen": -0.04338609799742699, - "logits/rejected": -0.04294615983963013, - "logps/chosen": -1.3530676364898682, - "logps/rejected": -1.4674022197723389, - "loss": 2.1339, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.3530676364898682, - "rewards/margins": 0.11433436721563339, - "rewards/rejected": -1.4674022197723389, - "semantic_entropy": 0.7847703695297241, + "logits/chosen": -0.14987041056156158, + "logits/rejected": -0.15985018014907837, + "logps/chosen": -1.3277456760406494, + "logps/rejected": -1.4078071117401123, + "loss": 1.7256, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.3277456760406494, + "rewards/margins": 0.08006126433610916, + "rewards/rejected": -1.4078071117401123, "step": 2620 }, { "epoch": 1.4049172102358254, - "grad_norm": 13.825014034037942, + "grad_norm": 8.617162785119755, "learning_rate": 6.405809748488032e-07, - "logits/chosen": 0.0195016972720623, - "logits/rejected": 0.1728542298078537, - "logps/chosen": -1.2841010093688965, - "logps/rejected": -1.5404409170150757, - "loss": 2.0065, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2841010093688965, - "rewards/margins": 0.25633999705314636, - "rewards/rejected": -1.5404409170150757, - "semantic_entropy": 0.8087183237075806, + "logits/chosen": -0.0892399325966835, + "logits/rejected": 0.03166574984788895, + "logps/chosen": -1.248976230621338, + "logps/rejected": -1.438613772392273, + "loss": 1.5903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.248976230621338, + "rewards/margins": 0.189637690782547, + "rewards/rejected": -1.438613772392273, "step": 2625 }, { "epoch": 1.4075932430172269, - "grad_norm": 10.399756704030894, + "grad_norm": 8.36874910160355, "learning_rate": 6.390857177451956e-07, - "logits/chosen": -0.1649092584848404, - "logits/rejected": 0.03741155192255974, - "logps/chosen": -1.3439829349517822, - "logps/rejected": -1.505408525466919, - "loss": 2.0719, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3439829349517822, - "rewards/margins": 0.16142557561397552, - "rewards/rejected": -1.505408525466919, - "semantic_entropy": 0.7817584276199341, + "logits/chosen": -0.25129103660583496, + "logits/rejected": -0.08070940524339676, + "logps/chosen": -1.3176677227020264, + "logps/rejected": -1.404175043106079, + "loss": 1.6778, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3176677227020264, + "rewards/margins": 0.08650705963373184, + "rewards/rejected": -1.404175043106079, "step": 2630 }, { "epoch": 1.4102692757986286, - "grad_norm": 10.735155487069681, + "grad_norm": 10.09644986033379, "learning_rate": 6.375891112313445e-07, - "logits/chosen": -0.05074986815452576, - "logits/rejected": 0.03829338029026985, - "logps/chosen": -1.2775318622589111, - "logps/rejected": -1.5022495985031128, - "loss": 1.9985, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2775318622589111, - "rewards/margins": 0.22471757233142853, - "rewards/rejected": -1.5022495985031128, - "semantic_entropy": 0.8041666746139526, + "logits/chosen": -0.16776664555072784, + "logits/rejected": -0.10792151838541031, + "logps/chosen": -1.255562424659729, + "logps/rejected": -1.420966386795044, + "loss": 1.5925, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.255562424659729, + "rewards/margins": 0.1654038280248642, + "rewards/rejected": -1.420966386795044, "step": 2635 }, { "epoch": 1.41294530858003, - "grad_norm": 7.290901946889319, + "grad_norm": 6.624218741585395, "learning_rate": 6.360911698273326e-07, - "logits/chosen": 0.020272737368941307, - "logits/rejected": 0.10198897123336792, - "logps/chosen": -1.337398886680603, - "logps/rejected": -1.5660853385925293, - "loss": 2.0534, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.337398886680603, - "rewards/margins": 0.22868652641773224, - "rewards/rejected": -1.5660853385925293, - "semantic_entropy": 0.7864670157432556, + "logits/chosen": -0.1129215806722641, + "logits/rejected": -0.06237906217575073, + "logps/chosen": -1.3118427991867065, + "logps/rejected": -1.47782301902771, + "loss": 1.6551, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3118427991867065, + "rewards/margins": 0.1659802496433258, + "rewards/rejected": -1.47782301902771, "step": 2640 }, { "epoch": 1.4156213413614318, - "grad_norm": 9.046840482319494, + "grad_norm": 8.11624284407403, "learning_rate": 6.345919080661944e-07, - "logits/chosen": -0.037472423166036606, - "logits/rejected": 0.04499778151512146, - "logps/chosen": -1.266202688217163, - "logps/rejected": -1.5562307834625244, - "loss": 1.9872, + "logits/chosen": -0.13897685706615448, + "logits/rejected": -0.07714378088712692, + "logps/chosen": -1.2451449632644653, + "logps/rejected": -1.4978704452514648, + "loss": 1.5768, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.266202688217163, - "rewards/margins": 0.2900282144546509, - "rewards/rejected": -1.5562307834625244, - "semantic_entropy": 0.8044629096984863, + "rewards/chosen": -1.2451449632644653, + "rewards/margins": 0.2527254819869995, + "rewards/rejected": -1.4978704452514648, "step": 2645 }, { "epoch": 1.4182973741428333, - "grad_norm": 7.03261181708054, + "grad_norm": 6.767373925947111, "learning_rate": 6.330913404937737e-07, - "logits/chosen": -0.08614523708820343, - "logits/rejected": 0.07208752632141113, - "logps/chosen": -1.2745702266693115, - "logps/rejected": -1.69574773311615, - "loss": 2.01, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2745702266693115, - "rewards/margins": 0.42117738723754883, - "rewards/rejected": -1.69574773311615, - "semantic_entropy": 0.7874399423599243, + "logits/chosen": -0.20330551266670227, + "logits/rejected": -0.08086180686950684, + "logps/chosen": -1.253254771232605, + "logps/rejected": -1.6146825551986694, + "loss": 1.6038, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.253254771232605, + "rewards/margins": 0.36142784357070923, + "rewards/rejected": -1.6146825551986694, "step": 2650 }, { "epoch": 1.4209734069242348, - "grad_norm": 9.051777151633472, + "grad_norm": 8.782776045832264, "learning_rate": 6.315894816685838e-07, - "logits/chosen": -0.006546747870743275, - "logits/rejected": 0.14669081568717957, - "logps/chosen": -1.173689365386963, - "logps/rejected": -1.4159897565841675, - "loss": 1.9562, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.173689365386963, - "rewards/margins": 0.24230046570301056, - "rewards/rejected": -1.4159897565841675, - "semantic_entropy": 0.8332114219665527, + "logits/chosen": -0.12377236038446426, + "logits/rejected": -0.0035462796222418547, + "logps/chosen": -1.157189130783081, + "logps/rejected": -1.339942216873169, + "loss": 1.5445, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.157189130783081, + "rewards/margins": 0.18275293707847595, + "rewards/rejected": -1.339942216873169, "step": 2655 }, { "epoch": 1.4236494397056365, - "grad_norm": 8.990035974128704, + "grad_norm": 8.528001606755288, "learning_rate": 6.300863461616657e-07, - "logits/chosen": 0.06433503329753876, - "logits/rejected": 0.11419510841369629, - "logps/chosen": -1.165921926498413, - "logps/rejected": -1.4911032915115356, - "loss": 1.9433, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.165921926498413, - "rewards/margins": 0.32518142461776733, - "rewards/rejected": -1.4911032915115356, - "semantic_entropy": 0.8362231254577637, + "logits/chosen": -0.054256655275821686, + "logits/rejected": -0.02772495709359646, + "logps/chosen": -1.1307682991027832, + "logps/rejected": -1.3774828910827637, + "loss": 1.5133, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1307682991027832, + "rewards/margins": 0.24671444296836853, + "rewards/rejected": -1.3774828910827637, "step": 2660 }, { "epoch": 1.426325472487038, - "grad_norm": 8.014043391058564, + "grad_norm": 7.568061035346011, "learning_rate": 6.285819485564465e-07, - "logits/chosen": -0.10654006153345108, - "logits/rejected": 0.01912633329629898, - "logps/chosen": -1.3031725883483887, - "logps/rejected": -1.5577466487884521, - "loss": 2.0142, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3031725883483887, - "rewards/margins": 0.2545742392539978, - "rewards/rejected": -1.5577466487884521, - "semantic_entropy": 0.7930922508239746, + "logits/chosen": -0.20463016629219055, + "logits/rejected": -0.11927781254053116, + "logps/chosen": -1.2784268856048584, + "logps/rejected": -1.4733545780181885, + "loss": 1.616, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2784268856048584, + "rewards/margins": 0.1949276179075241, + "rewards/rejected": -1.4733545780181885, "step": 2665 }, { "epoch": 1.4290015052684395, - "grad_norm": 9.160732444578377, + "grad_norm": 8.754933977946239, "learning_rate": 6.270763034485986e-07, - "logits/chosen": 0.029206525534391403, - "logits/rejected": 0.1290557086467743, - "logps/chosen": -1.3868186473846436, - "logps/rejected": -1.5575685501098633, - "loss": 2.1021, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3868186473846436, - "rewards/margins": 0.1707497388124466, - "rewards/rejected": -1.5575685501098633, - "semantic_entropy": 0.7730140686035156, + "logits/chosen": -0.08934472501277924, + "logits/rejected": -0.02049086056649685, + "logps/chosen": -1.3698203563690186, + "logps/rejected": -1.4895521402359009, + "loss": 1.7197, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3698203563690186, + "rewards/margins": 0.11973158270120621, + "rewards/rejected": -1.4895521402359009, "step": 2670 }, { "epoch": 1.4316775380498412, - "grad_norm": 11.463702138070392, + "grad_norm": 10.10100058158381, "learning_rate": 6.255694254458972e-07, - "logits/chosen": -0.03042779304087162, - "logits/rejected": 0.13861264288425446, - "logps/chosen": -1.3073450326919556, - "logps/rejected": -1.4852049350738525, - "loss": 2.0481, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3073450326919556, - "rewards/margins": 0.17785976827144623, - "rewards/rejected": -1.4852049350738525, - "semantic_entropy": 0.8076550364494324, + "logits/chosen": -0.1388912945985794, + "logits/rejected": -0.010842189192771912, + "logps/chosen": -1.2867250442504883, + "logps/rejected": -1.4099212884902954, + "loss": 1.6438, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2867250442504883, + "rewards/margins": 0.12319610267877579, + "rewards/rejected": -1.4099212884902954, "step": 2675 }, { "epoch": 1.4343535708312427, - "grad_norm": 9.484512890852951, + "grad_norm": 9.231846695551004, "learning_rate": 6.240613291680795e-07, - "logits/chosen": -0.04733417183160782, - "logits/rejected": 0.13355064392089844, - "logps/chosen": -1.2979787588119507, - "logps/rejected": -1.4991651773452759, - "loss": 2.0435, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2979787588119507, - "rewards/margins": 0.20118634402751923, - "rewards/rejected": -1.4991651773452759, - "semantic_entropy": 0.7871995568275452, + "logits/chosen": -0.18062396347522736, + "logits/rejected": -0.0430278442800045, + "logps/chosen": -1.2710715532302856, + "logps/rejected": -1.4060547351837158, + "loss": 1.6416, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2710715532302856, + "rewards/margins": 0.13498328626155853, + "rewards/rejected": -1.4060547351837158, "step": 2680 }, { "epoch": 1.4370296036126442, - "grad_norm": 9.553528285797984, + "grad_norm": 9.237884271223397, "learning_rate": 6.225520292467021e-07, - "logits/chosen": -0.0540904700756073, - "logits/rejected": 0.16049322485923767, - "logps/chosen": -1.2885100841522217, - "logps/rejected": -1.4493513107299805, - "loss": 2.0401, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2885100841522217, - "rewards/margins": 0.16084131598472595, - "rewards/rejected": -1.4493513107299805, - "semantic_entropy": 0.8091254234313965, + "logits/chosen": -0.1564517766237259, + "logits/rejected": 0.003753349184989929, + "logps/chosen": -1.2695815563201904, + "logps/rejected": -1.3745739459991455, + "loss": 1.6364, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2695815563201904, + "rewards/margins": 0.10499223321676254, + "rewards/rejected": -1.3745739459991455, "step": 2685 }, { "epoch": 1.439705636394046, - "grad_norm": 13.887500536632503, + "grad_norm": 10.139955829157355, "learning_rate": 6.210415403249993e-07, - "logits/chosen": -0.18344011902809143, - "logits/rejected": 0.06428229063749313, - "logps/chosen": -1.2974615097045898, - "logps/rejected": -1.6270719766616821, - "loss": 2.0105, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2974615097045898, - "rewards/margins": 0.32961034774780273, - "rewards/rejected": -1.6270719766616821, - "semantic_entropy": 0.7873474359512329, + "logits/chosen": -0.2812516987323761, + "logits/rejected": -0.09271320700645447, + "logps/chosen": -1.2651293277740479, + "logps/rejected": -1.5354411602020264, + "loss": 1.6048, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2651293277740479, + "rewards/margins": 0.2703118920326233, + "rewards/rejected": -1.5354411602020264, "step": 2690 }, { "epoch": 1.4423816691754474, - "grad_norm": 6.378348148603819, + "grad_norm": 6.426845661488, "learning_rate": 6.195298770577415e-07, - "logits/chosen": 0.02969251200556755, - "logits/rejected": 0.05600305646657944, - "logps/chosen": -1.2695014476776123, - "logps/rejected": -1.5602524280548096, - "loss": 1.9927, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2695014476776123, - "rewards/margins": 0.2907510995864868, - "rewards/rejected": -1.5602524280548096, - "semantic_entropy": 0.8114160299301147, + "logits/chosen": -0.13508456945419312, + "logits/rejected": -0.11751226335763931, + "logps/chosen": -1.2416387796401978, + "logps/rejected": -1.4536396265029907, + "loss": 1.5797, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2416387796401978, + "rewards/margins": 0.21200060844421387, + "rewards/rejected": -1.4536396265029907, "step": 2695 }, { "epoch": 1.445057701956849, - "grad_norm": 8.270961707183284, + "grad_norm": 7.714598930066917, "learning_rate": 6.180170541110923e-07, - "logits/chosen": -0.0491093285381794, - "logits/rejected": 0.14849188923835754, - "logps/chosen": -1.313826560974121, - "logps/rejected": -1.5336029529571533, - "loss": 2.0368, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.313826560974121, - "rewards/margins": 0.21977631747722626, - "rewards/rejected": -1.5336029529571533, - "semantic_entropy": 0.7848860025405884, + "logits/chosen": -0.17228879034519196, + "logits/rejected": -0.01403956301510334, + "logps/chosen": -1.290063738822937, + "logps/rejected": -1.4569785594940186, + "loss": 1.6395, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.290063738822937, + "rewards/margins": 0.16691474616527557, + "rewards/rejected": -1.4569785594940186, "step": 2700 }, { "epoch": 1.4477337347382506, - "grad_norm": 6.020138414515257, + "grad_norm": 5.708198226308769, "learning_rate": 6.165030861624663e-07, - "logits/chosen": -0.08611325919628143, - "logits/rejected": 0.14223533868789673, - "logps/chosen": -1.1841992139816284, - "logps/rejected": -1.5576660633087158, - "loss": 1.9085, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1841992139816284, - "rewards/margins": 0.3734667897224426, - "rewards/rejected": -1.5576660633087158, - "semantic_entropy": 0.821341872215271, + "logits/chosen": -0.19277127087116241, + "logits/rejected": -0.028677979484200478, + "logps/chosen": -1.1680980920791626, + "logps/rejected": -1.4506738185882568, + "loss": 1.5045, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1680980920791626, + "rewards/margins": 0.2825758755207062, + "rewards/rejected": -1.4506738185882568, "step": 2705 }, { "epoch": 1.4504097675196521, - "grad_norm": 7.926721115653813, + "grad_norm": 7.689429641279572, "learning_rate": 6.149879879003876e-07, - "logits/chosen": 0.040991488844156265, - "logits/rejected": 0.0697018951177597, - "logps/chosen": -1.2791852951049805, - "logps/rejected": -1.5475891828536987, - "loss": 1.9896, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2791852951049805, - "rewards/margins": 0.26840391755104065, - "rewards/rejected": -1.5475891828536987, - "semantic_entropy": 0.8002132177352905, + "logits/chosen": -0.10672245919704437, + "logits/rejected": -0.08293677121400833, + "logps/chosen": -1.2571884393692017, + "logps/rejected": -1.4730494022369385, + "loss": 1.5866, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2571884393692017, + "rewards/margins": 0.21586088836193085, + "rewards/rejected": -1.4730494022369385, "step": 2710 }, { "epoch": 1.4530858003010536, - "grad_norm": 8.1304948439871, + "grad_norm": 7.730130806525485, "learning_rate": 6.13471774024346e-07, - "logits/chosen": -0.09874364733695984, - "logits/rejected": 0.00951166171580553, - "logps/chosen": -1.2177903652191162, - "logps/rejected": -1.4703280925750732, - "loss": 1.9766, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2177903652191162, - "rewards/margins": 0.2525377869606018, - "rewards/rejected": -1.4703280925750732, - "semantic_entropy": 0.8077287673950195, + "logits/chosen": -0.23001877963542938, + "logits/rejected": -0.15539811551570892, + "logps/chosen": -1.198460340499878, + "logps/rejected": -1.3851935863494873, + "loss": 1.5722, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.198460340499878, + "rewards/margins": 0.186733216047287, + "rewards/rejected": -1.3851935863494873, "step": 2715 }, { "epoch": 1.4557618330824553, - "grad_norm": 8.550253666468736, + "grad_norm": 7.45875436423318, "learning_rate": 6.119544592446551e-07, - "logits/chosen": -0.06126248836517334, - "logits/rejected": 0.05804147571325302, - "logps/chosen": -1.2580182552337646, - "logps/rejected": -1.4090564250946045, - "loss": 2.0226, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2580182552337646, - "rewards/margins": 0.1510380655527115, - "rewards/rejected": -1.4090564250946045, - "semantic_entropy": 0.8237816095352173, + "logits/chosen": -0.1821044236421585, + "logits/rejected": -0.08938132971525192, + "logps/chosen": -1.2271735668182373, + "logps/rejected": -1.3389554023742676, + "loss": 1.6007, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2271735668182373, + "rewards/margins": 0.11178169399499893, + "rewards/rejected": -1.3389554023742676, "step": 2720 }, { "epoch": 1.4584378658638568, - "grad_norm": 9.360312673012285, + "grad_norm": 9.274979791690361, "learning_rate": 6.104360582823096e-07, - "logits/chosen": -0.02586626447737217, - "logits/rejected": 0.09422776848077774, - "logps/chosen": -1.2580927610397339, - "logps/rejected": -1.5287189483642578, - "loss": 2.0074, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2580927610397339, - "rewards/margins": 0.2706260681152344, - "rewards/rejected": -1.5287189483642578, - "semantic_entropy": 0.7973588705062866, + "logits/chosen": -0.14727376401424408, + "logits/rejected": -0.05907534435391426, + "logps/chosen": -1.23906672000885, + "logps/rejected": -1.4582659006118774, + "loss": 1.6037, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.23906672000885, + "rewards/margins": 0.21919913589954376, + "rewards/rejected": -1.4582659006118774, "step": 2725 }, { "epoch": 1.4611138986452583, - "grad_norm": 9.839600679806647, + "grad_norm": 8.308768679842965, "learning_rate": 6.089165858688423e-07, - "logits/chosen": -0.05331653356552124, - "logits/rejected": 0.13488610088825226, - "logps/chosen": -1.267230749130249, - "logps/rejected": -1.5607540607452393, - "loss": 2.0204, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.267230749130249, - "rewards/margins": 0.2935234308242798, - "rewards/rejected": -1.5607540607452393, - "semantic_entropy": 0.7877390384674072, + "logits/chosen": -0.19906631112098694, + "logits/rejected": -0.06506966054439545, + "logps/chosen": -1.2406883239746094, + "logps/rejected": -1.4651925563812256, + "loss": 1.6225, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2406883239746094, + "rewards/margins": 0.22450414299964905, + "rewards/rejected": -1.4651925563812256, "step": 2730 }, { "epoch": 1.46378993142666, - "grad_norm": 8.101147304031956, + "grad_norm": 7.296154052157939, "learning_rate": 6.073960567461811e-07, - "logits/chosen": -0.040810726583004, - "logits/rejected": 0.16771776974201202, - "logps/chosen": -1.1728408336639404, - "logps/rejected": -1.5638388395309448, - "loss": 1.9287, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1728408336639404, - "rewards/margins": 0.39099812507629395, - "rewards/rejected": -1.5638388395309448, - "semantic_entropy": 0.8042163848876953, + "logits/chosen": -0.1607706993818283, + "logits/rejected": 0.003165569854900241, + "logps/chosen": -1.1542766094207764, + "logps/rejected": -1.4799726009368896, + "loss": 1.5236, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1542766094207764, + "rewards/margins": 0.32569605112075806, + "rewards/rejected": -1.4799726009368896, "step": 2735 }, { "epoch": 1.4664659642080615, - "grad_norm": 10.135341944238796, + "grad_norm": 9.399177789493598, "learning_rate": 6.058744856665065e-07, - "logits/chosen": -0.0530366376042366, - "logits/rejected": 0.06937140226364136, - "logps/chosen": -1.2185879945755005, - "logps/rejected": -1.5651237964630127, - "loss": 1.9801, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2185879945755005, - "rewards/margins": 0.3465357720851898, - "rewards/rejected": -1.5651237964630127, - "semantic_entropy": 0.8120279312133789, + "logits/chosen": -0.19197958707809448, + "logits/rejected": -0.10159718990325928, + "logps/chosen": -1.19269597530365, + "logps/rejected": -1.4547951221466064, + "loss": 1.5679, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.19269597530365, + "rewards/margins": 0.2620992064476013, + "rewards/rejected": -1.4547951221466064, "step": 2740 }, { "epoch": 1.469141996989463, - "grad_norm": 8.081130306261906, + "grad_norm": 7.426364054634275, "learning_rate": 6.043518873921074e-07, - "logits/chosen": -0.0669885128736496, - "logits/rejected": 0.07204239070415497, - "logps/chosen": -1.2283408641815186, - "logps/rejected": -1.4145268201828003, - "loss": 1.9954, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2283408641815186, - "rewards/margins": 0.18618597090244293, - "rewards/rejected": -1.4145268201828003, - "semantic_entropy": 0.8127212524414062, + "logits/chosen": -0.17664211988449097, + "logits/rejected": -0.08954844623804092, + "logps/chosen": -1.2093679904937744, + "logps/rejected": -1.3380529880523682, + "loss": 1.5911, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2093679904937744, + "rewards/margins": 0.12868481874465942, + "rewards/rejected": -1.3380529880523682, "step": 2745 }, { "epoch": 1.4718180297708647, - "grad_norm": 8.6742003855211, + "grad_norm": 8.03926829429509, "learning_rate": 6.028282766952393e-07, - "logits/chosen": -0.0030053220689296722, - "logits/rejected": 0.10681009292602539, - "logps/chosen": -1.3068584203720093, - "logps/rejected": -1.609853744506836, - "loss": 1.992, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3068584203720093, - "rewards/margins": 0.3029954433441162, - "rewards/rejected": -1.609853744506836, - "semantic_entropy": 0.7740689516067505, + "logits/chosen": -0.15437455475330353, + "logits/rejected": -0.08024605363607407, + "logps/chosen": -1.2849972248077393, + "logps/rejected": -1.5242011547088623, + "loss": 1.6053, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2849972248077393, + "rewards/margins": 0.23920373618602753, + "rewards/rejected": -1.5242011547088623, "step": 2750 }, { "epoch": 1.4744940625522662, - "grad_norm": 12.341995821257413, + "grad_norm": 9.826266540452712, "learning_rate": 6.013036683579798e-07, - "logits/chosen": 0.025272101163864136, - "logits/rejected": 0.17669710516929626, - "logps/chosen": -1.2535477876663208, - "logps/rejected": -1.443696141242981, - "loss": 2.0065, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2535477876663208, - "rewards/margins": 0.19014835357666016, - "rewards/rejected": -1.443696141242981, - "semantic_entropy": 0.8169199228286743, + "logits/chosen": -0.10797605663537979, + "logits/rejected": 0.014654259197413921, + "logps/chosen": -1.2318819761276245, + "logps/rejected": -1.3689765930175781, + "loss": 1.5971, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2318819761276245, + "rewards/margins": 0.13709449768066406, + "rewards/rejected": -1.3689765930175781, "step": 2755 }, { "epoch": 1.4771700953336677, - "grad_norm": 9.236585928461313, + "grad_norm": 7.879073302523249, "learning_rate": 5.997780771720854e-07, - "logits/chosen": -0.1045481339097023, - "logits/rejected": 0.08253008872270584, - "logps/chosen": -1.2917675971984863, - "logps/rejected": -1.6060972213745117, - "loss": 2.018, + "logits/chosen": -0.22682805359363556, + "logits/rejected": -0.08655048161745071, + "logps/chosen": -1.267865538597107, + "logps/rejected": -1.5178544521331787, + "loss": 1.6205, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2917675971984863, - "rewards/margins": 0.3143296539783478, - "rewards/rejected": -1.6060972213745117, - "semantic_entropy": 0.7867814898490906, + "rewards/chosen": -1.267865538597107, + "rewards/margins": 0.24998879432678223, + "rewards/rejected": -1.5178544521331787, "step": 2760 }, { "epoch": 1.4798461281150694, - "grad_norm": 9.887674753689055, + "grad_norm": 8.292600778846104, "learning_rate": 5.982515179388486e-07, - "logits/chosen": 0.012802052311599255, - "logits/rejected": 0.15946759283542633, - "logps/chosen": -1.214952826499939, - "logps/rejected": -1.472954511642456, - "loss": 1.9618, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.214952826499939, - "rewards/margins": 0.25800177454948425, - "rewards/rejected": -1.472954511642456, - "semantic_entropy": 0.8072951436042786, + "logits/chosen": -0.11081697791814804, + "logits/rejected": 0.00440225237980485, + "logps/chosen": -1.1883069276809692, + "logps/rejected": -1.3887240886688232, + "loss": 1.5564, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1883069276809692, + "rewards/margins": 0.200417160987854, + "rewards/rejected": -1.3887240886688232, "step": 2765 }, { "epoch": 1.482522160896471, - "grad_norm": 9.546052846550923, + "grad_norm": 7.9152983573075115, "learning_rate": 5.967240054689541e-07, - "logits/chosen": -0.06388188898563385, - "logits/rejected": 0.01827801950275898, - "logps/chosen": -1.2447795867919922, - "logps/rejected": -1.3526091575622559, - "loss": 2.0449, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2447795867919922, - "rewards/margins": 0.1078295111656189, - "rewards/rejected": -1.3526091575622559, - "semantic_entropy": 0.8271406292915344, + "logits/chosen": -0.17233093082904816, + "logits/rejected": -0.1128607988357544, + "logps/chosen": -1.2181812524795532, + "logps/rejected": -1.2844047546386719, + "loss": 1.6288, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2181812524795532, + "rewards/margins": 0.06622340530157089, + "rewards/rejected": -1.2844047546386719, "step": 2770 }, { "epoch": 1.4851981936778724, - "grad_norm": 6.197965780271343, + "grad_norm": 5.556966087112487, "learning_rate": 5.951955545823342e-07, - "logits/chosen": -0.01683109626173973, - "logits/rejected": 0.030049163848161697, - "logps/chosen": -1.2154908180236816, - "logps/rejected": -1.4938910007476807, - "loss": 1.9702, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2154908180236816, - "rewards/margins": 0.2784002125263214, - "rewards/rejected": -1.4938910007476807, - "semantic_entropy": 0.8202184438705444, + "logits/chosen": -0.13642926514148712, + "logits/rejected": -0.12106633186340332, + "logps/chosen": -1.1797887086868286, + "logps/rejected": -1.3879332542419434, + "loss": 1.5525, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1797887086868286, + "rewards/margins": 0.20814457535743713, + "rewards/rejected": -1.3879332542419434, "step": 2775 }, { "epoch": 1.4878742264592741, - "grad_norm": 5.786231493222208, + "grad_norm": 5.392301224487717, "learning_rate": 5.936661801080263e-07, - "logits/chosen": -0.02936570905148983, - "logits/rejected": 0.07508943974971771, - "logps/chosen": -1.3776317834854126, - "logps/rejected": -1.574289321899414, - "loss": 2.0838, + "logits/chosen": -0.14420035481452942, + "logits/rejected": -0.061324309557676315, + "logps/chosen": -1.3488701581954956, + "logps/rejected": -1.4870027303695679, + "loss": 1.6864, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3776317834854126, - "rewards/margins": 0.1966576725244522, - "rewards/rejected": -1.574289321899414, - "semantic_entropy": 0.7650582194328308, + "rewards/chosen": -1.3488701581954956, + "rewards/margins": 0.13813255727291107, + "rewards/rejected": -1.4870027303695679, "step": 2780 }, { "epoch": 1.4905502592406756, - "grad_norm": 8.22099606926176, + "grad_norm": 7.535224374184795, "learning_rate": 5.92135896884028e-07, - "logits/chosen": -0.0665474608540535, - "logits/rejected": 0.07018626481294632, - "logps/chosen": -1.3456532955169678, - "logps/rejected": -1.592674732208252, - "loss": 2.0659, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3456532955169678, - "rewards/margins": 0.24702143669128418, - "rewards/rejected": -1.592674732208252, - "semantic_entropy": 0.7766121625900269, + "logits/chosen": -0.18038392066955566, + "logits/rejected": -0.07577495276927948, + "logps/chosen": -1.3093490600585938, + "logps/rejected": -1.4596061706542969, + "loss": 1.6714, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3093490600585938, + "rewards/margins": 0.15025700628757477, + "rewards/rejected": -1.4596061706542969, "step": 2785 }, { "epoch": 1.4932262920220774, - "grad_norm": 13.009909043873694, + "grad_norm": 11.729284249643793, "learning_rate": 5.906047197571541e-07, - "logits/chosen": -0.002771927509456873, - "logits/rejected": -0.02358252741396427, - "logps/chosen": -1.2346235513687134, - "logps/rejected": -1.4999468326568604, - "loss": 1.9989, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2346235513687134, - "rewards/margins": 0.2653234004974365, - "rewards/rejected": -1.4999468326568604, - "semantic_entropy": 0.8137930631637573, + "logits/chosen": -0.132780522108078, + "logits/rejected": -0.14752009510993958, + "logps/chosen": -1.2028001546859741, + "logps/rejected": -1.418414831161499, + "loss": 1.5775, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2028001546859741, + "rewards/margins": 0.21561458706855774, + "rewards/rejected": -1.418414831161499, "step": 2790 }, { "epoch": 1.4959023248034788, - "grad_norm": 7.600751860354241, + "grad_norm": 7.13278055336521, "learning_rate": 5.890726635828919e-07, - "logits/chosen": 0.08247774839401245, - "logits/rejected": 0.09755951166152954, - "logps/chosen": -1.2009843587875366, - "logps/rejected": -1.4116266965866089, - "loss": 2.0229, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2009843587875366, - "rewards/margins": 0.21064254641532898, - "rewards/rejected": -1.4116266965866089, - "semantic_entropy": 0.8201521039009094, + "logits/chosen": -0.021932054311037064, + "logits/rejected": -0.00037776678800582886, + "logps/chosen": -1.1753807067871094, + "logps/rejected": -1.311991810798645, + "loss": 1.6075, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.1753807067871094, + "rewards/margins": 0.13661102950572968, + "rewards/rejected": -1.311991810798645, "step": 2795 }, { "epoch": 1.4985783575848803, - "grad_norm": 7.9667357221109905, + "grad_norm": 7.341481609459083, "learning_rate": 5.875397432252569e-07, - "logits/chosen": -0.10117968171834946, - "logits/rejected": -0.018336813896894455, - "logps/chosen": -1.3454875946044922, - "logps/rejected": -1.6285938024520874, - "loss": 2.0557, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3454875946044922, - "rewards/margins": 0.2831062972545624, - "rewards/rejected": -1.6285938024520874, - "semantic_entropy": 0.7585874199867249, + "logits/chosen": -0.20246192812919617, + "logits/rejected": -0.13079217076301575, + "logps/chosen": -1.31998610496521, + "logps/rejected": -1.535980463027954, + "loss": 1.6688, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.31998610496521, + "rewards/margins": 0.21599426865577698, + "rewards/rejected": -1.535980463027954, "step": 2800 }, { "epoch": 1.4985783575848803, - "eval_logits/chosen": 0.3579806685447693, - "eval_logits/rejected": 0.4562504291534424, - "eval_logps/chosen": -1.3279922008514404, - "eval_logps/rejected": -1.587229609489441, - "eval_loss": 2.043422222137451, - "eval_rewards/accuracies": 0.5845697522163391, - "eval_rewards/chosen": -1.3279922008514404, - "eval_rewards/margins": 0.2592373490333557, - "eval_rewards/rejected": -1.587229609489441, - "eval_runtime": 34.5745, - "eval_samples_per_second": 38.902, - "eval_semantic_entropy": 0.7788424491882324, - "eval_steps_per_second": 9.747, + "eval_logits/chosen": 0.10498671978712082, + "eval_logits/rejected": 0.17505963146686554, + "eval_logps/chosen": -1.2999731302261353, + "eval_logps/rejected": -1.4728928804397583, + "eval_loss": 1.6485788822174072, + "eval_rewards/accuracies": 0.5689911246299744, + "eval_rewards/chosen": -1.2999731302261353, + "eval_rewards/margins": 0.17291992902755737, + "eval_rewards/rejected": -1.4728928804397583, + "eval_runtime": 40.3876, + "eval_samples_per_second": 33.302, + "eval_steps_per_second": 8.344, "step": 2800 }, { "epoch": 1.5012543903662818, - "grad_norm": 7.132846494980536, + "grad_norm": 4.849294698705585, "learning_rate": 5.860059735566491e-07, - "logits/chosen": -0.17691846191883087, - "logits/rejected": -0.016455356031656265, - "logps/chosen": -1.176946997642517, - "logps/rejected": -1.488738775253296, - "loss": 1.946, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.176946997642517, - "rewards/margins": 0.3117918372154236, - "rewards/rejected": -1.488738775253296, - "semantic_entropy": 0.8184489011764526, + "logits/chosen": -0.2729756236076355, + "logits/rejected": -0.15374641120433807, + "logps/chosen": -1.1556975841522217, + "logps/rejected": -1.393615961074829, + "loss": 1.5309, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1556975841522217, + "rewards/margins": 0.23791857063770294, + "rewards/rejected": -1.393615961074829, "step": 2805 }, { "epoch": 1.5039304231476835, - "grad_norm": 17.289937745543508, + "grad_norm": 11.891979762914524, "learning_rate": 5.844713694577087e-07, - "logits/chosen": -0.042862921953201294, - "logits/rejected": 0.0031647428404539824, - "logps/chosen": -1.3153235912322998, - "logps/rejected": -1.5819628238677979, - "loss": 2.0377, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3153235912322998, - "rewards/margins": 0.2666395306587219, - "rewards/rejected": -1.5819628238677979, - "semantic_entropy": 0.7687281966209412, + "logits/chosen": -0.1642301380634308, + "logits/rejected": -0.14914503693580627, + "logps/chosen": -1.286602258682251, + "logps/rejected": -1.4862585067749023, + "loss": 1.6405, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.286602258682251, + "rewards/margins": 0.19965621829032898, + "rewards/rejected": -1.4862585067749023, "step": 2810 }, { "epoch": 1.5066064559290853, - "grad_norm": 7.73698652084708, + "grad_norm": 7.780853040126767, "learning_rate": 5.829359458171714e-07, - "logits/chosen": 0.019528161734342575, - "logits/rejected": 0.14965340495109558, - "logps/chosen": -1.308516502380371, - "logps/rejected": -1.5800927877426147, - "loss": 2.0036, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.308516502380371, - "rewards/margins": 0.2715762257575989, - "rewards/rejected": -1.5800927877426147, - "semantic_entropy": 0.7879072427749634, + "logits/chosen": -0.10971619933843613, + "logits/rejected": -0.003022894263267517, + "logps/chosen": -1.2809616327285767, + "logps/rejected": -1.463622808456421, + "loss": 1.6122, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2809616327285767, + "rewards/margins": 0.18266119062900543, + "rewards/rejected": -1.463622808456421, "step": 2815 }, { "epoch": 1.5092824887104868, - "grad_norm": 6.855133463366414, + "grad_norm": 6.591814679920951, "learning_rate": 5.81399717531724e-07, - "logits/chosen": 0.004591824021190405, - "logits/rejected": 0.19138944149017334, - "logps/chosen": -1.275131344795227, - "logps/rejected": -1.4118839502334595, - "loss": 2.0545, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.275131344795227, - "rewards/margins": 0.13675281405448914, - "rewards/rejected": -1.4118839502334595, - "semantic_entropy": 0.8109269142150879, + "logits/chosen": -0.12527456879615784, + "logits/rejected": 0.0023119777906686068, + "logps/chosen": -1.2496436834335327, + "logps/rejected": -1.3035128116607666, + "loss": 1.6462, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2496436834335327, + "rewards/margins": 0.05386912822723389, + "rewards/rejected": -1.3035128116607666, "step": 2820 }, { "epoch": 1.5119585214918883, - "grad_norm": 8.422960613579917, + "grad_norm": 7.044574604780478, "learning_rate": 5.798626995058602e-07, - "logits/chosen": -0.07312478125095367, - "logits/rejected": 0.10833685100078583, - "logps/chosen": -1.3415075540542603, - "logps/rejected": -1.6823097467422485, - "loss": 2.0298, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3415075540542603, - "rewards/margins": 0.3408019542694092, - "rewards/rejected": -1.6823097467422485, - "semantic_entropy": 0.7743151783943176, + "logits/chosen": -0.19462482631206512, + "logits/rejected": -0.05798368528485298, + "logps/chosen": -1.309741497039795, + "logps/rejected": -1.5621037483215332, + "loss": 1.6324, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.309741497039795, + "rewards/margins": 0.25236231088638306, + "rewards/rejected": -1.5621037483215332, "step": 2825 }, { "epoch": 1.51463455427329, - "grad_norm": 7.278580407920762, + "grad_norm": 7.108608702314739, "learning_rate": 5.783249066517354e-07, - "logits/chosen": 0.008635302074253559, - "logits/rejected": 0.1758573055267334, - "logps/chosen": -1.3390541076660156, - "logps/rejected": -1.4160137176513672, - "loss": 2.1078, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3390541076660156, - "rewards/margins": 0.07695959508419037, - "rewards/rejected": -1.4160137176513672, - "semantic_entropy": 0.7865692377090454, + "logits/chosen": -0.16074320673942566, + "logits/rejected": -0.033432383090257645, + "logps/chosen": -1.3094022274017334, + "logps/rejected": -1.3287417888641357, + "loss": 1.7062, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3094022274017334, + "rewards/margins": 0.019339632242918015, + "rewards/rejected": -1.3287417888641357, "step": 2830 }, { "epoch": 1.5173105870546915, - "grad_norm": 10.345320269143327, + "grad_norm": 9.514791959704727, "learning_rate": 5.767863538890228e-07, - "logits/chosen": -0.028357133269309998, - "logits/rejected": 0.14543204009532928, - "logps/chosen": -1.264595627784729, - "logps/rejected": -1.498169183731079, - "loss": 2.0073, + "logits/chosen": -0.1489587277173996, + "logits/rejected": -0.008739927783608437, + "logps/chosen": -1.2328494787216187, + "logps/rejected": -1.3843698501586914, + "loss": 1.6031, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.264595627784729, - "rewards/margins": 0.23357343673706055, - "rewards/rejected": -1.498169183731079, - "semantic_entropy": 0.8044542074203491, + "rewards/chosen": -1.2328494787216187, + "rewards/margins": 0.1515202671289444, + "rewards/rejected": -1.3843698501586914, "step": 2835 }, { "epoch": 1.519986619836093, - "grad_norm": 9.129057194395378, + "grad_norm": 8.805937871805284, "learning_rate": 5.75247056144768e-07, - "logits/chosen": 0.0071360827423632145, - "logits/rejected": 0.11982695013284683, - "logps/chosen": -1.2875782251358032, - "logps/rejected": -1.480312705039978, - "loss": 2.0147, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2875782251358032, - "rewards/margins": 0.19273433089256287, - "rewards/rejected": -1.480312705039978, - "semantic_entropy": 0.7945183515548706, + "logits/chosen": -0.1267159879207611, + "logits/rejected": -0.03693726286292076, + "logps/chosen": -1.262799620628357, + "logps/rejected": -1.4053882360458374, + "loss": 1.6117, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.262799620628357, + "rewards/margins": 0.14258867502212524, + "rewards/rejected": -1.4053882360458374, "step": 2840 }, { "epoch": 1.5226626526174947, - "grad_norm": 9.43707371227767, + "grad_norm": 7.9077575887848655, "learning_rate": 5.737070283532444e-07, - "logits/chosen": -0.01443692110478878, - "logits/rejected": 0.07831753045320511, - "logps/chosen": -1.2341654300689697, - "logps/rejected": -1.5503387451171875, - "loss": 1.9832, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2341654300689697, - "rewards/margins": 0.31617334485054016, - "rewards/rejected": -1.5503387451171875, - "semantic_entropy": 0.8021982908248901, + "logits/chosen": -0.1251341700553894, + "logits/rejected": -0.06009829789400101, + "logps/chosen": -1.1997259855270386, + "logps/rejected": -1.4353101253509521, + "loss": 1.5689, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.1997259855270386, + "rewards/margins": 0.2355840504169464, + "rewards/rejected": -1.4353101253509521, "step": 2845 }, { "epoch": 1.5253386853988962, - "grad_norm": 13.105256203412493, + "grad_norm": 9.469293451184276, "learning_rate": 5.721662854558084e-07, - "logits/chosen": -0.058693576604127884, - "logits/rejected": 0.02043284848332405, - "logps/chosen": -1.3020961284637451, - "logps/rejected": -1.544008493423462, - "loss": 2.0443, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3020961284637451, - "rewards/margins": 0.24191224575042725, - "rewards/rejected": -1.544008493423462, - "semantic_entropy": 0.8002818822860718, + "logits/chosen": -0.17621280252933502, + "logits/rejected": -0.1186450719833374, + "logps/chosen": -1.2835023403167725, + "logps/rejected": -1.4212285280227661, + "loss": 1.6484, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.2835023403167725, + "rewards/margins": 0.13772614300251007, + "rewards/rejected": -1.4212285280227661, "step": 2850 }, { "epoch": 1.5280147181802977, - "grad_norm": 7.099331223706971, + "grad_norm": 6.5897812059697385, "learning_rate": 5.706248424007545e-07, - "logits/chosen": -0.04099093750119209, - "logits/rejected": 0.13332320749759674, - "logps/chosen": -1.361289620399475, - "logps/rejected": -1.6211292743682861, - "loss": 2.0504, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.361289620399475, - "rewards/margins": 0.2598397433757782, - "rewards/rejected": -1.6211292743682861, - "semantic_entropy": 0.7661951184272766, + "logits/chosen": -0.18296119570732117, + "logits/rejected": -0.05216040462255478, + "logps/chosen": -1.3307311534881592, + "logps/rejected": -1.527421236038208, + "loss": 1.656, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3307311534881592, + "rewards/margins": 0.19669008255004883, + "rewards/rejected": -1.527421236038208, "step": 2855 }, { "epoch": 1.5306907509616994, - "grad_norm": 10.367612289081793, + "grad_norm": 8.831262505099328, "learning_rate": 5.690827141431699e-07, - "logits/chosen": -0.1250082552433014, - "logits/rejected": 0.06297969073057175, - "logps/chosen": -1.253442406654358, - "logps/rejected": -1.446865439414978, - "loss": 2.0052, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.253442406654358, - "rewards/margins": 0.19342297315597534, - "rewards/rejected": -1.446865439414978, - "semantic_entropy": 0.8168342709541321, + "logits/chosen": -0.23127059638500214, + "logits/rejected": -0.08917646110057831, + "logps/chosen": -1.2180769443511963, + "logps/rejected": -1.3507416248321533, + "loss": 1.5876, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2180769443511963, + "rewards/margins": 0.132664754986763, + "rewards/rejected": -1.3507416248321533, "step": 2860 }, { "epoch": 1.5333667837431009, - "grad_norm": 8.184876215714546, + "grad_norm": 7.992509411678571, "learning_rate": 5.675399156447897e-07, - "logits/chosen": -0.18241655826568604, - "logits/rejected": -0.04299313947558403, - "logps/chosen": -1.258354902267456, - "logps/rejected": -1.5374661684036255, - "loss": 2.0191, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.258354902267456, - "rewards/margins": 0.279111385345459, - "rewards/rejected": -1.5374661684036255, - "semantic_entropy": 0.8082086443901062, + "logits/chosen": -0.3154450058937073, + "logits/rejected": -0.2124137133359909, + "logps/chosen": -1.2226747274398804, + "logps/rejected": -1.422944188117981, + "loss": 1.604, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2226747274398804, + "rewards/margins": 0.20026938617229462, + "rewards/rejected": -1.422944188117981, "step": 2865 }, { "epoch": 1.5360428165245024, - "grad_norm": 10.230428213029905, + "grad_norm": 9.366797403404275, "learning_rate": 5.659964618738515e-07, - "logits/chosen": -0.06168074160814285, - "logits/rejected": 0.06839561462402344, - "logps/chosen": -1.3128324747085571, - "logps/rejected": -1.4371354579925537, - "loss": 2.0895, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3128324747085571, - "rewards/margins": 0.12430305778980255, - "rewards/rejected": -1.4371354579925537, - "semantic_entropy": 0.7939896583557129, + "logits/chosen": -0.20468780398368835, + "logits/rejected": -0.11390724033117294, + "logps/chosen": -1.2798616886138916, + "logps/rejected": -1.3370516300201416, + "loss": 1.6791, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.2798616886138916, + "rewards/margins": 0.05719008296728134, + "rewards/rejected": -1.3370516300201416, "step": 2870 }, { "epoch": 1.538718849305904, - "grad_norm": 8.389267495481054, + "grad_norm": 8.178835145850876, "learning_rate": 5.644523678049509e-07, - "logits/chosen": -0.04819143936038017, - "logits/rejected": 0.055794280022382736, - "logps/chosen": -1.2973073720932007, - "logps/rejected": -1.4822477102279663, - "loss": 2.0293, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2973073720932007, - "rewards/margins": 0.184940367937088, - "rewards/rejected": -1.4822477102279663, - "semantic_entropy": 0.7976966500282288, + "logits/chosen": -0.20676104724407196, + "logits/rejected": -0.14471128582954407, + "logps/chosen": -1.278641939163208, + "logps/rejected": -1.393040657043457, + "loss": 1.6344, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.278641939163208, + "rewards/margins": 0.11439867317676544, + "rewards/rejected": -1.393040657043457, "step": 2875 }, { "epoch": 1.5413948820873056, - "grad_norm": 10.857288352616921, + "grad_norm": 9.263124693885866, "learning_rate": 5.629076484188952e-07, - "logits/chosen": 0.04812438413500786, - "logits/rejected": 0.16079124808311462, - "logps/chosen": -1.2496439218521118, - "logps/rejected": -1.5407159328460693, - "loss": 2.0209, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2496439218521118, - "rewards/margins": 0.2910721004009247, - "rewards/rejected": -1.5407159328460693, - "semantic_entropy": 0.8148852586746216, + "logits/chosen": -0.0644320473074913, + "logits/rejected": 0.02961653098464012, + "logps/chosen": -1.2251948118209839, + "logps/rejected": -1.4403178691864014, + "loss": 1.6105, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2251948118209839, + "rewards/margins": 0.21512313187122345, + "rewards/rejected": -1.4403178691864014, "step": 2880 }, { "epoch": 1.544070914868707, - "grad_norm": 8.342141881463323, + "grad_norm": 8.159770572119198, "learning_rate": 5.613623187025587e-07, - "logits/chosen": -0.038243114948272705, - "logits/rejected": 0.08904510736465454, - "logps/chosen": -1.2702000141143799, - "logps/rejected": -1.5126235485076904, - "loss": 2.0171, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2702000141143799, - "rewards/margins": 0.242423415184021, - "rewards/rejected": -1.5126235485076904, - "semantic_entropy": 0.8042590022087097, + "logits/chosen": -0.14082811772823334, + "logits/rejected": -0.039442818611860275, + "logps/chosen": -1.241360068321228, + "logps/rejected": -1.4022153615951538, + "loss": 1.6075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.241360068321228, + "rewards/margins": 0.1608552634716034, + "rewards/rejected": -1.4022153615951538, "step": 2885 }, { "epoch": 1.5467469476501088, - "grad_norm": 9.564009717517427, + "grad_norm": 7.228196201868465, "learning_rate": 5.598163936487369e-07, - "logits/chosen": -0.1139940619468689, - "logits/rejected": 0.07968755066394806, - "logps/chosen": -1.2677125930786133, - "logps/rejected": -1.4977737665176392, - "loss": 2.0176, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2677125930786133, - "rewards/margins": 0.23006126284599304, - "rewards/rejected": -1.4977737665176392, - "semantic_entropy": 0.8163504600524902, + "logits/chosen": -0.24351325631141663, + "logits/rejected": -0.08822022378444672, + "logps/chosen": -1.2423522472381592, + "logps/rejected": -1.4015181064605713, + "loss": 1.6044, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2423522472381592, + "rewards/margins": 0.15916575491428375, + "rewards/rejected": -1.4015181064605713, "step": 2890 }, { "epoch": 1.5494229804315103, - "grad_norm": 8.897773500157538, + "grad_norm": 7.735085289730952, "learning_rate": 5.582698882560017e-07, - "logits/chosen": -0.08605276048183441, - "logits/rejected": 0.06514769792556763, - "logps/chosen": -1.2082021236419678, - "logps/rejected": -1.4648030996322632, - "loss": 1.9763, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2082021236419678, - "rewards/margins": 0.2566010057926178, - "rewards/rejected": -1.4648030996322632, - "semantic_entropy": 0.8213735818862915, + "logits/chosen": -0.21803371608257294, + "logits/rejected": -0.11068868637084961, + "logps/chosen": -1.1797256469726562, + "logps/rejected": -1.351781964302063, + "loss": 1.5622, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1797256469726562, + "rewards/margins": 0.17205628752708435, + "rewards/rejected": -1.351781964302063, "step": 2895 }, { "epoch": 1.5520990132129118, - "grad_norm": 9.072552859671193, + "grad_norm": 7.755538928282204, "learning_rate": 5.567228175285549e-07, - "logits/chosen": -0.020847894251346588, - "logits/rejected": 0.07970386743545532, - "logps/chosen": -1.2829740047454834, - "logps/rejected": -1.522769570350647, - "loss": 2.0128, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2829740047454834, - "rewards/margins": 0.2397955358028412, - "rewards/rejected": -1.522769570350647, - "semantic_entropy": 0.7907953262329102, + "logits/chosen": -0.11511901766061783, + "logits/rejected": -0.031004998832941055, + "logps/chosen": -1.263131022453308, + "logps/rejected": -1.4148228168487549, + "loss": 1.6173, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.263131022453308, + "rewards/margins": 0.15169157087802887, + "rewards/rejected": -1.4148228168487549, "step": 2900 }, { "epoch": 1.5547750459943135, - "grad_norm": 7.7152752664459605, + "grad_norm": 6.08725319260436, "learning_rate": 5.551751964760838e-07, - "logits/chosen": 0.05560041591525078, - "logits/rejected": 0.08189483731985092, - "logps/chosen": -1.2490696907043457, - "logps/rejected": -1.4978667497634888, - "loss": 1.9936, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2490696907043457, - "rewards/margins": 0.24879701435565948, - "rewards/rejected": -1.4978667497634888, - "semantic_entropy": 0.8143652081489563, + "logits/chosen": -0.09188179671764374, + "logits/rejected": -0.066791832447052, + "logps/chosen": -1.2208307981491089, + "logps/rejected": -1.412955641746521, + "loss": 1.5818, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2208307981491089, + "rewards/margins": 0.19212493300437927, + "rewards/rejected": -1.412955641746521, "step": 2905 }, { "epoch": 1.557451078775715, - "grad_norm": 8.452751247504208, + "grad_norm": 7.969419534273052, "learning_rate": 5.536270401136145e-07, - "logits/chosen": -0.009603270329535007, - "logits/rejected": 0.08335626870393753, - "logps/chosen": -1.22438645362854, - "logps/rejected": -1.4349185228347778, - "loss": 1.9937, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.22438645362854, - "rewards/margins": 0.2105320245027542, - "rewards/rejected": -1.4349185228347778, - "semantic_entropy": 0.8151274919509888, + "logits/chosen": -0.13259169459342957, + "logits/rejected": -0.07302357256412506, + "logps/chosen": -1.204056739807129, + "logps/rejected": -1.3601787090301514, + "loss": 1.5882, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.204056739807129, + "rewards/margins": 0.1561218798160553, + "rewards/rejected": -1.3601787090301514, "step": 2910 }, { "epoch": 1.5601271115571165, - "grad_norm": 13.291563098455539, + "grad_norm": 9.477438173704936, "learning_rate": 5.520783634613667e-07, - "logits/chosen": 0.028323788195848465, - "logits/rejected": 0.19542863965034485, - "logps/chosen": -1.3303496837615967, - "logps/rejected": -1.5417304039001465, - "loss": 2.0507, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3303496837615967, - "rewards/margins": 0.21138079464435577, - "rewards/rejected": -1.5417304039001465, - "semantic_entropy": 0.7754170298576355, + "logits/chosen": -0.10372304916381836, + "logits/rejected": 0.026906440034508705, + "logps/chosen": -1.2964290380477905, + "logps/rejected": -1.4188811779022217, + "loss": 1.6533, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2964290380477905, + "rewards/margins": 0.12245219945907593, + "rewards/rejected": -1.4188811779022217, "step": 2915 }, { "epoch": 1.5628031443385182, - "grad_norm": 8.197490163591677, + "grad_norm": 5.790667370836652, "learning_rate": 5.505291815446082e-07, - "logits/chosen": 0.02938341535627842, - "logits/rejected": 0.15413501858711243, - "logps/chosen": -1.3015943765640259, - "logps/rejected": -1.5449639558792114, - "loss": 2.0232, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3015943765640259, - "rewards/margins": 0.24336960911750793, - "rewards/rejected": -1.5449639558792114, - "semantic_entropy": 0.7976529002189636, + "logits/chosen": -0.09391071647405624, + "logits/rejected": 0.0030964971520006657, + "logps/chosen": -1.2686607837677002, + "logps/rejected": -1.4566603899002075, + "loss": 1.6123, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2686607837677002, + "rewards/margins": 0.18799959123134613, + "rewards/rejected": -1.4566603899002075, "step": 2920 }, { "epoch": 1.5654791771199197, - "grad_norm": 10.372056783410427, + "grad_norm": 8.774622196913327, "learning_rate": 5.489795093935089e-07, - "logits/chosen": 0.04805382713675499, - "logits/rejected": 0.11252939701080322, - "logps/chosen": -1.22806978225708, - "logps/rejected": -1.5528632402420044, - "loss": 1.9853, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.22806978225708, - "rewards/margins": 0.3247934877872467, - "rewards/rejected": -1.5528632402420044, - "semantic_entropy": 0.8175485730171204, + "logits/chosen": -0.09754832088947296, + "logits/rejected": -0.06331709772348404, + "logps/chosen": -1.1873222589492798, + "logps/rejected": -1.4646153450012207, + "loss": 1.5576, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1873222589492798, + "rewards/margins": 0.2772930860519409, + "rewards/rejected": -1.4646153450012207, "step": 2925 }, { "epoch": 1.5681552099013212, - "grad_norm": 10.419268390877221, + "grad_norm": 10.30320939433974, "learning_rate": 5.474293620429946e-07, - "logits/chosen": -0.13174846768379211, - "logits/rejected": 0.04244610667228699, - "logps/chosen": -1.2425198554992676, - "logps/rejected": -1.6323487758636475, - "loss": 1.9664, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2425198554992676, - "rewards/margins": 0.3898290991783142, - "rewards/rejected": -1.6323487758636475, - "semantic_entropy": 0.798608660697937, + "logits/chosen": -0.2510526478290558, + "logits/rejected": -0.12636008858680725, + "logps/chosen": -1.2209627628326416, + "logps/rejected": -1.4968620538711548, + "loss": 1.5691, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2209627628326416, + "rewards/margins": 0.27589935064315796, + "rewards/rejected": -1.4968620538711548, "step": 2930 }, { "epoch": 1.570831242682723, - "grad_norm": 7.620399450088161, + "grad_norm": 7.653860879146269, "learning_rate": 5.458787545326018e-07, - "logits/chosen": -0.0838308110833168, - "logits/rejected": 0.06031849980354309, - "logps/chosen": -1.288436770439148, - "logps/rejected": -1.513409972190857, - "loss": 2.0247, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.288436770439148, - "rewards/margins": 0.2249731719493866, - "rewards/rejected": -1.513409972190857, - "semantic_entropy": 0.7873957753181458, + "logits/chosen": -0.1960235983133316, + "logits/rejected": -0.07919275760650635, + "logps/chosen": -1.2599356174468994, + "logps/rejected": -1.4395039081573486, + "loss": 1.6216, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.2599356174468994, + "rewards/margins": 0.1795683354139328, + "rewards/rejected": -1.4395039081573486, "step": 2935 }, { "epoch": 1.5735072754641244, - "grad_norm": 6.330710597157233, + "grad_norm": 6.058531376640787, "learning_rate": 5.443277019063311e-07, - "logits/chosen": -0.06933264434337616, - "logits/rejected": 0.10066770017147064, - "logps/chosen": -1.277112603187561, - "logps/rejected": -1.6146923303604126, - "loss": 2.0084, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.277112603187561, - "rewards/margins": 0.3375798761844635, - "rewards/rejected": -1.6146923303604126, - "semantic_entropy": 0.790891706943512, + "logits/chosen": -0.19647802412509918, + "logits/rejected": -0.060090743005275726, + "logps/chosen": -1.2536137104034424, + "logps/rejected": -1.4841291904449463, + "loss": 1.6131, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2536137104034424, + "rewards/margins": 0.2305155098438263, + "rewards/rejected": -1.4841291904449463, "step": 2940 }, { "epoch": 1.5761833082455259, - "grad_norm": 10.827020128865453, + "grad_norm": 10.050325184853422, "learning_rate": 5.427762192125023e-07, - "logits/chosen": -0.08359242975711823, - "logits/rejected": 0.06046230345964432, - "logps/chosen": -1.2609506845474243, - "logps/rejected": -1.4343267679214478, - "loss": 2.0267, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2609506845474243, - "rewards/margins": 0.1733759641647339, - "rewards/rejected": -1.4343267679214478, - "semantic_entropy": 0.8085500597953796, + "logits/chosen": -0.18975773453712463, + "logits/rejected": -0.07312886416912079, + "logps/chosen": -1.2278273105621338, + "logps/rejected": -1.348670244216919, + "loss": 1.6151, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2278273105621338, + "rewards/margins": 0.12084273993968964, + "rewards/rejected": -1.348670244216919, "step": 2945 }, { "epoch": 1.5788593410269276, - "grad_norm": 11.942780852463356, + "grad_norm": 10.247961150436891, "learning_rate": 5.41224321503607e-07, - "logits/chosen": 0.03151168301701546, - "logits/rejected": 0.28194573521614075, - "logps/chosen": -1.2235701084136963, - "logps/rejected": -1.5240552425384521, - "loss": 1.9511, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2235701084136963, - "rewards/margins": 0.3004850745201111, - "rewards/rejected": -1.5240552425384521, - "semantic_entropy": 0.8099506497383118, + "logits/chosen": -0.08409741520881653, + "logits/rejected": 0.10858605802059174, + "logps/chosen": -1.1982523202896118, + "logps/rejected": -1.4111332893371582, + "loss": 1.5483, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1982523202896118, + "rewards/margins": 0.21288099884986877, + "rewards/rejected": -1.4111332893371582, "step": 2950 }, { "epoch": 1.5815353738083293, - "grad_norm": 8.354754182965555, + "grad_norm": 7.392703100739131, "learning_rate": 5.396720238361637e-07, - "logits/chosen": 0.023025449365377426, - "logits/rejected": 0.12840992212295532, - "logps/chosen": -1.242348313331604, - "logps/rejected": -1.498448133468628, - "loss": 1.9863, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.242348313331604, - "rewards/margins": 0.2560999393463135, - "rewards/rejected": -1.498448133468628, - "semantic_entropy": 0.8004533052444458, + "logits/chosen": -0.10309597104787827, + "logits/rejected": -0.02585100196301937, + "logps/chosen": -1.2173881530761719, + "logps/rejected": -1.434526801109314, + "loss": 1.5753, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2173881530761719, + "rewards/margins": 0.21713873744010925, + "rewards/rejected": -1.434526801109314, "step": 2955 }, { "epoch": 1.5842114065897306, - "grad_norm": 9.470837718257423, + "grad_norm": 8.731910409734624, "learning_rate": 5.381193412705711e-07, - "logits/chosen": -0.07888027280569077, - "logits/rejected": 0.05634929984807968, - "logps/chosen": -1.2643178701400757, - "logps/rejected": -1.4858763217926025, - "loss": 1.9969, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2643178701400757, - "rewards/margins": 0.2215585708618164, - "rewards/rejected": -1.4858763217926025, - "semantic_entropy": 0.8052790760993958, + "logits/chosen": -0.20448771119117737, + "logits/rejected": -0.10863231122493744, + "logps/chosen": -1.2373604774475098, + "logps/rejected": -1.401271104812622, + "loss": 1.5899, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2373604774475098, + "rewards/margins": 0.1639106571674347, + "rewards/rejected": -1.401271104812622, "step": 2960 }, { "epoch": 1.5868874393711323, - "grad_norm": 11.719959752917669, + "grad_norm": 11.54676399891047, "learning_rate": 5.365662888709622e-07, - "logits/chosen": -0.02363435924053192, - "logits/rejected": 0.08364065736532211, - "logps/chosen": -1.2288060188293457, - "logps/rejected": -1.4702298641204834, - "loss": 1.9937, + "logits/chosen": -0.1474340260028839, + "logits/rejected": -0.07390202581882477, + "logps/chosen": -1.202203392982483, + "logps/rejected": -1.3635931015014648, + "loss": 1.5809, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2288060188293457, - "rewards/margins": 0.24142387509346008, - "rewards/rejected": -1.4702298641204834, - "semantic_entropy": 0.8168516159057617, + "rewards/chosen": -1.202203392982483, + "rewards/margins": 0.16138988733291626, + "rewards/rejected": -1.3635931015014648, "step": 2965 }, { "epoch": 1.589563472152534, - "grad_norm": 13.767804991891948, + "grad_norm": 11.368510077058957, "learning_rate": 5.350128817050585e-07, - "logits/chosen": -0.04800247400999069, - "logits/rejected": 0.14096921682357788, - "logps/chosen": -1.3195701837539673, - "logps/rejected": -1.5222375392913818, - "loss": 2.0751, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3195701837539673, - "rewards/margins": 0.20266716182231903, - "rewards/rejected": -1.5222375392913818, - "semantic_entropy": 0.7949842810630798, + "logits/chosen": -0.15914659202098846, + "logits/rejected": -0.021402398124337196, + "logps/chosen": -1.2980326414108276, + "logps/rejected": -1.4061661958694458, + "loss": 1.6791, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2980326414108276, + "rewards/margins": 0.10813357681035995, + "rewards/rejected": -1.4061661958694458, "step": 2970 }, { "epoch": 1.5922395049339353, - "grad_norm": 9.10072268997601, + "grad_norm": 8.928421716896088, "learning_rate": 5.334591348440229e-07, - "logits/chosen": 0.010277917608618736, - "logits/rejected": 0.16392071545124054, - "logps/chosen": -1.2994472980499268, - "logps/rejected": -1.6308130025863647, - "loss": 2.0031, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2994472980499268, - "rewards/margins": 0.3313658535480499, - "rewards/rejected": -1.6308130025863647, - "semantic_entropy": 0.7773244380950928, + "logits/chosen": -0.1347757875919342, + "logits/rejected": -0.026164641603827477, + "logps/chosen": -1.2644984722137451, + "logps/rejected": -1.5294973850250244, + "loss": 1.5981, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2644984722137451, + "rewards/margins": 0.2649989724159241, + "rewards/rejected": -1.5294973850250244, "step": 2975 }, { "epoch": 1.594915537715337, - "grad_norm": 11.464691228439937, + "grad_norm": 10.035546717884044, "learning_rate": 5.319050633623141e-07, - "logits/chosen": -0.10767938196659088, - "logits/rejected": 0.06060751527547836, - "logps/chosen": -1.339794397354126, - "logps/rejected": -1.5700126886367798, - "loss": 2.0697, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.339794397354126, - "rewards/margins": 0.2302185595035553, - "rewards/rejected": -1.5700126886367798, - "semantic_entropy": 0.7720788717269897, + "logits/chosen": -0.224522203207016, + "logits/rejected": -0.1060357317328453, + "logps/chosen": -1.299433708190918, + "logps/rejected": -1.4766066074371338, + "loss": 1.6595, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.299433708190918, + "rewards/margins": 0.17717306315898895, + "rewards/rejected": -1.4766066074371338, "step": 2980 }, { "epoch": 1.5975915704967387, - "grad_norm": 6.312446403147627, + "grad_norm": 5.906593506571086, "learning_rate": 5.303506823375409e-07, - "logits/chosen": -0.07872191816568375, - "logits/rejected": 0.11133910715579987, - "logps/chosen": -1.3662869930267334, - "logps/rejected": -1.5414142608642578, - "loss": 2.079, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3662869930267334, - "rewards/margins": 0.17512717843055725, - "rewards/rejected": -1.5414142608642578, - "semantic_entropy": 0.7808175086975098, + "logits/chosen": -0.18225395679473877, + "logits/rejected": -0.036761581897735596, + "logps/chosen": -1.3251721858978271, + "logps/rejected": -1.4256501197814941, + "loss": 1.6713, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3251721858978271, + "rewards/margins": 0.10047806799411774, + "rewards/rejected": -1.4256501197814941, "step": 2985 }, { "epoch": 1.60026760327814, - "grad_norm": 8.842807601753613, + "grad_norm": 8.425429840500003, "learning_rate": 5.287960068503143e-07, - "logits/chosen": -0.05272887274622917, - "logits/rejected": 0.13576534390449524, - "logps/chosen": -1.2263929843902588, - "logps/rejected": -1.5741432905197144, - "loss": 1.9769, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2263929843902588, - "rewards/margins": 0.34775030612945557, - "rewards/rejected": -1.5741432905197144, - "semantic_entropy": 0.8109342455863953, + "logits/chosen": -0.16185665130615234, + "logits/rejected": -0.023658882826566696, + "logps/chosen": -1.2019795179367065, + "logps/rejected": -1.4564263820648193, + "loss": 1.5652, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2019795179367065, + "rewards/margins": 0.25444674491882324, + "rewards/rejected": -1.4564263820648193, "step": 2990 }, { "epoch": 1.6029436360595417, - "grad_norm": 9.090297915482498, + "grad_norm": 8.605090024836523, "learning_rate": 5.272410519841032e-07, - "logits/chosen": -0.02721552550792694, - "logits/rejected": 0.0816565752029419, - "logps/chosen": -1.3226561546325684, - "logps/rejected": -1.6543006896972656, - "loss": 2.0125, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3226561546325684, - "rewards/margins": 0.33164435625076294, - "rewards/rejected": -1.6543006896972656, - "semantic_entropy": 0.7670749425888062, + "logits/chosen": -0.14876993000507355, + "logits/rejected": -0.0722462460398674, + "logps/chosen": -1.310435175895691, + "logps/rejected": -1.545121192932129, + "loss": 1.635, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.310435175895691, + "rewards/margins": 0.23468592762947083, + "rewards/rejected": -1.545121192932129, "step": 2995 }, { "epoch": 1.6056196688409434, - "grad_norm": 7.089363880096743, + "grad_norm": 7.129441612993683, "learning_rate": 5.256858328250861e-07, - "logits/chosen": -0.07572749257087708, - "logits/rejected": 0.0714503601193428, - "logps/chosen": -1.3386659622192383, - "logps/rejected": -1.5658314228057861, - "loss": 2.0894, + "logits/chosen": -0.173610657453537, + "logits/rejected": -0.06454495340585709, + "logps/chosen": -1.3077783584594727, + "logps/rejected": -1.4751222133636475, + "loss": 1.6835, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3386659622192383, - "rewards/margins": 0.22716538608074188, - "rewards/rejected": -1.5658314228057861, - "semantic_entropy": 0.7847882509231567, + "rewards/chosen": -1.3077783584594727, + "rewards/margins": 0.16734392940998077, + "rewards/rejected": -1.4751222133636475, "step": 3000 }, { "epoch": 1.608295701622345, - "grad_norm": 8.59220488525354, + "grad_norm": 7.845254283018713, "learning_rate": 5.241303644620063e-07, - "logits/chosen": -0.14671023190021515, - "logits/rejected": -0.00781182711943984, - "logps/chosen": -1.2366340160369873, - "logps/rejected": -1.4559341669082642, - "loss": 2.0106, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2366340160369873, - "rewards/margins": 0.21930010616779327, - "rewards/rejected": -1.4559341669082642, - "semantic_entropy": 0.8152686357498169, + "logits/chosen": -0.22596955299377441, + "logits/rejected": -0.10917635262012482, + "logps/chosen": -1.2063977718353271, + "logps/rejected": -1.3889039754867554, + "loss": 1.5891, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2063977718353271, + "rewards/margins": 0.18250641226768494, + "rewards/rejected": -1.3889039754867554, "step": 3005 }, { "epoch": 1.6109717344037464, - "grad_norm": 10.880437653964645, + "grad_norm": 8.734234001949082, "learning_rate": 5.225746619860248e-07, - "logits/chosen": -0.09534215927124023, - "logits/rejected": 0.04673149809241295, - "logps/chosen": -1.2736543416976929, - "logps/rejected": -1.4805123805999756, - "loss": 2.0545, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2736543416976929, - "rewards/margins": 0.20685787498950958, - "rewards/rejected": -1.4805123805999756, - "semantic_entropy": 0.8125940561294556, + "logits/chosen": -0.19577258825302124, + "logits/rejected": -0.08821289241313934, + "logps/chosen": -1.2466051578521729, + "logps/rejected": -1.4116572141647339, + "loss": 1.6361, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2466051578521729, + "rewards/margins": 0.16505205631256104, + "rewards/rejected": -1.4116572141647339, "step": 3010 }, { "epoch": 1.6136477671851481, - "grad_norm": 9.259488437555573, + "grad_norm": 8.585114257305527, "learning_rate": 5.210187404905735e-07, - "logits/chosen": 0.06914304196834564, - "logits/rejected": 0.14941935241222382, - "logps/chosen": -1.2673017978668213, - "logps/rejected": -1.542967438697815, - "loss": 1.9873, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2673017978668213, - "rewards/margins": 0.2756657004356384, - "rewards/rejected": -1.542967438697815, - "semantic_entropy": 0.7979117631912231, + "logits/chosen": -0.055124759674072266, + "logits/rejected": 0.012016276828944683, + "logps/chosen": -1.240079641342163, + "logps/rejected": -1.4648487567901611, + "loss": 1.5826, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.240079641342163, + "rewards/margins": 0.22476892173290253, + "rewards/rejected": -1.4648487567901611, "step": 3015 }, { "epoch": 1.6163237999665496, - "grad_norm": 8.235738095463073, + "grad_norm": 7.978827577989167, "learning_rate": 5.194626150712098e-07, - "logits/chosen": -0.13079389929771423, - "logits/rejected": 0.02636866271495819, - "logps/chosen": -1.2667205333709717, - "logps/rejected": -1.4473745822906494, - "loss": 2.0273, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2667205333709717, - "rewards/margins": 0.18065424263477325, - "rewards/rejected": -1.4473745822906494, - "semantic_entropy": 0.8098528981208801, + "logits/chosen": -0.20655544102191925, + "logits/rejected": -0.07737629115581512, + "logps/chosen": -1.2412638664245605, + "logps/rejected": -1.3628274202346802, + "loss": 1.6176, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2412638664245605, + "rewards/margins": 0.12156347930431366, + "rewards/rejected": -1.3628274202346802, "step": 3020 }, { "epoch": 1.6189998327479511, - "grad_norm": 8.377388523055298, + "grad_norm": 8.170321883840396, "learning_rate": 5.179063008254695e-07, - "logits/chosen": -0.06490382552146912, - "logits/rejected": 0.08870285749435425, - "logps/chosen": -1.232604742050171, - "logps/rejected": -1.4257268905639648, - "loss": 2.011, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.232604742050171, - "rewards/margins": 0.19312207400798798, - "rewards/rejected": -1.4257268905639648, - "semantic_entropy": 0.8080936670303345, + "logits/chosen": -0.16517595946788788, + "logits/rejected": -0.052291691303253174, + "logps/chosen": -1.2107970714569092, + "logps/rejected": -1.359795331954956, + "loss": 1.5975, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2107970714569092, + "rewards/margins": 0.1489982157945633, + "rewards/rejected": -1.359795331954956, "step": 3025 }, { "epoch": 1.6216758655293528, - "grad_norm": 6.092592887944284, + "grad_norm": 5.630556685338138, "learning_rate": 5.163498128527199e-07, - "logits/chosen": -0.02501394972205162, - "logits/rejected": 0.12650498747825623, - "logps/chosen": -1.321905493736267, - "logps/rejected": -1.5000231266021729, - "loss": 2.0363, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.321905493736267, - "rewards/margins": 0.17811742424964905, - "rewards/rejected": -1.5000231266021729, - "semantic_entropy": 0.7947280406951904, + "logits/chosen": -0.1525728702545166, + "logits/rejected": -0.04186255484819412, + "logps/chosen": -1.2931994199752808, + "logps/rejected": -1.3988301753997803, + "loss": 1.6377, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2931994199752808, + "rewards/margins": 0.10563075542449951, + "rewards/rejected": -1.3988301753997803, "step": 3030 }, { "epoch": 1.6243518983107543, - "grad_norm": 8.418555595272252, + "grad_norm": 8.157753203282361, "learning_rate": 5.147931662540144e-07, - "logits/chosen": 0.05391792207956314, - "logits/rejected": 0.17616690695285797, - "logps/chosen": -1.2905899286270142, - "logps/rejected": -1.4090840816497803, - "loss": 2.0586, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2905899286270142, - "rewards/margins": 0.11849413812160492, - "rewards/rejected": -1.4090840816497803, - "semantic_entropy": 0.8115051984786987, + "logits/chosen": -0.0260242260992527, + "logits/rejected": 0.060523390769958496, + "logps/chosen": -1.2663099765777588, + "logps/rejected": -1.3480726480484009, + "loss": 1.6493, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2663099765777588, + "rewards/margins": 0.08176268637180328, + "rewards/rejected": -1.3480726480484009, "step": 3035 }, { "epoch": 1.6270279310921558, - "grad_norm": 12.220354986532154, + "grad_norm": 11.972062026400824, "learning_rate": 5.132363761319449e-07, - "logits/chosen": -0.05254555493593216, - "logits/rejected": 0.010517707094550133, - "logps/chosen": -1.2007555961608887, - "logps/rejected": -1.5174579620361328, - "loss": 1.9429, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2007555961608887, - "rewards/margins": 0.31670236587524414, - "rewards/rejected": -1.5174579620361328, - "semantic_entropy": 0.809045135974884, + "logits/chosen": -0.1755906492471695, + "logits/rejected": -0.13223165273666382, + "logps/chosen": -1.1845462322235107, + "logps/rejected": -1.4218249320983887, + "loss": 1.5402, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1845462322235107, + "rewards/margins": 0.23727861046791077, + "rewards/rejected": -1.4218249320983887, "step": 3040 }, { "epoch": 1.6297039638735575, - "grad_norm": 15.788139287566645, + "grad_norm": 13.119864768265927, "learning_rate": 5.116794575904962e-07, - "logits/chosen": -0.06810857355594635, - "logits/rejected": 0.0314619354903698, - "logps/chosen": -1.2489879131317139, - "logps/rejected": -1.3966195583343506, - "loss": 2.037, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2489879131317139, - "rewards/margins": 0.14763149619102478, - "rewards/rejected": -1.3966195583343506, - "semantic_entropy": 0.8127719163894653, + "logits/chosen": -0.14955314993858337, + "logits/rejected": -0.06605812162160873, + "logps/chosen": -1.2177717685699463, + "logps/rejected": -1.3286962509155273, + "loss": 1.6158, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2177717685699463, + "rewards/margins": 0.11092434823513031, + "rewards/rejected": -1.3286962509155273, "step": 3045 }, { "epoch": 1.632379996654959, - "grad_norm": 9.279317596842828, + "grad_norm": 9.106121612831492, "learning_rate": 5.101224257348987e-07, - "logits/chosen": -0.10826786607503891, - "logits/rejected": 0.04793906211853027, - "logps/chosen": -1.3108704090118408, - "logps/rejected": -1.6074644327163696, - "loss": 2.0063, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3108704090118408, - "rewards/margins": 0.2965940833091736, - "rewards/rejected": -1.6074644327163696, - "semantic_entropy": 0.7802410125732422, + "logits/chosen": -0.1662093698978424, + "logits/rejected": -0.02916570007801056, + "logps/chosen": -1.2815699577331543, + "logps/rejected": -1.5031042098999023, + "loss": 1.6074, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2815699577331543, + "rewards/margins": 0.22153429687023163, + "rewards/rejected": -1.5031042098999023, "step": 3050 }, { "epoch": 1.6350560294363605, - "grad_norm": 7.285636339025404, + "grad_norm": 6.146930557344138, "learning_rate": 5.085652956714823e-07, - "logits/chosen": -0.11437921226024628, - "logits/rejected": 0.03067811205983162, - "logps/chosen": -1.2466702461242676, - "logps/rejected": -1.534493327140808, - "loss": 1.9999, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2466702461242676, - "rewards/margins": 0.28782323002815247, - "rewards/rejected": -1.534493327140808, - "semantic_entropy": 0.8040468096733093, + "logits/chosen": -0.17067714035511017, + "logits/rejected": -0.057190440595149994, + "logps/chosen": -1.2174100875854492, + "logps/rejected": -1.4384522438049316, + "loss": 1.5877, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2174100875854492, + "rewards/margins": 0.22104212641716003, + "rewards/rejected": -1.4384522438049316, "step": 3055 }, { "epoch": 1.6377320622177622, - "grad_norm": 7.061985397071907, + "grad_norm": 6.794366427686016, "learning_rate": 5.070080825075298e-07, - "logits/chosen": -0.11119749397039413, - "logits/rejected": 0.06831619143486023, - "logps/chosen": -1.2897229194641113, - "logps/rejected": -1.5156347751617432, - "loss": 2.0244, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2897229194641113, - "rewards/margins": 0.22591181099414825, - "rewards/rejected": -1.5156347751617432, - "semantic_entropy": 0.7982439398765564, + "logits/chosen": -0.19617754220962524, + "logits/rejected": -0.05101104825735092, + "logps/chosen": -1.2647570371627808, + "logps/rejected": -1.4480626583099365, + "loss": 1.6192, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2647570371627808, + "rewards/margins": 0.18330557644367218, + "rewards/rejected": -1.4480626583099365, "step": 3060 }, { "epoch": 1.6404080949991637, - "grad_norm": 8.674746275808152, + "grad_norm": 7.890439130584057, "learning_rate": 5.0545080135113e-07, - "logits/chosen": -0.006783929653465748, - "logits/rejected": 0.03824920952320099, - "logps/chosen": -1.2787463665008545, - "logps/rejected": -1.598751187324524, - "loss": 1.9926, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2787463665008545, - "rewards/margins": 0.3200048804283142, - "rewards/rejected": -1.598751187324524, - "semantic_entropy": 0.7831565737724304, + "logits/chosen": -0.0842519998550415, + "logits/rejected": -0.06073132902383804, + "logps/chosen": -1.2535126209259033, + "logps/rejected": -1.5172220468521118, + "loss": 1.5936, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2535126209259033, + "rewards/margins": 0.2637094557285309, + "rewards/rejected": -1.5172220468521118, "step": 3065 }, { "epoch": 1.6430841277805652, - "grad_norm": 7.231939797662285, + "grad_norm": 6.932068669261237, "learning_rate": 5.038934673110316e-07, - "logits/chosen": -0.12250302731990814, - "logits/rejected": -0.00720958411693573, - "logps/chosen": -1.2766475677490234, - "logps/rejected": -1.5290186405181885, - "loss": 2.0108, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2766475677490234, - "rewards/margins": 0.2523711621761322, - "rewards/rejected": -1.5290186405181885, - "semantic_entropy": 0.7957003116607666, + "logits/chosen": -0.22326147556304932, + "logits/rejected": -0.1301649957895279, + "logps/chosen": -1.2490341663360596, + "logps/rejected": -1.4320869445800781, + "loss": 1.6044, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2490341663360596, + "rewards/margins": 0.18305273354053497, + "rewards/rejected": -1.4320869445800781, "step": 3070 }, { "epoch": 1.645760160561967, - "grad_norm": 7.122051956828495, + "grad_norm": 6.534333873273087, "learning_rate": 5.023360954964963e-07, - "logits/chosen": -0.13707543909549713, - "logits/rejected": -0.06735747307538986, - "logps/chosen": -1.2285687923431396, - "logps/rejected": -1.4890555143356323, - "loss": 1.9769, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2285687923431396, - "rewards/margins": 0.2604869306087494, - "rewards/rejected": -1.4890555143356323, - "semantic_entropy": 0.8113986253738403, + "logits/chosen": -0.23500683903694153, + "logits/rejected": -0.18605561554431915, + "logps/chosen": -1.1966211795806885, + "logps/rejected": -1.4017795324325562, + "loss": 1.5608, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1966211795806885, + "rewards/margins": 0.2051583081483841, + "rewards/rejected": -1.4017795324325562, "step": 3075 }, { "epoch": 1.6484361933433684, - "grad_norm": 9.025536229209312, + "grad_norm": 8.222345073859744, "learning_rate": 5.007787010171524e-07, - "logits/chosen": -0.1931755542755127, - "logits/rejected": -0.0045111337676644325, - "logps/chosen": -1.18930983543396, - "logps/rejected": -1.4476261138916016, - "loss": 1.9691, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.18930983543396, - "rewards/margins": 0.25831639766693115, - "rewards/rejected": -1.4476261138916016, - "semantic_entropy": 0.8228625059127808, + "logits/chosen": -0.28006216883659363, + "logits/rejected": -0.12380047887563705, + "logps/chosen": -1.1686115264892578, + "logps/rejected": -1.3616727590560913, + "loss": 1.5561, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1686115264892578, + "rewards/margins": 0.19306130707263947, + "rewards/rejected": -1.3616727590560913, "step": 3080 }, { "epoch": 1.65111222612477, - "grad_norm": 9.40683918898441, + "grad_norm": 7.848292425252936, "learning_rate": 4.992212989828477e-07, - "logits/chosen": 0.007282339967787266, - "logits/rejected": 0.015585768036544323, - "logps/chosen": -1.2093725204467773, - "logps/rejected": -1.4898974895477295, - "loss": 1.9669, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2093725204467773, - "rewards/margins": 0.2805247902870178, - "rewards/rejected": -1.4898974895477295, - "semantic_entropy": 0.8125426173210144, + "logits/chosen": -0.08855435997247696, + "logits/rejected": -0.08716576546430588, + "logps/chosen": -1.1851686239242554, + "logps/rejected": -1.3858120441436768, + "loss": 1.5523, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1851686239242554, + "rewards/margins": 0.20064334571361542, + "rewards/rejected": -1.3858120441436768, "step": 3085 }, { "epoch": 1.6537882589061716, - "grad_norm": 8.620227561484512, + "grad_norm": 7.992472739060857, "learning_rate": 4.976639045035036e-07, - "logits/chosen": 0.028462231159210205, - "logits/rejected": 0.10271923243999481, - "logps/chosen": -1.2567675113677979, - "logps/rejected": -1.4431809186935425, - "loss": 2.0372, + "logits/chosen": -0.07760701328516006, + "logits/rejected": -0.022303396835923195, + "logps/chosen": -1.2271113395690918, + "logps/rejected": -1.363673210144043, + "loss": 1.6226, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2567675113677979, - "rewards/margins": 0.18641360104084015, - "rewards/rejected": -1.4431809186935425, - "semantic_entropy": 0.8090243339538574, + "rewards/chosen": -1.2271113395690918, + "rewards/margins": 0.13656170666217804, + "rewards/rejected": -1.363673210144043, "step": 3090 }, { "epoch": 1.6564642916875731, - "grad_norm": 9.715546359495004, + "grad_norm": 9.041074439845941, "learning_rate": 4.961065326889683e-07, - "logits/chosen": -0.03923254460096359, - "logits/rejected": 0.10990728437900543, - "logps/chosen": -1.282456398010254, - "logps/rejected": -1.5208022594451904, - "loss": 2.0179, + "logits/chosen": -0.12331873178482056, + "logits/rejected": -0.003659465117380023, + "logps/chosen": -1.2456037998199463, + "logps/rejected": -1.4136433601379395, + "loss": 1.6064, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.282456398010254, - "rewards/margins": 0.23834581673145294, - "rewards/rejected": -1.5208022594451904, - "semantic_entropy": 0.8012547492980957, + "rewards/chosen": -1.2456037998199463, + "rewards/margins": 0.16803942620754242, + "rewards/rejected": -1.4136433601379395, "step": 3095 }, { "epoch": 1.6591403244689746, - "grad_norm": 10.009918519263168, + "grad_norm": 9.30614609280546, "learning_rate": 4.9454919864887e-07, - "logits/chosen": -0.16790278255939484, - "logits/rejected": -0.028138387948274612, - "logps/chosen": -1.3211452960968018, - "logps/rejected": -1.5278136730194092, - "loss": 2.0624, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3211452960968018, - "rewards/margins": 0.20666833221912384, - "rewards/rejected": -1.5278136730194092, - "semantic_entropy": 0.7871449589729309, + "logits/chosen": -0.28149527311325073, + "logits/rejected": -0.16837576031684875, + "logps/chosen": -1.289886474609375, + "logps/rejected": -1.4208576679229736, + "loss": 1.6559, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.289886474609375, + "rewards/margins": 0.13097111880779266, + "rewards/rejected": -1.4208576679229736, "step": 3100 }, { "epoch": 1.6618163572503764, - "grad_norm": 10.855910976334592, + "grad_norm": 9.840047778991377, "learning_rate": 4.929919174924701e-07, - "logits/chosen": -0.10906453430652618, - "logits/rejected": 0.08404166996479034, - "logps/chosen": -1.3063828945159912, - "logps/rejected": -1.53128182888031, - "loss": 2.0223, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3063828945159912, - "rewards/margins": 0.2248990535736084, - "rewards/rejected": -1.53128182888031, - "semantic_entropy": 0.7838009595870972, + "logits/chosen": -0.20624008774757385, + "logits/rejected": -0.058651864528656006, + "logps/chosen": -1.2796785831451416, + "logps/rejected": -1.4359012842178345, + "loss": 1.6297, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2796785831451416, + "rewards/margins": 0.15622270107269287, + "rewards/rejected": -1.4359012842178345, "step": 3105 }, { "epoch": 1.6644923900317778, - "grad_norm": 7.404272070267876, + "grad_norm": 7.1401470055451535, "learning_rate": 4.914347043285177e-07, - "logits/chosen": -0.04257003217935562, - "logits/rejected": 0.07210078090429306, - "logps/chosen": -1.2872350215911865, - "logps/rejected": -1.5439434051513672, - "loss": 2.0154, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2872350215911865, - "rewards/margins": 0.2567083239555359, - "rewards/rejected": -1.5439434051513672, - "semantic_entropy": 0.7958934903144836, + "logits/chosen": -0.17281001806259155, + "logits/rejected": -0.08889992535114288, + "logps/chosen": -1.254786729812622, + "logps/rejected": -1.459333896636963, + "loss": 1.605, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.254786729812622, + "rewards/margins": 0.2045472413301468, + "rewards/rejected": -1.459333896636963, "step": 3110 }, { "epoch": 1.6671684228131793, - "grad_norm": 7.985100484489514, + "grad_norm": 5.658621347401571, "learning_rate": 4.898775742651013e-07, - "logits/chosen": 0.02354573830962181, - "logits/rejected": 0.1122712641954422, - "logps/chosen": -1.291019082069397, - "logps/rejected": -1.5914843082427979, - "loss": 1.9917, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.291019082069397, - "rewards/margins": 0.30046507716178894, - "rewards/rejected": -1.5914843082427979, - "semantic_entropy": 0.777329683303833, + "logits/chosen": -0.09652390331029892, + "logits/rejected": -0.0367254912853241, + "logps/chosen": -1.2596657276153564, + "logps/rejected": -1.4990681409835815, + "loss": 1.5933, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2596657276153564, + "rewards/margins": 0.23940233886241913, + "rewards/rejected": -1.4990681409835815, "step": 3115 }, { "epoch": 1.669844455594581, - "grad_norm": 6.377919696230422, + "grad_norm": 5.6994139371594486, "learning_rate": 4.883205424095037e-07, - "logits/chosen": -0.10408266633749008, - "logits/rejected": 0.05661571025848389, - "logps/chosen": -1.3499923944473267, - "logps/rejected": -1.600716233253479, - "loss": 2.0779, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3499923944473267, - "rewards/margins": 0.25072377920150757, - "rewards/rejected": -1.600716233253479, - "semantic_entropy": 0.7723182439804077, + "logits/chosen": -0.19167150557041168, + "logits/rejected": -0.07216285914182663, + "logps/chosen": -1.3158442974090576, + "logps/rejected": -1.4828596115112305, + "loss": 1.6714, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3158442974090576, + "rewards/margins": 0.16701537370681763, + "rewards/rejected": -1.4828596115112305, "step": 3120 }, { "epoch": 1.6725204883759828, - "grad_norm": 7.489392193604876, + "grad_norm": 7.29893228737393, "learning_rate": 4.86763623868055e-07, - "logits/chosen": 0.002272368874400854, - "logits/rejected": 0.12083474546670914, - "logps/chosen": -1.3575466871261597, - "logps/rejected": -1.654810905456543, - "loss": 2.0625, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3575466871261597, - "rewards/margins": 0.297264039516449, - "rewards/rejected": -1.654810905456543, - "semantic_entropy": 0.7716215252876282, + "logits/chosen": -0.11124508082866669, + "logits/rejected": -0.028446123003959656, + "logps/chosen": -1.3150824308395386, + "logps/rejected": -1.5301481485366821, + "loss": 1.6543, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3150824308395386, + "rewards/margins": 0.2150656282901764, + "rewards/rejected": -1.5301481485366821, "step": 3125 }, { "epoch": 1.675196521157384, - "grad_norm": 7.096207569657256, + "grad_norm": 7.02963922142275, "learning_rate": 4.852068337459856e-07, - "logits/chosen": 0.01183260791003704, - "logits/rejected": 0.16132783889770508, - "logps/chosen": -1.3407800197601318, - "logps/rejected": -1.5702488422393799, - "loss": 2.0401, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3407800197601318, - "rewards/margins": 0.22946885228157043, - "rewards/rejected": -1.5702488422393799, - "semantic_entropy": 0.7798231244087219, + "logits/chosen": -0.1324525773525238, + "logits/rejected": -0.03445501625537872, + "logps/chosen": -1.3168666362762451, + "logps/rejected": -1.4846765995025635, + "loss": 1.6482, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3168666362762451, + "rewards/margins": 0.16780975461006165, + "rewards/rejected": -1.4846765995025635, "step": 3130 }, { "epoch": 1.6778725539387858, - "grad_norm": 7.565280374118657, + "grad_norm": 7.430759587558647, "learning_rate": 4.8365018714728e-07, - "logits/chosen": 0.05541212111711502, - "logits/rejected": 0.11548095941543579, - "logps/chosen": -1.3500267267227173, - "logps/rejected": -1.5410051345825195, - "loss": 2.0785, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3500267267227173, - "rewards/margins": 0.19097848236560822, - "rewards/rejected": -1.5410051345825195, - "semantic_entropy": 0.776996910572052, + "logits/chosen": -0.0802285447716713, + "logits/rejected": -0.044229067862033844, + "logps/chosen": -1.3205969333648682, + "logps/rejected": -1.460759162902832, + "loss": 1.6794, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3205969333648682, + "rewards/margins": 0.14016228914260864, + "rewards/rejected": -1.460759162902832, "step": 3135 }, { "epoch": 1.6805485867201875, - "grad_norm": 6.038134474525186, + "grad_norm": 5.776051063063486, "learning_rate": 4.820936991745304e-07, - "logits/chosen": -0.20805136859416962, - "logits/rejected": -0.044590163975954056, - "logps/chosen": -1.2141269445419312, - "logps/rejected": -1.3900539875030518, - "loss": 1.9992, + "logits/chosen": -0.33011573553085327, + "logits/rejected": -0.193569153547287, + "logps/chosen": -1.1992324590682983, + "logps/rejected": -1.3035260438919067, + "loss": 1.5946, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2141269445419312, - "rewards/margins": 0.17592690885066986, - "rewards/rejected": -1.3900539875030518, - "semantic_entropy": 0.8216248750686646, + "rewards/chosen": -1.1992324590682983, + "rewards/margins": 0.10429352521896362, + "rewards/rejected": -1.3035260438919067, "step": 3140 }, { "epoch": 1.6832246195015887, - "grad_norm": 8.710741282482623, + "grad_norm": 8.391908438600513, "learning_rate": 4.8053738492879e-07, - "logits/chosen": 0.0013846077490597963, - "logits/rejected": 0.147234708070755, - "logps/chosen": -1.2627509832382202, - "logps/rejected": -1.4728825092315674, - "loss": 2.0147, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2627509832382202, - "rewards/margins": 0.2101314812898636, - "rewards/rejected": -1.4728825092315674, - "semantic_entropy": 0.8021093606948853, + "logits/chosen": -0.15375883877277374, + "logits/rejected": -0.05089033395051956, + "logps/chosen": -1.243142008781433, + "logps/rejected": -1.3542028665542603, + "loss": 1.6207, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.243142008781433, + "rewards/margins": 0.1110607385635376, + "rewards/rejected": -1.3542028665542603, "step": 3145 }, { "epoch": 1.6859006522829905, - "grad_norm": 8.204856396695087, + "grad_norm": 7.90373749299904, "learning_rate": 4.789812595094265e-07, - "logits/chosen": -0.14415964484214783, - "logits/rejected": -0.010510803200304508, - "logps/chosen": -1.3389784097671509, - "logps/rejected": -1.546367883682251, - "loss": 2.0382, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3389784097671509, - "rewards/margins": 0.20738950371742249, - "rewards/rejected": -1.546367883682251, - "semantic_entropy": 0.7847117781639099, + "logits/chosen": -0.28623443841934204, + "logits/rejected": -0.1853335201740265, + "logps/chosen": -1.3113467693328857, + "logps/rejected": -1.4720947742462158, + "loss": 1.6392, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3113467693328857, + "rewards/margins": 0.16074810922145844, + "rewards/rejected": -1.4720947742462158, "step": 3150 }, { "epoch": 1.6885766850643922, - "grad_norm": 14.18752375431332, + "grad_norm": 11.50635801645452, "learning_rate": 4.774253380139752e-07, - "logits/chosen": -0.1542137712240219, - "logits/rejected": -0.03455689921975136, - "logps/chosen": -1.2210431098937988, - "logps/rejected": -1.4889132976531982, - "loss": 1.977, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2210431098937988, - "rewards/margins": 0.2678702473640442, - "rewards/rejected": -1.4889132976531982, - "semantic_entropy": 0.8089672923088074, + "logits/chosen": -0.27749574184417725, + "logits/rejected": -0.1822170913219452, + "logps/chosen": -1.1950091123580933, + "logps/rejected": -1.4036287069320679, + "loss": 1.5649, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1950091123580933, + "rewards/margins": 0.2086195945739746, + "rewards/rejected": -1.4036287069320679, "step": 3155 }, { "epoch": 1.6912527178457935, - "grad_norm": 8.584676492997804, + "grad_norm": 8.313965644208702, "learning_rate": 4.758696355379936e-07, - "logits/chosen": -0.11684201657772064, - "logits/rejected": -0.09880466759204865, - "logps/chosen": -1.2682487964630127, - "logps/rejected": -1.5760184526443481, - "loss": 2.0007, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2682487964630127, - "rewards/margins": 0.3077697455883026, - "rewards/rejected": -1.5760184526443481, - "semantic_entropy": 0.797630250453949, + "logits/chosen": -0.23794877529144287, + "logits/rejected": -0.22094373404979706, + "logps/chosen": -1.2401232719421387, + "logps/rejected": -1.4726377725601196, + "loss": 1.5962, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2401232719421387, + "rewards/margins": 0.23251450061798096, + "rewards/rejected": -1.4726377725601196, "step": 3160 }, { "epoch": 1.6939287506271952, - "grad_norm": 6.252793796460734, + "grad_norm": 6.279641958228806, "learning_rate": 4.743141671749138e-07, - "logits/chosen": -0.18824736773967743, - "logits/rejected": -0.098078653216362, - "logps/chosen": -1.3107153177261353, - "logps/rejected": -1.4777319431304932, - "loss": 2.0477, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3107153177261353, - "rewards/margins": 0.16701671481132507, - "rewards/rejected": -1.4777319431304932, - "semantic_entropy": 0.7871649861335754, + "logits/chosen": -0.2521572411060333, + "logits/rejected": -0.18462175130844116, + "logps/chosen": -1.2821708917617798, + "logps/rejected": -1.4105151891708374, + "loss": 1.6424, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2821708917617798, + "rewards/margins": 0.12834443151950836, + "rewards/rejected": -1.4105151891708374, "step": 3165 }, { "epoch": 1.6966047834085969, - "grad_norm": 6.388763474026948, + "grad_norm": 6.145885295128102, "learning_rate": 4.727589480158968e-07, - "logits/chosen": -0.13319739699363708, - "logits/rejected": -0.029325807467103004, - "logps/chosen": -1.2875155210494995, - "logps/rejected": -1.5287894010543823, - "loss": 2.0377, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2875155210494995, - "rewards/margins": 0.24127385020256042, - "rewards/rejected": -1.5287894010543823, - "semantic_entropy": 0.7841957807540894, + "logits/chosen": -0.23114600777626038, + "logits/rejected": -0.15453112125396729, + "logps/chosen": -1.2547972202301025, + "logps/rejected": -1.4210622310638428, + "loss": 1.6316, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2547972202301025, + "rewards/margins": 0.16626504063606262, + "rewards/rejected": -1.4210622310638428, "step": 3170 }, { "epoch": 1.6992808161899984, - "grad_norm": 11.07062376146642, + "grad_norm": 9.283537736527547, "learning_rate": 4.712039931496855e-07, - "logits/chosen": -0.15786674618721008, - "logits/rejected": -0.05507850646972656, - "logps/chosen": -1.2572047710418701, - "logps/rejected": -1.4527876377105713, - "loss": 2.0407, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2572047710418701, - "rewards/margins": 0.19558288156986237, - "rewards/rejected": -1.4527876377105713, - "semantic_entropy": 0.804765522480011, + "logits/chosen": -0.2676324248313904, + "logits/rejected": -0.19834928214550018, + "logps/chosen": -1.2339831590652466, + "logps/rejected": -1.3864284753799438, + "loss": 1.6215, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2339831590652466, + "rewards/margins": 0.1524452269077301, + "rewards/rejected": -1.3864284753799438, "step": 3175 }, { "epoch": 1.7019568489713999, - "grad_norm": 5.6188414262791495, + "grad_norm": 5.54712738975142, "learning_rate": 4.6964931766245905e-07, - "logits/chosen": -0.01330545824021101, - "logits/rejected": 0.041144389659166336, - "logps/chosen": -1.3052301406860352, - "logps/rejected": -1.6139024496078491, - "loss": 2.0025, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3052301406860352, - "rewards/margins": 0.3086722791194916, - "rewards/rejected": -1.6139024496078491, - "semantic_entropy": 0.7841047048568726, + "logits/chosen": -0.12336407601833344, + "logits/rejected": -0.08209587633609772, + "logps/chosen": -1.2824407815933228, + "logps/rejected": -1.4989856481552124, + "loss": 1.613, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2824407815933228, + "rewards/margins": 0.21654491126537323, + "rewards/rejected": -1.4989856481552124, "step": 3180 }, { "epoch": 1.7046328817528016, - "grad_norm": 8.862736744649089, + "grad_norm": 8.07909200577735, "learning_rate": 4.6809493663768575e-07, - "logits/chosen": -0.06787719577550888, - "logits/rejected": -0.045422784984111786, - "logps/chosen": -1.2365736961364746, - "logps/rejected": -1.436856985092163, - "loss": 1.9958, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2365736961364746, - "rewards/margins": 0.2002832591533661, - "rewards/rejected": -1.436856985092163, - "semantic_entropy": 0.8167963027954102, + "logits/chosen": -0.17828579246997833, + "logits/rejected": -0.17043253779411316, + "logps/chosen": -1.200535535812378, + "logps/rejected": -1.3539785146713257, + "loss": 1.5748, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.200535535812378, + "rewards/margins": 0.15344305336475372, + "rewards/rejected": -1.3539785146713257, "step": 3185 }, { "epoch": 1.707308914534203, - "grad_norm": 9.181454482123595, + "grad_norm": 8.134623664630519, "learning_rate": 4.6654086515597716e-07, - "logits/chosen": -0.1357351839542389, - "logits/rejected": 0.02635277807712555, - "logps/chosen": -1.2460029125213623, - "logps/rejected": -1.5762498378753662, - "loss": 1.9446, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2460029125213623, - "rewards/margins": 0.3302469253540039, - "rewards/rejected": -1.5762498378753662, - "semantic_entropy": 0.8032658696174622, + "logits/chosen": -0.23891082406044006, + "logits/rejected": -0.12342718988656998, + "logps/chosen": -1.220595359802246, + "logps/rejected": -1.4715100526809692, + "loss": 1.5438, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.220595359802246, + "rewards/margins": 0.25091463327407837, + "rewards/rejected": -1.4715100526809692, "step": 3190 }, { "epoch": 1.7099849473156046, - "grad_norm": 6.31497143037071, + "grad_norm": 5.636674650063026, "learning_rate": 4.6498711829494154e-07, - "logits/chosen": -0.1393432319164276, - "logits/rejected": -0.032652225345373154, - "logps/chosen": -1.2492889165878296, - "logps/rejected": -1.5665004253387451, - "loss": 1.9693, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2492889165878296, - "rewards/margins": 0.31721144914627075, - "rewards/rejected": -1.5665004253387451, - "semantic_entropy": 0.8023662567138672, + "logits/chosen": -0.24777567386627197, + "logits/rejected": -0.16002780199050903, + "logps/chosen": -1.227882981300354, + "logps/rejected": -1.4638681411743164, + "loss": 1.5698, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.227882981300354, + "rewards/margins": 0.2359851896762848, + "rewards/rejected": -1.4638681411743164, "step": 3195 }, { "epoch": 1.7126609800970063, - "grad_norm": 8.943664344540371, + "grad_norm": 8.26481230362609, "learning_rate": 4.6343371112903777e-07, - "logits/chosen": -0.03442186489701271, - "logits/rejected": 0.11945761740207672, - "logps/chosen": -1.3083088397979736, - "logps/rejected": -1.6861575841903687, - "loss": 2.0057, + "logits/chosen": -0.14667882025241852, + "logits/rejected": -0.01680718921124935, + "logps/chosen": -1.275852918624878, + "logps/rejected": -1.575873613357544, + "loss": 1.6012, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3083088397979736, - "rewards/margins": 0.3778485655784607, - "rewards/rejected": -1.6861575841903687, - "semantic_entropy": 0.7777668833732605, + "rewards/chosen": -1.275852918624878, + "rewards/margins": 0.30002063512802124, + "rewards/rejected": -1.575873613357544, "step": 3200 }, { "epoch": 1.7126609800970063, - "eval_logits/chosen": 0.2875590920448303, - "eval_logits/rejected": 0.3803773522377014, - "eval_logps/chosen": -1.3269991874694824, - "eval_logps/rejected": -1.575361967086792, - "eval_loss": 2.0459630489349365, - "eval_rewards/accuracies": 0.5882789492607117, - "eval_rewards/chosen": -1.3269991874694824, - "eval_rewards/margins": 0.24836291372776031, - "eval_rewards/rejected": -1.575361967086792, - "eval_runtime": 34.5538, - "eval_samples_per_second": 38.925, - "eval_semantic_entropy": 0.7827669978141785, - "eval_steps_per_second": 9.753, + "eval_logits/chosen": 0.14010000228881836, + "eval_logits/rejected": 0.2138935625553131, + "eval_logps/chosen": -1.300903558731079, + "eval_logps/rejected": -1.4722107648849487, + "eval_loss": 1.6495161056518555, + "eval_rewards/accuracies": 0.5667656064033508, + "eval_rewards/chosen": -1.300903558731079, + "eval_rewards/margins": 0.1713072508573532, + "eval_rewards/rejected": -1.4722107648849487, + "eval_runtime": 40.5718, + "eval_samples_per_second": 33.151, + "eval_steps_per_second": 8.306, "step": 3200 }, { "epoch": 1.7153370128784078, - "grad_norm": 6.181562002484526, + "grad_norm": 5.5146507889561525, "learning_rate": 4.618806587294291e-07, - "logits/chosen": -0.19150307774543762, - "logits/rejected": -0.07083725929260254, - "logps/chosen": -1.3303239345550537, - "logps/rejected": -1.5726451873779297, - "loss": 2.0342, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3303239345550537, - "rewards/margins": 0.24232113361358643, - "rewards/rejected": -1.5726451873779297, - "semantic_entropy": 0.7856124639511108, + "logits/chosen": -0.2771919369697571, + "logits/rejected": -0.18472592532634735, + "logps/chosen": -1.2956626415252686, + "logps/rejected": -1.4821714162826538, + "loss": 1.6298, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2956626415252686, + "rewards/margins": 0.18650877475738525, + "rewards/rejected": -1.4821714162826538, "step": 3205 }, { "epoch": 1.7180130456598093, - "grad_norm": 12.92251198732787, + "grad_norm": 9.231050864247887, "learning_rate": 4.603279761638365e-07, - "logits/chosen": -0.15394911170005798, - "logits/rejected": -0.04621434956789017, - "logps/chosen": -1.293766975402832, - "logps/rejected": -1.5395978689193726, - "loss": 2.0374, + "logits/chosen": -0.2625105381011963, + "logits/rejected": -0.18162298202514648, + "logps/chosen": -1.2624567747116089, + "logps/rejected": -1.4393675327301025, + "loss": 1.6263, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.293766975402832, - "rewards/margins": 0.24583086371421814, - "rewards/rejected": -1.5395978689193726, - "semantic_entropy": 0.7910820245742798, + "rewards/chosen": -1.2624567747116089, + "rewards/margins": 0.17691072821617126, + "rewards/rejected": -1.4393675327301025, "step": 3210 }, { "epoch": 1.720689078441211, - "grad_norm": 10.95795391128249, + "grad_norm": 9.688812251430225, "learning_rate": 4.5877567849639315e-07, - "logits/chosen": -0.14600083231925964, - "logits/rejected": -0.021480122581124306, - "logps/chosen": -1.2595031261444092, - "logps/rejected": -1.4939130544662476, - "loss": 2.0118, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2595031261444092, - "rewards/margins": 0.23441000282764435, - "rewards/rejected": -1.4939130544662476, - "semantic_entropy": 0.8064897656440735, + "logits/chosen": -0.22895050048828125, + "logits/rejected": -0.12594011425971985, + "logps/chosen": -1.2300055027008057, + "logps/rejected": -1.3816862106323242, + "loss": 1.6023, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2300055027008057, + "rewards/margins": 0.15168072283267975, + "rewards/rejected": -1.3816862106323242, "step": 3215 }, { "epoch": 1.7233651112226125, - "grad_norm": 8.336084511981387, + "grad_norm": 6.722526720784602, "learning_rate": 4.572237807874979e-07, - "logits/chosen": -0.1490456759929657, - "logits/rejected": 0.06070408970117569, - "logps/chosen": -1.3603156805038452, - "logps/rejected": -1.6003338098526, - "loss": 2.0626, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3603156805038452, - "rewards/margins": 0.24001812934875488, - "rewards/rejected": -1.6003338098526, - "semantic_entropy": 0.7771097421646118, + "logits/chosen": -0.23030118644237518, + "logits/rejected": -0.07641883939504623, + "logps/chosen": -1.3148319721221924, + "logps/rejected": -1.4781345129013062, + "loss": 1.653, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3148319721221924, + "rewards/margins": 0.16330257058143616, + "rewards/rejected": -1.4781345129013062, "step": 3220 }, { "epoch": 1.726041144004014, - "grad_norm": 10.327496360033788, + "grad_norm": 9.475403044592714, "learning_rate": 4.5567229809366895e-07, - "logits/chosen": -0.1425078958272934, - "logits/rejected": -0.012681936845183372, - "logps/chosen": -1.2058963775634766, - "logps/rejected": -1.4731707572937012, - "loss": 1.9638, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2058963775634766, - "rewards/margins": 0.2672743499279022, - "rewards/rejected": -1.4731707572937012, - "semantic_entropy": 0.8253902196884155, + "logits/chosen": -0.23579053580760956, + "logits/rejected": -0.12721948325634003, + "logps/chosen": -1.1815276145935059, + "logps/rejected": -1.3895444869995117, + "loss": 1.5467, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1815276145935059, + "rewards/margins": 0.20801691710948944, + "rewards/rejected": -1.3895444869995117, "step": 3225 }, { "epoch": 1.7287171767854157, - "grad_norm": 7.275516801530044, + "grad_norm": 6.696982142265946, "learning_rate": 4.541212454673984e-07, - "logits/chosen": -0.14230267703533173, - "logits/rejected": 0.01516336016356945, - "logps/chosen": -1.2601486444473267, - "logps/rejected": -1.6092818975448608, - "loss": 1.9666, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2601486444473267, - "rewards/margins": 0.34913307428359985, - "rewards/rejected": -1.6092818975448608, - "semantic_entropy": 0.7881345152854919, + "logits/chosen": -0.2501373291015625, + "logits/rejected": -0.13391202688217163, + "logps/chosen": -1.2293736934661865, + "logps/rejected": -1.4656680822372437, + "loss": 1.5685, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2293736934661865, + "rewards/margins": 0.23629438877105713, + "rewards/rejected": -1.4656680822372437, "step": 3230 }, { "epoch": 1.7313932095668172, - "grad_norm": 10.07712270276737, + "grad_norm": 8.490345110087015, "learning_rate": 4.525706379570055e-07, - "logits/chosen": -0.09800378978252411, - "logits/rejected": -0.029662657529115677, - "logps/chosen": -1.2746299505233765, - "logps/rejected": -1.5383646488189697, - "loss": 2.0011, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2746299505233765, - "rewards/margins": 0.26373451948165894, - "rewards/rejected": -1.5383646488189697, - "semantic_entropy": 0.8014398813247681, + "logits/chosen": -0.204094797372818, + "logits/rejected": -0.16735957562923431, + "logps/chosen": -1.2532507181167603, + "logps/rejected": -1.437369704246521, + "loss": 1.602, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2532507181167603, + "rewards/margins": 0.18411897122859955, + "rewards/rejected": -1.437369704246521, "step": 3235 }, { "epoch": 1.7340692423482187, - "grad_norm": 6.246134723181422, + "grad_norm": 6.271553325835849, "learning_rate": 4.510204906064911e-07, - "logits/chosen": 0.0009855165844783187, - "logits/rejected": 0.11134722083806992, - "logps/chosen": -1.2328336238861084, - "logps/rejected": -1.5697522163391113, - "loss": 1.9354, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2328336238861084, - "rewards/margins": 0.33691850304603577, - "rewards/rejected": -1.5697522163391113, - "semantic_entropy": 0.8106444478034973, + "logits/chosen": -0.11776401847600937, + "logits/rejected": -0.04645920917391777, + "logps/chosen": -1.199644923210144, + "logps/rejected": -1.4558780193328857, + "loss": 1.5254, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.199644923210144, + "rewards/margins": 0.2562331557273865, + "rewards/rejected": -1.4558780193328857, "step": 3240 }, { "epoch": 1.7367452751296204, - "grad_norm": 9.727248624352367, + "grad_norm": 8.823179932392053, "learning_rate": 4.4947081845539177e-07, - "logits/chosen": -0.20650625228881836, - "logits/rejected": -0.06619389355182648, - "logps/chosen": -1.2484396696090698, - "logps/rejected": -1.5200622081756592, - "loss": 2.0096, + "logits/chosen": -0.3170090317726135, + "logits/rejected": -0.20921523869037628, + "logps/chosen": -1.2052419185638428, + "logps/rejected": -1.4000091552734375, + "loss": 1.585, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2484396696090698, - "rewards/margins": 0.2716224789619446, - "rewards/rejected": -1.5200622081756592, - "semantic_entropy": 0.8025262951850891, + "rewards/chosen": -1.2052419185638428, + "rewards/margins": 0.19476726651191711, + "rewards/rejected": -1.4000091552734375, "step": 3245 }, { "epoch": 1.739421307911022, - "grad_norm": 9.681377075377828, + "grad_norm": 7.087829064434588, "learning_rate": 4.479216365386333e-07, - "logits/chosen": 0.02583617903292179, - "logits/rejected": 0.17587433755397797, - "logps/chosen": -1.28328537940979, - "logps/rejected": -1.5489881038665771, - "loss": 2.0047, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.28328537940979, - "rewards/margins": 0.26570266485214233, - "rewards/rejected": -1.5489881038665771, - "semantic_entropy": 0.7944211959838867, + "logits/chosen": -0.09932031482458115, + "logits/rejected": -0.006480866577476263, + "logps/chosen": -1.2517788410186768, + "logps/rejected": -1.4760605096817017, + "loss": 1.5947, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2517788410186768, + "rewards/margins": 0.2242816686630249, + "rewards/rejected": -1.4760605096817017, "step": 3250 }, { "epoch": 1.7420973406924234, - "grad_norm": 7.1162691309907995, + "grad_norm": 7.305535342674533, "learning_rate": 4.4637295988638555e-07, - "logits/chosen": -0.0334712490439415, - "logits/rejected": 0.05123140290379524, - "logps/chosen": -1.3655494451522827, - "logps/rejected": -1.4881891012191772, - "loss": 2.0721, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3655494451522827, - "rewards/margins": 0.12263967096805573, - "rewards/rejected": -1.4881891012191772, - "semantic_entropy": 0.7760681509971619, + "logits/chosen": -0.12802617251873016, + "logits/rejected": -0.06714338064193726, + "logps/chosen": -1.3415563106536865, + "logps/rejected": -1.4062931537628174, + "loss": 1.6824, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3415563106536865, + "rewards/margins": 0.06473670899868011, + "rewards/rejected": -1.4062931537628174, "step": 3255 }, { "epoch": 1.744773373473825, - "grad_norm": 8.40556859712584, + "grad_norm": 8.118621526135135, "learning_rate": 4.4482480352391623e-07, - "logits/chosen": -0.1488790214061737, - "logits/rejected": -0.009104812517762184, - "logps/chosen": -1.3160731792449951, - "logps/rejected": -1.4561564922332764, - "loss": 2.0384, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3160731792449951, - "rewards/margins": 0.1400834321975708, - "rewards/rejected": -1.4561564922332764, - "semantic_entropy": 0.7991342544555664, + "logits/chosen": -0.27072957158088684, + "logits/rejected": -0.15636329352855682, + "logps/chosen": -1.2973716259002686, + "logps/rejected": -1.3826886415481567, + "loss": 1.6414, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2973716259002686, + "rewards/margins": 0.08531701564788818, + "rewards/rejected": -1.3826886415481567, "step": 3260 }, { "epoch": 1.7474494062552266, - "grad_norm": 11.769905710707198, + "grad_norm": 11.27222860909617, "learning_rate": 4.4327718247144507e-07, - "logits/chosen": -0.0454300232231617, - "logits/rejected": 0.06899180263280869, - "logps/chosen": -1.2196002006530762, - "logps/rejected": -1.5009334087371826, - "loss": 1.9635, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2196002006530762, - "rewards/margins": 0.28133314847946167, - "rewards/rejected": -1.5009334087371826, - "semantic_entropy": 0.8047575950622559, + "logits/chosen": -0.15749691426753998, + "logits/rejected": -0.06950239837169647, + "logps/chosen": -1.1970851421356201, + "logps/rejected": -1.4039987325668335, + "loss": 1.5624, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1970851421356201, + "rewards/margins": 0.20691350102424622, + "rewards/rejected": -1.4039987325668335, "step": 3265 }, { "epoch": 1.750125439036628, - "grad_norm": 9.860790687033859, + "grad_norm": 8.279971215164508, "learning_rate": 4.417301117439984e-07, - "logits/chosen": -0.04798585921525955, - "logits/rejected": 0.09852831065654755, - "logps/chosen": -1.1797542572021484, - "logps/rejected": -1.489864706993103, - "loss": 1.9504, + "logits/chosen": -0.1439046859741211, + "logits/rejected": -0.010311020538210869, + "logps/chosen": -1.1577627658843994, + "logps/rejected": -1.4001916646957397, + "loss": 1.5366, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1797542572021484, - "rewards/margins": 0.31011033058166504, - "rewards/rejected": -1.489864706993103, - "semantic_entropy": 0.8194267153739929, + "rewards/chosen": -1.1577627658843994, + "rewards/margins": 0.2424289882183075, + "rewards/rejected": -1.4001916646957397, "step": 3270 }, { "epoch": 1.7528014718180298, - "grad_norm": 11.278419301684329, + "grad_norm": 8.997782212793032, "learning_rate": 4.401836063512631e-07, - "logits/chosen": -0.09553112089633942, - "logits/rejected": 0.22362038493156433, - "logps/chosen": -1.2884795665740967, - "logps/rejected": -1.4595869779586792, - "loss": 2.0391, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2884795665740967, - "rewards/margins": 0.17110735177993774, - "rewards/rejected": -1.4595869779586792, - "semantic_entropy": 0.8058779835700989, + "logits/chosen": -0.2029833048582077, + "logits/rejected": 0.04279404133558273, + "logps/chosen": -1.2708823680877686, + "logps/rejected": -1.3666971921920776, + "loss": 1.6451, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2708823680877686, + "rewards/margins": 0.0958150252699852, + "rewards/rejected": -1.3666971921920776, "step": 3275 }, { "epoch": 1.7554775045994313, - "grad_norm": 9.495267927421752, + "grad_norm": 8.28610441032188, "learning_rate": 4.386376812974413e-07, - "logits/chosen": -0.09709614515304565, - "logits/rejected": -0.005060785915702581, - "logps/chosen": -1.235475778579712, - "logps/rejected": -1.4756667613983154, - "loss": 1.998, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.235475778579712, - "rewards/margins": 0.24019083380699158, - "rewards/rejected": -1.4756667613983154, - "semantic_entropy": 0.8163919448852539, + "logits/chosen": -0.17824730277061462, + "logits/rejected": -0.10273455083370209, + "logps/chosen": -1.2131166458129883, + "logps/rejected": -1.3982055187225342, + "loss": 1.5873, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2131166458129883, + "rewards/margins": 0.18508893251419067, + "rewards/rejected": -1.3982055187225342, "step": 3280 }, { "epoch": 1.7581535373808328, - "grad_norm": 7.859136954214772, + "grad_norm": 7.547073763786053, "learning_rate": 4.370923515811048e-07, - "logits/chosen": -0.1316256821155548, - "logits/rejected": 0.08077608048915863, - "logps/chosen": -1.2407208681106567, - "logps/rejected": -1.5216323137283325, - "loss": 2.0041, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2407208681106567, - "rewards/margins": 0.28091129660606384, - "rewards/rejected": -1.5216323137283325, - "semantic_entropy": 0.8013314008712769, + "logits/chosen": -0.20342381298542023, + "logits/rejected": -0.030245179310441017, + "logps/chosen": -1.2119803428649902, + "logps/rejected": -1.4356105327606201, + "loss": 1.6003, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2119803428649902, + "rewards/margins": 0.22363007068634033, + "rewards/rejected": -1.4356105327606201, "step": 3285 }, { "epoch": 1.7608295701622345, - "grad_norm": 8.930092324257826, + "grad_norm": 8.180883361802714, "learning_rate": 4.35547632195049e-07, - "logits/chosen": -0.04723476618528366, - "logits/rejected": 0.06438259780406952, - "logps/chosen": -1.2634159326553345, - "logps/rejected": -1.4355380535125732, - "loss": 2.0441, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2634159326553345, - "rewards/margins": 0.17212224006652832, - "rewards/rejected": -1.4355380535125732, - "semantic_entropy": 0.8089887499809265, + "logits/chosen": -0.14093777537345886, + "logits/rejected": -0.05257188156247139, + "logps/chosen": -1.2363379001617432, + "logps/rejected": -1.3310226202011108, + "loss": 1.6366, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2363379001617432, + "rewards/margins": 0.09468485414981842, + "rewards/rejected": -1.3310226202011108, "step": 3290 }, { "epoch": 1.763505602943636, - "grad_norm": 10.272633292110191, + "grad_norm": 9.046488593280937, "learning_rate": 4.340035381261484e-07, - "logits/chosen": -0.08970440179109573, - "logits/rejected": -0.01908710040152073, - "logps/chosen": -1.3528639078140259, - "logps/rejected": -1.507359266281128, - "loss": 2.1138, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3528639078140259, - "rewards/margins": 0.15449512004852295, - "rewards/rejected": -1.507359266281128, - "semantic_entropy": 0.7817720174789429, + "logits/chosen": -0.17947307229042053, + "logits/rejected": -0.1344245821237564, + "logps/chosen": -1.3134772777557373, + "logps/rejected": -1.4176721572875977, + "loss": 1.7044, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3134772777557373, + "rewards/margins": 0.10419495403766632, + "rewards/rejected": -1.4176721572875977, "step": 3295 }, { "epoch": 1.7661816357250375, - "grad_norm": 8.125587964992828, + "grad_norm": 7.928396648025896, "learning_rate": 4.324600843552104e-07, - "logits/chosen": -0.18519294261932373, - "logits/rejected": -0.04560722038149834, - "logps/chosen": -1.34574294090271, - "logps/rejected": -1.5829145908355713, - "loss": 2.0564, + "logits/chosen": -0.27724844217300415, + "logits/rejected": -0.1727016419172287, + "logps/chosen": -1.313591718673706, + "logps/rejected": -1.463928461074829, + "loss": 1.6647, "rewards/accuracies": 0.5625, - "rewards/chosen": -1.34574294090271, - "rewards/margins": 0.2371715009212494, - "rewards/rejected": -1.5829145908355713, - "semantic_entropy": 0.7710360288619995, + "rewards/chosen": -1.313591718673706, + "rewards/margins": 0.1503368318080902, + "rewards/rejected": -1.463928461074829, "step": 3300 }, { "epoch": 1.7688576685064392, - "grad_norm": 9.374436536799694, + "grad_norm": 9.244841943663591, "learning_rate": 4.309172858568302e-07, - "logits/chosen": -0.165949285030365, - "logits/rejected": -0.02660273388028145, - "logps/chosen": -1.3073817491531372, - "logps/rejected": -1.4832111597061157, - "loss": 2.0696, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3073817491531372, - "rewards/margins": 0.1758294552564621, - "rewards/rejected": -1.4832111597061157, - "semantic_entropy": 0.7953299283981323, + "logits/chosen": -0.2549338936805725, + "logits/rejected": -0.15062884986400604, + "logps/chosen": -1.2812715768814087, + "logps/rejected": -1.3981187343597412, + "loss": 1.6565, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2812715768814087, + "rewards/margins": 0.11684717237949371, + "rewards/rejected": -1.3981187343597412, "step": 3305 }, { "epoch": 1.771533701287841, - "grad_norm": 8.087464752020232, + "grad_norm": 7.687183836493162, "learning_rate": 4.293751575992455e-07, - "logits/chosen": 0.013452662155032158, - "logits/rejected": 0.05870788171887398, - "logps/chosen": -1.2773231267929077, - "logps/rejected": -1.4914847612380981, - "loss": 2.0228, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2773231267929077, - "rewards/margins": 0.2141617238521576, - "rewards/rejected": -1.4914847612380981, - "semantic_entropy": 0.8066979646682739, + "logits/chosen": -0.1065024733543396, + "logits/rejected": -0.06856070458889008, + "logps/chosen": -1.2517259120941162, + "logps/rejected": -1.4108588695526123, + "loss": 1.6143, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2517259120941162, + "rewards/margins": 0.1591329425573349, + "rewards/rejected": -1.4108588695526123, "step": 3310 }, { "epoch": 1.7742097340692422, - "grad_norm": 10.725463234406757, + "grad_norm": 8.894348041746323, "learning_rate": 4.278337145441916e-07, - "logits/chosen": -0.1867462694644928, - "logits/rejected": -0.037748754024505615, - "logps/chosen": -1.255563497543335, - "logps/rejected": -1.5021106004714966, - "loss": 2.0126, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.255563497543335, - "rewards/margins": 0.2465471774339676, - "rewards/rejected": -1.5021106004714966, - "semantic_entropy": 0.8064797520637512, + "logits/chosen": -0.27567869424819946, + "logits/rejected": -0.16353575885295868, + "logps/chosen": -1.2159628868103027, + "logps/rejected": -1.397537112236023, + "loss": 1.5918, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2159628868103027, + "rewards/margins": 0.18157419562339783, + "rewards/rejected": -1.397537112236023, "step": 3315 }, { "epoch": 1.776885766850644, - "grad_norm": 7.58508784876293, + "grad_norm": 7.090683543079151, "learning_rate": 4.262929716467556e-07, - "logits/chosen": -0.11277034133672714, - "logits/rejected": 0.07743777334690094, - "logps/chosen": -1.2668403387069702, - "logps/rejected": -1.6261193752288818, - "loss": 1.9878, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2668403387069702, - "rewards/margins": 0.35927897691726685, - "rewards/rejected": -1.6261193752288818, - "semantic_entropy": 0.8014122247695923, + "logits/chosen": -0.17891091108322144, + "logits/rejected": -0.028943505138158798, + "logps/chosen": -1.2455543279647827, + "logps/rejected": -1.4666962623596191, + "loss": 1.5921, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2455543279647827, + "rewards/margins": 0.22114193439483643, + "rewards/rejected": -1.4666962623596191, "step": 3320 }, { "epoch": 1.7795617996320456, - "grad_norm": 17.453866294688677, + "grad_norm": 8.038774574999868, "learning_rate": 4.247529438552321e-07, - "logits/chosen": -0.19220510125160217, - "logits/rejected": -0.0137777179479599, - "logps/chosen": -1.2939108610153198, - "logps/rejected": -1.5628669261932373, - "loss": 2.0125, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2939108610153198, - "rewards/margins": 0.2689562141895294, - "rewards/rejected": -1.5628669261932373, - "semantic_entropy": 0.8005807995796204, + "logits/chosen": -0.2750813364982605, + "logits/rejected": -0.12457992881536484, + "logps/chosen": -1.2607461214065552, + "logps/rejected": -1.451475739479065, + "loss": 1.6106, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2607461214065552, + "rewards/margins": 0.19072948396205902, + "rewards/rejected": -1.451475739479065, "step": 3325 }, { "epoch": 1.782237832413447, - "grad_norm": 12.090886939609968, + "grad_norm": 9.664541781553764, "learning_rate": 4.232136461109773e-07, - "logits/chosen": -0.06670709699392319, - "logits/rejected": 0.040992237627506256, - "logps/chosen": -1.1921557188034058, - "logps/rejected": -1.5382206439971924, - "loss": 1.9347, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1921557188034058, - "rewards/margins": 0.346064954996109, - "rewards/rejected": -1.5382206439971924, - "semantic_entropy": 0.8233155012130737, + "logits/chosen": -0.1424180418252945, + "logits/rejected": -0.05181293562054634, + "logps/chosen": -1.1679651737213135, + "logps/rejected": -1.4122867584228516, + "loss": 1.5243, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1679651737213135, + "rewards/margins": 0.2443215399980545, + "rewards/rejected": -1.4122867584228516, "step": 3330 }, { "epoch": 1.7849138651948486, - "grad_norm": 12.17353452249854, + "grad_norm": 11.392142095404724, "learning_rate": 4.216750933482646e-07, - "logits/chosen": -0.11370841413736343, - "logits/rejected": 0.046696338802576065, - "logps/chosen": -1.3182452917099, - "logps/rejected": -1.5295355319976807, - "loss": 2.029, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3182452917099, - "rewards/margins": 0.2112903892993927, - "rewards/rejected": -1.5295355319976807, - "semantic_entropy": 0.7953765392303467, + "logits/chosen": -0.21398165822029114, + "logits/rejected": -0.0784340351819992, + "logps/chosen": -1.283003807067871, + "logps/rejected": -1.4440919160842896, + "loss": 1.6162, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.283003807067871, + "rewards/margins": 0.16108819842338562, + "rewards/rejected": -1.4440919160842896, "step": 3335 }, { "epoch": 1.7875898979762503, - "grad_norm": 7.707961560103677, + "grad_norm": 6.20814263774446, "learning_rate": 4.2013730049413986e-07, - "logits/chosen": -0.0673111230134964, - "logits/rejected": 0.0833623856306076, - "logps/chosen": -1.2442004680633545, - "logps/rejected": -1.5510399341583252, - "loss": 1.9881, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2442004680633545, - "rewards/margins": 0.3068394660949707, - "rewards/rejected": -1.5510399341583252, - "semantic_entropy": 0.8056608438491821, + "logits/chosen": -0.13394159078598022, + "logits/rejected": -0.01691107079386711, + "logps/chosen": -1.2252600193023682, + "logps/rejected": -1.4495265483856201, + "loss": 1.5877, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2252600193023682, + "rewards/margins": 0.22426645457744598, + "rewards/rejected": -1.4495265483856201, "step": 3340 }, { "epoch": 1.7902659307576518, - "grad_norm": 9.860843976130422, + "grad_norm": 8.96712169638507, "learning_rate": 4.1860028246827594e-07, - "logits/chosen": -0.10094896703958511, - "logits/rejected": 0.0693550854921341, - "logps/chosen": -1.1767338514328003, - "logps/rejected": -1.4444658756256104, - "loss": 1.9528, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1767338514328003, - "rewards/margins": 0.26773205399513245, - "rewards/rejected": -1.4444658756256104, - "semantic_entropy": 0.8237002491950989, + "logits/chosen": -0.15664049983024597, + "logits/rejected": -0.018710583448410034, + "logps/chosen": -1.156507134437561, + "logps/rejected": -1.364248275756836, + "loss": 1.5416, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.156507134437561, + "rewards/margins": 0.2077411711215973, + "rewards/rejected": -1.364248275756836, "step": 3345 }, { "epoch": 1.7929419635390533, - "grad_norm": 8.425056726555647, + "grad_norm": 8.42976352514539, "learning_rate": 4.170640541828285e-07, - "logits/chosen": -0.1958092898130417, - "logits/rejected": -0.04710087925195694, - "logps/chosen": -1.3505299091339111, - "logps/rejected": -1.549831748008728, - "loss": 2.068, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3505299091339111, - "rewards/margins": 0.19930198788642883, - "rewards/rejected": -1.549831748008728, - "semantic_entropy": 0.782744824886322, + "logits/chosen": -0.2600031793117523, + "logits/rejected": -0.13997478783130646, + "logps/chosen": -1.3221666812896729, + "logps/rejected": -1.4479409456253052, + "loss": 1.6767, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3221666812896729, + "rewards/margins": 0.12577416002750397, + "rewards/rejected": -1.4479409456253052, "step": 3350 }, { "epoch": 1.795617996320455, - "grad_norm": 11.576807594386622, + "grad_norm": 10.628946026491922, "learning_rate": 4.1552863054229116e-07, - "logits/chosen": 0.04093014448881149, - "logits/rejected": 0.08852342516183853, - "logps/chosen": -1.3552569150924683, - "logps/rejected": -1.504966139793396, - "loss": 2.1187, + "logits/chosen": -0.022637512534856796, + "logits/rejected": 0.009353891015052795, + "logps/chosen": -1.3117666244506836, + "logps/rejected": -1.397631287574768, + "loss": 1.7081, "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.3552569150924683, - "rewards/margins": 0.14970925450325012, - "rewards/rejected": -1.504966139793396, - "semantic_entropy": 0.7735754251480103, + "rewards/chosen": -1.3117666244506836, + "rewards/margins": 0.0858646109700203, + "rewards/rejected": -1.397631287574768, "step": 3355 }, { "epoch": 1.7982940291018565, - "grad_norm": 8.288746454534513, + "grad_norm": 8.161709625334764, "learning_rate": 4.139940264433508e-07, - "logits/chosen": -0.09469692409038544, - "logits/rejected": 0.12611272931098938, - "logps/chosen": -1.2306644916534424, - "logps/rejected": -1.4716196060180664, - "loss": 1.9888, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2306644916534424, - "rewards/margins": 0.24095502495765686, - "rewards/rejected": -1.4716196060180664, - "semantic_entropy": 0.8139133453369141, + "logits/chosen": -0.19507454335689545, + "logits/rejected": -0.01754070445895195, + "logps/chosen": -1.2055964469909668, + "logps/rejected": -1.3833786249160767, + "loss": 1.5788, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2055964469909668, + "rewards/margins": 0.17778228223323822, + "rewards/rejected": -1.3833786249160767, "step": 3360 }, { "epoch": 1.800970061883258, - "grad_norm": 7.1777727486770955, + "grad_norm": 5.789284712069662, "learning_rate": 4.1246025677474303e-07, - "logits/chosen": -0.1426037847995758, - "logits/rejected": 0.014146551489830017, - "logps/chosen": -1.2692500352859497, - "logps/rejected": -1.5440677404403687, - "loss": 1.9848, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2692500352859497, - "rewards/margins": 0.2748177647590637, - "rewards/rejected": -1.5440677404403687, - "semantic_entropy": 0.7960511445999146, + "logits/chosen": -0.2251238375902176, + "logits/rejected": -0.10471514612436295, + "logps/chosen": -1.242795705795288, + "logps/rejected": -1.439692735671997, + "loss": 1.5858, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.242795705795288, + "rewards/margins": 0.19689705967903137, + "rewards/rejected": -1.439692735671997, "step": 3365 }, { "epoch": 1.8036460946646597, - "grad_norm": 7.206340297679535, + "grad_norm": 6.4576897334290715, "learning_rate": 4.10927336417108e-07, - "logits/chosen": -0.11211264133453369, - "logits/rejected": 0.04343719407916069, - "logps/chosen": -1.2716386318206787, - "logps/rejected": -1.4695169925689697, - "loss": 2.0223, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2716386318206787, - "rewards/margins": 0.19787819683551788, - "rewards/rejected": -1.4695169925689697, - "semantic_entropy": 0.8089770078659058, + "logits/chosen": -0.18072129786014557, + "logits/rejected": -0.05849956348538399, + "logps/chosen": -1.2341316938400269, + "logps/rejected": -1.3826676607131958, + "loss": 1.6024, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2341316938400269, + "rewards/margins": 0.14853589236736298, + "rewards/rejected": -1.3826676607131958, "step": 3370 }, { "epoch": 1.8063221274460612, - "grad_norm": 8.240207415115314, + "grad_norm": 7.8585008339633164, "learning_rate": 4.093952802428457e-07, - "logits/chosen": 0.055115751922130585, - "logits/rejected": 0.10022290796041489, - "logps/chosen": -1.3147608041763306, - "logps/rejected": -1.4589980840682983, - "loss": 2.0785, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3147608041763306, - "rewards/margins": 0.1442374736070633, - "rewards/rejected": -1.4589980840682983, - "semantic_entropy": 0.8038409948348999, + "logits/chosen": -0.060932356864213943, + "logits/rejected": -0.03307708352804184, + "logps/chosen": -1.2687528133392334, + "logps/rejected": -1.3632913827896118, + "loss": 1.6541, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2687528133392334, + "rewards/margins": 0.0945383757352829, + "rewards/rejected": -1.3632913827896118, "step": 3375 }, { "epoch": 1.8089981602274627, - "grad_norm": 6.219293393123405, + "grad_norm": 5.779868997107154, "learning_rate": 4.0786410311597184e-07, - "logits/chosen": -0.14277827739715576, - "logits/rejected": 0.0073592751286923885, - "logps/chosen": -1.2585190534591675, - "logps/rejected": -1.5473378896713257, - "loss": 1.9828, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2585190534591675, - "rewards/margins": 0.28881901502609253, - "rewards/rejected": -1.5473378896713257, - "semantic_entropy": 0.8017240762710571, + "logits/chosen": -0.24139253795146942, + "logits/rejected": -0.1293010115623474, + "logps/chosen": -1.2347960472106934, + "logps/rejected": -1.4479663372039795, + "loss": 1.5791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2347960472106934, + "rewards/margins": 0.2131703644990921, + "rewards/rejected": -1.4479663372039795, "step": 3380 }, { "epoch": 1.8116741930088645, - "grad_norm": 8.042866914467831, + "grad_norm": 7.1835051661861895, "learning_rate": 4.063338198919737e-07, - "logits/chosen": -0.12222157418727875, - "logits/rejected": -0.0931001529097557, - "logps/chosen": -1.3203542232513428, - "logps/rejected": -1.5036855936050415, - "loss": 2.0601, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3203542232513428, - "rewards/margins": 0.18333129584789276, - "rewards/rejected": -1.5036855936050415, - "semantic_entropy": 0.7957580089569092, + "logits/chosen": -0.20090754330158234, + "logits/rejected": -0.17768360674381256, + "logps/chosen": -1.295978307723999, + "logps/rejected": -1.444831371307373, + "loss": 1.6513, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.295978307723999, + "rewards/margins": 0.14885297417640686, + "rewards/rejected": -1.444831371307373, "step": 3385 }, { "epoch": 1.814350225790266, - "grad_norm": 11.085977240555279, + "grad_norm": 9.757139822711501, "learning_rate": 4.0480444541766575e-07, - "logits/chosen": -0.09060386568307877, - "logits/rejected": 0.03485582396388054, - "logps/chosen": -1.3461965322494507, - "logps/rejected": -1.4985482692718506, - "loss": 2.0926, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3461965322494507, - "rewards/margins": 0.1523514986038208, - "rewards/rejected": -1.4985482692718506, - "semantic_entropy": 0.788882851600647, + "logits/chosen": -0.15369240939617157, + "logits/rejected": -0.059753142297267914, + "logps/chosen": -1.3139399290084839, + "logps/rejected": -1.4136239290237427, + "loss": 1.6832, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3139399290084839, + "rewards/margins": 0.09968401491641998, + "rewards/rejected": -1.4136239290237427, "step": 3390 }, { "epoch": 1.8170262585716674, - "grad_norm": 10.154637330551065, + "grad_norm": 8.8837194965421, "learning_rate": 4.0327599453104606e-07, - "logits/chosen": -0.13672223687171936, - "logits/rejected": -0.03219519183039665, - "logps/chosen": -1.21634840965271, - "logps/rejected": -1.5037835836410522, - "loss": 1.9777, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.21634840965271, - "rewards/margins": 0.28743523359298706, - "rewards/rejected": -1.5037835836410522, - "semantic_entropy": 0.8166402578353882, + "logits/chosen": -0.19835281372070312, + "logits/rejected": -0.12649798393249512, + "logps/chosen": -1.191025972366333, + "logps/rejected": -1.4131062030792236, + "loss": 1.5628, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.191025972366333, + "rewards/margins": 0.22208015620708466, + "rewards/rejected": -1.4131062030792236, "step": 3395 }, { "epoch": 1.8197022913530692, - "grad_norm": 9.548085302685983, + "grad_norm": 8.13866966024635, "learning_rate": 4.017484820611514e-07, - "logits/chosen": -0.09091535955667496, - "logits/rejected": 0.03177911415696144, - "logps/chosen": -1.287781000137329, - "logps/rejected": -1.5104598999023438, - "loss": 2.016, + "logits/chosen": -0.19858194887638092, + "logits/rejected": -0.10243697464466095, + "logps/chosen": -1.258873462677002, + "logps/rejected": -1.40668785572052, + "loss": 1.6142, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.287781000137329, - "rewards/margins": 0.2226787507534027, - "rewards/rejected": -1.5104598999023438, - "semantic_entropy": 0.7931040525436401, + "rewards/chosen": -1.258873462677002, + "rewards/margins": 0.1478143036365509, + "rewards/rejected": -1.40668785572052, "step": 3400 }, { "epoch": 1.8223783241344707, - "grad_norm": 12.039861832870322, + "grad_norm": 11.355187720081398, "learning_rate": 4.002219228279148e-07, - "logits/chosen": -0.09477577358484268, - "logits/rejected": 0.061077456921339035, - "logps/chosen": -1.275453805923462, - "logps/rejected": -1.4921751022338867, - "loss": 2.0209, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.275453805923462, - "rewards/margins": 0.2167212963104248, - "rewards/rejected": -1.4921751022338867, - "semantic_entropy": 0.808013916015625, + "logits/chosen": -0.17992979288101196, + "logits/rejected": -0.05091270059347153, + "logps/chosen": -1.2505369186401367, + "logps/rejected": -1.427901029586792, + "loss": 1.6086, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2505369186401367, + "rewards/margins": 0.1773640364408493, + "rewards/rejected": -1.427901029586792, "step": 3405 }, { "epoch": 1.8250543569158721, - "grad_norm": 10.043505087001625, + "grad_norm": 8.244314426249161, "learning_rate": 3.9869633164202045e-07, - "logits/chosen": -0.10172738879919052, - "logits/rejected": 0.12015529721975327, - "logps/chosen": -1.4054529666900635, - "logps/rejected": -1.5626072883605957, - "loss": 2.1194, + "logits/chosen": -0.18372592329978943, + "logits/rejected": -0.01030859723687172, + "logps/chosen": -1.3831340074539185, + "logps/rejected": -1.489128589630127, + "loss": 1.7304, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.4054529666900635, - "rewards/margins": 0.15715420246124268, - "rewards/rejected": -1.5626072883605957, - "semantic_entropy": 0.7611129879951477, + "rewards/chosen": -1.3831340074539185, + "rewards/margins": 0.10599465668201447, + "rewards/rejected": -1.489128589630127, "step": 3410 }, { "epoch": 1.8277303896972739, - "grad_norm": 9.554825491563795, + "grad_norm": 7.525238246945747, "learning_rate": 3.9717172330476077e-07, - "logits/chosen": -0.1088135689496994, - "logits/rejected": -0.0048567550256848335, - "logps/chosen": -1.2670540809631348, - "logps/rejected": -1.5354701280593872, - "loss": 2.0084, + "logits/chosen": -0.18597820401191711, + "logits/rejected": -0.10546676069498062, + "logps/chosen": -1.2357432842254639, + "logps/rejected": -1.4461013078689575, + "loss": 1.5916, "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2670540809631348, - "rewards/margins": 0.2684159576892853, - "rewards/rejected": -1.5354701280593872, - "semantic_entropy": 0.7980339527130127, + "rewards/chosen": -1.2357432842254639, + "rewards/margins": 0.21035806834697723, + "rewards/rejected": -1.4461013078689575, "step": 3415 }, { "epoch": 1.8304064224786754, - "grad_norm": 10.833270444734117, + "grad_norm": 9.698197774122443, "learning_rate": 3.956481126078927e-07, - "logits/chosen": -0.07312284409999847, - "logits/rejected": 0.03952528536319733, - "logps/chosen": -1.3103262186050415, - "logps/rejected": -1.6255826950073242, - "loss": 2.0335, + "logits/chosen": -0.1279938519001007, + "logits/rejected": -0.02889108657836914, + "logps/chosen": -1.2791216373443604, + "logps/rejected": -1.517954707145691, + "loss": 1.6296, "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3103262186050415, - "rewards/margins": 0.3152565062046051, - "rewards/rejected": -1.6255826950073242, - "semantic_entropy": 0.779475212097168, + "rewards/chosen": -1.2791216373443604, + "rewards/margins": 0.2388329952955246, + "rewards/rejected": -1.517954707145691, "step": 3420 }, { "epoch": 1.8330824552600768, - "grad_norm": 6.387688251221902, + "grad_norm": 6.172841283742514, "learning_rate": 3.941255143334937e-07, - "logits/chosen": -0.143729105591774, - "logits/rejected": -0.09878051280975342, - "logps/chosen": -1.2660369873046875, - "logps/rejected": -1.5195882320404053, - "loss": 2.0111, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2660369873046875, - "rewards/margins": 0.2535511553287506, - "rewards/rejected": -1.5195882320404053, - "semantic_entropy": 0.8040241003036499, + "logits/chosen": -0.2117902934551239, + "logits/rejected": -0.17474015057086945, + "logps/chosen": -1.2423498630523682, + "logps/rejected": -1.4180864095687866, + "loss": 1.6009, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2423498630523682, + "rewards/margins": 0.1757366955280304, + "rewards/rejected": -1.4180864095687866, "step": 3425 }, { "epoch": 1.8357584880414786, - "grad_norm": 12.017230340151167, + "grad_norm": 10.448068708499129, "learning_rate": 3.9260394325381895e-07, - "logits/chosen": -0.1061253771185875, - "logits/rejected": 0.02162790857255459, - "logps/chosen": -1.3311865329742432, - "logps/rejected": -1.6028121709823608, - "loss": 2.0403, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3311865329742432, - "rewards/margins": 0.2716255187988281, - "rewards/rejected": -1.6028121709823608, - "semantic_entropy": 0.7831372022628784, + "logits/chosen": -0.2236020565032959, + "logits/rejected": -0.12878096103668213, + "logps/chosen": -1.3056252002716064, + "logps/rejected": -1.495781421661377, + "loss": 1.6369, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3056252002716064, + "rewards/margins": 0.19015632569789886, + "rewards/rejected": -1.495781421661377, "step": 3430 }, { "epoch": 1.83843452082288, - "grad_norm": 11.260166805719937, + "grad_norm": 10.046608805526557, "learning_rate": 3.9108341413115784e-07, - "logits/chosen": -0.12128078937530518, - "logits/rejected": -0.029097210615873337, - "logps/chosen": -1.274632453918457, - "logps/rejected": -1.5507726669311523, - "loss": 1.9776, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.274632453918457, - "rewards/margins": 0.2761402130126953, - "rewards/rejected": -1.5507726669311523, - "semantic_entropy": 0.790766716003418, + "logits/chosen": -0.20522567629814148, + "logits/rejected": -0.13971085846424103, + "logps/chosen": -1.2589514255523682, + "logps/rejected": -1.4753795862197876, + "loss": 1.5882, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2589514255523682, + "rewards/margins": 0.21642813086509705, + "rewards/rejected": -1.4753795862197876, "step": 3435 }, { "epoch": 1.8411105536042816, - "grad_norm": 11.838134033346803, + "grad_norm": 10.07150425461416, "learning_rate": 3.895639417176905e-07, - "logits/chosen": -0.1734875738620758, - "logits/rejected": -0.10030355304479599, - "logps/chosen": -1.2006644010543823, - "logps/rejected": -1.5158345699310303, - "loss": 1.9848, + "logits/chosen": -0.24803462624549866, + "logits/rejected": -0.19548973441123962, + "logps/chosen": -1.1807209253311157, + "logps/rejected": -1.4144724607467651, + "loss": 1.5658, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2006644010543823, - "rewards/margins": 0.31517019867897034, - "rewards/rejected": -1.5158345699310303, - "semantic_entropy": 0.8104988932609558, + "rewards/chosen": -1.1807209253311157, + "rewards/margins": 0.23375146090984344, + "rewards/rejected": -1.4144724607467651, "step": 3440 }, { "epoch": 1.8437865863856833, - "grad_norm": 9.095694299153367, + "grad_norm": 7.890741243298936, "learning_rate": 3.8804554075534497e-07, - "logits/chosen": -0.18821772933006287, - "logits/rejected": 0.03512702137231827, - "logps/chosen": -1.2557734251022339, - "logps/rejected": -1.4832450151443481, - "loss": 1.9862, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2557734251022339, - "rewards/margins": 0.2274715006351471, - "rewards/rejected": -1.4832450151443481, - "semantic_entropy": 0.8089550137519836, + "logits/chosen": -0.22131972014904022, + "logits/rejected": -0.01649406924843788, + "logps/chosen": -1.22940194606781, + "logps/rejected": -1.4079666137695312, + "loss": 1.5728, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.22940194606781, + "rewards/margins": 0.17856450378894806, + "rewards/rejected": -1.4079666137695312, "step": 3445 }, { "epoch": 1.8464626191670848, - "grad_norm": 13.087986143054922, + "grad_norm": 10.333119132736323, "learning_rate": 3.8652822597565403e-07, - "logits/chosen": -0.2668977975845337, - "logits/rejected": -0.08437497913837433, - "logps/chosen": -1.2757649421691895, - "logps/rejected": -1.5986865758895874, - "loss": 1.9793, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2757649421691895, - "rewards/margins": 0.3229215145111084, - "rewards/rejected": -1.5986865758895874, - "semantic_entropy": 0.7837361097335815, + "logits/chosen": -0.31308794021606445, + "logits/rejected": -0.15774603188037872, + "logps/chosen": -1.2503737211227417, + "logps/rejected": -1.5004576444625854, + "loss": 1.5846, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2503737211227417, + "rewards/margins": 0.2500839829444885, + "rewards/rejected": -1.5004576444625854, "step": 3450 }, { "epoch": 1.8491386519484863, - "grad_norm": 7.107161760470273, + "grad_norm": 6.919591848086438, "learning_rate": 3.850120120996123e-07, - "logits/chosen": -0.1137063130736351, - "logits/rejected": 0.04673473909497261, - "logps/chosen": -1.4318660497665405, - "logps/rejected": -1.6971549987792969, - "loss": 2.1086, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.4318660497665405, - "rewards/margins": 0.2652890682220459, - "rewards/rejected": -1.6971549987792969, - "semantic_entropy": 0.7388371229171753, + "logits/chosen": -0.16612792015075684, + "logits/rejected": -0.036644019186496735, + "logps/chosen": -1.411642074584961, + "logps/rejected": -1.6058638095855713, + "loss": 1.7367, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.411642074584961, + "rewards/margins": 0.19422176480293274, + "rewards/rejected": -1.6058638095855713, "step": 3455 }, { "epoch": 1.851814684729888, - "grad_norm": 11.203647624565647, + "grad_norm": 10.712034376207075, "learning_rate": 3.8349691383753356e-07, - "logits/chosen": -0.014518792741000652, - "logits/rejected": 0.11191823333501816, - "logps/chosen": -1.2621428966522217, - "logps/rejected": -1.5209171772003174, - "loss": 2.0493, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2621428966522217, - "rewards/margins": 0.258774071931839, - "rewards/rejected": -1.5209171772003174, - "semantic_entropy": 0.799616813659668, + "logits/chosen": -0.08675368130207062, + "logits/rejected": 0.01952032931149006, + "logps/chosen": -1.235244631767273, + "logps/rejected": -1.426180124282837, + "loss": 1.6395, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.235244631767273, + "rewards/margins": 0.19093546271324158, + "rewards/rejected": -1.426180124282837, "step": 3460 }, { "epoch": 1.8544907175112895, - "grad_norm": 6.988737589871721, + "grad_norm": 6.924033747890647, "learning_rate": 3.819829458889078e-07, - "logits/chosen": -0.16742148995399475, - "logits/rejected": -0.038034576922655106, - "logps/chosen": -1.2240421772003174, - "logps/rejected": -1.4272968769073486, - "loss": 1.9947, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2240421772003174, - "rewards/margins": 0.2032547891139984, - "rewards/rejected": -1.4272968769073486, - "semantic_entropy": 0.8272393345832825, + "logits/chosen": -0.244086354970932, + "logits/rejected": -0.13570265471935272, + "logps/chosen": -1.203270673751831, + "logps/rejected": -1.367319107055664, + "loss": 1.5766, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.203270673751831, + "rewards/margins": 0.16404837369918823, + "rewards/rejected": -1.367319107055664, "step": 3465 }, { "epoch": 1.857166750292691, - "grad_norm": 7.58424123902832, + "grad_norm": 7.464170816007541, "learning_rate": 3.804701229422585e-07, - "logits/chosen": -0.16845372319221497, - "logits/rejected": -0.07218648493289948, - "logps/chosen": -1.351628065109253, - "logps/rejected": -1.55996572971344, - "loss": 2.0535, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.351628065109253, - "rewards/margins": 0.20833781361579895, - "rewards/rejected": -1.55996572971344, - "semantic_entropy": 0.7803600430488586, + "logits/chosen": -0.22056007385253906, + "logits/rejected": -0.1388140469789505, + "logps/chosen": -1.3312097787857056, + "logps/rejected": -1.472596287727356, + "loss": 1.6641, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3312097787857056, + "rewards/margins": 0.1413867175579071, + "rewards/rejected": -1.472596287727356, "step": 3470 }, { "epoch": 1.8598427830740927, - "grad_norm": 9.02069349372374, + "grad_norm": 6.761094195749699, "learning_rate": 3.789584596750007e-07, - "logits/chosen": -0.1916225254535675, - "logits/rejected": -0.12627634406089783, - "logps/chosen": -1.2798500061035156, - "logps/rejected": -1.5317795276641846, - "loss": 2.0162, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2798500061035156, - "rewards/margins": 0.2519295811653137, - "rewards/rejected": -1.5317795276641846, - "semantic_entropy": 0.7949556112289429, + "logits/chosen": -0.29377657175064087, + "logits/rejected": -0.23835745453834534, + "logps/chosen": -1.256420373916626, + "logps/rejected": -1.452223777770996, + "loss": 1.6167, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.256420373916626, + "rewards/margins": 0.19580325484275818, + "rewards/rejected": -1.452223777770996, "step": 3475 }, { "epoch": 1.8625188158554944, - "grad_norm": 8.745520346577289, + "grad_norm": 7.9660825575787255, "learning_rate": 3.77447970753298e-07, - "logits/chosen": -0.054103873670101166, - "logits/rejected": -0.01997731253504753, - "logps/chosen": -1.3081482648849487, - "logps/rejected": -1.5802189111709595, - "loss": 2.0131, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3081482648849487, - "rewards/margins": 0.2720705270767212, - "rewards/rejected": -1.5802189111709595, - "semantic_entropy": 0.7743436098098755, + "logits/chosen": -0.15394994616508484, + "logits/rejected": -0.12967711687088013, + "logps/chosen": -1.2819709777832031, + "logps/rejected": -1.4983596801757812, + "loss": 1.6208, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2819709777832031, + "rewards/margins": 0.2163887768983841, + "rewards/rejected": -1.4983596801757812, "step": 3480 }, { "epoch": 1.8651948486368957, - "grad_norm": 10.422824586512155, + "grad_norm": 10.274470546661414, "learning_rate": 3.7593867083192057e-07, - "logits/chosen": -0.11146446317434311, - "logits/rejected": -0.003896909300237894, - "logps/chosen": -1.266123652458191, - "logps/rejected": -1.5006070137023926, - "loss": 2.0187, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.266123652458191, - "rewards/margins": 0.2344832867383957, - "rewards/rejected": -1.5006070137023926, - "semantic_entropy": 0.8031982183456421, + "logits/chosen": -0.2137545645236969, + "logits/rejected": -0.12820082902908325, + "logps/chosen": -1.2431753873825073, + "logps/rejected": -1.4213874340057373, + "loss": 1.6093, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2431753873825073, + "rewards/margins": 0.17821213603019714, + "rewards/rejected": -1.4213874340057373, "step": 3485 }, { "epoch": 1.8678708814182974, - "grad_norm": 7.17462834869493, + "grad_norm": 6.8007809735857245, "learning_rate": 3.7443057455410276e-07, - "logits/chosen": -0.09207798540592194, - "logits/rejected": 0.03373739868402481, - "logps/chosen": -1.3165172338485718, - "logps/rejected": -1.4651727676391602, - "loss": 2.0569, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3165172338485718, - "rewards/margins": 0.14865554869174957, - "rewards/rejected": -1.4651727676391602, - "semantic_entropy": 0.7978287935256958, + "logits/chosen": -0.15245218575000763, + "logits/rejected": -0.05015261098742485, + "logps/chosen": -1.301128625869751, + "logps/rejected": -1.3995161056518555, + "loss": 1.6607, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.301128625869751, + "rewards/margins": 0.09838749468326569, + "rewards/rejected": -1.3995161056518555, "step": 3490 }, { "epoch": 1.870546914199699, - "grad_norm": 8.608209174018716, + "grad_norm": 8.146538116029161, "learning_rate": 3.7292369655140145e-07, - "logits/chosen": -0.19836124777793884, - "logits/rejected": -0.036930233240127563, - "logps/chosen": -1.2759727239608765, - "logps/rejected": -1.4997532367706299, - "loss": 2.0038, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2759727239608765, - "rewards/margins": 0.223780557513237, - "rewards/rejected": -1.4997532367706299, - "semantic_entropy": 0.808540940284729, + "logits/chosen": -0.23573660850524902, + "logits/rejected": -0.0928931012749672, + "logps/chosen": -1.2535717487335205, + "logps/rejected": -1.4325979948043823, + "loss": 1.5981, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2535717487335205, + "rewards/margins": 0.17902621626853943, + "rewards/rejected": -1.4325979948043823, "step": 3495 }, { "epoch": 1.8732229469811004, - "grad_norm": 8.037145528269233, + "grad_norm": 8.154477081361472, "learning_rate": 3.714180514435534e-07, - "logits/chosen": -0.1019342690706253, - "logits/rejected": 0.04769862815737724, - "logps/chosen": -1.3215068578720093, - "logps/rejected": -1.6422048807144165, - "loss": 2.006, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3215068578720093, - "rewards/margins": 0.3206980526447296, - "rewards/rejected": -1.6422048807144165, - "semantic_entropy": 0.7793091535568237, + "logits/chosen": -0.18610471487045288, + "logits/rejected": -0.05407028645277023, + "logps/chosen": -1.2972004413604736, + "logps/rejected": -1.529928207397461, + "loss": 1.6143, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2972004413604736, + "rewards/margins": 0.23272785544395447, + "rewards/rejected": -1.529928207397461, "step": 3500 }, { "epoch": 1.875898979762502, - "grad_norm": 10.400013771141799, + "grad_norm": 9.159993057152748, "learning_rate": 3.6991365383833426e-07, - "logits/chosen": -0.10635554790496826, - "logits/rejected": 0.01562192477285862, - "logps/chosen": -1.321709156036377, - "logps/rejected": -1.6019134521484375, - "loss": 2.0371, + "logits/chosen": -0.17671632766723633, + "logits/rejected": -0.08549154549837112, + "logps/chosen": -1.296130657196045, + "logps/rejected": -1.4943058490753174, + "loss": 1.6465, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.321709156036377, - "rewards/margins": 0.2802041471004486, - "rewards/rejected": -1.6019134521484375, - "semantic_entropy": 0.7732099294662476, + "rewards/chosen": -1.296130657196045, + "rewards/margins": 0.19817538559436798, + "rewards/rejected": -1.4943058490753174, "step": 3505 }, { "epoch": 1.8785750125439038, - "grad_norm": 10.206269336294888, + "grad_norm": 9.571784503316689, "learning_rate": 3.684105183314162e-07, - "logits/chosen": -0.13000985980033875, - "logits/rejected": -0.04903121665120125, - "logps/chosen": -1.265295147895813, - "logps/rejected": -1.4725327491760254, - "loss": 1.9937, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.265295147895813, - "rewards/margins": 0.20723757147789001, - "rewards/rejected": -1.4725327491760254, - "semantic_entropy": 0.8022018671035767, + "logits/chosen": -0.20955529808998108, + "logits/rejected": -0.14858964085578918, + "logps/chosen": -1.242735743522644, + "logps/rejected": -1.3935405015945435, + "loss": 1.5927, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.242735743522644, + "rewards/margins": 0.15080469846725464, + "rewards/rejected": -1.3935405015945435, "step": 3510 }, { "epoch": 1.881251045325305, - "grad_norm": 11.736347149498757, + "grad_norm": 9.319053879866445, "learning_rate": 3.669086595062263e-07, - "logits/chosen": -0.13198086619377136, - "logits/rejected": 0.06340041011571884, - "logps/chosen": -1.3111042976379395, - "logps/rejected": -1.5591113567352295, - "loss": 2.0347, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3111042976379395, - "rewards/margins": 0.24800701439380646, - "rewards/rejected": -1.5591113567352295, - "semantic_entropy": 0.8016371726989746, + "logits/chosen": -0.19252166152000427, + "logits/rejected": -0.030325695872306824, + "logps/chosen": -1.2827403545379639, + "logps/rejected": -1.4876335859298706, + "loss": 1.6236, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2827403545379639, + "rewards/margins": 0.20489318668842316, + "rewards/rejected": -1.4876335859298706, "step": 3515 }, { "epoch": 1.8839270781067068, - "grad_norm": 10.291606681517248, + "grad_norm": 9.101855369858386, "learning_rate": 3.654080919338056e-07, - "logits/chosen": -0.17716601490974426, - "logits/rejected": -0.03891471400856972, - "logps/chosen": -1.317662000656128, - "logps/rejected": -1.593109130859375, - "loss": 2.0252, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.317662000656128, - "rewards/margins": 0.27544716000556946, - "rewards/rejected": -1.593109130859375, - "semantic_entropy": 0.7637777328491211, + "logits/chosen": -0.23145878314971924, + "logits/rejected": -0.12374116480350494, + "logps/chosen": -1.2911999225616455, + "logps/rejected": -1.4790095090866089, + "loss": 1.6347, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2911999225616455, + "rewards/margins": 0.1878097951412201, + "rewards/rejected": -1.4790095090866089, "step": 3520 }, { "epoch": 1.8866031108881085, - "grad_norm": 7.861563362786371, + "grad_norm": 7.144259581381725, "learning_rate": 3.639088301726673e-07, - "logits/chosen": -0.0826728343963623, - "logits/rejected": 0.11588224023580551, - "logps/chosen": -1.2835619449615479, - "logps/rejected": -1.5396344661712646, - "loss": 2.0151, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2835619449615479, - "rewards/margins": 0.2560725212097168, - "rewards/rejected": -1.5396344661712646, - "semantic_entropy": 0.795852541923523, + "logits/chosen": -0.18061567842960358, + "logits/rejected": -0.018749738112092018, + "logps/chosen": -1.2568756341934204, + "logps/rejected": -1.4504077434539795, + "loss": 1.6138, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2568756341934204, + "rewards/margins": 0.19353221356868744, + "rewards/rejected": -1.4504077434539795, "step": 3525 }, { "epoch": 1.88927914366951, - "grad_norm": 13.126218986628478, + "grad_norm": 11.728557151173204, "learning_rate": 3.624108887686556e-07, - "logits/chosen": -0.09712855517864227, - "logits/rejected": -0.02328089438378811, - "logps/chosen": -1.260948896408081, - "logps/rejected": -1.507131814956665, - "loss": 2.0035, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.260948896408081, - "rewards/margins": 0.24618300795555115, - "rewards/rejected": -1.507131814956665, - "semantic_entropy": 0.8005443811416626, + "logits/chosen": -0.18043796718120575, + "logits/rejected": -0.12324018776416779, + "logps/chosen": -1.2382079362869263, + "logps/rejected": -1.44778311252594, + "loss": 1.595, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2382079362869263, + "rewards/margins": 0.20957525074481964, + "rewards/rejected": -1.44778311252594, "step": 3530 }, { "epoch": 1.8919551764509115, - "grad_norm": 6.546292643866425, + "grad_norm": 6.5571486374089805, "learning_rate": 3.6091428225480433e-07, - "logits/chosen": -0.18229694664478302, - "logits/rejected": -0.04712440446019173, - "logps/chosen": -1.227616548538208, - "logps/rejected": -1.4500181674957275, - "loss": 2.0085, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.227616548538208, - "rewards/margins": 0.22240164875984192, - "rewards/rejected": -1.4500181674957275, - "semantic_entropy": 0.8142200708389282, + "logits/chosen": -0.26944318413734436, + "logits/rejected": -0.16007402539253235, + "logps/chosen": -1.2057348489761353, + "logps/rejected": -1.3465030193328857, + "loss": 1.6035, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2057348489761353, + "rewards/margins": 0.14076808094978333, + "rewards/rejected": -1.3465030193328857, "step": 3535 }, { "epoch": 1.8946312092323132, - "grad_norm": 14.05217500248357, + "grad_norm": 11.186457917160363, "learning_rate": 3.5941902515119674e-07, - "logits/chosen": -0.13271181285381317, - "logits/rejected": 0.09598705172538757, - "logps/chosen": -1.2617511749267578, - "logps/rejected": -1.4930734634399414, - "loss": 2.0113, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2617511749267578, - "rewards/margins": 0.23132216930389404, - "rewards/rejected": -1.4930734634399414, - "semantic_entropy": 0.8063961863517761, + "logits/chosen": -0.20591190457344055, + "logits/rejected": -0.006433853413909674, + "logps/chosen": -1.2343441247940063, + "logps/rejected": -1.433020830154419, + "loss": 1.5926, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2343441247940063, + "rewards/margins": 0.1986767053604126, + "rewards/rejected": -1.433020830154419, "step": 3540 }, { "epoch": 1.8973072420137147, - "grad_norm": 9.515261930521739, + "grad_norm": 9.125103890239405, "learning_rate": 3.5792513196482373e-07, - "logits/chosen": -0.2581722140312195, - "logits/rejected": 0.015956703573465347, - "logps/chosen": -1.2583338022232056, - "logps/rejected": -1.442116141319275, - "loss": 2.0185, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2583338022232056, - "rewards/margins": 0.18378230929374695, - "rewards/rejected": -1.442116141319275, - "semantic_entropy": 0.8180877566337585, + "logits/chosen": -0.3186398446559906, + "logits/rejected": -0.0966661125421524, + "logps/chosen": -1.2360858917236328, + "logps/rejected": -1.352618932723999, + "loss": 1.6067, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2360858917236328, + "rewards/margins": 0.11653308570384979, + "rewards/rejected": -1.352618932723999, "step": 3545 }, { "epoch": 1.8999832747951162, - "grad_norm": 7.148565276722444, + "grad_norm": 7.048961384226068, "learning_rate": 3.5643261718944346e-07, - "logits/chosen": -0.08364517986774445, - "logits/rejected": 0.008080209605395794, - "logps/chosen": -1.2652015686035156, - "logps/rejected": -1.4693939685821533, - "loss": 2.0108, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2652015686035156, - "rewards/margins": 0.20419220626354218, - "rewards/rejected": -1.4693939685821533, - "semantic_entropy": 0.823981761932373, + "logits/chosen": -0.14295661449432373, + "logits/rejected": -0.06613682955503464, + "logps/chosen": -1.2295020818710327, + "logps/rejected": -1.374596357345581, + "loss": 1.5876, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2295020818710327, + "rewards/margins": 0.14509446918964386, + "rewards/rejected": -1.374596357345581, "step": 3550 }, { "epoch": 1.902659307576518, - "grad_norm": 6.059750779835962, + "grad_norm": 6.114270164065146, "learning_rate": 3.5494149530544087e-07, - "logits/chosen": -0.2066015899181366, - "logits/rejected": -0.067063108086586, - "logps/chosen": -1.2199000120162964, - "logps/rejected": -1.479786992073059, - "loss": 2.0179, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2199000120162964, - "rewards/margins": 0.2598869204521179, - "rewards/rejected": -1.479786992073059, - "semantic_entropy": 0.8159680366516113, + "logits/chosen": -0.26475760340690613, + "logits/rejected": -0.14583542943000793, + "logps/chosen": -1.1900657415390015, + "logps/rejected": -1.3886867761611938, + "loss": 1.5951, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1900657415390015, + "rewards/margins": 0.19862084090709686, + "rewards/rejected": -1.3886867761611938, "step": 3555 }, { "epoch": 1.9053353403579194, - "grad_norm": 12.349981712337039, + "grad_norm": 11.502442498112247, "learning_rate": 3.534517807796871e-07, - "logits/chosen": -0.10540161281824112, - "logits/rejected": -0.01878190226852894, - "logps/chosen": -1.3090038299560547, - "logps/rejected": -1.5431599617004395, - "loss": 2.0363, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3090038299560547, - "rewards/margins": 0.2341562807559967, - "rewards/rejected": -1.5431599617004395, - "semantic_entropy": 0.7972686290740967, + "logits/chosen": -0.1803629845380783, + "logits/rejected": -0.12026476860046387, + "logps/chosen": -1.281933069229126, + "logps/rejected": -1.466369867324829, + "loss": 1.6331, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.281933069229126, + "rewards/margins": 0.18443647027015686, + "rewards/rejected": -1.466369867324829, "step": 3560 }, { "epoch": 1.908011373139321, - "grad_norm": 6.601573179170184, + "grad_norm": 6.002821132575185, "learning_rate": 3.519634880653988e-07, - "logits/chosen": -0.12914696335792542, - "logits/rejected": -0.0617067813873291, - "logps/chosen": -1.2342662811279297, - "logps/rejected": -1.5450482368469238, - "loss": 1.9972, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2342662811279297, - "rewards/margins": 0.3107820749282837, - "rewards/rejected": -1.5450482368469238, - "semantic_entropy": 0.8117408752441406, + "logits/chosen": -0.18821127712726593, + "logits/rejected": -0.14258238673210144, + "logps/chosen": -1.2131202220916748, + "logps/rejected": -1.4328038692474365, + "loss": 1.5913, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2131202220916748, + "rewards/margins": 0.2196836918592453, + "rewards/rejected": -1.4328038692474365, "step": 3565 }, { "epoch": 1.9106874059207226, - "grad_norm": 7.578795754764497, + "grad_norm": 7.635269179529435, "learning_rate": 3.504766316019987e-07, - "logits/chosen": -0.15820898115634918, - "logits/rejected": -0.017529401928186417, - "logps/chosen": -1.262640118598938, - "logps/rejected": -1.5338207483291626, - "loss": 2.0033, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.262640118598938, - "rewards/margins": 0.27118054032325745, - "rewards/rejected": -1.5338207483291626, - "semantic_entropy": 0.8046427965164185, + "logits/chosen": -0.2189391553401947, + "logits/rejected": -0.10928545892238617, + "logps/chosen": -1.2418806552886963, + "logps/rejected": -1.4406083822250366, + "loss": 1.6055, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2418806552886963, + "rewards/margins": 0.1987278163433075, + "rewards/rejected": -1.4406083822250366, "step": 3570 }, { "epoch": 1.913363438702124, - "grad_norm": 8.152077198375881, + "grad_norm": 7.443797664782801, "learning_rate": 3.489912258149745e-07, - "logits/chosen": -0.05192985013127327, - "logits/rejected": 0.0704207569360733, - "logps/chosen": -1.2362616062164307, - "logps/rejected": -1.5028289556503296, - "loss": 2.0064, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2362616062164307, - "rewards/margins": 0.2665674090385437, - "rewards/rejected": -1.5028289556503296, - "semantic_entropy": 0.8162727355957031, + "logits/chosen": -0.12070286273956299, + "logits/rejected": -0.016779515892267227, + "logps/chosen": -1.2063409090042114, + "logps/rejected": -1.4125832319259644, + "loss": 1.5852, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2063409090042114, + "rewards/margins": 0.2062424123287201, + "rewards/rejected": -1.4125832319259644, "step": 3575 }, { "epoch": 1.9160394714835256, - "grad_norm": 10.272991612886464, + "grad_norm": 8.504628729714607, "learning_rate": 3.475072851157397e-07, - "logits/chosen": -0.11488697677850723, - "logits/rejected": -0.06389477103948593, - "logps/chosen": -1.2549583911895752, - "logps/rejected": -1.5735355615615845, - "loss": 1.9829, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2549583911895752, - "rewards/margins": 0.3185771107673645, - "rewards/rejected": -1.5735355615615845, - "semantic_entropy": 0.7964209914207458, + "logits/chosen": -0.19800689816474915, + "logits/rejected": -0.16064441204071045, + "logps/chosen": -1.2234773635864258, + "logps/rejected": -1.464333176612854, + "loss": 1.5766, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2234773635864258, + "rewards/margins": 0.24085572361946106, + "rewards/rejected": -1.464333176612854, "step": 3580 }, { "epoch": 1.9187155042649273, - "grad_norm": 8.11158263217016, + "grad_norm": 7.9677329475988605, "learning_rate": 3.460248239014936e-07, - "logits/chosen": -0.015757273882627487, - "logits/rejected": 0.03946710377931595, - "logps/chosen": -1.3438389301300049, - "logps/rejected": -1.534001350402832, - "loss": 2.0518, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3438389301300049, - "rewards/margins": 0.19016249477863312, - "rewards/rejected": -1.534001350402832, - "semantic_entropy": 0.7529127597808838, + "logits/chosen": -0.1262514889240265, + "logits/rejected": -0.08946479856967926, + "logps/chosen": -1.3143281936645508, + "logps/rejected": -1.4545438289642334, + "loss": 1.6649, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3143281936645508, + "rewards/margins": 0.14021556079387665, + "rewards/rejected": -1.4545438289642334, "step": 3585 }, { "epoch": 1.9213915370463288, - "grad_norm": 8.210026560238209, + "grad_norm": 7.952744211308689, "learning_rate": 3.4454385655508134e-07, - "logits/chosen": -0.04379934445023537, - "logits/rejected": 0.029793402180075645, - "logps/chosen": -1.3078745603561401, - "logps/rejected": -1.49537992477417, - "loss": 2.0368, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3078745603561401, - "rewards/margins": 0.18750540912151337, - "rewards/rejected": -1.49537992477417, - "semantic_entropy": 0.7906752824783325, + "logits/chosen": -0.13433387875556946, + "logits/rejected": -0.07007332146167755, + "logps/chosen": -1.277406930923462, + "logps/rejected": -1.4121787548065186, + "loss": 1.6319, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.277406930923462, + "rewards/margins": 0.13477174937725067, + "rewards/rejected": -1.4121787548065186, "step": 3590 }, { "epoch": 1.9240675698277303, - "grad_norm": 6.953433919929532, + "grad_norm": 6.781740184269981, "learning_rate": 3.4306439744485447e-07, - "logits/chosen": -0.21919319033622742, - "logits/rejected": -0.014783772639930248, - "logps/chosen": -1.2707698345184326, - "logps/rejected": -1.529934287071228, - "loss": 2.0018, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2707698345184326, - "rewards/margins": 0.25916433334350586, - "rewards/rejected": -1.529934287071228, - "semantic_entropy": 0.7992693185806274, + "logits/chosen": -0.2822973132133484, + "logits/rejected": -0.10233817994594574, + "logps/chosen": -1.2295691967010498, + "logps/rejected": -1.4294407367706299, + "loss": 1.5833, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2295691967010498, + "rewards/margins": 0.19987143576145172, + "rewards/rejected": -1.4294407367706299, "step": 3595 }, { "epoch": 1.926743602609132, - "grad_norm": 10.877825850627211, + "grad_norm": 9.943679947386732, "learning_rate": 3.415864609245322e-07, - "logits/chosen": -0.059394340962171555, - "logits/rejected": 0.12252438068389893, - "logps/chosen": -1.2442500591278076, - "logps/rejected": -1.5825135707855225, - "loss": 1.971, + "logits/chosen": -0.14189042150974274, + "logits/rejected": 0.013472884893417358, + "logps/chosen": -1.209618091583252, + "logps/rejected": -1.4569435119628906, + "loss": 1.5646, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2442500591278076, - "rewards/margins": 0.33826351165771484, - "rewards/rejected": -1.5825135707855225, - "semantic_entropy": 0.7945801019668579, + "rewards/chosen": -1.209618091583252, + "rewards/margins": 0.24732527136802673, + "rewards/rejected": -1.4569435119628906, "step": 3600 }, { "epoch": 1.926743602609132, - "eval_logits/chosen": 0.21609361469745636, - "eval_logits/rejected": 0.3034639358520508, - "eval_logps/chosen": -1.3219430446624756, - "eval_logps/rejected": -1.569968342781067, - "eval_loss": 2.04423189163208, - "eval_rewards/accuracies": 0.5860534310340881, - "eval_rewards/chosen": -1.3219430446624756, - "eval_rewards/margins": 0.24802519381046295, - "eval_rewards/rejected": -1.569968342781067, - "eval_runtime": 34.5858, - "eval_samples_per_second": 38.889, - "eval_semantic_entropy": 0.7832201719284058, - "eval_steps_per_second": 9.744, + "eval_logits/chosen": 0.10516196489334106, + "eval_logits/rejected": 0.17712825536727905, + "eval_logps/chosen": -1.2986959218978882, + "eval_logps/rejected": -1.477814793586731, + "eval_loss": 1.647760033607483, + "eval_rewards/accuracies": 0.5704748034477234, + "eval_rewards/chosen": -1.2986959218978882, + "eval_rewards/margins": 0.179118812084198, + "eval_rewards/rejected": -1.477814793586731, + "eval_runtime": 40.3628, + "eval_samples_per_second": 33.323, + "eval_steps_per_second": 8.349, "step": 3600 }, { "epoch": 1.9294196353905335, - "grad_norm": 8.144866054072718, + "grad_norm": 7.808770644621557, "learning_rate": 3.401100613330605e-07, - "logits/chosen": -0.158551424741745, - "logits/rejected": -0.11706791073083878, - "logps/chosen": -1.2888761758804321, - "logps/rejected": -1.4633435010910034, - "loss": 2.0513, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2888761758804321, - "rewards/margins": 0.17446734011173248, - "rewards/rejected": -1.4633435010910034, - "semantic_entropy": 0.8004593849182129, + "logits/chosen": -0.24109283089637756, + "logits/rejected": -0.2096950262784958, + "logps/chosen": -1.2588623762130737, + "logps/rejected": -1.3880422115325928, + "loss": 1.641, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2588623762130737, + "rewards/margins": 0.1291799396276474, + "rewards/rejected": -1.3880422115325928, "step": 3605 }, { "epoch": 1.932095668171935, - "grad_norm": 7.073985429506359, + "grad_norm": 6.415918600900964, "learning_rate": 3.3863521299447514e-07, - "logits/chosen": -0.08732198923826218, - "logits/rejected": 0.03606638312339783, - "logps/chosen": -1.2564102411270142, - "logps/rejected": -1.526855707168579, - "loss": 1.9798, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2564102411270142, - "rewards/margins": 0.2704453766345978, - "rewards/rejected": -1.526855707168579, - "semantic_entropy": 0.7902366518974304, + "logits/chosen": -0.14331869781017303, + "logits/rejected": -0.04207003861665726, + "logps/chosen": -1.2295633554458618, + "logps/rejected": -1.4343435764312744, + "loss": 1.5756, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2295633554458618, + "rewards/margins": 0.2047802209854126, + "rewards/rejected": -1.4343435764312744, "step": 3610 }, { "epoch": 1.9347717009533367, - "grad_norm": 6.3886970871943065, + "grad_norm": 6.2842172727300865, "learning_rate": 3.371619302177609e-07, - "logits/chosen": -0.023822737857699394, - "logits/rejected": 0.09075307846069336, - "logps/chosen": -1.3473241329193115, - "logps/rejected": -1.5236204862594604, - "loss": 2.0573, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3473241329193115, - "rewards/margins": 0.1762964427471161, - "rewards/rejected": -1.5236204862594604, - "semantic_entropy": 0.7837681770324707, + "logits/chosen": -0.0876409187912941, + "logits/rejected": 0.008129620924592018, + "logps/chosen": -1.3283607959747314, + "logps/rejected": -1.436718463897705, + "loss": 1.6673, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3283607959747314, + "rewards/margins": 0.10835757106542587, + "rewards/rejected": -1.436718463897705, "step": 3615 }, { "epoch": 1.9374477337347382, - "grad_norm": 10.305278907533184, + "grad_norm": 10.35555577271964, "learning_rate": 3.3569022729671393e-07, - "logits/chosen": -0.07354442030191422, - "logits/rejected": -0.0033059536945074797, - "logps/chosen": -1.299324631690979, - "logps/rejected": -1.4606642723083496, - "loss": 2.065, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.299324631690979, - "rewards/margins": 0.1613396406173706, - "rewards/rejected": -1.4606642723083496, - "semantic_entropy": 0.7894943356513977, + "logits/chosen": -0.15332219004631042, + "logits/rejected": -0.09550925344228745, + "logps/chosen": -1.2686882019042969, + "logps/rejected": -1.3833425045013428, + "loss": 1.6595, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2686882019042969, + "rewards/margins": 0.11465413868427277, + "rewards/rejected": -1.3833425045013428, "step": 3620 }, { "epoch": 1.9401237665161397, - "grad_norm": 8.565777965624173, + "grad_norm": 8.709612576533985, "learning_rate": 3.342201185098024e-07, - "logits/chosen": -0.0113115468993783, - "logits/rejected": 0.005684341304004192, - "logps/chosen": -1.2715879678726196, - "logps/rejected": -1.5271837711334229, - "loss": 2.0008, + "logits/chosen": -0.09398411959409714, + "logits/rejected": -0.07650090754032135, + "logps/chosen": -1.2532272338867188, + "logps/rejected": -1.4717903137207031, + "loss": 1.6012, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2715879678726196, - "rewards/margins": 0.25559574365615845, - "rewards/rejected": -1.5271837711334229, - "semantic_entropy": 0.7982123494148254, + "rewards/chosen": -1.2532272338867188, + "rewards/margins": 0.21856316924095154, + "rewards/rejected": -1.4717903137207031, "step": 3625 }, { "epoch": 1.9427997992975414, - "grad_norm": 8.985052902489157, + "grad_norm": 8.75858655836486, "learning_rate": 3.3275161812002807e-07, - "logits/chosen": -0.11439623683691025, - "logits/rejected": -0.057005755603313446, - "logps/chosen": -1.2966344356536865, - "logps/rejected": -1.54485285282135, - "loss": 2.0394, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2966344356536865, - "rewards/margins": 0.24821829795837402, - "rewards/rejected": -1.54485285282135, - "semantic_entropy": 0.7945786714553833, + "logits/chosen": -0.18788668513298035, + "logits/rejected": -0.14185675978660583, + "logps/chosen": -1.2734358310699463, + "logps/rejected": -1.43949556350708, + "loss": 1.6406, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2734358310699463, + "rewards/margins": 0.16605976223945618, + "rewards/rejected": -1.43949556350708, "step": 3630 }, { "epoch": 1.945475832078943, - "grad_norm": 10.35010463149898, + "grad_norm": 10.659669108581703, "learning_rate": 3.312847403747883e-07, - "logits/chosen": -0.15070085227489471, - "logits/rejected": -0.06268687546253204, - "logps/chosen": -1.310336709022522, - "logps/rejected": -1.533013105392456, - "loss": 2.0457, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.310336709022522, - "rewards/margins": 0.22267630696296692, - "rewards/rejected": -1.533013105392456, - "semantic_entropy": 0.7924118041992188, + "logits/chosen": -0.18720242381095886, + "logits/rejected": -0.11303949356079102, + "logps/chosen": -1.290158987045288, + "logps/rejected": -1.448259949684143, + "loss": 1.651, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.290158987045288, + "rewards/margins": 0.15810075402259827, + "rewards/rejected": -1.448259949684143, "step": 3635 }, { "epoch": 1.9481518648603444, - "grad_norm": 8.069338833575255, + "grad_norm": 7.846067267898016, "learning_rate": 3.2981949950573733e-07, - "logits/chosen": -0.07498464733362198, - "logits/rejected": 0.039922118186950684, - "logps/chosen": -1.3872663974761963, - "logps/rejected": -1.5021774768829346, - "loss": 2.0958, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3872663974761963, - "rewards/margins": 0.1149112731218338, - "rewards/rejected": -1.5021774768829346, - "semantic_entropy": 0.76435387134552, + "logits/chosen": -0.15360137820243835, + "logits/rejected": -0.04836711287498474, + "logps/chosen": -1.3664968013763428, + "logps/rejected": -1.4371174573898315, + "loss": 1.713, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3664968013763428, + "rewards/margins": 0.07062048465013504, + "rewards/rejected": -1.4371174573898315, "step": 3640 }, { "epoch": 1.9508278976417461, - "grad_norm": 7.908362354066705, + "grad_norm": 7.473076523289346, "learning_rate": 3.283559097286486e-07, - "logits/chosen": -0.11206521838903427, - "logits/rejected": 0.020204050466418266, - "logps/chosen": -1.3849326372146606, - "logps/rejected": -1.5381712913513184, - "loss": 2.0937, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3849326372146606, - "rewards/margins": 0.15323874354362488, - "rewards/rejected": -1.5381712913513184, - "semantic_entropy": 0.7717481255531311, + "logits/chosen": -0.18360236287117004, + "logits/rejected": -0.07247422635555267, + "logps/chosen": -1.3604953289031982, + "logps/rejected": -1.4847160577774048, + "loss": 1.6987, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3604953289031982, + "rewards/margins": 0.12422071397304535, + "rewards/rejected": -1.4847160577774048, "step": 3645 }, { "epoch": 1.9535039304231478, - "grad_norm": 9.712785475979707, + "grad_norm": 7.444897497210662, "learning_rate": 3.268939852432765e-07, - "logits/chosen": -0.13621452450752258, - "logits/rejected": -0.029971089214086533, - "logps/chosen": -1.2723300457000732, - "logps/rejected": -1.471142053604126, - "loss": 2.017, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2723300457000732, - "rewards/margins": 0.1988120973110199, - "rewards/rejected": -1.471142053604126, - "semantic_entropy": 0.8029723167419434, + "logits/chosen": -0.20776736736297607, + "logits/rejected": -0.11900806427001953, + "logps/chosen": -1.2479541301727295, + "logps/rejected": -1.385831356048584, + "loss": 1.6118, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2479541301727295, + "rewards/margins": 0.13787731528282166, + "rewards/rejected": -1.385831356048584, "step": 3650 }, { "epoch": 1.9561799632045491, - "grad_norm": 12.481648173594774, + "grad_norm": 11.393299359575368, "learning_rate": 3.254337402332187e-07, - "logits/chosen": -0.11881868541240692, - "logits/rejected": 0.017414908856153488, - "logps/chosen": -1.3414294719696045, - "logps/rejected": -1.5426610708236694, - "loss": 2.08, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3414294719696045, - "rewards/margins": 0.20123150944709778, - "rewards/rejected": -1.5426610708236694, - "semantic_entropy": 0.7877740859985352, + "logits/chosen": -0.1976751834154129, + "logits/rejected": -0.07935307919979095, + "logps/chosen": -1.3020820617675781, + "logps/rejected": -1.455503225326538, + "loss": 1.6683, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3020820617675781, + "rewards/margins": 0.1534210443496704, + "rewards/rejected": -1.455503225326538, "step": 3655 }, { "epoch": 1.9588559959859508, - "grad_norm": 11.346605430241318, + "grad_norm": 11.232389591855133, "learning_rate": 3.239751888657788e-07, - "logits/chosen": -0.14560244977474213, - "logits/rejected": -0.0107874795794487, - "logps/chosen": -1.2256336212158203, - "logps/rejected": -1.4671902656555176, - "loss": 2.0069, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2256336212158203, - "rewards/margins": 0.24155676364898682, - "rewards/rejected": -1.4671902656555176, - "semantic_entropy": 0.8126087188720703, + "logits/chosen": -0.20845703780651093, + "logits/rejected": -0.09822802245616913, + "logps/chosen": -1.2032948732376099, + "logps/rejected": -1.3660537004470825, + "loss": 1.5965, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2032948732376099, + "rewards/margins": 0.16275881230831146, + "rewards/rejected": -1.3660537004470825, "step": 3660 }, { "epoch": 1.9615320287673526, - "grad_norm": 10.395984398742891, + "grad_norm": 8.85819556697752, "learning_rate": 3.2251834529182856e-07, - "logits/chosen": -0.08570713549852371, - "logits/rejected": 0.031803928315639496, - "logps/chosen": -1.2469929456710815, - "logps/rejected": -1.4951390027999878, - "loss": 2.017, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2469929456710815, - "rewards/margins": 0.2481461763381958, - "rewards/rejected": -1.4951390027999878, - "semantic_entropy": 0.8090551495552063, + "logits/chosen": -0.1434224545955658, + "logits/rejected": -0.039260972291231155, + "logps/chosen": -1.2225271463394165, + "logps/rejected": -1.3938648700714111, + "loss": 1.6086, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2225271463394165, + "rewards/margins": 0.17133775353431702, + "rewards/rejected": -1.3938648700714111, "step": 3665 }, { "epoch": 1.9642080615487538, - "grad_norm": 6.370737640712571, + "grad_norm": 6.489617402223498, "learning_rate": 3.2106322364567075e-07, - "logits/chosen": -0.1544579565525055, - "logits/rejected": -0.005169686861336231, - "logps/chosen": -1.2779276371002197, - "logps/rejected": -1.5672134160995483, - "loss": 1.9949, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2779276371002197, - "rewards/margins": 0.2892858386039734, - "rewards/rejected": -1.5672134160995483, - "semantic_entropy": 0.7989880442619324, + "logits/chosen": -0.21803636848926544, + "logits/rejected": -0.09411749243736267, + "logps/chosen": -1.2584469318389893, + "logps/rejected": -1.4679569005966187, + "loss": 1.6013, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2584469318389893, + "rewards/margins": 0.20950999855995178, + "rewards/rejected": -1.4679569005966187, "step": 3670 }, { "epoch": 1.9668840943301555, - "grad_norm": 7.798494451995573, + "grad_norm": 7.405848196687554, "learning_rate": 3.1960983804490183e-07, - "logits/chosen": -0.11066894233226776, - "logits/rejected": 0.02928655780851841, - "logps/chosen": -1.312739372253418, - "logps/rejected": -1.6923391819000244, - "loss": 1.9945, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.312739372253418, - "rewards/margins": 0.3795998692512512, - "rewards/rejected": -1.6923391819000244, - "semantic_entropy": 0.7777242660522461, + "logits/chosen": -0.17621895670890808, + "logits/rejected": -0.05477041006088257, + "logps/chosen": -1.288193702697754, + "logps/rejected": -1.578798532485962, + "loss": 1.606, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.288193702697754, + "rewards/margins": 0.29060500860214233, + "rewards/rejected": -1.578798532485962, "step": 3675 }, { "epoch": 1.9695601271115573, - "grad_norm": 8.440153896776039, + "grad_norm": 8.007223927502881, "learning_rate": 3.1815820259027537e-07, - "logits/chosen": -0.11401332914829254, - "logits/rejected": 0.00045472680358216166, - "logps/chosen": -1.1583054065704346, - "logps/rejected": -1.4422087669372559, - "loss": 1.9335, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1583054065704346, - "rewards/margins": 0.28390341997146606, - "rewards/rejected": -1.4422087669372559, - "semantic_entropy": 0.8331707715988159, + "logits/chosen": -0.19255180656909943, + "logits/rejected": -0.0977533608675003, + "logps/chosen": -1.138106346130371, + "logps/rejected": -1.3500877618789673, + "loss": 1.5194, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.138106346130371, + "rewards/margins": 0.21198129653930664, + "rewards/rejected": -1.3500877618789673, "step": 3680 }, { "epoch": 1.9722361598929585, - "grad_norm": 10.959153779640832, + "grad_norm": 8.567693499874197, "learning_rate": 3.16708331365565e-07, - "logits/chosen": -0.11247573047876358, - "logits/rejected": -0.026667693629860878, - "logps/chosen": -1.268367052078247, - "logps/rejected": -1.5691543817520142, - "loss": 1.9707, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.268367052078247, - "rewards/margins": 0.3007873296737671, - "rewards/rejected": -1.5691543817520142, - "semantic_entropy": 0.7896307706832886, + "logits/chosen": -0.23554396629333496, + "logits/rejected": -0.1732751429080963, + "logps/chosen": -1.2374318838119507, + "logps/rejected": -1.468052864074707, + "loss": 1.5719, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2374318838119507, + "rewards/margins": 0.23062118887901306, + "rewards/rejected": -1.468052864074707, "step": 3685 }, { "epoch": 1.9749121926743602, - "grad_norm": 7.408577878714491, + "grad_norm": 7.500530526726536, "learning_rate": 3.152602384374275e-07, - "logits/chosen": -0.09716648608446121, - "logits/rejected": 0.06562429666519165, - "logps/chosen": -1.3082753419876099, - "logps/rejected": -1.5784547328948975, - "loss": 2.0316, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3082753419876099, - "rewards/margins": 0.2701794505119324, - "rewards/rejected": -1.5784547328948975, - "semantic_entropy": 0.7799612283706665, + "logits/chosen": -0.15899136662483215, + "logits/rejected": -0.027197346091270447, + "logps/chosen": -1.2746217250823975, + "logps/rejected": -1.4639803171157837, + "loss": 1.6315, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2746217250823975, + "rewards/margins": 0.18935871124267578, + "rewards/rejected": -1.4639803171157837, "step": 3690 }, { "epoch": 1.977588225455762, - "grad_norm": 8.179466642910503, + "grad_norm": 7.35502099895214, "learning_rate": 3.1381393785526697e-07, - "logits/chosen": -0.05400281026959419, - "logits/rejected": -0.006647360511124134, - "logps/chosen": -1.3509399890899658, - "logps/rejected": -1.6333929300308228, - "loss": 2.0255, + "logits/chosen": -0.11527419090270996, + "logits/rejected": -0.08784718811511993, + "logps/chosen": -1.3176900148391724, + "logps/rejected": -1.550249457359314, + "loss": 1.6318, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3509399890899658, - "rewards/margins": 0.28245288133621216, - "rewards/rejected": -1.6333929300308228, - "semantic_entropy": 0.7614681124687195, + "rewards/chosen": -1.3176900148391724, + "rewards/margins": 0.23255948722362518, + "rewards/rejected": -1.550249457359314, "step": 3695 }, { "epoch": 1.9802642582371635, - "grad_norm": 12.428902913590797, + "grad_norm": 8.661549654110603, "learning_rate": 3.123694436510979e-07, - "logits/chosen": -0.029851287603378296, - "logits/rejected": 0.08511605858802795, - "logps/chosen": -1.2690141201019287, - "logps/rejected": -1.543554425239563, - "loss": 2.0004, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2690141201019287, - "rewards/margins": 0.2745402455329895, - "rewards/rejected": -1.543554425239563, - "semantic_entropy": 0.7996218800544739, + "logits/chosen": -0.12260335683822632, + "logits/rejected": -0.030400067567825317, + "logps/chosen": -1.2405729293823242, + "logps/rejected": -1.4361417293548584, + "loss": 1.593, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2405729293823242, + "rewards/margins": 0.19556888937950134, + "rewards/rejected": -1.4361417293548584, "step": 3700 }, { "epoch": 1.982940291018565, - "grad_norm": 9.828271639098743, + "grad_norm": 8.768064878804184, "learning_rate": 3.1092676983940946e-07, - "logits/chosen": -0.11319668591022491, - "logits/rejected": -0.033414699137210846, - "logps/chosen": -1.3103493452072144, - "logps/rejected": -1.594153642654419, - "loss": 2.0304, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3103493452072144, - "rewards/margins": 0.2838039994239807, - "rewards/rejected": -1.594153642654419, - "semantic_entropy": 0.7914609909057617, + "logits/chosen": -0.16908925771713257, + "logits/rejected": -0.10685942322015762, + "logps/chosen": -1.2900021076202393, + "logps/rejected": -1.4903085231781006, + "loss": 1.6398, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2900021076202393, + "rewards/margins": 0.20030629634857178, + "rewards/rejected": -1.4903085231781006, "step": 3705 }, { "epoch": 1.9856163237999667, - "grad_norm": 9.075896180830405, + "grad_norm": 7.926221873843065, "learning_rate": 3.094859304170293e-07, - "logits/chosen": 0.06817227602005005, - "logits/rejected": 0.12981468439102173, - "logps/chosen": -1.3226096630096436, - "logps/rejected": -1.5663455724716187, - "loss": 2.0396, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3226096630096436, - "rewards/margins": 0.24373598396778107, - "rewards/rejected": -1.5663455724716187, - "semantic_entropy": 0.781419575214386, + "logits/chosen": -0.07368534058332443, + "logits/rejected": -0.03026638552546501, + "logps/chosen": -1.29747474193573, + "logps/rejected": -1.458830714225769, + "loss": 1.6498, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.29747474193573, + "rewards/margins": 0.16135601699352264, + "rewards/rejected": -1.458830714225769, "step": 3710 }, { "epoch": 1.9882923565813682, - "grad_norm": 5.926526406892064, + "grad_norm": 5.747132852592568, "learning_rate": 3.0804693936298795e-07, - "logits/chosen": -0.021849263459444046, - "logits/rejected": 0.05796981602907181, - "logps/chosen": -1.3069286346435547, - "logps/rejected": -1.5945860147476196, - "loss": 2.0158, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3069286346435547, - "rewards/margins": 0.2876574993133545, - "rewards/rejected": -1.5945860147476196, - "semantic_entropy": 0.7784181237220764, + "logits/chosen": -0.13629598915576935, + "logits/rejected": -0.06977993249893188, + "logps/chosen": -1.2730337381362915, + "logps/rejected": -1.475895881652832, + "loss": 1.6178, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2730337381362915, + "rewards/margins": 0.20286211371421814, + "rewards/rejected": -1.475895881652832, "step": 3715 }, { "epoch": 1.9909683893627697, - "grad_norm": 7.061494171242169, + "grad_norm": 6.970404484087072, "learning_rate": 3.066098106383826e-07, - "logits/chosen": -0.08047592639923096, - "logits/rejected": 0.008429741486907005, - "logps/chosen": -1.2842880487442017, - "logps/rejected": -1.4925073385238647, - "loss": 2.0469, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2842880487442017, - "rewards/margins": 0.20821920037269592, - "rewards/rejected": -1.4925073385238647, - "semantic_entropy": 0.7969603538513184, + "logits/chosen": -0.1679581254720688, + "logits/rejected": -0.10403795540332794, + "logps/chosen": -1.2511913776397705, + "logps/rejected": -1.4179065227508545, + "loss": 1.6271, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2511913776397705, + "rewards/margins": 0.16671538352966309, + "rewards/rejected": -1.4179065227508545, "step": 3720 }, { "epoch": 1.9936444221441714, - "grad_norm": 7.610782704184788, + "grad_norm": 6.81701280169232, "learning_rate": 3.0517455818624263e-07, - "logits/chosen": -0.13162103295326233, - "logits/rejected": -0.02476480044424534, - "logps/chosen": -1.282560110092163, - "logps/rejected": -1.5355018377304077, - "loss": 2.0228, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.282560110092163, - "rewards/margins": 0.25294169783592224, - "rewards/rejected": -1.5355018377304077, - "semantic_entropy": 0.8069452047348022, + "logits/chosen": -0.2270546406507492, + "logits/rejected": -0.14077839255332947, + "logps/chosen": -1.2551872730255127, + "logps/rejected": -1.422701120376587, + "loss": 1.6151, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2551872730255127, + "rewards/margins": 0.16751374304294586, + "rewards/rejected": -1.422701120376587, "step": 3725 }, { "epoch": 1.9963204549255729, - "grad_norm": 7.758490750103918, + "grad_norm": 7.559032307430267, "learning_rate": 3.037411959313936e-07, - "logits/chosen": -0.027306068688631058, - "logits/rejected": 0.10662344843149185, - "logps/chosen": -1.2372483015060425, - "logps/rejected": -1.51559579372406, - "loss": 1.9763, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2372483015060425, - "rewards/margins": 0.2783472537994385, - "rewards/rejected": -1.51559579372406, - "semantic_entropy": 0.8002141714096069, + "logits/chosen": -0.11934938281774521, + "logits/rejected": -0.01248091645538807, + "logps/chosen": -1.218796968460083, + "logps/rejected": -1.4315433502197266, + "loss": 1.5745, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.218796968460083, + "rewards/margins": 0.2127462923526764, + "rewards/rejected": -1.4315433502197266, "step": 3730 }, { "epoch": 1.9989964877069744, - "grad_norm": 9.839994365343232, + "grad_norm": 7.357540622090184, "learning_rate": 3.023097377803224e-07, - "logits/chosen": 0.0056734951213002205, - "logits/rejected": 0.08714814484119415, - "logps/chosen": -1.3742187023162842, - "logps/rejected": -1.5301988124847412, - "loss": 2.1023, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3742187023162842, - "rewards/margins": 0.155980184674263, - "rewards/rejected": -1.5301988124847412, - "semantic_entropy": 0.7796865105628967, + "logits/chosen": -0.09708665311336517, + "logits/rejected": -0.031124413013458252, + "logps/chosen": -1.3405152559280396, + "logps/rejected": -1.4379889965057373, + "loss": 1.6962, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3405152559280396, + "rewards/margins": 0.09747375547885895, + "rewards/rejected": -1.4379889965057373, "step": 3735 }, { "epoch": 2.001672520488376, - "grad_norm": 6.9179271221848095, + "grad_norm": 7.029834417254843, "learning_rate": 3.008801976210423e-07, - "logits/chosen": 0.021409938111901283, - "logits/rejected": 0.07372380793094635, - "logps/chosen": -1.3586982488632202, - "logps/rejected": -1.5227470397949219, - "loss": 2.0708, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3586982488632202, - "rewards/margins": 0.16404885053634644, - "rewards/rejected": -1.5227470397949219, - "semantic_entropy": 0.7766925692558289, + "logits/chosen": -0.10880253463983536, + "logits/rejected": -0.07140463590621948, + "logps/chosen": -1.327852725982666, + "logps/rejected": -1.4347646236419678, + "loss": 1.6715, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.327852725982666, + "rewards/margins": 0.10691193491220474, + "rewards/rejected": -1.4347646236419678, "step": 3740 }, { "epoch": 2.0043485532697773, - "grad_norm": 7.620126843705302, + "grad_norm": 6.6231405405746395, "learning_rate": 2.994525893229581e-07, - "logits/chosen": -0.058956682682037354, - "logits/rejected": 0.04316861927509308, - "logps/chosen": -1.3086483478546143, - "logps/rejected": -1.5185747146606445, - "loss": 2.0332, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3086483478546143, - "rewards/margins": 0.20992644131183624, - "rewards/rejected": -1.5185747146606445, - "semantic_entropy": 0.7986747622489929, + "logits/chosen": -0.16665935516357422, + "logits/rejected": -0.08508746325969696, + "logps/chosen": -1.2745071649551392, + "logps/rejected": -1.4427918195724487, + "loss": 1.6197, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2745071649551392, + "rewards/margins": 0.16828462481498718, + "rewards/rejected": -1.4427918195724487, "step": 3745 }, { "epoch": 2.007024586051179, - "grad_norm": 6.914423972752786, + "grad_norm": 6.744849193834629, "learning_rate": 2.98026926736732e-07, - "logits/chosen": -0.1123870238661766, - "logits/rejected": -0.023719770833849907, - "logps/chosen": -1.2111319303512573, - "logps/rejected": -1.542511224746704, - "loss": 1.9347, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2111319303512573, - "rewards/margins": 0.33137932419776917, - "rewards/rejected": -1.542511224746704, - "semantic_entropy": 0.8043048977851868, + "logits/chosen": -0.20406830310821533, + "logits/rejected": -0.13290412724018097, + "logps/chosen": -1.1823909282684326, + "logps/rejected": -1.4527281522750854, + "loss": 1.5256, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1823909282684326, + "rewards/margins": 0.27033716440200806, + "rewards/rejected": -1.4527281522750854, "step": 3750 }, { "epoch": 2.0097006188325808, - "grad_norm": 8.314845242126639, + "grad_norm": 7.446708091453874, "learning_rate": 2.9660322369414846e-07, - "logits/chosen": -0.06645806133747101, - "logits/rejected": 0.04517248272895813, - "logps/chosen": -1.222695231437683, - "logps/rejected": -1.6410176753997803, - "loss": 1.936, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.222695231437683, - "rewards/margins": 0.41832247376441956, - "rewards/rejected": -1.6410176753997803, - "semantic_entropy": 0.7972557544708252, + "logits/chosen": -0.1945587396621704, + "logits/rejected": -0.1164223775267601, + "logps/chosen": -1.2044284343719482, + "logps/rejected": -1.525867223739624, + "loss": 1.5394, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2044284343719482, + "rewards/margins": 0.32143890857696533, + "rewards/rejected": -1.525867223739624, "step": 3755 }, { "epoch": 2.0123766516139825, - "grad_norm": 7.978805802446787, + "grad_norm": 7.394113324521219, "learning_rate": 2.9518149400798063e-07, - "logits/chosen": -0.14016160368919373, - "logits/rejected": -0.11200448125600815, - "logps/chosen": -1.2655471563339233, - "logps/rejected": -1.6075023412704468, - "loss": 1.9572, + "logits/chosen": -0.25701871514320374, + "logits/rejected": -0.24626454710960388, + "logps/chosen": -1.243540644645691, + "logps/rejected": -1.4760215282440186, + "loss": 1.573, "rewards/accuracies": 0.625, - "rewards/chosen": -1.2655471563339233, - "rewards/margins": 0.34195518493652344, - "rewards/rejected": -1.6075023412704468, - "semantic_entropy": 0.7767778635025024, + "rewards/chosen": -1.243540644645691, + "rewards/margins": 0.23248091340065002, + "rewards/rejected": -1.4760215282440186, "step": 3760 }, { "epoch": 2.0150526843953838, - "grad_norm": 10.85433114680117, + "grad_norm": 8.634215422478055, "learning_rate": 2.9376175147185633e-07, - "logits/chosen": -0.005012214183807373, - "logits/rejected": 0.1798551231622696, - "logps/chosen": -1.259853720664978, - "logps/rejected": -1.5986053943634033, - "loss": 1.9709, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.259853720664978, - "rewards/margins": 0.3387514650821686, - "rewards/rejected": -1.5986053943634033, - "semantic_entropy": 0.8001511693000793, + "logits/chosen": -0.126735121011734, + "logits/rejected": 0.01685112714767456, + "logps/chosen": -1.2230195999145508, + "logps/rejected": -1.500044822692871, + "loss": 1.5565, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2230195999145508, + "rewards/margins": 0.27702516317367554, + "rewards/rejected": -1.500044822692871, "step": 3765 }, { "epoch": 2.0177287171767855, - "grad_norm": 10.519710493335946, + "grad_norm": 8.739521731135147, "learning_rate": 2.9234400986012376e-07, - "logits/chosen": -0.14667272567749023, - "logits/rejected": 0.024977799504995346, - "logps/chosen": -1.2061131000518799, - "logps/rejected": -1.6993818283081055, - "loss": 1.9044, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2061131000518799, - "rewards/margins": 0.4932686388492584, - "rewards/rejected": -1.6993818283081055, - "semantic_entropy": 0.7931141257286072, + "logits/chosen": -0.26435479521751404, + "logits/rejected": -0.13318143784999847, + "logps/chosen": -1.174684762954712, + "logps/rejected": -1.5840399265289307, + "loss": 1.5061, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.174684762954712, + "rewards/margins": 0.4093553125858307, + "rewards/rejected": -1.5840399265289307, "step": 3770 }, { "epoch": 2.020404749958187, - "grad_norm": 8.439243831534606, + "grad_norm": 7.887435915063563, "learning_rate": 2.9092828292771817e-07, - "logits/chosen": -0.08728514611721039, - "logits/rejected": -0.02273547649383545, - "logps/chosen": -1.2604572772979736, - "logps/rejected": -1.5512092113494873, - "loss": 1.9991, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.2604572772979736, - "rewards/margins": 0.2907518744468689, - "rewards/rejected": -1.5512092113494873, - "semantic_entropy": 0.8103083372116089, + "logits/chosen": -0.23848462104797363, + "logits/rejected": -0.19505062699317932, + "logps/chosen": -1.230447769165039, + "logps/rejected": -1.439680814743042, + "loss": 1.5973, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.230447769165039, + "rewards/margins": 0.20923320949077606, + "rewards/rejected": -1.439680814743042, "step": 3775 }, { "epoch": 2.0230807827395885, - "grad_norm": 7.982311356152901, + "grad_norm": 7.20935429995073, "learning_rate": 2.8951458441002875e-07, - "logits/chosen": -0.032814525067806244, - "logits/rejected": 0.005210143513977528, - "logps/chosen": -1.273353099822998, - "logps/rejected": -1.5685460567474365, - "loss": 2.0184, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.273353099822998, - "rewards/margins": 0.2951931059360504, - "rewards/rejected": -1.5685460567474365, - "semantic_entropy": 0.7983887195587158, + "logits/chosen": -0.11859314143657684, + "logits/rejected": -0.09189174324274063, + "logps/chosen": -1.2536604404449463, + "logps/rejected": -1.4843485355377197, + "loss": 1.6231, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2536604404449463, + "rewards/margins": 0.2306881844997406, + "rewards/rejected": -1.4843485355377197, "step": 3780 }, { "epoch": 2.02575681552099, - "grad_norm": 7.081907610905893, + "grad_norm": 6.542434525353967, "learning_rate": 2.881029280227643e-07, - "logits/chosen": -0.09091036021709442, - "logits/rejected": 0.035645000636577606, - "logps/chosen": -1.296726942062378, - "logps/rejected": -1.6826658248901367, - "loss": 1.9666, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.296726942062378, - "rewards/margins": 0.3859389126300812, - "rewards/rejected": -1.6826658248901367, - "semantic_entropy": 0.772459864616394, + "logits/chosen": -0.17586956918239594, + "logits/rejected": -0.07927899062633514, + "logps/chosen": -1.271405577659607, + "logps/rejected": -1.5665825605392456, + "loss": 1.5756, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.271405577659607, + "rewards/margins": 0.29517701268196106, + "rewards/rejected": -1.5665825605392456, "step": 3785 }, { "epoch": 2.028432848302392, - "grad_norm": 6.553036872618776, + "grad_norm": 6.2583841886826175, "learning_rate": 2.8669332746182177e-07, - "logits/chosen": -0.1433906853199005, - "logits/rejected": 0.03814217075705528, - "logps/chosen": -1.2487014532089233, - "logps/rejected": -1.595126748085022, - "loss": 1.9387, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2487014532089233, - "rewards/margins": 0.34642526507377625, - "rewards/rejected": -1.595126748085022, - "semantic_entropy": 0.7828398942947388, + "logits/chosen": -0.24330580234527588, + "logits/rejected": -0.0978088229894638, + "logps/chosen": -1.2277387380599976, + "logps/rejected": -1.4597686529159546, + "loss": 1.5563, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2277387380599976, + "rewards/margins": 0.23202982544898987, + "rewards/rejected": -1.4597686529159546, "step": 3790 }, { "epoch": 2.031108881083793, - "grad_norm": 6.515967009686399, + "grad_norm": 6.445583706030623, "learning_rate": 2.8528579640315156e-07, - "logits/chosen": -0.08073130995035172, - "logits/rejected": -0.052123237401247025, - "logps/chosen": -1.2313802242279053, - "logps/rejected": -1.4814410209655762, - "loss": 1.9817, + "logits/chosen": -0.17014442384243011, + "logits/rejected": -0.15202614665031433, + "logps/chosen": -1.2047111988067627, + "logps/rejected": -1.3954555988311768, + "loss": 1.5703, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2313802242279053, - "rewards/margins": 0.2500608563423157, - "rewards/rejected": -1.4814410209655762, - "semantic_entropy": 0.8185272216796875, + "rewards/chosen": -1.2047111988067627, + "rewards/margins": 0.1907445192337036, + "rewards/rejected": -1.3954555988311768, "step": 3795 }, { "epoch": 2.033784913865195, - "grad_norm": 9.658581177515622, + "grad_norm": 8.38716639275352, "learning_rate": 2.8388034850262646e-07, - "logits/chosen": -0.048771053552627563, - "logits/rejected": 0.08041323721408844, - "logps/chosen": -1.3159829378128052, - "logps/rejected": -1.6725927591323853, - "loss": 2.0205, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3159829378128052, - "rewards/margins": 0.35660985112190247, - "rewards/rejected": -1.6725927591323853, - "semantic_entropy": 0.7742539644241333, + "logits/chosen": -0.16343332827091217, + "logits/rejected": -0.06480084359645844, + "logps/chosen": -1.286795973777771, + "logps/rejected": -1.5521700382232666, + "loss": 1.6305, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.286795973777771, + "rewards/margins": 0.2653741240501404, + "rewards/rejected": -1.5521700382232666, "step": 3800 }, { "epoch": 2.0364609466465966, - "grad_norm": 13.080769815083926, + "grad_norm": 10.866345322456871, "learning_rate": 2.824769973959079e-07, - "logits/chosen": -0.02432123012840748, - "logits/rejected": 0.1006104126572609, - "logps/chosen": -1.2159243822097778, - "logps/rejected": -1.528954267501831, - "loss": 1.9372, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2159243822097778, - "rewards/margins": 0.31303003430366516, - "rewards/rejected": -1.528954267501831, - "semantic_entropy": 0.8056100010871887, + "logits/chosen": -0.14301295578479767, + "logits/rejected": -0.05214305594563484, + "logps/chosen": -1.190725564956665, + "logps/rejected": -1.431903600692749, + "loss": 1.534, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.190725564956665, + "rewards/margins": 0.24117796123027802, + "rewards/rejected": -1.431903600692749, "step": 3805 }, { "epoch": 2.039136979427998, - "grad_norm": 8.574320370782875, + "grad_norm": 7.7262644544689145, "learning_rate": 2.81075756698315e-07, - "logits/chosen": 0.03976669907569885, - "logits/rejected": 0.1289629340171814, - "logps/chosen": -1.2397609949111938, - "logps/rejected": -1.5856821537017822, - "loss": 1.9553, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2397609949111938, - "rewards/margins": 0.34592103958129883, - "rewards/rejected": -1.5856821537017822, - "semantic_entropy": 0.7904757261276245, + "logits/chosen": -0.07260935008525848, + "logits/rejected": 0.004268960561603308, + "logps/chosen": -1.2204227447509766, + "logps/rejected": -1.4730734825134277, + "loss": 1.5652, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2204227447509766, + "rewards/margins": 0.2526509761810303, + "rewards/rejected": -1.4730734825134277, "step": 3810 }, { "epoch": 2.0418130122093996, - "grad_norm": 7.864942736987948, + "grad_norm": 7.418071761914062, "learning_rate": 2.7967664000469035e-07, - "logits/chosen": -0.17100025713443756, - "logits/rejected": -0.03756319358944893, - "logps/chosen": -1.292011022567749, - "logps/rejected": -1.5119432210922241, - "loss": 2.0294, + "logits/chosen": -0.2659146189689636, + "logits/rejected": -0.15749691426753998, + "logps/chosen": -1.2592390775680542, + "logps/rejected": -1.4130852222442627, + "loss": 1.6185, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.292011022567749, - "rewards/margins": 0.21993222832679749, - "rewards/rejected": -1.5119432210922241, - "semantic_entropy": 0.7964569926261902, + "rewards/chosen": -1.2592390775680542, + "rewards/margins": 0.1538461297750473, + "rewards/rejected": -1.4130852222442627, "step": 3815 }, { "epoch": 2.0444890449908013, - "grad_norm": 8.272492234979593, + "grad_norm": 9.265432079063835, "learning_rate": 2.7827966088927095e-07, - "logits/chosen": -0.17397232353687286, - "logits/rejected": 0.03040078654885292, - "logps/chosen": -1.3632009029388428, - "logps/rejected": -1.5390888452529907, - "loss": 2.0824, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3632009029388428, - "rewards/margins": 0.1758880466222763, - "rewards/rejected": -1.5390888452529907, - "semantic_entropy": 0.7751861214637756, + "logits/chosen": -0.23816660046577454, + "logits/rejected": -0.06556002795696259, + "logps/chosen": -1.3202743530273438, + "logps/rejected": -1.4127553701400757, + "loss": 1.6821, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.3202743530273438, + "rewards/margins": 0.09248095750808716, + "rewards/rejected": -1.4127553701400757, "step": 3820 }, { "epoch": 2.0471650777722026, - "grad_norm": 9.290254425431332, + "grad_norm": 8.43824883139852, "learning_rate": 2.768848329055538e-07, - "logits/chosen": -0.12546177208423615, - "logits/rejected": -0.009615510702133179, - "logps/chosen": -1.2498613595962524, - "logps/rejected": -1.4855058193206787, - "loss": 1.9852, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2498613595962524, - "rewards/margins": 0.2356446087360382, - "rewards/rejected": -1.4855058193206787, - "semantic_entropy": 0.8016610145568848, + "logits/chosen": -0.19324809312820435, + "logits/rejected": -0.09612785279750824, + "logps/chosen": -1.2284154891967773, + "logps/rejected": -1.3744738101959229, + "loss": 1.5943, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2284154891967773, + "rewards/margins": 0.1460580974817276, + "rewards/rejected": -1.3744738101959229, "step": 3825 }, { "epoch": 2.0498411105536043, - "grad_norm": 8.589930136904933, + "grad_norm": 8.457060541067369, "learning_rate": 2.7549216958616657e-07, - "logits/chosen": -0.17075549066066742, - "logits/rejected": -0.013943374156951904, - "logps/chosen": -1.316281795501709, - "logps/rejected": -1.6756961345672607, - "loss": 1.9949, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.316281795501709, - "rewards/margins": 0.3594144880771637, - "rewards/rejected": -1.6756961345672607, - "semantic_entropy": 0.7636662721633911, + "logits/chosen": -0.25151169300079346, + "logits/rejected": -0.13039612770080566, + "logps/chosen": -1.2907540798187256, + "logps/rejected": -1.5371133089065552, + "loss": 1.6136, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2907540798187256, + "rewards/margins": 0.24635927379131317, + "rewards/rejected": -1.5371133089065552, "step": 3830 }, { "epoch": 2.052517143335006, - "grad_norm": 8.141804117372102, + "grad_norm": 7.365987381138891, "learning_rate": 2.741016844427344e-07, - "logits/chosen": -0.07137881219387054, - "logits/rejected": 0.0819416269659996, - "logps/chosen": -1.3067326545715332, - "logps/rejected": -1.5788406133651733, - "loss": 2.0157, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3067326545715332, - "rewards/margins": 0.27210795879364014, - "rewards/rejected": -1.5788406133651733, - "semantic_entropy": 0.7795847654342651, + "logits/chosen": -0.17692366242408752, + "logits/rejected": -0.0669875293970108, + "logps/chosen": -1.2781174182891846, + "logps/rejected": -1.4772151708602905, + "loss": 1.6226, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2781174182891846, + "rewards/margins": 0.1990976482629776, + "rewards/rejected": -1.4772151708602905, "step": 3835 }, { "epoch": 2.0551931761164073, - "grad_norm": 8.785325832990798, + "grad_norm": 8.056718284287626, "learning_rate": 2.7271339096575073e-07, - "logits/chosen": -0.03475064039230347, - "logits/rejected": 0.07072875648736954, - "logps/chosen": -1.2127859592437744, - "logps/rejected": -1.6222158670425415, - "loss": 1.9293, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2127859592437744, - "rewards/margins": 0.4094300866127014, - "rewards/rejected": -1.6222158670425415, - "semantic_entropy": 0.8095159530639648, + "logits/chosen": -0.12126553058624268, + "logits/rejected": -0.04391341656446457, + "logps/chosen": -1.1803662776947021, + "logps/rejected": -1.5080560445785522, + "loss": 1.5237, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1803662776947021, + "rewards/margins": 0.3276898264884949, + "rewards/rejected": -1.5080560445785522, "step": 3840 }, { "epoch": 2.057869208897809, - "grad_norm": 7.770262670893247, + "grad_norm": 7.159079293202307, "learning_rate": 2.713273026244446e-07, - "logits/chosen": -0.2322789430618286, - "logits/rejected": 0.011026580817997456, - "logps/chosen": -1.3297617435455322, - "logps/rejected": -1.628687858581543, - "loss": 2.0274, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3297617435455322, - "rewards/margins": 0.29892611503601074, - "rewards/rejected": -1.628687858581543, - "semantic_entropy": 0.7731846570968628, + "logits/chosen": -0.30405330657958984, + "logits/rejected": -0.1049649715423584, + "logps/chosen": -1.300267219543457, + "logps/rejected": -1.5288124084472656, + "loss": 1.634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.300267219543457, + "rewards/margins": 0.22854533791542053, + "rewards/rejected": -1.5288124084472656, "step": 3845 }, { "epoch": 2.0605452416792107, - "grad_norm": 8.270676971050367, + "grad_norm": 7.744160416002481, "learning_rate": 2.6994343286665156e-07, - "logits/chosen": -0.08706966042518616, - "logits/rejected": 0.0894269123673439, - "logps/chosen": -1.3270047903060913, - "logps/rejected": -1.5941104888916016, - "loss": 2.0329, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3270047903060913, - "rewards/margins": 0.26710575819015503, - "rewards/rejected": -1.5941104888916016, - "semantic_entropy": 0.7839833498001099, + "logits/chosen": -0.18721036612987518, + "logits/rejected": -0.05016489699482918, + "logps/chosen": -1.2924706935882568, + "logps/rejected": -1.4859955310821533, + "loss": 1.6305, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2924706935882568, + "rewards/margins": 0.19352486729621887, + "rewards/rejected": -1.4859955310821533, "step": 3850 }, { "epoch": 2.063221274460612, - "grad_norm": 9.677987341117074, + "grad_norm": 7.293770790685207, "learning_rate": 2.6856179511868156e-07, - "logits/chosen": -0.05726304650306702, - "logits/rejected": 0.13342034816741943, - "logps/chosen": -1.2690484523773193, - "logps/rejected": -1.6873533725738525, - "loss": 1.9796, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2690484523773193, - "rewards/margins": 0.4183047413825989, - "rewards/rejected": -1.6873533725738525, - "semantic_entropy": 0.7959326505661011, + "logits/chosen": -0.15263861417770386, + "logits/rejected": -9.13009062060155e-05, + "logps/chosen": -1.2374870777130127, + "logps/rejected": -1.5334017276763916, + "loss": 1.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2374870777130127, + "rewards/margins": 0.29591476917266846, + "rewards/rejected": -1.5334017276763916, "step": 3855 }, { "epoch": 2.0658973072420137, - "grad_norm": 6.222363162895002, + "grad_norm": 6.212222071720489, "learning_rate": 2.6718240278519056e-07, - "logits/chosen": -0.08423185348510742, - "logits/rejected": 0.08383387327194214, - "logps/chosen": -1.2890822887420654, - "logps/rejected": -1.6258800029754639, - "loss": 2.0136, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2890822887420654, - "rewards/margins": 0.3367977738380432, - "rewards/rejected": -1.6258800029754639, - "semantic_entropy": 0.7936300039291382, + "logits/chosen": -0.18352673947811127, + "logits/rejected": -0.05127815529704094, + "logps/chosen": -1.2543615102767944, + "logps/rejected": -1.4569965600967407, + "loss": 1.6082, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2543615102767944, + "rewards/margins": 0.2026350051164627, + "rewards/rejected": -1.4569965600967407, "step": 3860 }, { "epoch": 2.0685733400234154, - "grad_norm": 11.215212145108538, + "grad_norm": 10.837827872231681, "learning_rate": 2.6580526924904866e-07, - "logits/chosen": -0.1955564320087433, - "logits/rejected": 0.0005769982817582786, - "logps/chosen": -1.325304388999939, - "logps/rejected": -1.5285425186157227, - "loss": 2.0362, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.325304388999939, - "rewards/margins": 0.2032381296157837, - "rewards/rejected": -1.5285425186157227, - "semantic_entropy": 0.7790963649749756, + "logits/chosen": -0.2902659475803375, + "logits/rejected": -0.13155676424503326, + "logps/chosen": -1.291379690170288, + "logps/rejected": -1.418097734451294, + "loss": 1.6395, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.291379690170288, + "rewards/margins": 0.12671804428100586, + "rewards/rejected": -1.418097734451294, "step": 3865 }, { "epoch": 2.0712493728048167, - "grad_norm": 9.725281583079685, + "grad_norm": 8.955970485936408, "learning_rate": 2.6443040787121186e-07, - "logits/chosen": -0.10238544642925262, - "logits/rejected": -0.04848577454686165, - "logps/chosen": -1.1635031700134277, - "logps/rejected": -1.4549726247787476, - "loss": 1.9292, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.1635031700134277, - "rewards/margins": 0.2914695143699646, - "rewards/rejected": -1.4549726247787476, - "semantic_entropy": 0.8310944437980652, + "logits/chosen": -0.2045224905014038, + "logits/rejected": -0.18195459246635437, + "logps/chosen": -1.1399279832839966, + "logps/rejected": -1.3327436447143555, + "loss": 1.5193, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1399279832839966, + "rewards/margins": 0.19281557202339172, + "rewards/rejected": -1.3327436447143555, "step": 3870 }, { "epoch": 2.0739254055862184, - "grad_norm": 7.543625625162968, + "grad_norm": 6.872285912032201, "learning_rate": 2.6305783199059084e-07, - "logits/chosen": -0.10998735576868057, - "logits/rejected": 0.00942229200154543, - "logps/chosen": -1.263076901435852, - "logps/rejected": -1.5884946584701538, - "loss": 1.9641, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.263076901435852, - "rewards/margins": 0.3254178464412689, - "rewards/rejected": -1.5884946584701538, - "semantic_entropy": 0.7829529047012329, + "logits/chosen": -0.23911531269550323, + "logits/rejected": -0.13421085476875305, + "logps/chosen": -1.2303617000579834, + "logps/rejected": -1.4737814664840698, + "loss": 1.569, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2303617000579834, + "rewards/margins": 0.24341964721679688, + "rewards/rejected": -1.4737814664840698, "step": 3875 }, { "epoch": 2.07660143836762, - "grad_norm": 7.455548456156887, + "grad_norm": 6.797546616721448, "learning_rate": 2.6168755492392324e-07, - "logits/chosen": -0.11983096599578857, - "logits/rejected": 0.05408283323049545, - "logps/chosen": -1.1768678426742554, - "logps/rejected": -1.528571367263794, - "loss": 1.911, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1768678426742554, - "rewards/margins": 0.3517035245895386, - "rewards/rejected": -1.528571367263794, - "semantic_entropy": 0.8298446536064148, + "logits/chosen": -0.23981383442878723, + "logits/rejected": -0.11860020458698273, + "logps/chosen": -1.1498663425445557, + "logps/rejected": -1.397800087928772, + "loss": 1.4988, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1498663425445557, + "rewards/margins": 0.24793359637260437, + "rewards/rejected": -1.397800087928772, "step": 3880 }, { "epoch": 2.0792774711490214, - "grad_norm": 8.413218220585827, + "grad_norm": 7.350912931287538, "learning_rate": 2.6031958996564274e-07, - "logits/chosen": -0.15878412127494812, - "logits/rejected": -0.00622421782463789, - "logps/chosen": -1.2211363315582275, - "logps/rejected": -1.6070476770401, - "loss": 1.9309, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2211363315582275, - "rewards/margins": 0.3859114646911621, - "rewards/rejected": -1.6070476770401, - "semantic_entropy": 0.8035030364990234, + "logits/chosen": -0.24004165828227997, + "logits/rejected": -0.11813943088054657, + "logps/chosen": -1.1958129405975342, + "logps/rejected": -1.4676146507263184, + "loss": 1.5325, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.1958129405975342, + "rewards/margins": 0.271801620721817, + "rewards/rejected": -1.4676146507263184, "step": 3885 }, { "epoch": 2.081953503930423, - "grad_norm": 11.64726487557045, + "grad_norm": 10.665938862076626, "learning_rate": 2.589539503877518e-07, - "logits/chosen": -0.03869933634996414, - "logits/rejected": 0.05498753860592842, - "logps/chosen": -1.2614524364471436, - "logps/rejected": -1.5560925006866455, - "loss": 2.0024, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2614524364471436, - "rewards/margins": 0.29464003443717957, - "rewards/rejected": -1.5560925006866455, - "semantic_entropy": 0.7954789400100708, + "logits/chosen": -0.16734269261360168, + "logits/rejected": -0.09549416601657867, + "logps/chosen": -1.236961841583252, + "logps/rejected": -1.4197896718978882, + "loss": 1.6023, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.236961841583252, + "rewards/margins": 0.1828276813030243, + "rewards/rejected": -1.4197896718978882, "step": 3890 }, { "epoch": 2.084629536711825, - "grad_norm": 10.412752082048966, + "grad_norm": 9.847712661459095, "learning_rate": 2.5759064943969125e-07, - "logits/chosen": -0.11886237561702728, - "logits/rejected": 0.12702760100364685, - "logps/chosen": -1.2422895431518555, - "logps/rejected": -1.584733247756958, - "loss": 1.9709, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2422895431518555, - "rewards/margins": 0.3424435257911682, - "rewards/rejected": -1.584733247756958, - "semantic_entropy": 0.8005902171134949, + "logits/chosen": -0.2093161791563034, + "logits/rejected": -0.008092102594673634, + "logps/chosen": -1.2099231481552124, + "logps/rejected": -1.436281681060791, + "loss": 1.5682, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2099231481552124, + "rewards/margins": 0.2263585776090622, + "rewards/rejected": -1.436281681060791, "step": 3895 }, { "epoch": 2.087305569493226, - "grad_norm": 8.46319246124346, + "grad_norm": 8.457076838996455, "learning_rate": 2.562297003482131e-07, - "logits/chosen": 0.023574665188789368, - "logits/rejected": 0.03584318235516548, - "logps/chosen": -1.2451503276824951, - "logps/rejected": -1.533006191253662, - "loss": 1.9799, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2451503276824951, - "rewards/margins": 0.28785592317581177, - "rewards/rejected": -1.533006191253662, - "semantic_entropy": 0.8111165165901184, + "logits/chosen": -0.08073611557483673, + "logits/rejected": -0.08362691849470139, + "logps/chosen": -1.2163944244384766, + "logps/rejected": -1.431199073791504, + "loss": 1.5725, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2163944244384766, + "rewards/margins": 0.21480469405651093, + "rewards/rejected": -1.431199073791504, "step": 3900 }, { "epoch": 2.089981602274628, - "grad_norm": 8.304832510053231, + "grad_norm": 7.488957696280318, "learning_rate": 2.548711163172512e-07, - "logits/chosen": -0.05365335941314697, - "logits/rejected": 0.048491910099983215, - "logps/chosen": -1.2720180749893188, - "logps/rejected": -1.5300616025924683, - "loss": 1.9934, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2720180749893188, - "rewards/margins": 0.2580435872077942, - "rewards/rejected": -1.5300616025924683, - "semantic_entropy": 0.7993375062942505, + "logits/chosen": -0.16372177004814148, + "logits/rejected": -0.07514110952615738, + "logps/chosen": -1.2316033840179443, + "logps/rejected": -1.400773525238037, + "loss": 1.5816, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2316033840179443, + "rewards/margins": 0.16917003691196442, + "rewards/rejected": -1.400773525238037, "step": 3905 }, { "epoch": 2.0926576350560295, - "grad_norm": 7.208604172092762, + "grad_norm": 6.794047364268938, "learning_rate": 2.53514910527794e-07, - "logits/chosen": -0.007045459933578968, - "logits/rejected": 0.12363393604755402, - "logps/chosen": -1.1774392127990723, - "logps/rejected": -1.5029500722885132, - "loss": 1.9469, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.1774392127990723, - "rewards/margins": 0.3255109488964081, - "rewards/rejected": -1.5029500722885132, - "semantic_entropy": 0.8183084726333618, + "logits/chosen": -0.1059347540140152, + "logits/rejected": -0.006711071822792292, + "logps/chosen": -1.1519787311553955, + "logps/rejected": -1.3930784463882446, + "loss": 1.525, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1519787311553955, + "rewards/margins": 0.24109964072704315, + "rewards/rejected": -1.3930784463882446, "step": 3910 }, { "epoch": 2.095333667837431, - "grad_norm": 6.02671830959697, + "grad_norm": 5.697122413335748, "learning_rate": 2.5216109613775573e-07, - "logits/chosen": -0.0878453403711319, - "logits/rejected": 0.0802905336022377, - "logps/chosen": -1.295917272567749, - "logps/rejected": -1.6467256546020508, - "loss": 1.9905, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.295917272567749, - "rewards/margins": 0.3508082628250122, - "rewards/rejected": -1.6467256546020508, - "semantic_entropy": 0.792289137840271, + "logits/chosen": -0.2032475471496582, + "logits/rejected": -0.06592769175767899, + "logps/chosen": -1.2571234703063965, + "logps/rejected": -1.537047266960144, + "loss": 1.5779, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2571234703063965, + "rewards/margins": 0.2799237072467804, + "rewards/rejected": -1.537047266960144, "step": 3915 }, { "epoch": 2.0980097006188325, - "grad_norm": 8.246722913305083, + "grad_norm": 6.725268181910343, "learning_rate": 2.5080968628184993e-07, - "logits/chosen": -0.08657468855381012, - "logits/rejected": 0.0812126100063324, - "logps/chosen": -1.300894856452942, - "logps/rejected": -1.7418267726898193, - "loss": 1.9738, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.300894856452942, - "rewards/margins": 0.4409319758415222, - "rewards/rejected": -1.7418267726898193, - "semantic_entropy": 0.7787320017814636, + "logits/chosen": -0.2132866084575653, + "logits/rejected": -0.07607977092266083, + "logps/chosen": -1.2650877237319946, + "logps/rejected": -1.5556974411010742, + "loss": 1.5755, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2650877237319946, + "rewards/margins": 0.2906096875667572, + "rewards/rejected": -1.5556974411010742, "step": 3920 }, { "epoch": 2.1006857334002342, - "grad_norm": 9.365923612794083, + "grad_norm": 7.726090458805358, "learning_rate": 2.494606940714605e-07, - "logits/chosen": -0.08720335364341736, - "logits/rejected": 0.03497013822197914, - "logps/chosen": -1.2213890552520752, - "logps/rejected": -1.5773426294326782, - "loss": 1.9608, + "logits/chosen": -0.20984609425067902, + "logits/rejected": -0.11176743358373642, + "logps/chosen": -1.1926872730255127, + "logps/rejected": -1.454128623008728, + "loss": 1.5672, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2213890552520752, - "rewards/margins": 0.3559534549713135, - "rewards/rejected": -1.5773426294326782, - "semantic_entropy": 0.787868320941925, + "rewards/chosen": -1.1926872730255127, + "rewards/margins": 0.2614414095878601, + "rewards/rejected": -1.454128623008728, "step": 3925 }, { "epoch": 2.103361766181636, - "grad_norm": 6.485820842635789, + "grad_norm": 6.319357466196435, "learning_rate": 2.4811413259451625e-07, - "logits/chosen": -0.1779341697692871, - "logits/rejected": -0.029109278693795204, - "logps/chosen": -1.3027786016464233, - "logps/rejected": -1.5649641752243042, - "loss": 2.0159, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3027786016464233, - "rewards/margins": 0.2621855139732361, - "rewards/rejected": -1.5649641752243042, - "semantic_entropy": 0.784981369972229, + "logits/chosen": -0.287931889295578, + "logits/rejected": -0.1848205327987671, + "logps/chosen": -1.278119444847107, + "logps/rejected": -1.4347612857818604, + "loss": 1.6255, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.278119444847107, + "rewards/margins": 0.15664172172546387, + "rewards/rejected": -1.4347612857818604, "step": 3930 }, { "epoch": 2.106037798963037, - "grad_norm": 7.57817047099227, + "grad_norm": 7.167238920674248, "learning_rate": 2.46770014915362e-07, - "logits/chosen": -0.023013921454548836, - "logits/rejected": 0.04001970216631889, - "logps/chosen": -1.2584831714630127, - "logps/rejected": -1.6443601846694946, - "loss": 1.9534, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2584831714630127, - "rewards/margins": 0.38587701320648193, - "rewards/rejected": -1.6443601846694946, - "semantic_entropy": 0.7898402810096741, + "logits/chosen": -0.12809686362743378, + "logits/rejected": -0.0829991027712822, + "logps/chosen": -1.2316675186157227, + "logps/rejected": -1.500192403793335, + "loss": 1.5574, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2316675186157227, + "rewards/margins": 0.26852482557296753, + "rewards/rejected": -1.500192403793335, "step": 3935 }, { "epoch": 2.108713831744439, - "grad_norm": 11.18516150542784, + "grad_norm": 9.086017638738277, "learning_rate": 2.45428354074634e-07, - "logits/chosen": -0.04282800853252411, - "logits/rejected": 0.014397243969142437, - "logps/chosen": -1.1761744022369385, - "logps/rejected": -1.6283648014068604, - "loss": 1.8839, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1761744022369385, - "rewards/margins": 0.45219022035598755, - "rewards/rejected": -1.6283648014068604, - "semantic_entropy": 0.8094109296798706, + "logits/chosen": -0.16330906748771667, + "logits/rejected": -0.1318950206041336, + "logps/chosen": -1.1459821462631226, + "logps/rejected": -1.5246264934539795, + "loss": 1.4668, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1459821462631226, + "rewards/margins": 0.3786444365978241, + "rewards/rejected": -1.5246264934539795, "step": 3940 }, { "epoch": 2.1113898645258407, - "grad_norm": 9.02202970388384, + "grad_norm": 7.908057122641015, "learning_rate": 2.4408916308913105e-07, - "logits/chosen": -0.06121363118290901, - "logits/rejected": 0.11865203082561493, - "logps/chosen": -1.308318018913269, - "logps/rejected": -1.5495134592056274, - "loss": 2.0296, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.308318018913269, - "rewards/margins": 0.2411954700946808, - "rewards/rejected": -1.5495134592056274, - "semantic_entropy": 0.7808352112770081, + "logits/chosen": -0.17196539044380188, + "logits/rejected": -0.023234616965055466, + "logps/chosen": -1.2766989469528198, + "logps/rejected": -1.4319767951965332, + "loss": 1.6312, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2766989469528198, + "rewards/margins": 0.15527768433094025, + "rewards/rejected": -1.4319767951965332, "step": 3945 }, { "epoch": 2.114065897307242, - "grad_norm": 10.43072144089605, + "grad_norm": 9.871413993500036, "learning_rate": 2.4275245495169025e-07, - "logits/chosen": 0.00885589700192213, - "logits/rejected": 0.17541439831256866, - "logps/chosen": -1.2490864992141724, - "logps/rejected": -1.4956276416778564, - "loss": 2.0013, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2490864992141724, - "rewards/margins": 0.2465410977602005, - "rewards/rejected": -1.4956276416778564, - "semantic_entropy": 0.806951642036438, + "logits/chosen": -0.11495999246835709, + "logits/rejected": 0.016498660668730736, + "logps/chosen": -1.2260228395462036, + "logps/rejected": -1.3812894821166992, + "loss": 1.6032, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2260228395462036, + "rewards/margins": 0.155266672372818, + "rewards/rejected": -1.3812894821166992, "step": 3950 }, { "epoch": 2.1167419300886436, - "grad_norm": 8.73901614305497, + "grad_norm": 7.83555448376319, "learning_rate": 2.414182426310597e-07, - "logits/chosen": -0.1357141137123108, - "logits/rejected": -0.04953531548380852, - "logps/chosen": -1.2262976169586182, - "logps/rejected": -1.5870901346206665, - "loss": 1.9462, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2262976169586182, - "rewards/margins": 0.36079269647598267, - "rewards/rejected": -1.5870901346206665, - "semantic_entropy": 0.7998301982879639, + "logits/chosen": -0.23881487548351288, + "logits/rejected": -0.16763314604759216, + "logps/chosen": -1.190349817276001, + "logps/rejected": -1.4223787784576416, + "loss": 1.5393, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.190349817276001, + "rewards/margins": 0.23202869296073914, + "rewards/rejected": -1.4223787784576416, "step": 3955 }, { "epoch": 2.1194179628700454, - "grad_norm": 13.96331969015264, + "grad_norm": 12.022333269502365, "learning_rate": 2.400865390717734e-07, - "logits/chosen": -0.04178817197680473, - "logits/rejected": 0.07974360138177872, - "logps/chosen": -1.2890770435333252, - "logps/rejected": -1.7201932668685913, - "loss": 1.9744, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2890770435333252, - "rewards/margins": 0.4311162531375885, - "rewards/rejected": -1.7201932668685913, - "semantic_entropy": 0.7699192762374878, + "logits/chosen": -0.16634608805179596, + "logits/rejected": -0.06972671300172806, + "logps/chosen": -1.260103702545166, + "logps/rejected": -1.561883807182312, + "loss": 1.5905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.260103702545166, + "rewards/margins": 0.3017801344394684, + "rewards/rejected": -1.561883807182312, "step": 3960 }, { "epoch": 2.1220939956514466, - "grad_norm": 8.690446841300258, + "grad_norm": 7.107186437787189, "learning_rate": 2.3875735719402475e-07, - "logits/chosen": 0.002413132693618536, - "logits/rejected": 0.11878538131713867, - "logps/chosen": -1.197887897491455, - "logps/rejected": -1.5999175310134888, - "loss": 1.9194, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.197887897491455, - "rewards/margins": 0.40202951431274414, - "rewards/rejected": -1.5999175310134888, - "semantic_entropy": 0.8078168630599976, + "logits/chosen": -0.11475582420825958, + "logits/rejected": -0.02629752829670906, + "logps/chosen": -1.1744776964187622, + "logps/rejected": -1.4737120866775513, + "loss": 1.5135, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1744776964187622, + "rewards/margins": 0.29923444986343384, + "rewards/rejected": -1.4737120866775513, "step": 3965 }, { "epoch": 2.1247700284328483, - "grad_norm": 7.546364801753625, + "grad_norm": 7.131379407199912, "learning_rate": 2.3743070989354258e-07, - "logits/chosen": -0.05478797107934952, - "logits/rejected": 0.040621694177389145, - "logps/chosen": -1.2546120882034302, - "logps/rejected": -1.6230491399765015, - "loss": 1.9688, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2546120882034302, - "rewards/margins": 0.3684369921684265, - "rewards/rejected": -1.6230491399765015, - "semantic_entropy": 0.7908979058265686, + "logits/chosen": -0.1745968759059906, + "logits/rejected": -0.09987455606460571, + "logps/chosen": -1.2226725816726685, + "logps/rejected": -1.507332444190979, + "loss": 1.5716, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2226725816726685, + "rewards/margins": 0.28465989232063293, + "rewards/rejected": -1.507332444190979, "step": 3970 }, { "epoch": 2.12744606121425, - "grad_norm": 7.943490277866662, + "grad_norm": 7.021912184076167, "learning_rate": 2.3610661004146454e-07, - "logits/chosen": -0.035158295184373856, - "logits/rejected": 0.0653868168592453, - "logps/chosen": -1.1693830490112305, - "logps/rejected": -1.4864637851715088, - "loss": 1.9245, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1693830490112305, - "rewards/margins": 0.3170807361602783, - "rewards/rejected": -1.4864637851715088, - "semantic_entropy": 0.8256049156188965, + "logits/chosen": -0.11084862053394318, + "logits/rejected": -0.042972978204488754, + "logps/chosen": -1.1444134712219238, + "logps/rejected": -1.4090802669525146, + "loss": 1.5116, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1444134712219238, + "rewards/margins": 0.26466673612594604, + "rewards/rejected": -1.4090802669525146, "step": 3975 }, { "epoch": 2.1301220939956513, - "grad_norm": 9.99264274060387, + "grad_norm": 8.622696824811587, "learning_rate": 2.3478507048421314e-07, - "logits/chosen": -0.11048316955566406, - "logits/rejected": -0.030838940292596817, - "logps/chosen": -1.1681926250457764, - "logps/rejected": -1.6157619953155518, - "loss": 1.892, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1681926250457764, - "rewards/margins": 0.4475693702697754, - "rewards/rejected": -1.6157619953155518, - "semantic_entropy": 0.8003697395324707, + "logits/chosen": -0.2279435098171234, + "logits/rejected": -0.17337724566459656, + "logps/chosen": -1.1432182788848877, + "logps/rejected": -1.488876223564148, + "loss": 1.4881, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1432182788848877, + "rewards/margins": 0.3456578254699707, + "rewards/rejected": -1.488876223564148, "step": 3980 }, { "epoch": 2.132798126777053, - "grad_norm": 14.451761814862564, + "grad_norm": 11.351056337593837, "learning_rate": 2.334661040433713e-07, - "logits/chosen": -0.1646009385585785, - "logits/rejected": -0.05796453356742859, - "logps/chosen": -1.2593486309051514, - "logps/rejected": -1.6181026697158813, - "loss": 1.9615, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2593486309051514, - "rewards/margins": 0.3587539792060852, - "rewards/rejected": -1.6181026697158813, - "semantic_entropy": 0.7864263653755188, + "logits/chosen": -0.26267361640930176, + "logits/rejected": -0.18066450953483582, + "logps/chosen": -1.2176510095596313, + "logps/rejected": -1.4880212545394897, + "loss": 1.5592, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2176510095596313, + "rewards/margins": 0.27037036418914795, + "rewards/rejected": -1.4880212545394897, "step": 3985 }, { "epoch": 2.1354741595584548, - "grad_norm": 8.209541807724527, + "grad_norm": 7.191875819198297, "learning_rate": 2.321497235155568e-07, - "logits/chosen": -0.18386468291282654, - "logits/rejected": -0.04220528155565262, - "logps/chosen": -1.205427885055542, - "logps/rejected": -1.552451252937317, - "loss": 1.9238, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.205427885055542, - "rewards/margins": 0.3470233678817749, - "rewards/rejected": -1.552451252937317, - "semantic_entropy": 0.8057225942611694, + "logits/chosen": -0.2620295584201813, + "logits/rejected": -0.1543290913105011, + "logps/chosen": -1.1899316310882568, + "logps/rejected": -1.4223182201385498, + "loss": 1.5356, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1899316310882568, + "rewards/margins": 0.23238661885261536, + "rewards/rejected": -1.4223182201385498, "step": 3990 }, { "epoch": 2.138150192339856, - "grad_norm": 13.990666954895628, + "grad_norm": 7.5889949335374345, "learning_rate": 2.3083594167229965e-07, - "logits/chosen": -0.22056451439857483, - "logits/rejected": 0.03705238923430443, - "logps/chosen": -1.2569921016693115, - "logps/rejected": -1.6521209478378296, - "loss": 1.9669, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2569921016693115, - "rewards/margins": 0.3951290249824524, - "rewards/rejected": -1.6521209478378296, - "semantic_entropy": 0.7817844152450562, + "logits/chosen": -0.30554115772247314, + "logits/rejected": -0.09454164654016495, + "logps/chosen": -1.2171602249145508, + "logps/rejected": -1.532593011856079, + "loss": 1.5557, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2171602249145508, + "rewards/margins": 0.3154327869415283, + "rewards/rejected": -1.532593011856079, "step": 3995 }, { "epoch": 2.1408262251212578, - "grad_norm": 15.435024760859893, + "grad_norm": 9.09363561535733, "learning_rate": 2.295247712599167e-07, - "logits/chosen": -0.0924561619758606, - "logits/rejected": 0.022307047620415688, - "logps/chosen": -1.2224881649017334, - "logps/rejected": -1.615614652633667, - "loss": 1.9276, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2224881649017334, - "rewards/margins": 0.39312633872032166, - "rewards/rejected": -1.615614652633667, - "semantic_entropy": 0.7890762090682983, + "logits/chosen": -0.2022174596786499, + "logits/rejected": -0.10738588869571686, + "logps/chosen": -1.196028709411621, + "logps/rejected": -1.4597480297088623, + "loss": 1.5351, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.196028709411621, + "rewards/margins": 0.2637191414833069, + "rewards/rejected": -1.4597480297088623, "step": 4000 }, { "epoch": 2.1408262251212578, - "eval_logits/chosen": 0.28233543038368225, - "eval_logits/rejected": 0.37708133459091187, - "eval_logps/chosen": -1.3312329053878784, - "eval_logps/rejected": -1.6023647785186768, - "eval_loss": 2.0445003509521484, - "eval_rewards/accuracies": 0.5905044674873352, - "eval_rewards/chosen": -1.3312329053878784, - "eval_rewards/margins": 0.2711319625377655, - "eval_rewards/rejected": -1.6023647785186768, - "eval_runtime": 34.171, - "eval_samples_per_second": 39.361, - "eval_semantic_entropy": 0.7784487009048462, - "eval_steps_per_second": 9.862, + "eval_logits/chosen": 0.054726891219615936, + "eval_logits/rejected": 0.12384755164384842, + "eval_logps/chosen": -1.3020442724227905, + "eval_logps/rejected": -1.495224118232727, + "eval_loss": 1.646976113319397, + "eval_rewards/accuracies": 0.5712166428565979, + "eval_rewards/chosen": -1.3020442724227905, + "eval_rewards/margins": 0.19317977130413055, + "eval_rewards/rejected": -1.495224118232727, + "eval_runtime": 40.3493, + "eval_samples_per_second": 33.334, + "eval_steps_per_second": 8.352, "step": 4000 }, { "epoch": 2.1435022579026595, - "grad_norm": 6.777658647158, + "grad_norm": 6.706146378331485, "learning_rate": 2.2821622499938948e-07, - "logits/chosen": -0.10082963854074478, - "logits/rejected": 0.12032053619623184, - "logps/chosen": -1.3868082761764526, - "logps/rejected": -1.6199979782104492, - "loss": 2.0674, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3868082761764526, - "rewards/margins": 0.23318979144096375, - "rewards/rejected": -1.6199979782104492, - "semantic_entropy": 0.7673584222793579, + "logits/chosen": -0.2207103967666626, + "logits/rejected": -0.04271166771650314, + "logps/chosen": -1.3544065952301025, + "logps/rejected": -1.512176752090454, + "loss": 1.6739, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3544065952301025, + "rewards/margins": 0.1577700972557068, + "rewards/rejected": -1.512176752090454, "step": 4005 }, { "epoch": 2.1461782906840607, - "grad_norm": 13.218879283641312, + "grad_norm": 6.468273724297947, "learning_rate": 2.269103155862391e-07, - "logits/chosen": -0.09654682874679565, - "logits/rejected": 0.003932853229343891, - "logps/chosen": -1.2881536483764648, - "logps/rejected": -1.5453517436981201, - "loss": 1.9957, + "logits/chosen": -0.20493578910827637, + "logits/rejected": -0.12602047622203827, + "logps/chosen": -1.2647501230239868, + "logps/rejected": -1.4369029998779297, + "loss": 1.6101, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2881536483764648, - "rewards/margins": 0.2571980357170105, - "rewards/rejected": -1.5453517436981201, - "semantic_entropy": 0.7780659198760986, + "rewards/chosen": -1.2647501230239868, + "rewards/margins": 0.1721528172492981, + "rewards/rejected": -1.4369029998779297, "step": 4010 }, { "epoch": 2.1488543234654625, - "grad_norm": 8.059694189229388, + "grad_norm": 7.548959517776111, "learning_rate": 2.2560705569040483e-07, - "logits/chosen": -0.10584896802902222, - "logits/rejected": 0.1518443524837494, - "logps/chosen": -1.2734086513519287, - "logps/rejected": -1.5498818159103394, - "loss": 1.9992, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2734086513519287, - "rewards/margins": 0.27647334337234497, - "rewards/rejected": -1.5498818159103394, - "semantic_entropy": 0.7976066470146179, + "logits/chosen": -0.23293106257915497, + "logits/rejected": -0.026049653068184853, + "logps/chosen": -1.2496126890182495, + "logps/rejected": -1.4440643787384033, + "loss": 1.6015, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2496126890182495, + "rewards/margins": 0.19445185363292694, + "rewards/rejected": -1.4440643787384033, "step": 4015 }, { "epoch": 2.151530356246864, - "grad_norm": 7.686166913386505, + "grad_norm": 6.715284489428122, "learning_rate": 2.2430645795611963e-07, - "logits/chosen": -0.18073877692222595, - "logits/rejected": -0.03493395447731018, - "logps/chosen": -1.301017165184021, - "logps/rejected": -1.5369887351989746, - "loss": 2.0196, + "logits/chosen": -0.28804710507392883, + "logits/rejected": -0.17533376812934875, + "logps/chosen": -1.2737540006637573, + "logps/rejected": -1.4372507333755493, + "loss": 1.6193, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.301017165184021, - "rewards/margins": 0.23597149550914764, - "rewards/rejected": -1.5369887351989746, - "semantic_entropy": 0.7954763174057007, + "rewards/chosen": -1.2737540006637573, + "rewards/margins": 0.1634966880083084, + "rewards/rejected": -1.4372507333755493, "step": 4020 }, { "epoch": 2.1542063890282654, - "grad_norm": 8.340165419877227, + "grad_norm": 8.172689510884089, "learning_rate": 2.230085350017884e-07, - "logits/chosen": -0.11597704887390137, - "logits/rejected": -0.012895789928734303, - "logps/chosen": -1.20939040184021, - "logps/rejected": -1.5225939750671387, - "loss": 1.9657, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.20939040184021, - "rewards/margins": 0.3132036328315735, - "rewards/rejected": -1.5225939750671387, - "semantic_entropy": 0.8167934417724609, + "logits/chosen": -0.21455779671669006, + "logits/rejected": -0.13440287113189697, + "logps/chosen": -1.175466775894165, + "logps/rejected": -1.4104111194610596, + "loss": 1.5519, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.175466775894165, + "rewards/margins": 0.2349444329738617, + "rewards/rejected": -1.4104111194610596, "step": 4025 }, { "epoch": 2.156882421809667, - "grad_norm": 10.629523832293694, + "grad_norm": 11.649277739380334, "learning_rate": 2.2171329941986554e-07, - "logits/chosen": -0.1574414223432541, - "logits/rejected": -0.07325611263513565, - "logps/chosen": -1.2151435613632202, - "logps/rejected": -1.5735080242156982, - "loss": 1.9421, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2151435613632202, - "rewards/margins": 0.358364462852478, - "rewards/rejected": -1.5735080242156982, - "semantic_entropy": 0.7974958419799805, + "logits/chosen": -0.2572100758552551, + "logits/rejected": -0.19653275609016418, + "logps/chosen": -1.193954348564148, + "logps/rejected": -1.463118553161621, + "loss": 1.54, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.193954348564148, + "rewards/margins": 0.26916417479515076, + "rewards/rejected": -1.463118553161621, "step": 4030 }, { "epoch": 2.159558454591069, - "grad_norm": 14.813281848954883, + "grad_norm": 13.066068577813956, "learning_rate": 2.2042076377673202e-07, - "logits/chosen": -0.10038616508245468, - "logits/rejected": -0.06362534314393997, - "logps/chosen": -1.2242885828018188, - "logps/rejected": -1.3778797388076782, - "loss": 2.0183, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2242885828018188, - "rewards/margins": 0.153591126203537, - "rewards/rejected": -1.3778797388076782, - "semantic_entropy": 0.8234738111495972, + "logits/chosen": -0.2382601499557495, + "logits/rejected": -0.20679374039173126, + "logps/chosen": -1.2078959941864014, + "logps/rejected": -1.3131279945373535, + "loss": 1.6089, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2078959941864014, + "rewards/margins": 0.10523198544979095, + "rewards/rejected": -1.3131279945373535, "step": 4035 }, { "epoch": 2.16223448737247, - "grad_norm": 7.133608092036389, + "grad_norm": 6.799592401932894, "learning_rate": 2.1913094061257476e-07, - "logits/chosen": -0.11537346988916397, - "logits/rejected": -0.07308430969715118, - "logps/chosen": -1.1973235607147217, - "logps/rejected": -1.4738342761993408, - "loss": 1.9631, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.1973235607147217, - "rewards/margins": 0.2765108346939087, - "rewards/rejected": -1.4738342761993408, - "semantic_entropy": 0.8326338529586792, + "logits/chosen": -0.2680966258049011, + "logits/rejected": -0.24163532257080078, + "logps/chosen": -1.177412509918213, + "logps/rejected": -1.3566945791244507, + "loss": 1.5494, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.177412509918213, + "rewards/margins": 0.17928209900856018, + "rewards/rejected": -1.3566945791244507, "step": 4040 }, { "epoch": 2.164910520153872, - "grad_norm": 9.65466716777848, + "grad_norm": 9.205016811310943, "learning_rate": 2.178438424412633e-07, - "logits/chosen": -0.07614657282829285, - "logits/rejected": 0.052105795592069626, - "logps/chosen": -1.2976658344268799, - "logps/rejected": -1.5549640655517578, - "loss": 2.0149, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2976658344268799, - "rewards/margins": 0.25729843974113464, - "rewards/rejected": -1.5549640655517578, - "semantic_entropy": 0.7852495312690735, + "logits/chosen": -0.1484549641609192, + "logits/rejected": -0.04238913580775261, + "logps/chosen": -1.2709609270095825, + "logps/rejected": -1.4526519775390625, + "loss": 1.6185, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2709609270095825, + "rewards/margins": 0.1816910207271576, + "rewards/rejected": -1.4526519775390625, "step": 4045 }, { "epoch": 2.1675865529352736, - "grad_norm": 5.3450360348739006, + "grad_norm": 4.990038669563438, "learning_rate": 2.165594817502302e-07, - "logits/chosen": -0.18720772862434387, - "logits/rejected": -0.08179843425750732, - "logps/chosen": -1.2786951065063477, - "logps/rejected": -1.4834058284759521, - "loss": 2.0237, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2786951065063477, - "rewards/margins": 0.2047106921672821, - "rewards/rejected": -1.4834058284759521, - "semantic_entropy": 0.7854777574539185, + "logits/chosen": -0.27551451325416565, + "logits/rejected": -0.19810011982917786, + "logps/chosen": -1.2552398443222046, + "logps/rejected": -1.4007903337478638, + "loss": 1.6253, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2552398443222046, + "rewards/margins": 0.14555031061172485, + "rewards/rejected": -1.4007903337478638, "step": 4050 }, { "epoch": 2.170262585716675, - "grad_norm": 7.106303803593254, + "grad_norm": 6.9657305893367125, "learning_rate": 2.1527787100034806e-07, - "logits/chosen": -0.05425887182354927, - "logits/rejected": 0.00020194947137497365, - "logps/chosen": -1.2666957378387451, - "logps/rejected": -1.4778165817260742, - "loss": 1.9981, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2666957378387451, - "rewards/margins": 0.21112075448036194, - "rewards/rejected": -1.4778165817260742, - "semantic_entropy": 0.808153510093689, + "logits/chosen": -0.19223785400390625, + "logits/rejected": -0.14967408776283264, + "logps/chosen": -1.2315384149551392, + "logps/rejected": -1.4193180799484253, + "loss": 1.5764, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2315384149551392, + "rewards/margins": 0.18777960538864136, + "rewards/rejected": -1.4193180799484253, "step": 4055 }, { "epoch": 2.1729386184980766, - "grad_norm": 11.034265016869863, + "grad_norm": 8.656762165165723, "learning_rate": 2.1399902262581037e-07, - "logits/chosen": 0.0023912787437438965, - "logits/rejected": 0.10449610650539398, - "logps/chosen": -1.1998052597045898, - "logps/rejected": -1.4737387895584106, - "loss": 1.9525, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1998052597045898, - "rewards/margins": 0.2739335000514984, - "rewards/rejected": -1.4737387895584106, - "semantic_entropy": 0.8287736773490906, + "logits/chosen": -0.07751107960939407, + "logits/rejected": -0.005736204795539379, + "logps/chosen": -1.1673355102539062, + "logps/rejected": -1.341541051864624, + "loss": 1.5375, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1673355102539062, + "rewards/margins": 0.17420557141304016, + "rewards/rejected": -1.341541051864624, "step": 4060 }, { "epoch": 2.1756146512794783, - "grad_norm": 8.768829635920307, + "grad_norm": 8.216756052327828, "learning_rate": 2.127229490340094e-07, - "logits/chosen": -0.1924888640642166, - "logits/rejected": -0.09673380851745605, - "logps/chosen": -1.2586601972579956, - "logps/rejected": -1.641990303993225, - "loss": 1.9554, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2586601972579956, - "rewards/margins": 0.38332998752593994, - "rewards/rejected": -1.641990303993225, - "semantic_entropy": 0.7948935627937317, + "logits/chosen": -0.28622791171073914, + "logits/rejected": -0.21848344802856445, + "logps/chosen": -1.234943151473999, + "logps/rejected": -1.4843318462371826, + "loss": 1.5576, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.234943151473999, + "rewards/margins": 0.2493886649608612, + "rewards/rejected": -1.4843318462371826, "step": 4065 }, { "epoch": 2.1782906840608796, - "grad_norm": 12.865765144912011, + "grad_norm": 11.15955657755358, "learning_rate": 2.1144966260541698e-07, - "logits/chosen": -0.09876732528209686, - "logits/rejected": 0.1042884811758995, - "logps/chosen": -1.2214984893798828, - "logps/rejected": -1.5891913175582886, - "loss": 1.9443, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2214984893798828, - "rewards/margins": 0.3676929175853729, - "rewards/rejected": -1.5891913175582886, - "semantic_entropy": 0.8093553781509399, + "logits/chosen": -0.17828765511512756, + "logits/rejected": -0.005265363492071629, + "logps/chosen": -1.1935495138168335, + "logps/rejected": -1.4459011554718018, + "loss": 1.5417, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1935495138168335, + "rewards/margins": 0.25235164165496826, + "rewards/rejected": -1.4459011554718018, "step": 4070 }, { "epoch": 2.1809667168422813, - "grad_norm": 10.135158048631018, + "grad_norm": 7.989961925106653, "learning_rate": 2.1017917569346332e-07, - "logits/chosen": -0.14737705886363983, - "logits/rejected": 0.036610908806324005, - "logps/chosen": -1.311643362045288, - "logps/rejected": -1.5534847974777222, - "loss": 2.0146, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.311643362045288, - "rewards/margins": 0.24184155464172363, - "rewards/rejected": -1.5534847974777222, - "semantic_entropy": 0.7842815518379211, + "logits/chosen": -0.24111512303352356, + "logits/rejected": -0.09941961616277695, + "logps/chosen": -1.2860196828842163, + "logps/rejected": -1.4652435779571533, + "loss": 1.62, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2860196828842163, + "rewards/margins": 0.17922398447990417, + "rewards/rejected": -1.4652435779571533, "step": 4075 }, { "epoch": 2.183642749623683, - "grad_norm": 5.280114652940724, + "grad_norm": 5.6274107718592115, "learning_rate": 2.0891150062441837e-07, - "logits/chosen": -0.14050161838531494, - "logits/rejected": -0.004181402735412121, - "logps/chosen": -1.2973473072052002, - "logps/rejected": -1.6200370788574219, - "loss": 1.9934, + "logits/chosen": -0.2330934703350067, + "logits/rejected": -0.13141149282455444, + "logps/chosen": -1.26702880859375, + "logps/rejected": -1.4907028675079346, + "loss": 1.6039, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2973473072052002, - "rewards/margins": 0.32268989086151123, - "rewards/rejected": -1.6200370788574219, - "semantic_entropy": 0.7707660794258118, + "rewards/chosen": -1.26702880859375, + "rewards/margins": 0.22367417812347412, + "rewards/rejected": -1.4907028675079346, "step": 4080 }, { "epoch": 2.1863187824050843, - "grad_norm": 7.507466722589746, + "grad_norm": 6.8913373247329535, "learning_rate": 2.0764664969727086e-07, - "logits/chosen": -0.10917153209447861, - "logits/rejected": -0.014076301828026772, - "logps/chosen": -1.2789490222930908, - "logps/rejected": -1.4958078861236572, - "loss": 2.0392, + "logits/chosen": -0.21193423867225647, + "logits/rejected": -0.1271488070487976, + "logps/chosen": -1.2517024278640747, + "logps/rejected": -1.4006750583648682, + "loss": 1.6414, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2789490222930908, - "rewards/margins": 0.2168588638305664, - "rewards/rejected": -1.4958078861236572, - "semantic_entropy": 0.7913199067115784, + "rewards/chosen": -1.2517024278640747, + "rewards/margins": 0.14897270500659943, + "rewards/rejected": -1.4006750583648682, "step": 4085 }, { "epoch": 2.188994815186486, - "grad_norm": 8.443772130300719, + "grad_norm": 6.812174740485534, "learning_rate": 2.0638463518361033e-07, - "logits/chosen": -0.19746491312980652, - "logits/rejected": -0.0061895460821688175, - "logps/chosen": -1.2484588623046875, - "logps/rejected": -1.5168395042419434, - "loss": 1.9785, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2484588623046875, - "rewards/margins": 0.26838067173957825, - "rewards/rejected": -1.5168395042419434, - "semantic_entropy": 0.796512246131897, + "logits/chosen": -0.26948246359825134, + "logits/rejected": -0.12425342947244644, + "logps/chosen": -1.224528431892395, + "logps/rejected": -1.4087930917739868, + "loss": 1.5841, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.224528431892395, + "rewards/margins": 0.18426451086997986, + "rewards/rejected": -1.4087930917739868, "step": 4090 }, { "epoch": 2.1916708479678877, - "grad_norm": 10.320844957676552, + "grad_norm": 7.8071610053875045, "learning_rate": 2.0512546932750702e-07, - "logits/chosen": -0.15037508308887482, - "logits/rejected": -0.06879515200853348, - "logps/chosen": -1.3391551971435547, - "logps/rejected": -1.5366318225860596, - "loss": 2.0446, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3391551971435547, - "rewards/margins": 0.19747667014598846, - "rewards/rejected": -1.5366318225860596, - "semantic_entropy": 0.7697279453277588, + "logits/chosen": -0.23521646857261658, + "logits/rejected": -0.17201289534568787, + "logps/chosen": -1.3097665309906006, + "logps/rejected": -1.4396437406539917, + "loss": 1.6505, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3097665309906006, + "rewards/margins": 0.12987719476222992, + "rewards/rejected": -1.4396437406539917, "step": 4095 }, { "epoch": 2.194346880749289, - "grad_norm": 8.874663144842327, + "grad_norm": 9.759600570549564, "learning_rate": 2.0386916434539343e-07, - "logits/chosen": -0.10112161934375763, - "logits/rejected": 0.03384857252240181, - "logps/chosen": -1.1738734245300293, - "logps/rejected": -1.5382874011993408, - "loss": 1.925, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1738734245300293, - "rewards/margins": 0.36441394686698914, - "rewards/rejected": -1.5382874011993408, - "semantic_entropy": 0.8074756860733032, + "logits/chosen": -0.1733095496892929, + "logits/rejected": -0.06978510320186615, + "logps/chosen": -1.159136414527893, + "logps/rejected": -1.4183191061019897, + "loss": 1.5377, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.159136414527893, + "rewards/margins": 0.25918275117874146, + "rewards/rejected": -1.4183191061019897, "step": 4100 }, { "epoch": 2.1970229135306907, - "grad_norm": 11.524650233894052, + "grad_norm": 9.654762879413962, "learning_rate": 2.0261573242594627e-07, - "logits/chosen": -0.11960120499134064, - "logits/rejected": 0.07483614981174469, - "logps/chosen": -1.3214560747146606, - "logps/rejected": -1.556175947189331, - "loss": 2.0245, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3214560747146606, - "rewards/margins": 0.23472002148628235, - "rewards/rejected": -1.556175947189331, - "semantic_entropy": 0.783279299736023, + "logits/chosen": -0.23652561008930206, + "logits/rejected": -0.07516833394765854, + "logps/chosen": -1.2808523178100586, + "logps/rejected": -1.449417233467102, + "loss": 1.6213, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2808523178100586, + "rewards/margins": 0.16856491565704346, + "rewards/rejected": -1.449417233467102, "step": 4105 }, { "epoch": 2.1996989463120924, - "grad_norm": 10.565157270268786, + "grad_norm": 9.366010754246267, "learning_rate": 2.0136518572996724e-07, - "logits/chosen": -0.08820009231567383, - "logits/rejected": 0.07982397079467773, - "logps/chosen": -1.2397620677947998, - "logps/rejected": -1.518204689025879, - "loss": 1.9902, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2397620677947998, - "rewards/margins": 0.2784424424171448, - "rewards/rejected": -1.518204689025879, - "semantic_entropy": 0.8072455525398254, + "logits/chosen": -0.16323630511760712, + "logits/rejected": -0.02795899473130703, + "logps/chosen": -1.2069051265716553, + "logps/rejected": -1.3920739889144897, + "loss": 1.5824, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2069051265716553, + "rewards/margins": 0.18516886234283447, + "rewards/rejected": -1.3920739889144897, "step": 4110 }, { "epoch": 2.202374979093494, - "grad_norm": 8.282720482886361, + "grad_norm": 7.544247300712321, "learning_rate": 2.0011753639026617e-07, - "logits/chosen": -0.07058636844158173, - "logits/rejected": -0.027735024690628052, - "logps/chosen": -1.2462873458862305, - "logps/rejected": -1.6336066722869873, - "loss": 1.9538, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2462873458862305, - "rewards/margins": 0.38731926679611206, - "rewards/rejected": -1.6336066722869873, - "semantic_entropy": 0.7994504570960999, + "logits/chosen": -0.15321019291877747, + "logits/rejected": -0.12837058305740356, + "logps/chosen": -1.2154773473739624, + "logps/rejected": -1.4970439672470093, + "loss": 1.5524, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2154773473739624, + "rewards/margins": 0.28156667947769165, + "rewards/rejected": -1.4970439672470093, "step": 4115 }, { "epoch": 2.2050510118748954, - "grad_norm": 6.648714961392694, + "grad_norm": 5.510570855361945, "learning_rate": 1.988727965115421e-07, - "logits/chosen": -0.11287019401788712, - "logits/rejected": -0.035372935235500336, - "logps/chosen": -1.1937533617019653, - "logps/rejected": -1.5723878145217896, - "loss": 1.9056, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1937533617019653, - "rewards/margins": 0.37863463163375854, - "rewards/rejected": -1.5723878145217896, - "semantic_entropy": 0.7976509928703308, + "logits/chosen": -0.2098352015018463, + "logits/rejected": -0.1655178815126419, + "logps/chosen": -1.1754869222640991, + "logps/rejected": -1.4288277626037598, + "loss": 1.5211, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1754869222640991, + "rewards/margins": 0.25334078073501587, + "rewards/rejected": -1.4288277626037598, "step": 4120 }, { "epoch": 2.207727044656297, - "grad_norm": 11.00571287521211, + "grad_norm": 6.781154016442021, "learning_rate": 1.9763097817026713e-07, - "logits/chosen": -0.16754977405071259, - "logits/rejected": 0.03706652671098709, - "logps/chosen": -1.2285130023956299, - "logps/rejected": -1.5773698091506958, - "loss": 1.9546, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2285130023956299, - "rewards/margins": 0.3488568663597107, - "rewards/rejected": -1.5773698091506958, - "semantic_entropy": 0.8013304471969604, + "logits/chosen": -0.24368247389793396, + "logits/rejected": -0.08154761046171188, + "logps/chosen": -1.1996369361877441, + "logps/rejected": -1.4158093929290771, + "loss": 1.5593, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1996369361877441, + "rewards/margins": 0.216172456741333, + "rewards/rejected": -1.4158093929290771, "step": 4125 }, { "epoch": 2.210403077437699, - "grad_norm": 8.80659775953107, + "grad_norm": 7.839360609152996, "learning_rate": 1.9639209341456796e-07, - "logits/chosen": -0.08111747354269028, - "logits/rejected": -0.0008040537941269577, - "logps/chosen": -1.2347103357315063, - "logps/rejected": -1.581979751586914, - "loss": 1.9478, + "logits/chosen": -0.19044779241085052, + "logits/rejected": -0.12619850039482117, + "logps/chosen": -1.2049930095672607, + "logps/rejected": -1.464005470275879, + "loss": 1.5452, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2347103357315063, - "rewards/margins": 0.3472694456577301, - "rewards/rejected": -1.581979751586914, - "semantic_entropy": 0.8024226427078247, + "rewards/chosen": -1.2049930095672607, + "rewards/margins": 0.2590124011039734, + "rewards/rejected": -1.464005470275879, "step": 4130 }, { "epoch": 2.2130791102191, - "grad_norm": 17.10761812582053, + "grad_norm": 14.741267901523582, "learning_rate": 1.951561542641102e-07, - "logits/chosen": -0.07774971425533295, - "logits/rejected": -0.08791448175907135, - "logps/chosen": -1.2992351055145264, - "logps/rejected": -1.6637672185897827, - "loss": 2.0077, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2992351055145264, - "rewards/margins": 0.36453211307525635, - "rewards/rejected": -1.6637672185897827, - "semantic_entropy": 0.7671725749969482, + "logits/chosen": -0.22283372282981873, + "logits/rejected": -0.2287733107805252, + "logps/chosen": -1.2565338611602783, + "logps/rejected": -1.4918019771575928, + "loss": 1.6078, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2565338611602783, + "rewards/margins": 0.23526807129383087, + "rewards/rejected": -1.4918019771575928, "step": 4135 }, { "epoch": 2.215755143000502, - "grad_norm": 9.270540567718726, + "grad_norm": 8.455044723284791, "learning_rate": 1.939231727099806e-07, - "logits/chosen": -0.25526660680770874, - "logits/rejected": -0.1716885268688202, - "logps/chosen": -1.2138149738311768, - "logps/rejected": -1.5514978170394897, - "loss": 1.9477, + "logits/chosen": -0.3040529787540436, + "logits/rejected": -0.23242278397083282, + "logps/chosen": -1.1781367063522339, + "logps/rejected": -1.4392648935317993, + "loss": 1.5309, "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2138149738311768, - "rewards/margins": 0.337682843208313, - "rewards/rejected": -1.5514978170394897, - "semantic_entropy": 0.8171249628067017, + "rewards/chosen": -1.1781367063522339, + "rewards/margins": 0.26112794876098633, + "rewards/rejected": -1.4392648935317993, "step": 4140 }, { "epoch": 2.2184311757819035, - "grad_norm": 8.253484382591838, + "grad_norm": 8.127198255996031, "learning_rate": 1.926931607145719e-07, - "logits/chosen": -0.06769241392612457, - "logits/rejected": 0.06590889394283295, - "logps/chosen": -1.3090394735336304, - "logps/rejected": -1.6794878244400024, - "loss": 2.002, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3090394735336304, - "rewards/margins": 0.3704483211040497, - "rewards/rejected": -1.6794878244400024, - "semantic_entropy": 0.7758277654647827, + "logits/chosen": -0.15838944911956787, + "logits/rejected": -0.04505031183362007, + "logps/chosen": -1.276071310043335, + "logps/rejected": -1.5455598831176758, + "loss": 1.6046, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.276071310043335, + "rewards/margins": 0.2694885730743408, + "rewards/rejected": -1.5455598831176758, "step": 4145 }, { "epoch": 2.221107208563305, - "grad_norm": 11.290111124403301, + "grad_norm": 8.103668914229091, "learning_rate": 1.9146613021146564e-07, - "logits/chosen": -0.0959344133734703, - "logits/rejected": -0.003268678905442357, - "logps/chosen": -1.203951358795166, - "logps/rejected": -1.5836807489395142, - "loss": 1.9386, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.203951358795166, - "rewards/margins": 0.37972959876060486, - "rewards/rejected": -1.5836807489395142, - "semantic_entropy": 0.7990487217903137, + "logits/chosen": -0.16998286545276642, + "logits/rejected": -0.10186684131622314, + "logps/chosen": -1.1686853170394897, + "logps/rejected": -1.4343395233154297, + "loss": 1.5359, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1686853170394897, + "rewards/margins": 0.26565423607826233, + "rewards/rejected": -1.4343395233154297, "step": 4150 }, { "epoch": 2.2237832413447065, - "grad_norm": 10.982154674202642, + "grad_norm": 7.996804928577198, "learning_rate": 1.9024209310531736e-07, - "logits/chosen": -0.03283507376909256, - "logits/rejected": -0.05256788060069084, - "logps/chosen": -1.2322564125061035, - "logps/rejected": -1.5404446125030518, - "loss": 1.9473, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2322564125061035, - "rewards/margins": 0.30818822979927063, - "rewards/rejected": -1.5404446125030518, - "semantic_entropy": 0.794259250164032, + "logits/chosen": -0.1501077264547348, + "logits/rejected": -0.17684204876422882, + "logps/chosen": -1.2057361602783203, + "logps/rejected": -1.426270604133606, + "loss": 1.5515, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2057361602783203, + "rewards/margins": 0.2205342799425125, + "rewards/rejected": -1.426270604133606, "step": 4155 }, { "epoch": 2.2264592741261082, - "grad_norm": 11.44996223685397, + "grad_norm": 9.335124344977482, "learning_rate": 1.890210612717401e-07, - "logits/chosen": -0.1292242854833603, - "logits/rejected": 0.018556052818894386, - "logps/chosen": -1.3012385368347168, - "logps/rejected": -1.582773208618164, - "loss": 2.01, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3012385368347168, - "rewards/margins": 0.28153473138809204, - "rewards/rejected": -1.582773208618164, - "semantic_entropy": 0.7758509516716003, + "logits/chosen": -0.21425195038318634, + "logits/rejected": -0.09165969491004944, + "logps/chosen": -1.2601537704467773, + "logps/rejected": -1.4697338342666626, + "loss": 1.6069, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2601537704467773, + "rewards/margins": 0.20958003401756287, + "rewards/rejected": -1.4697338342666626, "step": 4160 }, { "epoch": 2.2291353069075095, - "grad_norm": 7.439751836298014, + "grad_norm": 7.097111005955174, "learning_rate": 1.8780304655719054e-07, - "logits/chosen": -0.13371321558952332, - "logits/rejected": -0.025500833988189697, - "logps/chosen": -1.2886149883270264, - "logps/rejected": -1.6176551580429077, - "loss": 1.9978, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2886149883270264, - "rewards/margins": 0.329039990901947, - "rewards/rejected": -1.6176551580429077, - "semantic_entropy": 0.7811014652252197, + "logits/chosen": -0.21242213249206543, + "logits/rejected": -0.1380898356437683, + "logps/chosen": -1.2612653970718384, + "logps/rejected": -1.4888466596603394, + "loss": 1.608, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2612653970718384, + "rewards/margins": 0.2275814265012741, + "rewards/rejected": -1.4888466596603394, "step": 4165 }, { "epoch": 2.231811339688911, - "grad_norm": 9.912674788694538, + "grad_norm": 8.531276498829367, "learning_rate": 1.865880607788523e-07, - "logits/chosen": -0.012343516573309898, - "logits/rejected": 0.0449819378554821, - "logps/chosen": -1.273308277130127, - "logps/rejected": -1.610945701599121, - "loss": 1.971, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.273308277130127, - "rewards/margins": 0.33763742446899414, - "rewards/rejected": -1.610945701599121, - "semantic_entropy": 0.7837294340133667, + "logits/chosen": -0.12863633036613464, + "logits/rejected": -0.08679869771003723, + "logps/chosen": -1.239980936050415, + "logps/rejected": -1.4835368394851685, + "loss": 1.5748, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.239980936050415, + "rewards/margins": 0.2435559332370758, + "rewards/rejected": -1.4835368394851685, "step": 4170 }, { "epoch": 2.234487372470313, - "grad_norm": 8.108254067117462, + "grad_norm": 7.0822481308862395, "learning_rate": 1.8537611572452316e-07, - "logits/chosen": -0.14038607478141785, - "logits/rejected": -0.039006926119327545, - "logps/chosen": -1.2530543804168701, - "logps/rejected": -1.4794385433197021, - "loss": 1.9945, + "logits/chosen": -0.2195165604352951, + "logits/rejected": -0.1345038115978241, + "logps/chosen": -1.2192291021347046, + "logps/rejected": -1.3896890878677368, + "loss": 1.5897, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2530543804168701, - "rewards/margins": 0.22638408839702606, - "rewards/rejected": -1.4794385433197021, - "semantic_entropy": 0.8008133172988892, + "rewards/chosen": -1.2192291021347046, + "rewards/margins": 0.17045992612838745, + "rewards/rejected": -1.3896890878677368, "step": 4175 }, { "epoch": 2.237163405251714, - "grad_norm": 11.866315689855641, + "grad_norm": 10.335725680656012, "learning_rate": 1.84167223152499e-07, - "logits/chosen": -0.1662590205669403, - "logits/rejected": 0.048979468643665314, - "logps/chosen": -1.199005365371704, - "logps/rejected": -1.56072199344635, - "loss": 1.9336, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.199005365371704, - "rewards/margins": 0.36171668767929077, - "rewards/rejected": -1.56072199344635, - "semantic_entropy": 0.8317713737487793, + "logits/chosen": -0.23852384090423584, + "logits/rejected": -0.05886412411928177, + "logps/chosen": -1.1707592010498047, + "logps/rejected": -1.449096918106079, + "loss": 1.5156, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1707592010498047, + "rewards/margins": 0.2783377468585968, + "rewards/rejected": -1.449096918106079, "step": 4180 }, { "epoch": 2.239839438033116, - "grad_norm": 11.019023925593954, + "grad_norm": 8.753214305093897, "learning_rate": 1.8296139479146112e-07, - "logits/chosen": -0.19250182807445526, - "logits/rejected": -0.18602564930915833, - "logps/chosen": -1.1668365001678467, - "logps/rejected": -1.524772047996521, - "loss": 1.9172, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1668365001678467, - "rewards/margins": 0.3579355776309967, - "rewards/rejected": -1.524772047996521, - "semantic_entropy": 0.8143719434738159, + "logits/chosen": -0.26844969391822815, + "logits/rejected": -0.269203782081604, + "logps/chosen": -1.144785761833191, + "logps/rejected": -1.4154605865478516, + "loss": 1.5112, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.144785761833191, + "rewards/margins": 0.27067479491233826, + "rewards/rejected": -1.4154605865478516, "step": 4185 }, { "epoch": 2.2425154708145176, - "grad_norm": 11.552918601934442, + "grad_norm": 8.49101681170719, "learning_rate": 1.8175864234036132e-07, - "logits/chosen": -0.032421402633190155, - "logits/rejected": 0.07119415700435638, - "logps/chosen": -1.248134970664978, - "logps/rejected": -1.5825941562652588, - "loss": 1.9738, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.248134970664978, - "rewards/margins": 0.334459125995636, - "rewards/rejected": -1.5825941562652588, - "semantic_entropy": 0.8027989268302917, + "logits/chosen": -0.13564220070838928, + "logits/rejected": -0.04429326206445694, + "logps/chosen": -1.223623514175415, + "logps/rejected": -1.435447096824646, + "loss": 1.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.223623514175415, + "rewards/margins": 0.21182358264923096, + "rewards/rejected": -1.435447096824646, "step": 4190 }, { "epoch": 2.245191503595919, - "grad_norm": 5.834167395205214, + "grad_norm": 5.32479960697816, "learning_rate": 1.805589774683094e-07, - "logits/chosen": -0.243893101811409, - "logits/rejected": -0.08822005242109299, - "logps/chosen": -1.2915481328964233, - "logps/rejected": -1.5085653066635132, - "loss": 2.0228, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2915481328964233, - "rewards/margins": 0.2170170247554779, - "rewards/rejected": -1.5085653066635132, - "semantic_entropy": 0.7923386096954346, + "logits/chosen": -0.29307204484939575, + "logits/rejected": -0.1578172743320465, + "logps/chosen": -1.2666765451431274, + "logps/rejected": -1.4149402379989624, + "loss": 1.6249, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2666765451431274, + "rewards/margins": 0.14826364815235138, + "rewards/rejected": -1.4149402379989624, "step": 4195 }, { "epoch": 2.2478675363773206, - "grad_norm": 10.490816788351006, + "grad_norm": 8.502020695065282, "learning_rate": 1.79362411814459e-07, - "logits/chosen": 0.012433795258402824, - "logits/rejected": -0.003731709672138095, - "logps/chosen": -1.3018280267715454, - "logps/rejected": -1.6060158014297485, - "loss": 1.9941, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3018280267715454, - "rewards/margins": 0.3041878342628479, - "rewards/rejected": -1.6060158014297485, - "semantic_entropy": 0.7790161967277527, + "logits/chosen": -0.0959613025188446, + "logits/rejected": -0.11763976514339447, + "logps/chosen": -1.271305799484253, + "logps/rejected": -1.502671480178833, + "loss": 1.5997, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.271305799484253, + "rewards/margins": 0.23136551678180695, + "rewards/rejected": -1.502671480178833, "step": 4200 }, { "epoch": 2.2505435691587223, - "grad_norm": 10.451097828510381, + "grad_norm": 8.408928564528509, "learning_rate": 1.7816895698789552e-07, - "logits/chosen": -0.18953083455562592, - "logits/rejected": -0.11586049944162369, - "logps/chosen": -1.2320678234100342, - "logps/rejected": -1.557760238647461, - "loss": 1.9327, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2320678234100342, - "rewards/margins": 0.32569244503974915, - "rewards/rejected": -1.557760238647461, - "semantic_entropy": 0.801048755645752, + "logits/chosen": -0.26660507917404175, + "logits/rejected": -0.20372562110424042, + "logps/chosen": -1.2004365921020508, + "logps/rejected": -1.4662361145019531, + "loss": 1.5209, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2004365921020508, + "rewards/margins": 0.26579946279525757, + "rewards/rejected": -1.4662361145019531, "step": 4205 }, { "epoch": 2.2532196019401236, - "grad_norm": 9.394165279246357, + "grad_norm": 8.460265723960784, "learning_rate": 1.7697862456752271e-07, - "logits/chosen": -0.14001984894275665, - "logits/rejected": -0.02964564599096775, - "logps/chosen": -1.2370903491973877, - "logps/rejected": -1.7004258632659912, - "loss": 1.9495, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2370903491973877, - "rewards/margins": 0.4633353352546692, - "rewards/rejected": -1.7004258632659912, - "semantic_entropy": 0.7974084615707397, + "logits/chosen": -0.2261292189359665, + "logits/rejected": -0.1469893455505371, + "logps/chosen": -1.2134878635406494, + "logps/rejected": -1.5519218444824219, + "loss": 1.5517, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2134878635406494, + "rewards/margins": 0.33843404054641724, + "rewards/rejected": -1.5519218444824219, "step": 4210 }, { "epoch": 2.2558956347215253, - "grad_norm": 7.41481622790433, + "grad_norm": 6.811278118003559, "learning_rate": 1.7579142610195124e-07, - "logits/chosen": -0.11725671589374542, - "logits/rejected": 0.018145056441426277, - "logps/chosen": -1.265994906425476, - "logps/rejected": -1.5646344423294067, - "loss": 1.9885, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.265994906425476, - "rewards/margins": 0.2986395061016083, - "rewards/rejected": -1.5646344423294067, - "semantic_entropy": 0.787617027759552, + "logits/chosen": -0.24250388145446777, + "logits/rejected": -0.1430424600839615, + "logps/chosen": -1.2435731887817383, + "logps/rejected": -1.452593445777893, + "loss": 1.5939, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2435731887817383, + "rewards/margins": 0.20902028679847717, + "rewards/rejected": -1.452593445777893, "step": 4215 }, { "epoch": 2.258571667502927, - "grad_norm": 8.497384684424222, + "grad_norm": 7.76141628727321, "learning_rate": 1.7460737310938568e-07, - "logits/chosen": -0.1787947118282318, - "logits/rejected": 0.02110464498400688, - "logps/chosen": -1.2498914003372192, - "logps/rejected": -1.543668508529663, - "loss": 1.9893, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2498914003372192, - "rewards/margins": 0.2937771677970886, - "rewards/rejected": -1.543668508529663, - "semantic_entropy": 0.7971317768096924, + "logits/chosen": -0.2618526816368103, + "logits/rejected": -0.09927698969841003, + "logps/chosen": -1.2269976139068604, + "logps/rejected": -1.4207863807678223, + "loss": 1.5909, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2269976139068604, + "rewards/margins": 0.19378860294818878, + "rewards/rejected": -1.4207863807678223, "step": 4220 }, { "epoch": 2.2612477002843283, - "grad_norm": 10.470912497403106, + "grad_norm": 10.727996484376538, "learning_rate": 1.734264770775133e-07, - "logits/chosen": -0.16025003790855408, - "logits/rejected": 0.01915592886507511, - "logps/chosen": -1.2886840105056763, - "logps/rejected": -1.6545966863632202, - "loss": 1.9668, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2886840105056763, - "rewards/margins": 0.3659127950668335, - "rewards/rejected": -1.6545966863632202, - "semantic_entropy": 0.7657202482223511, + "logits/chosen": -0.2516542673110962, + "logits/rejected": -0.11354745924472809, + "logps/chosen": -1.2602417469024658, + "logps/rejected": -1.5199130773544312, + "loss": 1.5865, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2602417469024658, + "rewards/margins": 0.2596711814403534, + "rewards/rejected": -1.5199130773544312, "step": 4225 }, { "epoch": 2.26392373306573, - "grad_norm": 8.935706182234096, + "grad_norm": 7.809550665640668, "learning_rate": 1.7224874946339241e-07, - "logits/chosen": -0.16925734281539917, - "logits/rejected": -0.08762809634208679, - "logps/chosen": -1.322160005569458, - "logps/rejected": -1.6796150207519531, - "loss": 2.0271, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.322160005569458, - "rewards/margins": 0.35745516419410706, - "rewards/rejected": -1.6796150207519531, - "semantic_entropy": 0.781524121761322, + "logits/chosen": -0.2582088112831116, + "logits/rejected": -0.20305652916431427, + "logps/chosen": -1.2808090448379517, + "logps/rejected": -1.5809587240219116, + "loss": 1.6193, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2808090448379517, + "rewards/margins": 0.30014973878860474, + "rewards/rejected": -1.5809587240219116, "step": 4230 }, { "epoch": 2.2665997658471317, - "grad_norm": 8.343351194328045, + "grad_norm": 7.570062948249775, "learning_rate": 1.7107420169334186e-07, - "logits/chosen": -0.12380006164312363, - "logits/rejected": -0.005908069666475058, - "logps/chosen": -1.3306140899658203, - "logps/rejected": -1.4885650873184204, - "loss": 2.0753, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.3306140899658203, - "rewards/margins": 0.1579509973526001, - "rewards/rejected": -1.4885650873184204, - "semantic_entropy": 0.7841423749923706, + "logits/chosen": -0.20256300270557404, + "logits/rejected": -0.10187848657369614, + "logps/chosen": -1.2940313816070557, + "logps/rejected": -1.3799540996551514, + "loss": 1.6734, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.2940313816070557, + "rewards/margins": 0.08592281490564346, + "rewards/rejected": -1.3799540996551514, "step": 4235 }, { "epoch": 2.269275798628533, - "grad_norm": 9.747809456731876, + "grad_norm": 8.623862268653017, "learning_rate": 1.6990284516282893e-07, - "logits/chosen": -0.12111307680606842, - "logits/rejected": -0.0011890288442373276, - "logps/chosen": -1.257455825805664, - "logps/rejected": -1.4483282566070557, - "loss": 2.0125, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.257455825805664, - "rewards/margins": 0.1908724457025528, - "rewards/rejected": -1.4483282566070557, - "semantic_entropy": 0.8028818368911743, + "logits/chosen": -0.18112261593341827, + "logits/rejected": -0.08644866943359375, + "logps/chosen": -1.2399652004241943, + "logps/rejected": -1.3614873886108398, + "loss": 1.6195, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2399652004241943, + "rewards/margins": 0.12152229249477386, + "rewards/rejected": -1.3614873886108398, "step": 4240 }, { "epoch": 2.2719518314099347, - "grad_norm": 7.533871407979239, + "grad_norm": 7.106966197584022, "learning_rate": 1.687346912363602e-07, - "logits/chosen": -0.2015412300825119, - "logits/rejected": -0.07601834833621979, - "logps/chosen": -1.2797132730484009, - "logps/rejected": -1.4889013767242432, - "loss": 2.0075, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2797132730484009, - "rewards/margins": 0.20918801426887512, - "rewards/rejected": -1.4889013767242432, - "semantic_entropy": 0.8059288263320923, + "logits/chosen": -0.27653664350509644, + "logits/rejected": -0.1859886646270752, + "logps/chosen": -1.2457199096679688, + "logps/rejected": -1.371375322341919, + "loss": 1.6025, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2457199096679688, + "rewards/margins": 0.12565529346466064, + "rewards/rejected": -1.371375322341919, "step": 4245 }, { "epoch": 2.2746278641913364, - "grad_norm": 5.385013656266259, + "grad_norm": 5.110825815541059, "learning_rate": 1.675697512473697e-07, - "logits/chosen": -0.10228011757135391, - "logits/rejected": 0.07375967502593994, - "logps/chosen": -1.2664943933486938, - "logps/rejected": -1.5417674779891968, - "loss": 2.0087, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2664943933486938, - "rewards/margins": 0.2752731740474701, - "rewards/rejected": -1.5417674779891968, - "semantic_entropy": 0.7941025495529175, + "logits/chosen": -0.18636533617973328, + "logits/rejected": -0.04834170266985893, + "logps/chosen": -1.2373005151748657, + "logps/rejected": -1.4027382135391235, + "loss": 1.6122, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2373005151748657, + "rewards/margins": 0.16543760895729065, + "rewards/rejected": -1.4027382135391235, "step": 4250 }, { "epoch": 2.2773038969727377, - "grad_norm": 12.063702544187802, + "grad_norm": 10.93040123034802, "learning_rate": 1.6640803649811087e-07, - "logits/chosen": -0.1658763885498047, - "logits/rejected": 0.06614699214696884, - "logps/chosen": -1.286105990409851, - "logps/rejected": -1.6867536306381226, - "loss": 1.9867, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.286105990409851, - "rewards/margins": 0.4006476402282715, - "rewards/rejected": -1.6867536306381226, - "semantic_entropy": 0.7811282277107239, + "logits/chosen": -0.2494935244321823, + "logits/rejected": -0.05917506664991379, + "logps/chosen": -1.2552902698516846, + "logps/rejected": -1.499660611152649, + "loss": 1.5887, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2552902698516846, + "rewards/margins": 0.2443702518939972, + "rewards/rejected": -1.499660611152649, "step": 4255 }, { "epoch": 2.2799799297541394, - "grad_norm": 10.859671266641389, + "grad_norm": 8.524387736147663, "learning_rate": 1.6524955825954472e-07, - "logits/chosen": -0.14719471335411072, - "logits/rejected": -0.04040377587080002, - "logps/chosen": -1.2015544176101685, - "logps/rejected": -1.5611904859542847, - "loss": 1.9292, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2015544176101685, - "rewards/margins": 0.35963618755340576, - "rewards/rejected": -1.5611904859542847, - "semantic_entropy": 0.808398425579071, + "logits/chosen": -0.20921842753887177, + "logits/rejected": -0.10822540521621704, + "logps/chosen": -1.172518014907837, + "logps/rejected": -1.4659488201141357, + "loss": 1.5203, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.172518014907837, + "rewards/margins": 0.29343074560165405, + "rewards/rejected": -1.4659488201141357, "step": 4260 }, { "epoch": 2.282655962535541, - "grad_norm": 7.040659457503802, + "grad_norm": 6.648049567413863, "learning_rate": 1.6409432777123277e-07, - "logits/chosen": -0.25105080008506775, - "logits/rejected": -0.08539889752864838, - "logps/chosen": -1.2529833316802979, - "logps/rejected": -1.5797755718231201, - "loss": 1.9642, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2529833316802979, - "rewards/margins": 0.3267921507358551, - "rewards/rejected": -1.5797755718231201, - "semantic_entropy": 0.7997857332229614, + "logits/chosen": -0.2961587607860565, + "logits/rejected": -0.15879955887794495, + "logps/chosen": -1.222677230834961, + "logps/rejected": -1.4485093355178833, + "loss": 1.5618, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.222677230834961, + "rewards/margins": 0.22583195567131042, + "rewards/rejected": -1.4485093355178833, "step": 4265 }, { "epoch": 2.285331995316943, - "grad_norm": 6.433767783550345, + "grad_norm": 5.8745856311887374, "learning_rate": 1.6294235624122577e-07, - "logits/chosen": -0.055332403630018234, - "logits/rejected": 0.1894211620092392, - "logps/chosen": -1.2655553817749023, - "logps/rejected": -1.7325023412704468, - "loss": 1.9706, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2655553817749023, - "rewards/margins": 0.466947078704834, - "rewards/rejected": -1.7325023412704468, - "semantic_entropy": 0.7825048565864563, + "logits/chosen": -0.1480863094329834, + "logits/rejected": 0.050052572041749954, + "logps/chosen": -1.243753433227539, + "logps/rejected": -1.594008207321167, + "loss": 1.5783, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.243753433227539, + "rewards/margins": 0.3502548336982727, + "rewards/rejected": -1.594008207321167, "step": 4270 }, { "epoch": 2.288008028098344, - "grad_norm": 12.153539061712653, + "grad_norm": 8.907531457349748, "learning_rate": 1.6179365484595697e-07, - "logits/chosen": -0.09241259098052979, - "logits/rejected": -0.00836194772273302, - "logps/chosen": -1.3286699056625366, - "logps/rejected": -1.6374403238296509, - "loss": 2.0464, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3286699056625366, - "rewards/margins": 0.3087703585624695, - "rewards/rejected": -1.6374403238296509, - "semantic_entropy": 0.7748314142227173, + "logits/chosen": -0.19933749735355377, + "logits/rejected": -0.13979306817054749, + "logps/chosen": -1.297931432723999, + "logps/rejected": -1.497729778289795, + "loss": 1.6489, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.297931432723999, + "rewards/margins": 0.19979847967624664, + "rewards/rejected": -1.497729778289795, "step": 4275 }, { "epoch": 2.290684060879746, - "grad_norm": 7.697929758201799, + "grad_norm": 6.864862563404964, "learning_rate": 1.60648234730132e-07, - "logits/chosen": -0.13402453064918518, - "logits/rejected": -0.043066900223493576, - "logps/chosen": -1.2240922451019287, - "logps/rejected": -1.5100966691970825, - "loss": 1.9501, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2240922451019287, - "rewards/margins": 0.28600430488586426, - "rewards/rejected": -1.5100966691970825, - "semantic_entropy": 0.8081742525100708, + "logits/chosen": -0.21919536590576172, + "logits/rejected": -0.1593218892812729, + "logps/chosen": -1.1974241733551025, + "logps/rejected": -1.3887784481048584, + "loss": 1.5465, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1974241733551025, + "rewards/margins": 0.19135454297065735, + "rewards/rejected": -1.3887784481048584, "step": 4280 }, { "epoch": 2.293360093661147, - "grad_norm": 14.290716459143816, + "grad_norm": 10.299591143212481, "learning_rate": 1.595061070066222e-07, - "logits/chosen": -0.09124065935611725, - "logits/rejected": -0.09486423432826996, - "logps/chosen": -1.1951037645339966, - "logps/rejected": -1.5133121013641357, - "loss": 1.9142, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1951037645339966, - "rewards/margins": 0.318208247423172, - "rewards/rejected": -1.5133121013641357, - "semantic_entropy": 0.818766713142395, + "logits/chosen": -0.1530156284570694, + "logits/rejected": -0.15994104743003845, + "logps/chosen": -1.1667227745056152, + "logps/rejected": -1.4124925136566162, + "loss": 1.5057, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1667227745056152, + "rewards/margins": 0.24576973915100098, + "rewards/rejected": -1.4124925136566162, "step": 4285 }, { "epoch": 2.296036126442549, - "grad_norm": 15.187423626425694, + "grad_norm": 10.554924882674959, "learning_rate": 1.5836728275635542e-07, - "logits/chosen": -0.2084455043077469, - "logits/rejected": -0.03481137752532959, - "logps/chosen": -1.2978460788726807, - "logps/rejected": -1.568542242050171, - "loss": 2.0049, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2978460788726807, - "rewards/margins": 0.2706960141658783, - "rewards/rejected": -1.568542242050171, - "semantic_entropy": 0.7863389253616333, + "logits/chosen": -0.2538110017776489, + "logits/rejected": -0.10423139482736588, + "logps/chosen": -1.2699533700942993, + "logps/rejected": -1.4364902973175049, + "loss": 1.6115, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2699533700942993, + "rewards/margins": 0.16653692722320557, + "rewards/rejected": -1.4364902973175049, "step": 4290 }, { "epoch": 2.2987121592239506, - "grad_norm": 9.04534945918805, + "grad_norm": 8.924876754305254, "learning_rate": 1.5723177302820984e-07, - "logits/chosen": -0.18676696717739105, - "logits/rejected": -0.1372944712638855, - "logps/chosen": -1.2712112665176392, - "logps/rejected": -1.5649312734603882, - "loss": 1.9921, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2712112665176392, - "rewards/margins": 0.2937201261520386, - "rewards/rejected": -1.5649312734603882, - "semantic_entropy": 0.7963592410087585, + "logits/chosen": -0.2252012938261032, + "logits/rejected": -0.1906716227531433, + "logps/chosen": -1.2317861318588257, + "logps/rejected": -1.4582042694091797, + "loss": 1.5814, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2317861318588257, + "rewards/margins": 0.226418137550354, + "rewards/rejected": -1.4582042694091797, "step": 4295 }, { "epoch": 2.3013881920053523, - "grad_norm": 13.090168405149521, + "grad_norm": 10.591288986335153, "learning_rate": 1.5609958883890544e-07, - "logits/chosen": -0.13981503248214722, - "logits/rejected": -0.026316970586776733, - "logps/chosen": -1.2637109756469727, - "logps/rejected": -1.489816427230835, - "loss": 1.9939, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2637109756469727, - "rewards/margins": 0.22610552608966827, - "rewards/rejected": -1.489816427230835, - "semantic_entropy": 0.7971670031547546, + "logits/chosen": -0.21802549064159393, + "logits/rejected": -0.1272871047258377, + "logps/chosen": -1.2291576862335205, + "logps/rejected": -1.3705837726593018, + "loss": 1.5957, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2291576862335205, + "rewards/margins": 0.14142607152462006, + "rewards/rejected": -1.3705837726593018, "step": 4300 }, { "epoch": 2.3040642247867535, - "grad_norm": 33.64487463400989, + "grad_norm": 9.327629730160833, "learning_rate": 1.5497074117289865e-07, - "logits/chosen": -0.20040778815746307, - "logits/rejected": -0.0799594298005104, - "logps/chosen": -1.2583087682724, - "logps/rejected": -1.6361068487167358, - "loss": 1.9477, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2583087682724, - "rewards/margins": 0.3777979612350464, - "rewards/rejected": -1.6361068487167358, - "semantic_entropy": 0.7779967188835144, + "logits/chosen": -0.2663048207759857, + "logits/rejected": -0.16262254118919373, + "logps/chosen": -1.2088749408721924, + "logps/rejected": -1.4837599992752075, + "loss": 1.5425, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2088749408721924, + "rewards/margins": 0.27488499879837036, + "rewards/rejected": -1.4837599992752075, "step": 4305 }, { "epoch": 2.3067402575681553, - "grad_norm": 9.214329986138315, + "grad_norm": 8.388252875949682, "learning_rate": 1.5384524098227402e-07, - "logits/chosen": -0.15226741135120392, - "logits/rejected": 0.047097109258174896, - "logps/chosen": -1.249534010887146, - "logps/rejected": -1.632002830505371, - "loss": 1.9639, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.249534010887146, - "rewards/margins": 0.38246893882751465, - "rewards/rejected": -1.632002830505371, - "semantic_entropy": 0.796162486076355, + "logits/chosen": -0.2186436653137207, + "logits/rejected": -0.053496263921260834, + "logps/chosen": -1.2235801219940186, + "logps/rejected": -1.4754079580307007, + "loss": 1.5735, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2235801219940186, + "rewards/margins": 0.25182777643203735, + "rewards/rejected": -1.4754079580307007, "step": 4310 }, { "epoch": 2.3094162903495565, - "grad_norm": 9.427152395024008, + "grad_norm": 7.222944816268537, "learning_rate": 1.5272309918663974e-07, - "logits/chosen": -0.13251027464866638, - "logits/rejected": 0.018474791198968887, - "logps/chosen": -1.2655764818191528, - "logps/rejected": -1.520042061805725, - "loss": 2.0069, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2655764818191528, - "rewards/margins": 0.2544656991958618, - "rewards/rejected": -1.520042061805725, - "semantic_entropy": 0.7987126111984253, + "logits/chosen": -0.1922692358493805, + "logits/rejected": -0.07148855179548264, + "logps/chosen": -1.222347617149353, + "logps/rejected": -1.4011558294296265, + "loss": 1.5932, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.222347617149353, + "rewards/margins": 0.17880816757678986, + "rewards/rejected": -1.4011558294296265, "step": 4315 }, { "epoch": 2.3120923231309582, - "grad_norm": 9.13175447086419, + "grad_norm": 8.36618947670397, "learning_rate": 1.516043266730201e-07, - "logits/chosen": -0.16241349279880524, - "logits/rejected": -0.036643363535404205, - "logps/chosen": -1.30803382396698, - "logps/rejected": -1.5431315898895264, - "loss": 2.026, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.30803382396698, - "rewards/margins": 0.23509757220745087, - "rewards/rejected": -1.5431315898895264, - "semantic_entropy": 0.7765775918960571, + "logits/chosen": -0.22373394668102264, + "logits/rejected": -0.12639454007148743, + "logps/chosen": -1.2792359590530396, + "logps/rejected": -1.401538610458374, + "loss": 1.6406, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2792359590530396, + "rewards/margins": 0.12230267375707626, + "rewards/rejected": -1.401538610458374, "step": 4320 }, { "epoch": 2.31476835591236, - "grad_norm": 13.967831571275822, + "grad_norm": 11.456603790802449, "learning_rate": 1.504889342957512e-07, - "logits/chosen": -0.15671613812446594, - "logits/rejected": -0.0015728086000308394, - "logps/chosen": -1.2362313270568848, - "logps/rejected": -1.6587120294570923, - "loss": 1.957, + "logits/chosen": -0.23932285606861115, + "logits/rejected": -0.10981453955173492, + "logps/chosen": -1.2031185626983643, + "logps/rejected": -1.4842259883880615, + "loss": 1.5514, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2362313270568848, - "rewards/margins": 0.42248082160949707, - "rewards/rejected": -1.6587120294570923, - "semantic_entropy": 0.7952734231948853, + "rewards/chosen": -1.2031185626983643, + "rewards/margins": 0.28110748529434204, + "rewards/rejected": -1.4842259883880615, "step": 4325 }, { "epoch": 2.3174443886937617, - "grad_norm": 7.5115255304772806, + "grad_norm": 6.560458780045564, "learning_rate": 1.4937693287637453e-07, - "logits/chosen": -0.136986643075943, - "logits/rejected": -0.029413629323244095, - "logps/chosen": -1.346637487411499, - "logps/rejected": -1.5729660987854004, - "loss": 2.0729, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.346637487411499, - "rewards/margins": 0.22632858157157898, - "rewards/rejected": -1.5729660987854004, - "semantic_entropy": 0.7861993908882141, + "logits/chosen": -0.20181819796562195, + "logits/rejected": -0.12014786154031754, + "logps/chosen": -1.304861307144165, + "logps/rejected": -1.436704397201538, + "loss": 1.6624, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.304861307144165, + "rewards/margins": 0.13184314966201782, + "rewards/rejected": -1.436704397201538, "step": 4330 }, { "epoch": 2.320120421475163, - "grad_norm": 11.798958316728067, + "grad_norm": 10.365098773976653, "learning_rate": 1.4826833320353305e-07, - "logits/chosen": -0.10806053876876831, - "logits/rejected": -0.043422140181064606, - "logps/chosen": -1.3138458728790283, - "logps/rejected": -1.6097075939178467, - "loss": 2.042, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3138458728790283, - "rewards/margins": 0.29586172103881836, - "rewards/rejected": -1.6097075939178467, - "semantic_entropy": 0.774515688419342, + "logits/chosen": -0.18095768988132477, + "logits/rejected": -0.1348801553249359, + "logps/chosen": -1.2820155620574951, + "logps/rejected": -1.4945478439331055, + "loss": 1.6453, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2820155620574951, + "rewards/margins": 0.21253237128257751, + "rewards/rejected": -1.4945478439331055, "step": 4335 }, { "epoch": 2.3227964542565647, - "grad_norm": 9.40444803947687, + "grad_norm": 8.709818303845928, "learning_rate": 1.4716314603286528e-07, - "logits/chosen": -0.15934400260448456, - "logits/rejected": 0.005942508578300476, - "logps/chosen": -1.175724983215332, - "logps/rejected": -1.696840524673462, - "loss": 1.8892, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.175724983215332, - "rewards/margins": 0.5211154818534851, - "rewards/rejected": -1.696840524673462, - "semantic_entropy": 0.8058635592460632, + "logits/chosen": -0.2264154851436615, + "logits/rejected": -0.09404493868350983, + "logps/chosen": -1.1498174667358398, + "logps/rejected": -1.5352723598480225, + "loss": 1.492, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1498174667358398, + "rewards/margins": 0.3854547142982483, + "rewards/rejected": -1.5352723598480225, "step": 4340 }, { "epoch": 2.3254724870379664, - "grad_norm": 11.730118366986853, + "grad_norm": 8.88602868081274, "learning_rate": 1.4606138208690233e-07, - "logits/chosen": -0.19282224774360657, - "logits/rejected": -0.11361217498779297, - "logps/chosen": -1.3833080530166626, - "logps/rejected": -1.4851402044296265, - "loss": 2.1013, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3833080530166626, - "rewards/margins": 0.10183216631412506, - "rewards/rejected": -1.4851402044296265, - "semantic_entropy": 0.7746142745018005, + "logits/chosen": -0.2641579508781433, + "logits/rejected": -0.19507022202014923, + "logps/chosen": -1.3444536924362183, + "logps/rejected": -1.3795907497406006, + "loss": 1.7046, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3444536924362183, + "rewards/margins": 0.03513716533780098, + "rewards/rejected": -1.3795907497406006, "step": 4345 }, { "epoch": 2.3281485198193677, - "grad_norm": 8.00361879964917, + "grad_norm": 7.721172501362392, "learning_rate": 1.4496305205496251e-07, - "logits/chosen": -0.0834389254450798, - "logits/rejected": -0.018457237631082535, - "logps/chosen": -1.282725214958191, - "logps/rejected": -1.6338307857513428, - "loss": 1.9837, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.282725214958191, - "rewards/margins": 0.3511054515838623, - "rewards/rejected": -1.6338307857513428, - "semantic_entropy": 0.7893123030662537, + "logits/chosen": -0.16946734488010406, + "logits/rejected": -0.11795884370803833, + "logps/chosen": -1.2383652925491333, + "logps/rejected": -1.499885082244873, + "loss": 1.572, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2383652925491333, + "rewards/margins": 0.2615198493003845, + "rewards/rejected": -1.499885082244873, "step": 4350 }, { "epoch": 2.3308245526007694, - "grad_norm": 7.258530204816981, + "grad_norm": 6.637011020428752, "learning_rate": 1.4386816659304895e-07, - "logits/chosen": -0.20895186066627502, - "logits/rejected": -0.1091470941901207, - "logps/chosen": -1.3068081140518188, - "logps/rejected": -1.569459319114685, - "loss": 1.9947, + "logits/chosen": -0.25504356622695923, + "logits/rejected": -0.1839907467365265, + "logps/chosen": -1.274584174156189, + "logps/rejected": -1.4763041734695435, + "loss": 1.5997, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3068081140518188, - "rewards/margins": 0.2626512050628662, - "rewards/rejected": -1.569459319114685, - "semantic_entropy": 0.7754660844802856, + "rewards/chosen": -1.274584174156189, + "rewards/margins": 0.20172002911567688, + "rewards/rejected": -1.4763041734695435, "step": 4355 }, { "epoch": 2.333500585382171, - "grad_norm": 7.252764634339286, + "grad_norm": 7.410794123821557, "learning_rate": 1.4277673632374492e-07, - "logits/chosen": -0.216875821352005, - "logits/rejected": -0.004487229976803064, - "logps/chosen": -1.3538143634796143, - "logps/rejected": -1.5858434438705444, - "loss": 2.044, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3538143634796143, - "rewards/margins": 0.2320290058851242, - "rewards/rejected": -1.5858434438705444, - "semantic_entropy": 0.7725597620010376, + "logits/chosen": -0.31483370065689087, + "logits/rejected": -0.1500522792339325, + "logps/chosen": -1.319117546081543, + "logps/rejected": -1.4547805786132812, + "loss": 1.6529, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.319117546081543, + "rewards/margins": 0.13566307723522186, + "rewards/rejected": -1.4547805786132812, "step": 4360 }, { "epoch": 2.3361766181635724, - "grad_norm": 8.511889251040571, + "grad_norm": 7.819488623849646, "learning_rate": 1.416887718361119e-07, - "logits/chosen": -0.0578872449696064, - "logits/rejected": -0.058798693120479584, - "logps/chosen": -1.2196348905563354, - "logps/rejected": -1.5464431047439575, - "loss": 1.9545, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2196348905563354, - "rewards/margins": 0.32680806517601013, - "rewards/rejected": -1.5464431047439575, - "semantic_entropy": 0.8095118403434753, + "logits/chosen": -0.13828317821025848, + "logits/rejected": -0.14891335368156433, + "logps/chosen": -1.1876940727233887, + "logps/rejected": -1.4420838356018066, + "loss": 1.5432, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1876940727233887, + "rewards/margins": 0.25438982248306274, + "rewards/rejected": -1.4420838356018066, "step": 4365 }, { "epoch": 2.338852650944974, - "grad_norm": 10.827800542341361, + "grad_norm": 9.917352217947807, "learning_rate": 1.406042836855859e-07, - "logits/chosen": -0.12696470320224762, - "logits/rejected": 0.003375391010195017, - "logps/chosen": -1.1783816814422607, - "logps/rejected": -1.6317615509033203, - "loss": 1.8999, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1783816814422607, - "rewards/margins": 0.45337972044944763, - "rewards/rejected": -1.6317615509033203, - "semantic_entropy": 0.8124502301216125, + "logits/chosen": -0.18322905898094177, + "logits/rejected": -0.07606031000614166, + "logps/chosen": -1.1553547382354736, + "logps/rejected": -1.5348331928253174, + "loss": 1.4964, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1553547382354736, + "rewards/margins": 0.3794783651828766, + "rewards/rejected": -1.5348331928253174, "step": 4370 }, { "epoch": 2.341528683726376, - "grad_norm": 13.967101137336337, + "grad_norm": 10.565204282699165, "learning_rate": 1.3952328239387595e-07, - "logits/chosen": -0.23310494422912598, - "logits/rejected": -0.04387233406305313, - "logps/chosen": -1.295742392539978, - "logps/rejected": -1.6016197204589844, - "loss": 1.9967, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.295742392539978, - "rewards/margins": 0.305877149105072, - "rewards/rejected": -1.6016197204589844, - "semantic_entropy": 0.7674862146377563, + "logits/chosen": -0.32471174001693726, + "logits/rejected": -0.17733509838581085, + "logps/chosen": -1.2622826099395752, + "logps/rejected": -1.4821867942810059, + "loss": 1.6017, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2622826099395752, + "rewards/margins": 0.21990422904491425, + "rewards/rejected": -1.4821867942810059, "step": 4375 }, { "epoch": 2.344204716507777, - "grad_norm": 7.9268173721038035, + "grad_norm": 7.235371461274833, "learning_rate": 1.3844577844886109e-07, - "logits/chosen": -0.18139347434043884, - "logits/rejected": -0.004594838712364435, - "logps/chosen": -1.3157854080200195, - "logps/rejected": -1.582556962966919, - "loss": 2.0215, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3157854080200195, - "rewards/margins": 0.26677125692367554, - "rewards/rejected": -1.582556962966919, - "semantic_entropy": 0.7893223762512207, + "logits/chosen": -0.24023239314556122, + "logits/rejected": -0.0995052307844162, + "logps/chosen": -1.288403034210205, + "logps/rejected": -1.4486147165298462, + "loss": 1.6281, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.288403034210205, + "rewards/margins": 0.1602117270231247, + "rewards/rejected": -1.4486147165298462, "step": 4380 }, { "epoch": 2.346880749289179, - "grad_norm": 10.589401436703588, + "grad_norm": 9.729259628403861, "learning_rate": 1.3737178230448955e-07, - "logits/chosen": -0.20681917667388916, - "logits/rejected": -0.06286197155714035, - "logps/chosen": -1.2401567697525024, - "logps/rejected": -1.6419627666473389, - "loss": 1.9314, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2401567697525024, - "rewards/margins": 0.4018060266971588, - "rewards/rejected": -1.6419627666473389, - "semantic_entropy": 0.795062243938446, + "logits/chosen": -0.26643213629722595, + "logits/rejected": -0.1396816521883011, + "logps/chosen": -1.2085447311401367, + "logps/rejected": -1.5265811681747437, + "loss": 1.5236, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2085447311401367, + "rewards/margins": 0.31803658604621887, + "rewards/rejected": -1.5265811681747437, "step": 4385 }, { "epoch": 2.3495567820705805, - "grad_norm": 6.967609256646875, + "grad_norm": 6.5834768933781245, "learning_rate": 1.363013043806764e-07, - "logits/chosen": -0.17762355506420135, - "logits/rejected": -0.05149609595537186, - "logps/chosen": -1.2417309284210205, - "logps/rejected": -1.5034621953964233, - "loss": 1.9855, + "logits/chosen": -0.22660765051841736, + "logits/rejected": -0.12067173421382904, + "logps/chosen": -1.2112690210342407, + "logps/rejected": -1.3970801830291748, + "loss": 1.5741, "rewards/accuracies": 0.625, - "rewards/chosen": -1.2417309284210205, - "rewards/margins": 0.26173120737075806, - "rewards/rejected": -1.5034621953964233, - "semantic_entropy": 0.8084238171577454, + "rewards/chosen": -1.2112690210342407, + "rewards/margins": 0.1858113557100296, + "rewards/rejected": -1.3970801830291748, "step": 4390 }, { "epoch": 2.3522328148519818, - "grad_norm": 8.132486478881162, + "grad_norm": 6.820345187978874, "learning_rate": 1.352343550632034e-07, - "logits/chosen": -0.1426798701286316, - "logits/rejected": 0.0015723813557997346, - "logps/chosen": -1.280442476272583, - "logps/rejected": -1.7234470844268799, - "loss": 1.9787, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.280442476272583, - "rewards/margins": 0.44300445914268494, - "rewards/rejected": -1.7234470844268799, - "semantic_entropy": 0.7840474843978882, + "logits/chosen": -0.19299550354480743, + "logits/rejected": -0.07580564171075821, + "logps/chosen": -1.2356846332550049, + "logps/rejected": -1.5556024312973022, + "loss": 1.565, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2356846332550049, + "rewards/margins": 0.319917768239975, + "rewards/rejected": -1.5556024312973022, "step": 4395 }, { "epoch": 2.3549088476333835, - "grad_norm": 6.714940579293801, + "grad_norm": 6.141813056666793, "learning_rate": 1.3417094470361722e-07, - "logits/chosen": -0.2102196216583252, - "logits/rejected": -0.06310750544071198, - "logps/chosen": -1.2375186681747437, - "logps/rejected": -1.6087543964385986, - "loss": 1.9438, + "logits/chosen": -0.28624340891838074, + "logits/rejected": -0.1574813276529312, + "logps/chosen": -1.198730230331421, + "logps/rejected": -1.4840584993362427, + "loss": 1.5307, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2375186681747437, - "rewards/margins": 0.37123560905456543, - "rewards/rejected": -1.6087543964385986, - "semantic_entropy": 0.799569845199585, + "rewards/chosen": -1.198730230331421, + "rewards/margins": 0.28532809019088745, + "rewards/rejected": -1.4840584993362427, "step": 4400 }, { "epoch": 2.3549088476333835, - "eval_logits/chosen": 0.16329367458820343, - "eval_logits/rejected": 0.24929870665073395, - "eval_logps/chosen": -1.3380868434906006, - "eval_logps/rejected": -1.615173578262329, - "eval_loss": 2.044917583465576, - "eval_rewards/accuracies": 0.5942136645317078, - "eval_rewards/chosen": -1.3380868434906006, - "eval_rewards/margins": 0.27708685398101807, - "eval_rewards/rejected": -1.615173578262329, - "eval_runtime": 34.7176, - "eval_samples_per_second": 38.741, - "eval_semantic_entropy": 0.7753984332084656, - "eval_steps_per_second": 9.707, + "eval_logits/chosen": -0.006359766703099012, + "eval_logits/rejected": 0.05871352180838585, + "eval_logps/chosen": -1.3054404258728027, + "eval_logps/rejected": -1.5042866468429565, + "eval_loss": 1.6468573808670044, + "eval_rewards/accuracies": 0.5712166428565979, + "eval_rewards/chosen": -1.3054404258728027, + "eval_rewards/margins": 0.1988460123538971, + "eval_rewards/rejected": -1.5042866468429565, + "eval_runtime": 40.3603, + "eval_samples_per_second": 33.325, + "eval_steps_per_second": 8.35, "step": 4400 }, { "epoch": 2.357584880414785, - "grad_norm": 7.596164051086343, + "grad_norm": 7.648024547279085, "learning_rate": 1.3311108361913015e-07, - "logits/chosen": -0.21881377696990967, - "logits/rejected": -0.17788389325141907, - "logps/chosen": -1.2354278564453125, - "logps/rejected": -1.4550416469573975, - "loss": 1.9945, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2354278564453125, - "rewards/margins": 0.21961364150047302, - "rewards/rejected": -1.4550416469573975, - "semantic_entropy": 0.8208361864089966, + "logits/chosen": -0.2932616174221039, + "logits/rejected": -0.2517506182193756, + "logps/chosen": -1.2127629518508911, + "logps/rejected": -1.3618614673614502, + "loss": 1.5902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2127629518508911, + "rewards/margins": 0.1490984857082367, + "rewards/rejected": -1.3618614673614502, "step": 4405 }, { "epoch": 2.3602609131961865, - "grad_norm": 7.299831982798536, + "grad_norm": 6.568924993201521, "learning_rate": 1.3205478209251874e-07, - "logits/chosen": -0.17249411344528198, - "logits/rejected": -0.14244119822978973, - "logps/chosen": -1.3397963047027588, - "logps/rejected": -1.7014557123184204, - "loss": 2.0249, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3397963047027588, - "rewards/margins": 0.3616591989994049, - "rewards/rejected": -1.7014557123184204, - "semantic_entropy": 0.7657409906387329, + "logits/chosen": -0.26170510053634644, + "logits/rejected": -0.23782913386821747, + "logps/chosen": -1.3037235736846924, + "logps/rejected": -1.5700328350067139, + "loss": 1.6319, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3037235736846924, + "rewards/margins": 0.26630908250808716, + "rewards/rejected": -1.5700328350067139, "step": 4410 }, { "epoch": 2.362936945977588, - "grad_norm": 8.269555867593573, + "grad_norm": 7.537474141208627, "learning_rate": 1.310020503720254e-07, - "logits/chosen": -0.13108180463314056, - "logits/rejected": 0.027709677815437317, - "logps/chosen": -1.2893644571304321, - "logps/rejected": -1.5708500146865845, - "loss": 2.0016, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2893644571304321, - "rewards/margins": 0.2814854681491852, - "rewards/rejected": -1.5708500146865845, - "semantic_entropy": 0.7902665734291077, + "logits/chosen": -0.1868119090795517, + "logits/rejected": -0.05980759859085083, + "logps/chosen": -1.2489304542541504, + "logps/rejected": -1.447834849357605, + "loss": 1.5896, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2489304542541504, + "rewards/margins": 0.19890446960926056, + "rewards/rejected": -1.447834849357605, "step": 4415 }, { "epoch": 2.36561297875899, - "grad_norm": 10.519560075344469, + "grad_norm": 8.586632763586499, "learning_rate": 1.2995289867125752e-07, - "logits/chosen": -0.16770415008068085, - "logits/rejected": -0.08232822269201279, - "logps/chosen": -1.3121337890625, - "logps/rejected": -1.4900834560394287, - "loss": 2.0503, + "logits/chosen": -0.2403072863817215, + "logits/rejected": -0.17797724902629852, + "logps/chosen": -1.279308795928955, + "logps/rejected": -1.402352213859558, + "loss": 1.6493, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3121337890625, - "rewards/margins": 0.17794954776763916, - "rewards/rejected": -1.4900834560394287, - "semantic_entropy": 0.7878513932228088, + "rewards/chosen": -1.279308795928955, + "rewards/margins": 0.12304346263408661, + "rewards/rejected": -1.402352213859558, "step": 4420 }, { "epoch": 2.368289011540391, - "grad_norm": 9.567672683788386, + "grad_norm": 8.382363884634668, "learning_rate": 1.2890733716908986e-07, - "logits/chosen": -0.14897114038467407, - "logits/rejected": -0.04493894428014755, - "logps/chosen": -1.2099311351776123, - "logps/rejected": -1.5338737964630127, - "loss": 1.9236, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2099311351776123, - "rewards/margins": 0.3239426612854004, - "rewards/rejected": -1.5338737964630127, - "semantic_entropy": 0.8020665049552917, + "logits/chosen": -0.21785800158977509, + "logits/rejected": -0.12086410820484161, + "logps/chosen": -1.191899061203003, + "logps/rejected": -1.4459514617919922, + "loss": 1.5283, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.191899061203003, + "rewards/margins": 0.2540523111820221, + "rewards/rejected": -1.4459514617919922, "step": 4425 }, { "epoch": 2.370965044321793, - "grad_norm": 7.5503797512692135, + "grad_norm": 5.937326547288543, "learning_rate": 1.2786537600956454e-07, - "logits/chosen": -0.1726982593536377, - "logits/rejected": -0.00577501067891717, - "logps/chosen": -1.3117141723632812, - "logps/rejected": -1.6250488758087158, - "loss": 2.0053, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3117141723632812, - "rewards/margins": 0.3133348226547241, - "rewards/rejected": -1.6250488758087158, - "semantic_entropy": 0.7708010673522949, + "logits/chosen": -0.22209754586219788, + "logits/rejected": -0.09047858417034149, + "logps/chosen": -1.2848058938980103, + "logps/rejected": -1.501165747642517, + "loss": 1.6148, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2848058938980103, + "rewards/margins": 0.21635976433753967, + "rewards/rejected": -1.501165747642517, "step": 4430 }, { "epoch": 2.3736410771031946, - "grad_norm": 7.026259775073373, + "grad_norm": 6.600559161032941, "learning_rate": 1.268270253017933e-07, - "logits/chosen": -0.25417202711105347, - "logits/rejected": -0.04576032608747482, - "logps/chosen": -1.2097904682159424, - "logps/rejected": -1.5549266338348389, - "loss": 1.9324, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2097904682159424, - "rewards/margins": 0.34513604640960693, - "rewards/rejected": -1.5549266338348389, - "semantic_entropy": 0.8113786578178406, + "logits/chosen": -0.3405889868736267, + "logits/rejected": -0.16396290063858032, + "logps/chosen": -1.1788181066513062, + "logps/rejected": -1.4323985576629639, + "loss": 1.5256, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1788181066513062, + "rewards/margins": 0.2535802125930786, + "rewards/rejected": -1.4323985576629639, "step": 4435 }, { "epoch": 2.376317109884596, - "grad_norm": 11.649166736443838, + "grad_norm": 9.478762843203286, "learning_rate": 1.257922951198591e-07, - "logits/chosen": -0.24857386946678162, - "logits/rejected": -0.04840501770377159, - "logps/chosen": -1.3066952228546143, - "logps/rejected": -1.5375475883483887, - "loss": 2.0276, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3066952228546143, - "rewards/margins": 0.23085255920886993, - "rewards/rejected": -1.5375475883483887, - "semantic_entropy": 0.7833686470985413, + "logits/chosen": -0.29350167512893677, + "logits/rejected": -0.129825621843338, + "logps/chosen": -1.2769209146499634, + "logps/rejected": -1.4102004766464233, + "loss": 1.6316, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2769209146499634, + "rewards/margins": 0.1332794427871704, + "rewards/rejected": -1.4102004766464233, "step": 4440 }, { "epoch": 2.3789931426659976, - "grad_norm": 9.528891181841956, + "grad_norm": 8.174733232105966, "learning_rate": 1.24761195502719e-07, - "logits/chosen": -0.20785681903362274, - "logits/rejected": -0.026444310322403908, - "logps/chosen": -1.2633240222930908, - "logps/rejected": -1.555964708328247, - "loss": 2.0104, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2633240222930908, - "rewards/margins": 0.2926408350467682, - "rewards/rejected": -1.555964708328247, - "semantic_entropy": 0.7821134924888611, + "logits/chosen": -0.26454395055770874, + "logits/rejected": -0.11497775465250015, + "logps/chosen": -1.229234218597412, + "logps/rejected": -1.4280798435211182, + "loss": 1.6064, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.229234218597412, + "rewards/margins": 0.1988455057144165, + "rewards/rejected": -1.4280798435211182, "step": 4445 }, { "epoch": 2.3816691754473993, - "grad_norm": 10.98983209743162, + "grad_norm": 9.864670169770434, "learning_rate": 1.2373373645410573e-07, - "logits/chosen": -0.11539150774478912, - "logits/rejected": 0.014820380136370659, - "logps/chosen": -1.3207120895385742, - "logps/rejected": -1.648510217666626, - "loss": 2.0113, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3207120895385742, - "rewards/margins": 0.3277982473373413, - "rewards/rejected": -1.648510217666626, - "semantic_entropy": 0.7733095288276672, + "logits/chosen": -0.20456938445568085, + "logits/rejected": -0.09650013595819473, + "logps/chosen": -1.286020040512085, + "logps/rejected": -1.4996373653411865, + "loss": 1.6172, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.286020040512085, + "rewards/margins": 0.21361732482910156, + "rewards/rejected": -1.4996373653411865, "step": 4450 }, { "epoch": 2.384345208228801, - "grad_norm": 8.857866072317233, + "grad_norm": 8.026644075668933, "learning_rate": 1.2270992794243175e-07, - "logits/chosen": -0.22098317742347717, - "logits/rejected": -0.12461896240711212, - "logps/chosen": -1.3009580373764038, - "logps/rejected": -1.5917580127716064, - "loss": 2.0043, + "logits/chosen": -0.2971116900444031, + "logits/rejected": -0.22733533382415771, + "logps/chosen": -1.2687150239944458, + "logps/rejected": -1.4610542058944702, + "loss": 1.6074, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3009580373764038, - "rewards/margins": 0.29079997539520264, - "rewards/rejected": -1.5917580127716064, - "semantic_entropy": 0.7838797569274902, + "rewards/chosen": -1.2687150239944458, + "rewards/margins": 0.1923392117023468, + "rewards/rejected": -1.4610542058944702, "step": 4455 }, { "epoch": 2.3870212410102023, - "grad_norm": 11.89718596474689, + "grad_norm": 7.4530620599680395, "learning_rate": 1.2168977990069147e-07, - "logits/chosen": -0.2289695292711258, - "logits/rejected": -0.0018982291221618652, - "logps/chosen": -1.2188758850097656, - "logps/rejected": -1.6825908422470093, - "loss": 1.9148, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2188758850097656, - "rewards/margins": 0.463715136051178, - "rewards/rejected": -1.6825908422470093, - "semantic_entropy": 0.802188515663147, + "logits/chosen": -0.29837164282798767, + "logits/rejected": -0.10467962175607681, + "logps/chosen": -1.1872581243515015, + "logps/rejected": -1.5156866312026978, + "loss": 1.5082, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1872581243515015, + "rewards/margins": 0.3284284472465515, + "rewards/rejected": -1.5156866312026978, "step": 4460 }, { "epoch": 2.389697273791604, - "grad_norm": 7.267727110713755, + "grad_norm": 6.948344264565798, "learning_rate": 1.206733022263659e-07, - "logits/chosen": -0.20777158439159393, - "logits/rejected": -0.022397834807634354, - "logps/chosen": -1.3242433071136475, - "logps/rejected": -1.5615819692611694, - "loss": 2.0477, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3242433071136475, - "rewards/margins": 0.23733873665332794, - "rewards/rejected": -1.5615819692611694, - "semantic_entropy": 0.7919968366622925, + "logits/chosen": -0.29605644941329956, + "logits/rejected": -0.14458665251731873, + "logps/chosen": -1.2869765758514404, + "logps/rejected": -1.433555245399475, + "loss": 1.6447, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2869765758514404, + "rewards/margins": 0.1465785950422287, + "rewards/rejected": -1.433555245399475, "step": 4465 }, { "epoch": 2.3923733065730053, - "grad_norm": 9.276616217142173, + "grad_norm": 8.034435333341417, "learning_rate": 1.1966050478132572e-07, - "logits/chosen": -0.08144637942314148, - "logits/rejected": -0.006868818309158087, - "logps/chosen": -1.1734609603881836, - "logps/rejected": -1.466123104095459, - "loss": 1.9461, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1734609603881836, - "rewards/margins": 0.29266220331192017, - "rewards/rejected": -1.466123104095459, - "semantic_entropy": 0.8322067260742188, + "logits/chosen": -0.16029222309589386, + "logits/rejected": -0.09777076542377472, + "logps/chosen": -1.1405402421951294, + "logps/rejected": -1.3531912565231323, + "loss": 1.526, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1405402421951294, + "rewards/margins": 0.21265116333961487, + "rewards/rejected": -1.3531912565231323, "step": 4470 }, { "epoch": 2.395049339354407, - "grad_norm": 10.285711021808112, + "grad_norm": 8.062583397766922, "learning_rate": 1.1865139739173635e-07, - "logits/chosen": -0.17131993174552917, - "logits/rejected": 0.04961296170949936, - "logps/chosen": -1.2460860013961792, - "logps/rejected": -1.560231328010559, - "loss": 1.9621, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2460860013961792, - "rewards/margins": 0.31414538621902466, - "rewards/rejected": -1.560231328010559, - "semantic_entropy": 0.7961411476135254, + "logits/chosen": -0.2398650348186493, + "logits/rejected": -0.048020146787166595, + "logps/chosen": -1.2048254013061523, + "logps/rejected": -1.4288790225982666, + "loss": 1.5512, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2048254013061523, + "rewards/margins": 0.22405338287353516, + "rewards/rejected": -1.4288790225982666, "step": 4475 }, { "epoch": 2.3977253721358087, - "grad_norm": 12.552218630123326, + "grad_norm": 8.999501778068137, "learning_rate": 1.1764598984796187e-07, - "logits/chosen": -0.15812304615974426, - "logits/rejected": -0.07235546410083771, - "logps/chosen": -1.180234432220459, - "logps/rejected": -1.4472579956054688, - "loss": 1.9585, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.180234432220459, - "rewards/margins": 0.26702359318733215, - "rewards/rejected": -1.4472579956054688, - "semantic_entropy": 0.8320823907852173, + "logits/chosen": -0.19769065082073212, + "logits/rejected": -0.13914498686790466, + "logps/chosen": -1.1431434154510498, + "logps/rejected": -1.3402740955352783, + "loss": 1.5346, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1431434154510498, + "rewards/margins": 0.19713075459003448, + "rewards/rejected": -1.3402740955352783, "step": 4480 }, { "epoch": 2.4004014049172104, - "grad_norm": 14.335288849532814, + "grad_norm": 11.91169582300977, "learning_rate": 1.1664429190447095e-07, - "logits/chosen": -0.15513862669467926, - "logits/rejected": -0.05408088117837906, - "logps/chosen": -1.2913730144500732, - "logps/rejected": -1.6561282873153687, - "loss": 2.0005, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2913730144500732, - "rewards/margins": 0.364755243062973, - "rewards/rejected": -1.6561282873153687, - "semantic_entropy": 0.7903246879577637, + "logits/chosen": -0.25468355417251587, + "logits/rejected": -0.16460008919239044, + "logps/chosen": -1.2626378536224365, + "logps/rejected": -1.5172367095947266, + "loss": 1.6013, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2626378536224365, + "rewards/margins": 0.25459879636764526, + "rewards/rejected": -1.5172367095947266, "step": 4485 }, { "epoch": 2.4030774376986117, - "grad_norm": 9.286973520864837, + "grad_norm": 7.198831095781075, "learning_rate": 1.1564631327974122e-07, - "logits/chosen": -0.20581397414207458, - "logits/rejected": 0.0023672953248023987, - "logps/chosen": -1.251208782196045, - "logps/rejected": -1.6634395122528076, - "loss": 1.9302, + "logits/chosen": -0.2648671269416809, + "logits/rejected": -0.08476381003856659, + "logps/chosen": -1.2206342220306396, + "logps/rejected": -1.51417875289917, + "loss": 1.5395, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.251208782196045, - "rewards/margins": 0.412230908870697, - "rewards/rejected": -1.6634395122528076, - "semantic_entropy": 0.781873881816864, + "rewards/chosen": -1.2206342220306396, + "rewards/margins": 0.2935445010662079, + "rewards/rejected": -1.51417875289917, "step": 4490 }, { "epoch": 2.4057534704800134, - "grad_norm": 11.572794160860106, + "grad_norm": 9.336637921006375, "learning_rate": 1.1465206365616587e-07, - "logits/chosen": -0.27398204803466797, - "logits/rejected": -0.06987808644771576, - "logps/chosen": -1.2382861375808716, - "logps/rejected": -1.5518062114715576, - "loss": 1.9656, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2382861375808716, - "rewards/margins": 0.31352001428604126, - "rewards/rejected": -1.5518062114715576, - "semantic_entropy": 0.8056244850158691, + "logits/chosen": -0.31433624029159546, + "logits/rejected": -0.13823679089546204, + "logps/chosen": -1.2059760093688965, + "logps/rejected": -1.4413530826568604, + "loss": 1.5535, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2059760093688965, + "rewards/margins": 0.23537695407867432, + "rewards/rejected": -1.4413530826568604, "step": 4495 }, { "epoch": 2.408429503261415, - "grad_norm": 7.479658537224136, + "grad_norm": 6.755312518786944, "learning_rate": 1.1366155267995887e-07, - "logits/chosen": -0.0675680860877037, - "logits/rejected": -0.06864787638187408, - "logps/chosen": -1.2645561695098877, - "logps/rejected": -1.5393455028533936, - "loss": 1.9899, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2645561695098877, - "rewards/margins": 0.2747894823551178, - "rewards/rejected": -1.5393455028533936, - "semantic_entropy": 0.7883538007736206, + "logits/chosen": -0.12695498764514923, + "logits/rejected": -0.13664838671684265, + "logps/chosen": -1.2372138500213623, + "logps/rejected": -1.433397650718689, + "loss": 1.5933, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2372138500213623, + "rewards/margins": 0.19618380069732666, + "rewards/rejected": -1.433397650718689, "step": 4500 }, { "epoch": 2.4111055360428164, - "grad_norm": 12.011976990634052, + "grad_norm": 10.439826793339416, "learning_rate": 1.1267478996106228e-07, - "logits/chosen": -0.21940676867961884, - "logits/rejected": -0.0764898955821991, - "logps/chosen": -1.252802848815918, - "logps/rejected": -1.6330541372299194, - "loss": 1.9569, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.252802848815918, - "rewards/margins": 0.3802511990070343, - "rewards/rejected": -1.6330541372299194, - "semantic_entropy": 0.7870553135871887, + "logits/chosen": -0.273817241191864, + "logits/rejected": -0.15390698611736298, + "logps/chosen": -1.2211644649505615, + "logps/rejected": -1.5253816843032837, + "loss": 1.5589, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2211644649505615, + "rewards/margins": 0.30421727895736694, + "rewards/rejected": -1.5253816843032837, "step": 4505 }, { "epoch": 2.413781568824218, - "grad_norm": 13.002078309301691, + "grad_norm": 11.234889573389628, "learning_rate": 1.116917850730521e-07, - "logits/chosen": -0.22240960597991943, - "logits/rejected": -0.09078870713710785, - "logps/chosen": -1.2161216735839844, - "logps/rejected": -1.4928818941116333, - "loss": 1.9765, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2161216735839844, - "rewards/margins": 0.27676016092300415, - "rewards/rejected": -1.4928818941116333, - "semantic_entropy": 0.8293957710266113, + "logits/chosen": -0.3066954016685486, + "logits/rejected": -0.20369398593902588, + "logps/chosen": -1.1758203506469727, + "logps/rejected": -1.3948097229003906, + "loss": 1.5495, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1758203506469727, + "rewards/margins": 0.21898940205574036, + "rewards/rejected": -1.3948097229003906, "step": 4510 }, { "epoch": 2.41645760160562, - "grad_norm": 6.662239576361183, + "grad_norm": 6.189244157909716, "learning_rate": 1.1071254755304637e-07, - "logits/chosen": -0.19843384623527527, - "logits/rejected": -0.12356531620025635, - "logps/chosen": -1.2522376775741577, - "logps/rejected": -1.585841417312622, - "loss": 1.96, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2522376775741577, - "rewards/margins": 0.3336038589477539, - "rewards/rejected": -1.585841417312622, - "semantic_entropy": 0.7871413230895996, + "logits/chosen": -0.2799038290977478, + "logits/rejected": -0.22861607372760773, + "logps/chosen": -1.2325727939605713, + "logps/rejected": -1.4817126989364624, + "loss": 1.5636, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2325727939605713, + "rewards/margins": 0.24913974106311798, + "rewards/rejected": -1.4817126989364624, "step": 4515 }, { "epoch": 2.419133634387021, - "grad_norm": 9.333033231682505, + "grad_norm": 7.701897641900147, "learning_rate": 1.0973708690161143e-07, - "logits/chosen": -0.16699953377246857, - "logits/rejected": -0.07821333408355713, - "logps/chosen": -1.233888864517212, - "logps/rejected": -1.620827317237854, - "loss": 1.9267, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.233888864517212, - "rewards/margins": 0.38693851232528687, - "rewards/rejected": -1.620827317237854, - "semantic_entropy": 0.7967000007629395, + "logits/chosen": -0.24316546320915222, + "logits/rejected": -0.18140380084514618, + "logps/chosen": -1.2099978923797607, + "logps/rejected": -1.4995787143707275, + "loss": 1.5331, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2099978923797607, + "rewards/margins": 0.28958070278167725, + "rewards/rejected": -1.4995787143707275, "step": 4520 }, { "epoch": 2.421809667168423, - "grad_norm": 11.82695740036326, + "grad_norm": 10.315218953869728, "learning_rate": 1.0876541258267119e-07, - "logits/chosen": -0.24223777651786804, - "logits/rejected": -0.06421225517988205, - "logps/chosen": -1.353435754776001, - "logps/rejected": -1.679324746131897, - "loss": 2.0156, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.353435754776001, - "rewards/margins": 0.32588905096054077, - "rewards/rejected": -1.679324746131897, - "semantic_entropy": 0.7549802660942078, + "logits/chosen": -0.3166322112083435, + "logits/rejected": -0.16723382472991943, + "logps/chosen": -1.3230161666870117, + "logps/rejected": -1.5725752115249634, + "loss": 1.6338, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3230161666870117, + "rewards/margins": 0.2495591938495636, + "rewards/rejected": -1.5725752115249634, "step": 4525 }, { "epoch": 2.4244856999498245, - "grad_norm": 6.298179954701871, + "grad_norm": 6.37458432411914, "learning_rate": 1.0779753402341379e-07, - "logits/chosen": -0.21974477171897888, - "logits/rejected": -0.12402723729610443, - "logps/chosen": -1.2834287881851196, - "logps/rejected": -1.5347378253936768, - "loss": 2.0122, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2834287881851196, - "rewards/margins": 0.25130897760391235, - "rewards/rejected": -1.5347378253936768, - "semantic_entropy": 0.793082594871521, + "logits/chosen": -0.28852128982543945, + "logits/rejected": -0.2155466377735138, + "logps/chosen": -1.2530457973480225, + "logps/rejected": -1.438676118850708, + "loss": 1.6031, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2530457973480225, + "rewards/margins": 0.18563035130500793, + "rewards/rejected": -1.438676118850708, "step": 4530 }, { "epoch": 2.427161732731226, - "grad_norm": 8.08713473417835, + "grad_norm": 6.944524704067793, "learning_rate": 1.0683346061420157e-07, - "logits/chosen": -0.0673285499215126, - "logits/rejected": 0.03805375471711159, - "logps/chosen": -1.2332861423492432, - "logps/rejected": -1.5473854541778564, - "loss": 1.9842, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2332861423492432, - "rewards/margins": 0.31409937143325806, - "rewards/rejected": -1.5473854541778564, - "semantic_entropy": 0.7957872152328491, + "logits/chosen": -0.15872615575790405, + "logits/rejected": -0.08008311688899994, + "logps/chosen": -1.2024890184402466, + "logps/rejected": -1.4294945001602173, + "loss": 1.5812, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2024890184402466, + "rewards/margins": 0.22700531780719757, + "rewards/rejected": -1.4294945001602173, "step": 4535 }, { "epoch": 2.4298377655126275, - "grad_norm": 7.452381023087329, + "grad_norm": 7.221047565580255, "learning_rate": 1.0587320170847874e-07, - "logits/chosen": -0.10687969624996185, - "logits/rejected": -0.037527017295360565, - "logps/chosen": -1.216997504234314, - "logps/rejected": -1.5355608463287354, - "loss": 1.9651, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.216997504234314, - "rewards/margins": 0.3185632824897766, - "rewards/rejected": -1.5355608463287354, - "semantic_entropy": 0.8129452466964722, + "logits/chosen": -0.1756131798028946, + "logits/rejected": -0.12765046954154968, + "logps/chosen": -1.1767234802246094, + "logps/rejected": -1.3959977626800537, + "loss": 1.5461, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1767234802246094, + "rewards/margins": 0.21927419304847717, + "rewards/rejected": -1.3959977626800537, "step": 4540 }, { "epoch": 2.4325137982940293, - "grad_norm": 8.387316558142318, + "grad_norm": 7.195056037934172, "learning_rate": 1.0491676662268156e-07, - "logits/chosen": -0.13831308484077454, - "logits/rejected": -0.007977311499416828, - "logps/chosen": -1.2193646430969238, - "logps/rejected": -1.551476240158081, - "loss": 1.978, + "logits/chosen": -0.22789350152015686, + "logits/rejected": -0.11055190861225128, + "logps/chosen": -1.1836001873016357, + "logps/rejected": -1.4245049953460693, + "loss": 1.5593, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2193646430969238, - "rewards/margins": 0.33211150765419006, - "rewards/rejected": -1.551476240158081, - "semantic_entropy": 0.8200357556343079, + "rewards/chosen": -1.1836001873016357, + "rewards/margins": 0.24090464413166046, + "rewards/rejected": -1.4245049953460693, "step": 4545 }, { "epoch": 2.4351898310754305, - "grad_norm": 11.386155774547941, + "grad_norm": 10.498404475913253, "learning_rate": 1.0396416463614732e-07, - "logits/chosen": -0.2331191748380661, - "logits/rejected": -0.1288275122642517, - "logps/chosen": -1.1922945976257324, - "logps/rejected": -1.5517548322677612, - "loss": 1.9585, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1922945976257324, - "rewards/margins": 0.35946017503738403, - "rewards/rejected": -1.5517548322677612, - "semantic_entropy": 0.8209033012390137, + "logits/chosen": -0.27692264318466187, + "logits/rejected": -0.18639354407787323, + "logps/chosen": -1.1665010452270508, + "logps/rejected": -1.4428874254226685, + "loss": 1.5453, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1665010452270508, + "rewards/margins": 0.2763864994049072, + "rewards/rejected": -1.4428874254226685, "step": 4550 }, { "epoch": 2.4378658638568322, - "grad_norm": 9.123494885824242, + "grad_norm": 7.8526590966783125, "learning_rate": 1.0301540499102479e-07, - "logits/chosen": -0.16231347620487213, - "logits/rejected": -0.09428876638412476, - "logps/chosen": -1.315527081489563, - "logps/rejected": -1.5254814624786377, - "loss": 2.0267, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.315527081489563, - "rewards/margins": 0.2099544107913971, - "rewards/rejected": -1.5254814624786377, - "semantic_entropy": 0.7753323316574097, + "logits/chosen": -0.2460877150297165, + "logits/rejected": -0.19337491691112518, + "logps/chosen": -1.281903624534607, + "logps/rejected": -1.4304178953170776, + "loss": 1.627, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.281903624534607, + "rewards/margins": 0.14851422607898712, + "rewards/rejected": -1.4304178953170776, "step": 4555 }, { "epoch": 2.440541896638234, - "grad_norm": 10.900991920728584, + "grad_norm": 9.928785492272352, "learning_rate": 1.0207049689218405e-07, - "logits/chosen": -0.1922842115163803, - "logits/rejected": -0.01617896556854248, - "logps/chosen": -1.2199078798294067, - "logps/rejected": -1.5163733959197998, - "loss": 1.9736, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2199078798294067, - "rewards/margins": 0.29646551609039307, - "rewards/rejected": -1.5163733959197998, - "semantic_entropy": 0.8171917796134949, + "logits/chosen": -0.2748548090457916, + "logits/rejected": -0.13906711339950562, + "logps/chosen": -1.1843833923339844, + "logps/rejected": -1.3841426372528076, + "loss": 1.5605, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1843833923339844, + "rewards/margins": 0.19975917041301727, + "rewards/rejected": -1.3841426372528076, "step": 4560 }, { "epoch": 2.4432179294196352, - "grad_norm": 15.26690120576109, + "grad_norm": 9.3550788719883, "learning_rate": 1.0112944950712782e-07, - "logits/chosen": -0.14947383105754852, - "logits/rejected": -0.06626245379447937, - "logps/chosen": -1.3093255758285522, - "logps/rejected": -1.574352741241455, - "loss": 2.0129, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3093255758285522, - "rewards/margins": 0.26502716541290283, - "rewards/rejected": -1.574352741241455, - "semantic_entropy": 0.7754411101341248, + "logits/chosen": -0.21005089581012726, + "logits/rejected": -0.154231995344162, + "logps/chosen": -1.285735011100769, + "logps/rejected": -1.4750357866287231, + "loss": 1.6221, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.285735011100769, + "rewards/margins": 0.1893007755279541, + "rewards/rejected": -1.4750357866287231, "step": 4565 }, { "epoch": 2.445893962201037, - "grad_norm": 10.773928342514377, + "grad_norm": 6.711557098770309, "learning_rate": 1.0019227196590174e-07, - "logits/chosen": -0.10572180896997452, - "logits/rejected": 0.0469868965446949, - "logps/chosen": -1.2659401893615723, - "logps/rejected": -1.6247440576553345, - "loss": 1.969, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2659401893615723, - "rewards/margins": 0.35880374908447266, - "rewards/rejected": -1.6247440576553345, - "semantic_entropy": 0.7877820134162903, + "logits/chosen": -0.17600271105766296, + "logits/rejected": -0.05018197372555733, + "logps/chosen": -1.2361449003219604, + "logps/rejected": -1.4916973114013672, + "loss": 1.5731, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2361449003219604, + "rewards/margins": 0.2555524706840515, + "rewards/rejected": -1.4916973114013672, "step": 4570 }, { "epoch": 2.4485699949824387, - "grad_norm": 8.452581423842831, + "grad_norm": 7.170275009538907, "learning_rate": 9.925897336100664e-08, - "logits/chosen": -0.06749458611011505, - "logits/rejected": -0.013055408373475075, - "logps/chosen": -1.2595086097717285, - "logps/rejected": -1.55685293674469, - "loss": 1.9857, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2595086097717285, - "rewards/margins": 0.2973443865776062, - "rewards/rejected": -1.55685293674469, - "semantic_entropy": 0.7922778129577637, + "logits/chosen": -0.1657809019088745, + "logits/rejected": -0.12726573646068573, + "logps/chosen": -1.2311041355133057, + "logps/rejected": -1.4348998069763184, + "loss": 1.5876, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2311041355133057, + "rewards/margins": 0.20379574596881866, + "rewards/rejected": -1.4348998069763184, "step": 4575 }, { "epoch": 2.45124602776384, - "grad_norm": 9.189277140411864, + "grad_norm": 8.412859806198218, "learning_rate": 9.832956274730946e-08, - "logits/chosen": -0.16322560608386993, - "logits/rejected": -0.12031744420528412, - "logps/chosen": -1.2119220495224, - "logps/rejected": -1.5800087451934814, - "loss": 1.9331, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2119220495224, - "rewards/margins": 0.36808669567108154, - "rewards/rejected": -1.5800087451934814, - "semantic_entropy": 0.7977925539016724, + "logits/chosen": -0.25646305084228516, + "logits/rejected": -0.22181513905525208, + "logps/chosen": -1.1789541244506836, + "logps/rejected": -1.4659755229949951, + "loss": 1.5227, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1789541244506836, + "rewards/margins": 0.2870217263698578, + "rewards/rejected": -1.4659755229949951, "step": 4580 }, { "epoch": 2.4539220605452416, - "grad_norm": 7.486266111441383, + "grad_norm": 6.681635468763091, "learning_rate": 9.740404914195633e-08, - "logits/chosen": -0.13839499652385712, - "logits/rejected": -0.005443167872726917, - "logps/chosen": -1.3202592134475708, - "logps/rejected": -1.5658947229385376, - "loss": 2.0481, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3202592134475708, - "rewards/margins": 0.24563536047935486, - "rewards/rejected": -1.5658947229385376, - "semantic_entropy": 0.7758022546768188, + "logits/chosen": -0.20302769541740417, + "logits/rejected": -0.09072286635637283, + "logps/chosen": -1.2906471490859985, + "logps/rejected": -1.4585412740707397, + "loss": 1.6575, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2906471490859985, + "rewards/margins": 0.1678941547870636, + "rewards/rejected": -1.4585412740707397, "step": 4585 }, { "epoch": 2.4565980933266434, - "grad_norm": 7.631634394696681, + "grad_norm": 6.894788190860359, "learning_rate": 9.648244152428392e-08, - "logits/chosen": -0.26290029287338257, - "logits/rejected": -0.08336208015680313, - "logps/chosen": -1.2126243114471436, - "logps/rejected": -1.52330482006073, - "loss": 1.9656, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2126243114471436, - "rewards/margins": 0.3106805682182312, - "rewards/rejected": -1.52330482006073, - "semantic_entropy": 0.8142677545547485, + "logits/chosen": -0.3316098749637604, + "logits/rejected": -0.1765943318605423, + "logps/chosen": -1.181456446647644, + "logps/rejected": -1.4261738061904907, + "loss": 1.5514, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.181456446647644, + "rewards/margins": 0.2447173297405243, + "rewards/rejected": -1.4261738061904907, "step": 4590 }, { "epoch": 2.4592741261080446, - "grad_norm": 8.523281182411818, + "grad_norm": 7.58771593401615, "learning_rate": 9.556474883573379e-08, - "logits/chosen": -0.20482425391674042, - "logits/rejected": -0.09075477719306946, - "logps/chosen": -1.264404535293579, - "logps/rejected": -1.6324313879013062, - "loss": 1.9884, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.264404535293579, - "rewards/margins": 0.3680269420146942, - "rewards/rejected": -1.6324313879013062, - "semantic_entropy": 0.7956129312515259, + "logits/chosen": -0.2860944867134094, + "logits/rejected": -0.19084931910037994, + "logps/chosen": -1.2366198301315308, + "logps/rejected": -1.508881688117981, + "loss": 1.5823, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2366198301315308, + "rewards/margins": 0.2722617983818054, + "rewards/rejected": -1.508881688117981, "step": 4595 }, { "epoch": 2.4619501588894463, - "grad_norm": 10.034992700566619, + "grad_norm": 8.280607864909799, "learning_rate": 9.465097997976412e-08, - "logits/chosen": -0.21117672324180603, - "logits/rejected": 0.03872231766581535, - "logps/chosen": -1.3046801090240479, - "logps/rejected": -1.6931383609771729, - "loss": 1.9877, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3046801090240479, - "rewards/margins": 0.38845834136009216, - "rewards/rejected": -1.6931383609771729, - "semantic_entropy": 0.769001841545105, + "logits/chosen": -0.27023738622665405, + "logits/rejected": -0.049452733248472214, + "logps/chosen": -1.28057861328125, + "logps/rejected": -1.5539510250091553, + "loss": 1.6078, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.28057861328125, + "rewards/margins": 0.2733725607395172, + "rewards/rejected": -1.5539510250091553, "step": 4600 }, { "epoch": 2.464626191670848, - "grad_norm": 10.548298161954385, + "grad_norm": 8.548402118132431, "learning_rate": 9.374114382176457e-08, - "logits/chosen": -0.13307562470436096, - "logits/rejected": -0.00199460843577981, - "logps/chosen": -1.2904552221298218, - "logps/rejected": -1.6387020349502563, - "loss": 2.0013, + "logits/chosen": -0.2092166692018509, + "logits/rejected": -0.1112133041024208, + "logps/chosen": -1.2589666843414307, + "logps/rejected": -1.5114405155181885, + "loss": 1.6017, "rewards/accuracies": 0.625, - "rewards/chosen": -1.2904552221298218, - "rewards/margins": 0.3482467532157898, - "rewards/rejected": -1.6387020349502563, - "semantic_entropy": 0.7897846698760986, + "rewards/chosen": -1.2589666843414307, + "rewards/margins": 0.2524738907814026, + "rewards/rejected": -1.5114405155181885, "step": 4605 }, { "epoch": 2.46730222445225, - "grad_norm": 9.189176275006018, + "grad_norm": 8.033466740124519, "learning_rate": 9.283524918896945e-08, - "logits/chosen": -0.14514726400375366, - "logits/rejected": 0.01468361634761095, - "logps/chosen": -1.2731126546859741, - "logps/rejected": -1.6553637981414795, - "loss": 1.952, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2731126546859741, - "rewards/margins": 0.38225096464157104, - "rewards/rejected": -1.6553637981414795, - "semantic_entropy": 0.7842705845832825, + "logits/chosen": -0.24130284786224365, + "logits/rejected": -0.11197362095117569, + "logps/chosen": -1.2412917613983154, + "logps/rejected": -1.5120117664337158, + "loss": 1.5583, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2412917613983154, + "rewards/margins": 0.2707201838493347, + "rewards/rejected": -1.5120117664337158, "step": 4610 }, { "epoch": 2.469978257233651, - "grad_norm": 8.431537133463804, + "grad_norm": 7.046513210480663, "learning_rate": 9.193330487037232e-08, - "logits/chosen": -0.10504718869924545, - "logits/rejected": 0.026652539148926735, - "logps/chosen": -1.288559913635254, - "logps/rejected": -1.7076818943023682, - "loss": 1.9672, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.288559913635254, - "rewards/margins": 0.4191219210624695, - "rewards/rejected": -1.7076818943023682, - "semantic_entropy": 0.7821754217147827, + "logits/chosen": -0.18820837140083313, + "logits/rejected": -0.07801246643066406, + "logps/chosen": -1.2589250802993774, + "logps/rejected": -1.547240972518921, + "loss": 1.577, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2589250802993774, + "rewards/margins": 0.28831595182418823, + "rewards/rejected": -1.547240972518921, "step": 4615 }, { "epoch": 2.4726542900150528, - "grad_norm": 9.264085580424796, + "grad_norm": 7.508135876465592, "learning_rate": 9.103531961664118e-08, - "logits/chosen": -0.11797323077917099, - "logits/rejected": 0.07366932928562164, - "logps/chosen": -1.18318772315979, - "logps/rejected": -1.5540214776992798, - "loss": 1.9128, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.18318772315979, - "rewards/margins": 0.37083372473716736, - "rewards/rejected": -1.5540214776992798, - "semantic_entropy": 0.8131403923034668, + "logits/chosen": -0.20133718848228455, + "logits/rejected": -0.03770292550325394, + "logps/chosen": -1.1520895957946777, + "logps/rejected": -1.4470627307891846, + "loss": 1.499, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1520895957946777, + "rewards/margins": 0.29497310519218445, + "rewards/rejected": -1.4470627307891846, "step": 4620 }, { "epoch": 2.475330322796454, - "grad_norm": 6.970011750620556, + "grad_norm": 6.466550652115422, "learning_rate": 9.014130214003269e-08, - "logits/chosen": -0.24690942466259003, - "logits/rejected": -0.19920854270458221, - "logps/chosen": -1.297821283340454, - "logps/rejected": -1.5923163890838623, - "loss": 1.982, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.297821283340454, - "rewards/margins": 0.2944951057434082, - "rewards/rejected": -1.5923163890838623, - "semantic_entropy": 0.7800196409225464, + "logits/chosen": -0.32645565271377563, + "logits/rejected": -0.28997671604156494, + "logps/chosen": -1.2724857330322266, + "logps/rejected": -1.4714481830596924, + "loss": 1.5952, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2724857330322266, + "rewards/margins": 0.1989624947309494, + "rewards/rejected": -1.4714481830596924, "step": 4625 }, { "epoch": 2.4780063555778558, - "grad_norm": 12.967707284344408, + "grad_norm": 11.735900395678097, "learning_rate": 8.925126111430848e-08, - "logits/chosen": -0.10687782615423203, - "logits/rejected": -0.025706391781568527, - "logps/chosen": -1.2271325588226318, - "logps/rejected": -1.5873157978057861, - "loss": 1.9681, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2271325588226318, - "rewards/margins": 0.3601832687854767, - "rewards/rejected": -1.5873157978057861, - "semantic_entropy": 0.8013485670089722, + "logits/chosen": -0.19582371413707733, + "logits/rejected": -0.12573130428791046, + "logps/chosen": -1.198366403579712, + "logps/rejected": -1.4611613750457764, + "loss": 1.5604, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.198366403579712, + "rewards/margins": 0.2627950608730316, + "rewards/rejected": -1.4611613750457764, "step": 4630 }, { "epoch": 2.4806823883592575, - "grad_norm": 12.220403329312555, + "grad_norm": 9.902997434035893, "learning_rate": 8.83652051746504e-08, - "logits/chosen": -0.052401043474674225, - "logits/rejected": 0.09878160059452057, - "logps/chosen": -1.2342487573623657, - "logps/rejected": -1.657254934310913, - "loss": 1.9672, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2342487573623657, - "rewards/margins": 0.42300620675086975, - "rewards/rejected": -1.657254934310913, - "semantic_entropy": 0.7962632179260254, + "logits/chosen": -0.13688859343528748, + "logits/rejected": -0.017780795693397522, + "logps/chosen": -1.2002695798873901, + "logps/rejected": -1.4998315572738647, + "loss": 1.564, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2002695798873901, + "rewards/margins": 0.2995621860027313, + "rewards/rejected": -1.4998315572738647, "step": 4635 }, { "epoch": 2.483358421140659, - "grad_norm": 7.803313803458224, + "grad_norm": 7.351790490600173, "learning_rate": 8.748314291757696e-08, - "logits/chosen": -0.10491069406270981, - "logits/rejected": -0.0009174048900604248, - "logps/chosen": -1.259453296661377, - "logps/rejected": -1.5711205005645752, - "loss": 1.9676, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.259453296661377, - "rewards/margins": 0.3116670548915863, - "rewards/rejected": -1.5711205005645752, - "semantic_entropy": 0.7823437452316284, + "logits/chosen": -0.1672445833683014, + "logits/rejected": -0.07436300069093704, + "logps/chosen": -1.2341772317886353, + "logps/rejected": -1.4727962017059326, + "loss": 1.5704, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2341772317886353, + "rewards/margins": 0.2386188954114914, + "rewards/rejected": -1.4727962017059326, "step": 4640 }, { "epoch": 2.4860344539220605, - "grad_norm": 16.97457748415567, + "grad_norm": 9.39778011100013, "learning_rate": 8.660508290086032e-08, - "logits/chosen": -0.12484784424304962, - "logits/rejected": 0.010466617532074451, - "logps/chosen": -1.2484409809112549, - "logps/rejected": -1.5821198225021362, - "loss": 1.969, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2484409809112549, - "rewards/margins": 0.3336789011955261, - "rewards/rejected": -1.5821198225021362, - "semantic_entropy": 0.7921417951583862, + "logits/chosen": -0.2063862830400467, + "logits/rejected": -0.0940953716635704, + "logps/chosen": -1.2115062475204468, + "logps/rejected": -1.47126305103302, + "loss": 1.5636, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2115062475204468, + "rewards/margins": 0.25975683331489563, + "rewards/rejected": -1.47126305103302, "step": 4645 }, { "epoch": 2.488710486703462, - "grad_norm": 13.315787077562575, + "grad_norm": 8.479287513353329, "learning_rate": 8.573103364344231e-08, - "logits/chosen": -0.17663438618183136, - "logits/rejected": 0.06814370304346085, - "logps/chosen": -1.268634557723999, - "logps/rejected": -1.6621404886245728, - "loss": 1.9485, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.268634557723999, - "rewards/margins": 0.3935058116912842, - "rewards/rejected": -1.6621404886245728, - "semantic_entropy": 0.7878329753875732, + "logits/chosen": -0.24118804931640625, + "logits/rejected": -0.03551122546195984, + "logps/chosen": -1.242187738418579, + "logps/rejected": -1.541870355606079, + "loss": 1.5544, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.242187738418579, + "rewards/margins": 0.2996828556060791, + "rewards/rejected": -1.541870355606079, "step": 4650 }, { "epoch": 2.4913865194848634, - "grad_norm": 9.25210617668145, + "grad_norm": 7.894086829697386, "learning_rate": 8.486100362535292e-08, - "logits/chosen": -0.15664373338222504, - "logits/rejected": -0.006619095802307129, - "logps/chosen": -1.241071343421936, - "logps/rejected": -1.499688744544983, - "loss": 1.9827, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.241071343421936, - "rewards/margins": 0.25861743092536926, - "rewards/rejected": -1.499688744544983, - "semantic_entropy": 0.8064199686050415, + "logits/chosen": -0.25328195095062256, + "logits/rejected": -0.1307801753282547, + "logps/chosen": -1.2145090103149414, + "logps/rejected": -1.3983103036880493, + "loss": 1.5689, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2145090103149414, + "rewards/margins": 0.18380117416381836, + "rewards/rejected": -1.3983103036880493, "step": 4655 }, { "epoch": 2.494062552266265, - "grad_norm": 7.998057347357029, + "grad_norm": 6.646646491542126, "learning_rate": 8.399500128762693e-08, - "logits/chosen": -0.1831817924976349, - "logits/rejected": -0.06365980952978134, - "logps/chosen": -1.3237661123275757, - "logps/rejected": -1.558573603630066, - "loss": 2.0178, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3237661123275757, - "rewards/margins": 0.23480753600597382, - "rewards/rejected": -1.558573603630066, - "semantic_entropy": 0.771876335144043, + "logits/chosen": -0.23050686717033386, + "logits/rejected": -0.12764891982078552, + "logps/chosen": -1.2896080017089844, + "logps/rejected": -1.4467109441757202, + "loss": 1.6302, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2896080017089844, + "rewards/margins": 0.1571030169725418, + "rewards/rejected": -1.4467109441757202, "step": 4660 }, { "epoch": 2.496738585047667, - "grad_norm": 7.958903616032643, + "grad_norm": 7.353091005940125, "learning_rate": 8.313303503222313e-08, - "logits/chosen": -0.18261002004146576, - "logits/rejected": -0.12167014926671982, - "logps/chosen": -1.3508309125900269, - "logps/rejected": -1.6414368152618408, - "loss": 2.0504, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3508309125900269, - "rewards/margins": 0.2906058430671692, - "rewards/rejected": -1.6414368152618408, - "semantic_entropy": 0.7719950675964355, + "logits/chosen": -0.2040676325559616, + "logits/rejected": -0.14961931109428406, + "logps/chosen": -1.3131300210952759, + "logps/rejected": -1.5262869596481323, + "loss": 1.6443, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3131300210952759, + "rewards/margins": 0.21315693855285645, + "rewards/rejected": -1.5262869596481323, "step": 4665 }, { "epoch": 2.4994146178290686, - "grad_norm": 11.210113296265586, + "grad_norm": 8.29195579521603, "learning_rate": 8.227511322194164e-08, - "logits/chosen": -0.16465625166893005, - "logits/rejected": -0.0402381494641304, - "logps/chosen": -1.3120533227920532, - "logps/rejected": -1.5447263717651367, - "loss": 2.016, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3120533227920532, - "rewards/margins": 0.23267300426959991, - "rewards/rejected": -1.5447263717651367, - "semantic_entropy": 0.7820819616317749, + "logits/chosen": -0.21293942630290985, + "logits/rejected": -0.10854591429233551, + "logps/chosen": -1.2803435325622559, + "logps/rejected": -1.4527544975280762, + "loss": 1.6204, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2803435325622559, + "rewards/margins": 0.17241086065769196, + "rewards/rejected": -1.4527544975280762, "step": 4670 }, { "epoch": 2.50209065061047, - "grad_norm": 7.5048010774702005, + "grad_norm": 6.8924834068001415, "learning_rate": 8.142124418034385e-08, - "logits/chosen": -0.11845330893993378, - "logits/rejected": 0.023418676108121872, - "logps/chosen": -1.2196156978607178, - "logps/rejected": -1.5629394054412842, - "loss": 1.9521, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2196156978607178, - "rewards/margins": 0.34332379698753357, - "rewards/rejected": -1.5629394054412842, - "semantic_entropy": 0.8040634989738464, + "logits/chosen": -0.16460387408733368, + "logits/rejected": -0.043806709349155426, + "logps/chosen": -1.186786413192749, + "logps/rejected": -1.441338300704956, + "loss": 1.5477, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.186786413192749, + "rewards/margins": 0.2545519173145294, + "rewards/rejected": -1.441338300704956, "step": 4675 }, { "epoch": 2.5047666833918716, - "grad_norm": 8.96692595819377, + "grad_norm": 8.946938575965241, "learning_rate": 8.057143619167073e-08, - "logits/chosen": -0.12013927847146988, - "logits/rejected": -0.011678531765937805, - "logps/chosen": -1.2727725505828857, - "logps/rejected": -1.6061283349990845, - "loss": 1.9921, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2727725505828857, - "rewards/margins": 0.3333559036254883, - "rewards/rejected": -1.6061283349990845, - "semantic_entropy": 0.7913545966148376, + "logits/chosen": -0.1750279814004898, + "logits/rejected": -0.08176834881305695, + "logps/chosen": -1.2342102527618408, + "logps/rejected": -1.4532325267791748, + "loss": 1.5877, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2342102527618408, + "rewards/margins": 0.21902227401733398, + "rewards/rejected": -1.4532325267791748, "step": 4680 }, { "epoch": 2.507442716173273, - "grad_norm": 7.332356348791178, + "grad_norm": 6.154934974488931, "learning_rate": 7.97256975007633e-08, - "logits/chosen": -0.191193088889122, - "logits/rejected": 0.005521965213119984, - "logps/chosen": -1.2749216556549072, - "logps/rejected": -1.5832358598709106, - "loss": 1.9818, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2749216556549072, - "rewards/margins": 0.30831417441368103, - "rewards/rejected": -1.5832358598709106, - "semantic_entropy": 0.7828585505485535, + "logits/chosen": -0.2888779044151306, + "logits/rejected": -0.13938526809215546, + "logps/chosen": -1.244964838027954, + "logps/rejected": -1.4479520320892334, + "loss": 1.5898, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.244964838027954, + "rewards/margins": 0.20298714935779572, + "rewards/rejected": -1.4479520320892334, "step": 4685 }, { "epoch": 2.5101187489546746, - "grad_norm": 10.237642721143716, + "grad_norm": 9.230252365717345, "learning_rate": 7.888403631298186e-08, - "logits/chosen": -0.11178383976221085, - "logits/rejected": -0.070093534886837, - "logps/chosen": -1.287856936454773, - "logps/rejected": -1.567197561264038, - "loss": 2.0032, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.287856936454773, - "rewards/margins": 0.27934056520462036, - "rewards/rejected": -1.567197561264038, - "semantic_entropy": 0.7800172567367554, + "logits/chosen": -0.20475709438323975, + "logits/rejected": -0.17029550671577454, + "logps/chosen": -1.263304352760315, + "logps/rejected": -1.4428774118423462, + "loss": 1.6101, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.263304352760315, + "rewards/margins": 0.17957308888435364, + "rewards/rejected": -1.4428774118423462, "step": 4690 }, { "epoch": 2.5127947817360763, - "grad_norm": 8.237172823392987, + "grad_norm": 8.54427486444681, "learning_rate": 7.804646079412719e-08, - "logits/chosen": -0.1218346506357193, - "logits/rejected": 0.06685183197259903, - "logps/chosen": -1.2876611948013306, - "logps/rejected": -1.5944554805755615, - "loss": 1.9876, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2876611948013306, - "rewards/margins": 0.30679452419281006, - "rewards/rejected": -1.5944554805755615, - "semantic_entropy": 0.7881008982658386, + "logits/chosen": -0.19619551301002502, + "logits/rejected": -0.035579193383455276, + "logps/chosen": -1.2540932893753052, + "logps/rejected": -1.4471417665481567, + "loss": 1.5896, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2540932893753052, + "rewards/margins": 0.19304853677749634, + "rewards/rejected": -1.4471417665481567, "step": 4695 }, { "epoch": 2.515470814517478, - "grad_norm": 7.592310399773866, + "grad_norm": 7.084263280815062, "learning_rate": 7.72129790703604e-08, - "logits/chosen": -0.2171975076198578, - "logits/rejected": -0.0944719910621643, - "logps/chosen": -1.2436479330062866, - "logps/rejected": -1.553193211555481, - "loss": 1.9712, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2436479330062866, - "rewards/margins": 0.30954509973526, - "rewards/rejected": -1.553193211555481, - "semantic_entropy": 0.8062686920166016, + "logits/chosen": -0.28747788071632385, + "logits/rejected": -0.19399094581604004, + "logps/chosen": -1.2100915908813477, + "logps/rejected": -1.4136308431625366, + "loss": 1.5647, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2100915908813477, + "rewards/margins": 0.20353949069976807, + "rewards/rejected": -1.4136308431625366, "step": 4700 }, { "epoch": 2.5181468472988793, - "grad_norm": 10.166467740095245, + "grad_norm": 8.845836063800892, "learning_rate": 7.638359922812504e-08, - "logits/chosen": -0.08607305586338043, - "logits/rejected": -0.05110453441739082, - "logps/chosen": -1.3230863809585571, - "logps/rejected": -1.6123307943344116, - "loss": 2.0252, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3230863809585571, - "rewards/margins": 0.28924453258514404, - "rewards/rejected": -1.6123307943344116, - "semantic_entropy": 0.7647134065628052, + "logits/chosen": -0.14692391455173492, + "logits/rejected": -0.13063344359397888, + "logps/chosen": -1.2915900945663452, + "logps/rejected": -1.4874224662780762, + "loss": 1.6382, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2915900945663452, + "rewards/margins": 0.19583231210708618, + "rewards/rejected": -1.4874224662780762, "step": 4705 }, { "epoch": 2.520822880080281, - "grad_norm": 11.52393554481213, + "grad_norm": 11.014357071728812, "learning_rate": 7.555832931406774e-08, - "logits/chosen": -0.182388037443161, - "logits/rejected": -0.031195128336548805, - "logps/chosen": -1.2906062602996826, - "logps/rejected": -1.5945990085601807, - "loss": 1.9873, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2906062602996826, - "rewards/margins": 0.3039928078651428, - "rewards/rejected": -1.5945990085601807, - "semantic_entropy": 0.7708030939102173, + "logits/chosen": -0.2220824509859085, + "logits/rejected": -0.10035743564367294, + "logps/chosen": -1.2579433917999268, + "logps/rejected": -1.4760496616363525, + "loss": 1.5911, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2579433917999268, + "rewards/margins": 0.2181061953306198, + "rewards/rejected": -1.4760496616363525, "step": 4710 }, { "epoch": 2.5234989128616827, - "grad_norm": 7.291958045293431, + "grad_norm": 7.105325272287114, "learning_rate": 7.47371773349611e-08, - "logits/chosen": -0.17179132997989655, - "logits/rejected": -0.13924379646778107, - "logps/chosen": -1.347267508506775, - "logps/rejected": -1.642747163772583, - "loss": 2.0267, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.347267508506775, - "rewards/margins": 0.2954796850681305, - "rewards/rejected": -1.642747163772583, - "semantic_entropy": 0.7599186897277832, + "logits/chosen": -0.22952616214752197, + "logits/rejected": -0.21391217410564423, + "logps/chosen": -1.3182785511016846, + "logps/rejected": -1.5258429050445557, + "loss": 1.649, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3182785511016846, + "rewards/margins": 0.20756426453590393, + "rewards/rejected": -1.5258429050445557, "step": 4715 }, { "epoch": 2.526174945643084, - "grad_norm": 11.405677460777579, + "grad_norm": 9.793604028079923, "learning_rate": 7.392015125762496e-08, - "logits/chosen": -0.14520487189292908, - "logits/rejected": -0.04902344197034836, - "logps/chosen": -1.197825312614441, - "logps/rejected": -1.5305838584899902, - "loss": 1.9289, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.197825312614441, - "rewards/margins": 0.33275845646858215, - "rewards/rejected": -1.5305838584899902, - "semantic_entropy": 0.8214031457901001, + "logits/chosen": -0.20207378268241882, + "logits/rejected": -0.13237614929676056, + "logps/chosen": -1.1758029460906982, + "logps/rejected": -1.4300979375839233, + "loss": 1.5208, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1758029460906982, + "rewards/margins": 0.2542952001094818, + "rewards/rejected": -1.4300979375839233, "step": 4720 }, { "epoch": 2.5288509784244857, - "grad_norm": 9.447896137190856, + "grad_norm": 8.129358537041927, "learning_rate": 7.310725900885018e-08, - "logits/chosen": -0.20286524295806885, - "logits/rejected": -0.1489933729171753, - "logps/chosen": -1.2946898937225342, - "logps/rejected": -1.5375152826309204, - "loss": 2.0094, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2946898937225342, - "rewards/margins": 0.2428254783153534, - "rewards/rejected": -1.5375152826309204, - "semantic_entropy": 0.7823761105537415, + "logits/chosen": -0.24870195984840393, + "logits/rejected": -0.19785283505916595, + "logps/chosen": -1.2585840225219727, + "logps/rejected": -1.4193689823150635, + "loss": 1.6106, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2585840225219727, + "rewards/margins": 0.16078488528728485, + "rewards/rejected": -1.4193689823150635, "step": 4725 }, { "epoch": 2.5315270112058874, - "grad_norm": 9.610255064005486, + "grad_norm": 8.531145146991735, "learning_rate": 7.229850847532076e-08, - "logits/chosen": -0.13549140095710754, - "logits/rejected": -0.020894240587949753, - "logps/chosen": -1.183825969696045, - "logps/rejected": -1.5215822458267212, - "loss": 1.9285, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.183825969696045, - "rewards/margins": 0.33775612711906433, - "rewards/rejected": -1.5215822458267212, - "semantic_entropy": 0.8025429844856262, + "logits/chosen": -0.21363727748394012, + "logits/rejected": -0.12141523510217667, + "logps/chosen": -1.1580511331558228, + "logps/rejected": -1.3999485969543457, + "loss": 1.5276, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1580511331558228, + "rewards/margins": 0.2418973743915558, + "rewards/rejected": -1.3999485969543457, "step": 4730 }, { "epoch": 2.5342030439872887, - "grad_norm": 8.168702490510011, + "grad_norm": 7.250451028832405, "learning_rate": 7.149390750353779e-08, - "logits/chosen": -0.12599562108516693, - "logits/rejected": -0.18436439335346222, - "logps/chosen": -1.3078311681747437, - "logps/rejected": -1.533595085144043, - "loss": 2.0131, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3078311681747437, - "rewards/margins": 0.22576388716697693, - "rewards/rejected": -1.533595085144043, - "semantic_entropy": 0.7823559641838074, + "logits/chosen": -0.2147807627916336, + "logits/rejected": -0.27469998598098755, + "logps/chosen": -1.277307152748108, + "logps/rejected": -1.4533443450927734, + "loss": 1.6183, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.277307152748108, + "rewards/margins": 0.17603713274002075, + "rewards/rejected": -1.4533443450927734, "step": 4735 }, { "epoch": 2.5368790767686904, - "grad_norm": 8.254126468029884, + "grad_norm": 8.974364799626747, "learning_rate": 7.069346389974374e-08, - "logits/chosen": -0.20458774268627167, - "logits/rejected": -0.06394211202859879, - "logps/chosen": -1.3023638725280762, - "logps/rejected": -1.6589126586914062, - "loss": 2.0008, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3023638725280762, - "rewards/margins": 0.35654881596565247, - "rewards/rejected": -1.6589126586914062, - "semantic_entropy": 0.7820869088172913, + "logits/chosen": -0.26902931928634644, + "logits/rejected": -0.15223777294158936, + "logps/chosen": -1.2735518217086792, + "logps/rejected": -1.530316948890686, + "loss": 1.6067, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2735518217086792, + "rewards/margins": 0.25676512718200684, + "rewards/rejected": -1.530316948890686, "step": 4740 }, { "epoch": 2.539555109550092, - "grad_norm": 8.474729875307984, + "grad_norm": 7.666730812815336, "learning_rate": 6.989718542984563e-08, - "logits/chosen": -0.14035843312740326, - "logits/rejected": -0.09613653272390366, - "logps/chosen": -1.3093879222869873, - "logps/rejected": -1.60835862159729, - "loss": 1.9904, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3093879222869873, - "rewards/margins": 0.29897063970565796, - "rewards/rejected": -1.60835862159729, - "semantic_entropy": 0.7700130343437195, + "logits/chosen": -0.20124347507953644, + "logits/rejected": -0.16961103677749634, + "logps/chosen": -1.2784777879714966, + "logps/rejected": -1.5050462484359741, + "loss": 1.6, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2784777879714966, + "rewards/margins": 0.2265685349702835, + "rewards/rejected": -1.5050462484359741, "step": 4745 }, { "epoch": 2.5422311423314934, - "grad_norm": 7.371448992180354, + "grad_norm": 6.24962077094031, "learning_rate": 6.9105079819341e-08, - "logits/chosen": -0.10261497646570206, - "logits/rejected": 0.09542595595121384, - "logps/chosen": -1.2490769624710083, - "logps/rejected": -1.7567031383514404, - "loss": 1.9241, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2490769624710083, - "rewards/margins": 0.5076262950897217, - "rewards/rejected": -1.7567031383514404, - "semantic_entropy": 0.7950537800788879, + "logits/chosen": -0.17841601371765137, + "logits/rejected": -0.024863218888640404, + "logps/chosen": -1.218346357345581, + "logps/rejected": -1.6240530014038086, + "loss": 1.5231, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.218346357345581, + "rewards/margins": 0.4057064950466156, + "rewards/rejected": -1.6240530014038086, "step": 4750 }, { "epoch": 2.544907175112895, - "grad_norm": 6.68530095678338, + "grad_norm": 6.504494487290139, "learning_rate": 6.831715475324163e-08, - "logits/chosen": -0.18308278918266296, - "logits/rejected": -0.01879027858376503, - "logps/chosen": -1.1776317358016968, - "logps/rejected": -1.644796371459961, - "loss": 1.9031, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1776317358016968, - "rewards/margins": 0.46716445684432983, - "rewards/rejected": -1.644796371459961, - "semantic_entropy": 0.8084004521369934, + "logits/chosen": -0.23551194369792938, + "logits/rejected": -0.0994628295302391, + "logps/chosen": -1.1515239477157593, + "logps/rejected": -1.4859282970428467, + "loss": 1.498, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1515239477157593, + "rewards/margins": 0.334404319524765, + "rewards/rejected": -1.4859282970428467, "step": 4755 }, { "epoch": 2.547583207894297, - "grad_norm": 7.494571745163121, + "grad_norm": 7.107319168779414, "learning_rate": 6.753341787600026e-08, - "logits/chosen": -0.2286044806241989, - "logits/rejected": -0.09256772696971893, - "logps/chosen": -1.1995595693588257, - "logps/rejected": -1.5113732814788818, - "loss": 1.951, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1995595693588257, - "rewards/margins": 0.31181374192237854, - "rewards/rejected": -1.5113732814788818, - "semantic_entropy": 0.8131352663040161, + "logits/chosen": -0.26813262701034546, + "logits/rejected": -0.14674730598926544, + "logps/chosen": -1.166414499282837, + "logps/rejected": -1.3925293684005737, + "loss": 1.5349, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.166414499282837, + "rewards/margins": 0.22611498832702637, + "rewards/rejected": -1.3925293684005737, "step": 4760 }, { "epoch": 2.5502592406756985, - "grad_norm": 10.150397112366806, + "grad_norm": 8.378977507372232, "learning_rate": 6.67538767914353e-08, - "logits/chosen": -0.24224725365638733, - "logits/rejected": -0.06934880465269089, - "logps/chosen": -1.3018126487731934, - "logps/rejected": -1.5710437297821045, - "loss": 2.0319, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3018126487731934, - "rewards/margins": 0.2692311406135559, - "rewards/rejected": -1.5710437297821045, - "semantic_entropy": 0.7827295064926147, + "logits/chosen": -0.28723758459091187, + "logits/rejected": -0.13835971057415009, + "logps/chosen": -1.2562470436096191, + "logps/rejected": -1.4372190237045288, + "loss": 1.6221, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2562470436096191, + "rewards/margins": 0.1809719353914261, + "rewards/rejected": -1.4372190237045288, "step": 4765 }, { "epoch": 2.5529352734571, - "grad_norm": 9.013106585065046, + "grad_norm": 8.238179195381722, "learning_rate": 6.597853906265793e-08, - "logits/chosen": -0.18994879722595215, - "logits/rejected": -0.08487293869256973, - "logps/chosen": -1.292405605316162, - "logps/rejected": -1.6529300212860107, - "loss": 1.9845, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.292405605316162, - "rewards/margins": 0.3605244755744934, - "rewards/rejected": -1.6529300212860107, - "semantic_entropy": 0.7887686491012573, + "logits/chosen": -0.22701935470104218, + "logits/rejected": -0.14575409889221191, + "logps/chosen": -1.2587096691131592, + "logps/rejected": -1.5133172273635864, + "loss": 1.5878, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2587096691131592, + "rewards/margins": 0.25460749864578247, + "rewards/rejected": -1.5133172273635864, "step": 4770 }, { "epoch": 2.5556113062385015, - "grad_norm": 6.103315524429753, + "grad_norm": 5.54586496974905, "learning_rate": 6.5207412211998e-08, - "logits/chosen": -0.05231186002492905, - "logits/rejected": 0.053858887404203415, - "logps/chosen": -1.2260487079620361, - "logps/rejected": -1.6088154315948486, - "loss": 1.9389, + "logits/chosen": -0.09544014185667038, + "logits/rejected": -0.0030840635299682617, + "logps/chosen": -1.1870863437652588, + "logps/rejected": -1.4353337287902832, + "loss": 1.5336, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2260487079620361, - "rewards/margins": 0.3827666640281677, - "rewards/rejected": -1.6088154315948486, - "semantic_entropy": 0.8044970631599426, + "rewards/chosen": -1.1870863437652588, + "rewards/margins": 0.24824753403663635, + "rewards/rejected": -1.4353337287902832, "step": 4775 }, { "epoch": 2.558287339019903, - "grad_norm": 6.102321488627527, + "grad_norm": 5.120314060587667, "learning_rate": 6.444050372093186e-08, - "logits/chosen": -0.13956710696220398, - "logits/rejected": -0.05834800750017166, - "logps/chosen": -1.2656276226043701, - "logps/rejected": -1.5484168529510498, - "loss": 1.9897, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2656276226043701, - "rewards/margins": 0.28278931975364685, - "rewards/rejected": -1.5484168529510498, - "semantic_entropy": 0.7972785234451294, + "logits/chosen": -0.18808437883853912, + "logits/rejected": -0.12491951137781143, + "logps/chosen": -1.2355691194534302, + "logps/rejected": -1.4354352951049805, + "loss": 1.585, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2355691194534302, + "rewards/margins": 0.1998661756515503, + "rewards/rejected": -1.4354352951049805, "step": 4780 }, { "epoch": 2.5609633718013045, - "grad_norm": 11.168837796629282, + "grad_norm": 10.532994020488944, "learning_rate": 6.367782103000873e-08, - "logits/chosen": -0.1260305792093277, - "logits/rejected": -0.06556358188390732, - "logps/chosen": -1.2994545698165894, - "logps/rejected": -1.5403183698654175, - "loss": 2.0189, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2994545698165894, - "rewards/margins": 0.2408638894557953, - "rewards/rejected": -1.5403183698654175, - "semantic_entropy": 0.7918559312820435, + "logits/chosen": -0.18425069749355316, + "logits/rejected": -0.14060840010643005, + "logps/chosen": -1.2594051361083984, + "logps/rejected": -1.4246543645858765, + "loss": 1.6132, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2594051361083984, + "rewards/margins": 0.16524925827980042, + "rewards/rejected": -1.4246543645858765, "step": 4785 }, { "epoch": 2.5636394045827062, - "grad_norm": 6.517739089204047, + "grad_norm": 6.432786636668071, "learning_rate": 6.29193715387798e-08, - "logits/chosen": -0.23115773499011993, - "logits/rejected": -0.09202119708061218, - "logps/chosen": -1.2879234552383423, - "logps/rejected": -1.6790473461151123, - "loss": 1.9696, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2879234552383423, - "rewards/margins": 0.3911239504814148, - "rewards/rejected": -1.6790473461151123, - "semantic_entropy": 0.7856905460357666, + "logits/chosen": -0.27169930934906006, + "logits/rejected": -0.15257199108600616, + "logps/chosen": -1.2576349973678589, + "logps/rejected": -1.5024917125701904, + "loss": 1.5822, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2576349973678589, + "rewards/margins": 0.2448567897081375, + "rewards/rejected": -1.5024917125701904, "step": 4790 }, { "epoch": 2.566315437364108, - "grad_norm": 13.27204614413823, + "grad_norm": 10.938717134054878, "learning_rate": 6.216516260572502e-08, - "logits/chosen": -0.09322036802768707, - "logits/rejected": -0.020159423351287842, - "logps/chosen": -1.3097180128097534, - "logps/rejected": -1.6087557077407837, - "loss": 2.0216, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3097180128097534, - "rewards/margins": 0.29903754591941833, - "rewards/rejected": -1.6087557077407837, - "semantic_entropy": 0.7815183401107788, + "logits/chosen": -0.17738035321235657, + "logits/rejected": -0.12643186748027802, + "logps/chosen": -1.2729908227920532, + "logps/rejected": -1.4721641540527344, + "loss": 1.6237, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2729908227920532, + "rewards/margins": 0.19917339086532593, + "rewards/rejected": -1.4721641540527344, "step": 4795 }, { "epoch": 2.568991470145509, - "grad_norm": 11.004880769709997, + "grad_norm": 8.268168215582811, "learning_rate": 6.141520154818297e-08, - "logits/chosen": -0.14857342839241028, - "logits/rejected": -0.031513821333646774, - "logps/chosen": -1.218299150466919, - "logps/rejected": -1.5542099475860596, - "loss": 1.9434, + "logits/chosen": -0.21264667809009552, + "logits/rejected": -0.1251252144575119, + "logps/chosen": -1.1901899576187134, + "logps/rejected": -1.4447002410888672, + "loss": 1.5433, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.218299150466919, - "rewards/margins": 0.33591070771217346, - "rewards/rejected": -1.5542099475860596, - "semantic_entropy": 0.7982202172279358, + "rewards/chosen": -1.1901899576187134, + "rewards/margins": 0.2545102834701538, + "rewards/rejected": -1.4447002410888672, "step": 4800 }, { "epoch": 2.568991470145509, - "eval_logits/chosen": 0.17427968978881836, - "eval_logits/rejected": 0.2611408829689026, - "eval_logps/chosen": -1.3386183977127075, - "eval_logps/rejected": -1.6198391914367676, - "eval_loss": 2.0458827018737793, - "eval_rewards/accuracies": 0.5942136645317078, - "eval_rewards/chosen": -1.3386183977127075, - "eval_rewards/margins": 0.2812207043170929, - "eval_rewards/rejected": -1.6198391914367676, - "eval_runtime": 34.75, - "eval_samples_per_second": 38.705, - "eval_semantic_entropy": 0.7744117975234985, - "eval_steps_per_second": 9.698, + "eval_logits/chosen": 0.08799929171800613, + "eval_logits/rejected": 0.1608801633119583, + "eval_logps/chosen": -1.3037397861480713, + "eval_logps/rejected": -1.5017025470733643, + "eval_loss": 1.647184133529663, + "eval_rewards/accuracies": 0.5727003216743469, + "eval_rewards/chosen": -1.3037397861480713, + "eval_rewards/margins": 0.19796282052993774, + "eval_rewards/rejected": -1.5017025470733643, + "eval_runtime": 40.3397, + "eval_samples_per_second": 33.342, + "eval_steps_per_second": 8.354, "step": 4800 }, { "epoch": 2.571667502926911, - "grad_norm": 11.503730963858805, + "grad_norm": 10.004139345742539, "learning_rate": 6.066949564227897e-08, - "logits/chosen": -0.224358469247818, - "logits/rejected": -0.10400880873203278, - "logps/chosen": -1.3242347240447998, - "logps/rejected": -1.7001020908355713, - "loss": 2.0023, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3242347240447998, - "rewards/margins": 0.3758672773838043, - "rewards/rejected": -1.7001020908355713, - "semantic_entropy": 0.7753196358680725, + "logits/chosen": -0.27661585807800293, + "logits/rejected": -0.17738543450832367, + "logps/chosen": -1.298222541809082, + "logps/rejected": -1.5642954111099243, + "loss": 1.6178, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.298222541809082, + "rewards/margins": 0.2660728991031647, + "rewards/rejected": -1.5642954111099243, "step": 4805 }, { "epoch": 2.574343535708312, - "grad_norm": 9.11957728544841, + "grad_norm": 6.853794272791884, "learning_rate": 5.992805212285523e-08, - "logits/chosen": -0.090094193816185, - "logits/rejected": 0.04532430320978165, - "logps/chosen": -1.3409744501113892, - "logps/rejected": -1.6526081562042236, - "loss": 2.0314, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3409744501113892, - "rewards/margins": 0.31163376569747925, - "rewards/rejected": -1.6526081562042236, - "semantic_entropy": 0.7715145349502563, + "logits/chosen": -0.18757258355617523, + "logits/rejected": -0.07676851749420166, + "logps/chosen": -1.3012917041778564, + "logps/rejected": -1.524106740951538, + "loss": 1.6321, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3012917041778564, + "rewards/margins": 0.22281484305858612, + "rewards/rejected": -1.524106740951538, "step": 4810 }, { "epoch": 2.577019568489714, - "grad_norm": 11.01571030763835, + "grad_norm": 8.424319277147559, "learning_rate": 5.9190878183399684e-08, - "logits/chosen": -0.09173809736967087, - "logits/rejected": 0.03900834172964096, - "logps/chosen": -1.1757429838180542, - "logps/rejected": -1.646345853805542, - "loss": 1.8842, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1757429838180542, - "rewards/margins": 0.47060298919677734, - "rewards/rejected": -1.646345853805542, - "semantic_entropy": 0.8084053993225098, + "logits/chosen": -0.18077567219734192, + "logits/rejected": -0.09020836651325226, + "logps/chosen": -1.1457854509353638, + "logps/rejected": -1.5183522701263428, + "loss": 1.4771, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1457854509353638, + "rewards/margins": 0.37256690859794617, + "rewards/rejected": -1.5183522701263428, "step": 4815 }, { "epoch": 2.5796956012711156, - "grad_norm": 9.665148218805912, + "grad_norm": 8.710967305387417, "learning_rate": 5.845798097597748e-08, - "logits/chosen": -0.0845046415925026, - "logits/rejected": -0.007058621849864721, - "logps/chosen": -1.3429194688796997, - "logps/rejected": -1.614829421043396, - "loss": 2.023, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3429194688796997, - "rewards/margins": 0.2719099223613739, - "rewards/rejected": -1.614829421043396, - "semantic_entropy": 0.7687947154045105, + "logits/chosen": -0.19583284854888916, + "logits/rejected": -0.1298530250787735, + "logps/chosen": -1.3064498901367188, + "logps/rejected": -1.4972361326217651, + "loss": 1.6298, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3064498901367188, + "rewards/margins": 0.19078609347343445, + "rewards/rejected": -1.4972361326217651, "step": 4820 }, { "epoch": 2.5823716340525174, - "grad_norm": 15.917517242871915, + "grad_norm": 9.934704564510238, "learning_rate": 5.772936761116026e-08, - "logits/chosen": -0.07637304067611694, - "logits/rejected": 0.04699677973985672, - "logps/chosen": -1.242073655128479, - "logps/rejected": -1.5095151662826538, - "loss": 1.9891, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.242073655128479, - "rewards/margins": 0.26744160056114197, - "rewards/rejected": -1.5095151662826538, - "semantic_entropy": 0.8077836036682129, + "logits/chosen": -0.1697060763835907, + "logits/rejected": -0.0799294114112854, + "logps/chosen": -1.2031030654907227, + "logps/rejected": -1.3971145153045654, + "loss": 1.571, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2031030654907227, + "rewards/margins": 0.19401133060455322, + "rewards/rejected": -1.3971145153045654, "step": 4825 }, { "epoch": 2.5850476668339186, - "grad_norm": 5.713470101292522, + "grad_norm": 5.558747942426494, "learning_rate": 5.700504515795829e-08, - "logits/chosen": -0.14389801025390625, - "logits/rejected": 0.005694887135177851, - "logps/chosen": -1.3192150592803955, - "logps/rejected": -1.5601211786270142, - "loss": 2.0238, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3192150592803955, - "rewards/margins": 0.24090604484081268, - "rewards/rejected": -1.5601211786270142, - "semantic_entropy": 0.7768298387527466, + "logits/chosen": -0.2419288605451584, + "logits/rejected": -0.12492205202579498, + "logps/chosen": -1.2873618602752686, + "logps/rejected": -1.44034743309021, + "loss": 1.6322, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2873618602752686, + "rewards/margins": 0.15298554301261902, + "rewards/rejected": -1.44034743309021, "step": 4830 }, { "epoch": 2.5877236996153203, - "grad_norm": 11.826039372971426, + "grad_norm": 10.588648455241415, "learning_rate": 5.628502064375101e-08, - "logits/chosen": -0.24123668670654297, - "logits/rejected": -0.05609578639268875, - "logps/chosen": -1.264898657798767, - "logps/rejected": -1.575839638710022, - "loss": 1.9891, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.264898657798767, - "rewards/margins": 0.31094080209732056, - "rewards/rejected": -1.575839638710022, - "semantic_entropy": 0.8016520738601685, + "logits/chosen": -0.34422606229782104, + "logits/rejected": -0.20205549895763397, + "logps/chosen": -1.2288485765457153, + "logps/rejected": -1.431166172027588, + "loss": 1.5837, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2288485765457153, + "rewards/margins": 0.20231764018535614, + "rewards/rejected": -1.431166172027588, "step": 4835 }, { "epoch": 2.5903997323967216, - "grad_norm": 22.226769592188973, + "grad_norm": 17.41639008267266, "learning_rate": 5.55693010542197e-08, - "logits/chosen": -0.1912364810705185, - "logits/rejected": 0.011147690005600452, - "logps/chosen": -1.2765791416168213, - "logps/rejected": -1.5787981748580933, - "loss": 1.9914, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2765791416168213, - "rewards/margins": 0.30221912264823914, - "rewards/rejected": -1.5787981748580933, - "semantic_entropy": 0.7809934020042419, + "logits/chosen": -0.27671462297439575, + "logits/rejected": -0.12988321483135223, + "logps/chosen": -1.2465171813964844, + "logps/rejected": -1.4433683156967163, + "loss": 1.5895, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2465171813964844, + "rewards/margins": 0.19685105979442596, + "rewards/rejected": -1.4433683156967163, "step": 4840 }, { "epoch": 2.5930757651781233, - "grad_norm": 7.512142408229105, + "grad_norm": 6.963596714901478, "learning_rate": 5.485789333327856e-08, - "logits/chosen": -0.14734336733818054, - "logits/rejected": -0.035708121955394745, - "logps/chosen": -1.252044439315796, - "logps/rejected": -1.5797019004821777, - "loss": 1.9539, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.252044439315796, - "rewards/margins": 0.3276575803756714, - "rewards/rejected": -1.5797019004821777, - "semantic_entropy": 0.8009160161018372, + "logits/chosen": -0.27144211530685425, + "logits/rejected": -0.16773834824562073, + "logps/chosen": -1.2030506134033203, + "logps/rejected": -1.4597923755645752, + "loss": 1.532, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2030506134033203, + "rewards/margins": 0.2567417025566101, + "rewards/rejected": -1.4597923755645752, "step": 4845 }, { "epoch": 2.595751797959525, - "grad_norm": 12.7462575378253, + "grad_norm": 10.818305557457998, "learning_rate": 5.4150804383008675e-08, - "logits/chosen": -0.26015961170196533, - "logits/rejected": -0.09845755994319916, - "logps/chosen": -1.2424649000167847, - "logps/rejected": -1.6106786727905273, - "loss": 1.9626, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2424649000167847, - "rewards/margins": 0.3682136833667755, - "rewards/rejected": -1.6106786727905273, - "semantic_entropy": 0.8059800863265991, + "logits/chosen": -0.3696734309196472, + "logits/rejected": -0.24278569221496582, + "logps/chosen": -1.1997416019439697, + "logps/rejected": -1.4444429874420166, + "loss": 1.5504, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1997416019439697, + "rewards/margins": 0.24470162391662598, + "rewards/rejected": -1.4444429874420166, "step": 4850 }, { "epoch": 2.5984278307409268, - "grad_norm": 18.122164837107288, + "grad_norm": 11.424581546227913, "learning_rate": 5.344804106359002e-08, - "logits/chosen": -0.09411051124334335, - "logits/rejected": 0.06524305045604706, - "logps/chosen": -1.2202339172363281, - "logps/rejected": -1.5384161472320557, - "loss": 1.9856, + "logits/chosen": -0.2225084751844406, + "logits/rejected": -0.10350972414016724, + "logps/chosen": -1.1904308795928955, + "logps/rejected": -1.3949419260025024, + "loss": 1.5781, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2202339172363281, - "rewards/margins": 0.31818222999572754, - "rewards/rejected": -1.5384161472320557, - "semantic_entropy": 0.805999755859375, + "rewards/chosen": -1.1904308795928955, + "rewards/margins": 0.20451101660728455, + "rewards/rejected": -1.3949419260025024, "step": 4855 }, { "epoch": 2.601103863522328, - "grad_norm": 12.956987300274212, + "grad_norm": 10.680690720703248, "learning_rate": 5.274961019323559e-08, - "logits/chosen": -0.13933970034122467, - "logits/rejected": -0.07091189920902252, - "logps/chosen": -1.1746845245361328, - "logps/rejected": -1.6169769763946533, - "loss": 1.8937, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1746845245361328, - "rewards/margins": 0.4422924518585205, - "rewards/rejected": -1.6169769763946533, - "semantic_entropy": 0.8040294647216797, + "logits/chosen": -0.2484455406665802, + "logits/rejected": -0.2033940851688385, + "logps/chosen": -1.1515166759490967, + "logps/rejected": -1.5101279020309448, + "loss": 1.4904, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1515166759490967, + "rewards/margins": 0.35861116647720337, + "rewards/rejected": -1.5101279020309448, "step": 4860 }, { "epoch": 2.6037798963037297, - "grad_norm": 9.04594431262119, + "grad_norm": 7.766892508530793, "learning_rate": 5.205551854812451e-08, - "logits/chosen": -0.23502866923809052, - "logits/rejected": -0.14637461304664612, - "logps/chosen": -1.2417190074920654, - "logps/rejected": -1.627515435218811, - "loss": 1.9502, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2417190074920654, - "rewards/margins": 0.38579636812210083, - "rewards/rejected": -1.627515435218811, - "semantic_entropy": 0.7967454791069031, + "logits/chosen": -0.33021020889282227, + "logits/rejected": -0.2669282555580139, + "logps/chosen": -1.2164661884307861, + "logps/rejected": -1.5209804773330688, + "loss": 1.5522, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2164661884307861, + "rewards/margins": 0.3045143187046051, + "rewards/rejected": -1.5209804773330688, "step": 4865 }, { "epoch": 2.606455929085131, - "grad_norm": 12.300434612266919, + "grad_norm": 10.5121533070436, "learning_rate": 5.1365772862337177e-08, - "logits/chosen": -0.09480637311935425, - "logits/rejected": 0.03487342968583107, - "logps/chosen": -1.2769651412963867, - "logps/rejected": -1.583880066871643, - "loss": 2.0036, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2769651412963867, - "rewards/margins": 0.3069148361682892, - "rewards/rejected": -1.583880066871643, - "semantic_entropy": 0.7944985628128052, + "logits/chosen": -0.20856007933616638, + "logits/rejected": -0.1037999615073204, + "logps/chosen": -1.2410916090011597, + "logps/rejected": -1.4437034130096436, + "loss": 1.5985, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2410916090011597, + "rewards/margins": 0.20261195302009583, + "rewards/rejected": -1.4437034130096436, "step": 4870 }, { "epoch": 2.6091319618665327, - "grad_norm": 8.881988481533577, + "grad_norm": 8.287978704385738, "learning_rate": 5.068037982778905e-08, - "logits/chosen": 0.027025306597352028, - "logits/rejected": 0.09920650720596313, - "logps/chosen": -1.240953803062439, - "logps/rejected": -1.6911252737045288, - "loss": 1.9596, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.240953803062439, - "rewards/margins": 0.45017147064208984, - "rewards/rejected": -1.6911252737045288, - "semantic_entropy": 0.7965016961097717, + "logits/chosen": -0.09526379406452179, + "logits/rejected": -0.04287983849644661, + "logps/chosen": -1.2036092281341553, + "logps/rejected": -1.5484676361083984, + "loss": 1.5442, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2036092281341553, + "rewards/margins": 0.34485840797424316, + "rewards/rejected": -1.5484676361083984, "step": 4875 }, { "epoch": 2.6118079946479344, - "grad_norm": 9.975007837075847, + "grad_norm": 8.942252036294636, "learning_rate": 4.999934609416656e-08, - "logits/chosen": -0.05672222375869751, - "logits/rejected": 0.05705530196428299, - "logps/chosen": -1.2177865505218506, - "logps/rejected": -1.640758752822876, - "loss": 1.9217, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2177865505218506, - "rewards/margins": 0.4229722023010254, - "rewards/rejected": -1.640758752822876, - "semantic_entropy": 0.7946897745132446, + "logits/chosen": -0.15273435413837433, + "logits/rejected": -0.06922443211078644, + "logps/chosen": -1.1939882040023804, + "logps/rejected": -1.490462064743042, + "loss": 1.5264, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1939882040023804, + "rewards/margins": 0.29647380113601685, + "rewards/rejected": -1.490462064743042, "step": 4880 }, { "epoch": 2.614484027429336, - "grad_norm": 10.150182185057119, + "grad_norm": 8.136911208657267, "learning_rate": 4.932267826886183e-08, - "logits/chosen": -0.03622272610664368, - "logits/rejected": 0.041126228868961334, - "logps/chosen": -1.3333375453948975, - "logps/rejected": -1.683574914932251, - "loss": 2.0232, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3333375453948975, - "rewards/margins": 0.35023751854896545, - "rewards/rejected": -1.683574914932251, - "semantic_entropy": 0.7540980577468872, + "logits/chosen": -0.17341378331184387, + "logits/rejected": -0.11079660803079605, + "logps/chosen": -1.3053932189941406, + "logps/rejected": -1.567337155342102, + "loss": 1.6352, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3053932189941406, + "rewards/margins": 0.26194387674331665, + "rewards/rejected": -1.567337155342102, "step": 4885 }, { "epoch": 2.6171600602107374, - "grad_norm": 10.730980754841438, + "grad_norm": 9.058068642708994, "learning_rate": 4.8650382916909206e-08, - "logits/chosen": -0.2052614986896515, - "logits/rejected": -0.034829698503017426, - "logps/chosen": -1.263850212097168, - "logps/rejected": -1.6367954015731812, - "loss": 1.9642, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.263850212097168, - "rewards/margins": 0.37294501066207886, - "rewards/rejected": -1.6367954015731812, - "semantic_entropy": 0.7861441373825073, + "logits/chosen": -0.2817673981189728, + "logits/rejected": -0.14578992128372192, + "logps/chosen": -1.2361321449279785, + "logps/rejected": -1.505322813987732, + "loss": 1.5715, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2361321449279785, + "rewards/margins": 0.2691906988620758, + "rewards/rejected": -1.505322813987732, "step": 4890 }, { "epoch": 2.619836092992139, - "grad_norm": 7.368577885518247, + "grad_norm": 6.941136983346939, "learning_rate": 4.7982466560920976e-08, - "logits/chosen": -0.1294822245836258, - "logits/rejected": -0.03858218714594841, - "logps/chosen": -1.3017622232437134, - "logps/rejected": -1.5246281623840332, - "loss": 2.023, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3017622232437134, - "rewards/margins": 0.22286593914031982, - "rewards/rejected": -1.5246281623840332, - "semantic_entropy": 0.7833074331283569, + "logits/chosen": -0.2306617945432663, + "logits/rejected": -0.15258941054344177, + "logps/chosen": -1.2696197032928467, + "logps/rejected": -1.424823522567749, + "loss": 1.6234, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2696197032928467, + "rewards/margins": 0.15520384907722473, + "rewards/rejected": -1.424823522567749, "step": 4895 }, { "epoch": 2.622512125773541, - "grad_norm": 5.928285248215213, + "grad_norm": 6.028115156070567, "learning_rate": 4.7318935681024685e-08, - "logits/chosen": -0.08297251164913177, - "logits/rejected": 0.05140919238328934, - "logps/chosen": -1.244057536125183, - "logps/rejected": -1.6229749917984009, - "loss": 1.9445, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.244057536125183, - "rewards/margins": 0.378917396068573, - "rewards/rejected": -1.6229749917984009, - "semantic_entropy": 0.7902035117149353, + "logits/chosen": -0.17112164199352264, + "logits/rejected": -0.056985218077898026, + "logps/chosen": -1.2183990478515625, + "logps/rejected": -1.4961133003234863, + "loss": 1.5523, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2183990478515625, + "rewards/margins": 0.27771419286727905, + "rewards/rejected": -1.4961133003234863, "step": 4900 }, { "epoch": 2.625188158554942, - "grad_norm": 6.2002842955489115, + "grad_norm": 6.413105815703295, "learning_rate": 4.6659796714799745e-08, - "logits/chosen": -0.11116425693035126, - "logits/rejected": 0.042223114520311356, - "logps/chosen": -1.2663477659225464, - "logps/rejected": -1.6212007999420166, - "loss": 1.9768, + "logits/chosen": -0.21311946213245392, + "logits/rejected": -0.09097801893949509, + "logps/chosen": -1.2271919250488281, + "logps/rejected": -1.490767002105713, + "loss": 1.5694, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2663477659225464, - "rewards/margins": 0.3548528552055359, - "rewards/rejected": -1.6212007999420166, - "semantic_entropy": 0.7857402563095093, + "rewards/chosen": -1.2271919250488281, + "rewards/margins": 0.26357507705688477, + "rewards/rejected": -1.490767002105713, "step": 4905 }, { "epoch": 2.627864191336344, - "grad_norm": 8.281462378102345, + "grad_norm": 7.187362492638208, "learning_rate": 4.60050560572155e-08, - "logits/chosen": -0.14414629340171814, - "logits/rejected": -0.16859321296215057, - "logps/chosen": -1.2700074911117554, - "logps/rejected": -1.735364317893982, - "loss": 1.9645, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2700074911117554, - "rewards/margins": 0.4653567373752594, - "rewards/rejected": -1.735364317893982, - "semantic_entropy": 0.789490818977356, + "logits/chosen": -0.2537928521633148, + "logits/rejected": -0.28167393803596497, + "logps/chosen": -1.2362048625946045, + "logps/rejected": -1.567087173461914, + "loss": 1.5652, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2362048625946045, + "rewards/margins": 0.33088216185569763, + "rewards/rejected": -1.567087173461914, "step": 4910 }, { "epoch": 2.6305402241177456, - "grad_norm": 9.741518580503993, + "grad_norm": 8.332445387426878, "learning_rate": 4.535472006056834e-08, - "logits/chosen": -0.06438411772251129, - "logits/rejected": 0.03645631670951843, - "logps/chosen": -1.1731632947921753, - "logps/rejected": -1.563751220703125, - "loss": 1.897, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1731632947921753, - "rewards/margins": 0.3905879557132721, - "rewards/rejected": -1.563751220703125, - "semantic_entropy": 0.8115876317024231, + "logits/chosen": -0.1759515255689621, + "logits/rejected": -0.10240741074085236, + "logps/chosen": -1.1517741680145264, + "logps/rejected": -1.447211742401123, + "loss": 1.4964, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1517741680145264, + "rewards/margins": 0.2954375743865967, + "rewards/rejected": -1.447211742401123, "step": 4915 }, { "epoch": 2.6332162568991473, - "grad_norm": 9.399338449222602, + "grad_norm": 10.163277147162006, "learning_rate": 4.470879503442132e-08, - "logits/chosen": -0.06840424239635468, - "logits/rejected": 0.012408537790179253, - "logps/chosen": -1.2594525814056396, - "logps/rejected": -1.536690354347229, - "loss": 2.0095, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2594525814056396, - "rewards/margins": 0.2772377133369446, - "rewards/rejected": -1.536690354347229, - "semantic_entropy": 0.8033695220947266, + "logits/chosen": -0.1583867073059082, + "logits/rejected": -0.10426501929759979, + "logps/chosen": -1.2298393249511719, + "logps/rejected": -1.414184808731079, + "loss": 1.5992, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2298393249511719, + "rewards/margins": 0.18434542417526245, + "rewards/rejected": -1.414184808731079, "step": 4920 }, { "epoch": 2.6358922896805486, - "grad_norm": 11.481991942896826, + "grad_norm": 8.515773110713946, "learning_rate": 4.406728724554154e-08, - "logits/chosen": -0.3004254996776581, - "logits/rejected": -0.050283849239349365, - "logps/chosen": -1.2204532623291016, - "logps/rejected": -1.577864646911621, - "loss": 1.9369, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2204532623291016, - "rewards/margins": 0.3574114441871643, - "rewards/rejected": -1.577864646911621, - "semantic_entropy": 0.7966974973678589, + "logits/chosen": -0.3868665397167206, + "logits/rejected": -0.18404386937618256, + "logps/chosen": -1.1990516185760498, + "logps/rejected": -1.4621487855911255, + "loss": 1.5456, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1990516185760498, + "rewards/margins": 0.26309722661972046, + "rewards/rejected": -1.4621487855911255, "step": 4925 }, { "epoch": 2.6385683224619503, - "grad_norm": 6.512321673294674, + "grad_norm": 6.398725501498622, "learning_rate": 4.3430202917840664e-08, - "logits/chosen": -0.09682625532150269, - "logits/rejected": 0.04881001263856888, - "logps/chosen": -1.3104798793792725, - "logps/rejected": -1.7430320978164673, - "loss": 1.9896, + "logits/chosen": -0.14923153817653656, + "logits/rejected": -0.03317415714263916, + "logps/chosen": -1.2743452787399292, + "logps/rejected": -1.5962750911712646, + "loss": 1.5927, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3104798793792725, - "rewards/margins": 0.4325522482395172, - "rewards/rejected": -1.7430320978164673, - "semantic_entropy": 0.7690348625183105, + "rewards/chosen": -1.2743452787399292, + "rewards/margins": 0.32192978262901306, + "rewards/rejected": -1.5962750911712646, "step": 4930 }, { "epoch": 2.6412443552433515, - "grad_norm": 16.111192810330614, + "grad_norm": 11.077780630597667, "learning_rate": 4.279754823231346e-08, - "logits/chosen": -0.21507367491722107, - "logits/rejected": -0.06278637796640396, - "logps/chosen": -1.2675609588623047, - "logps/rejected": -1.5637404918670654, - "loss": 1.9877, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2675609588623047, - "rewards/margins": 0.2961795926094055, - "rewards/rejected": -1.5637404918670654, - "semantic_entropy": 0.7901414632797241, + "logits/chosen": -0.23723450303077698, + "logits/rejected": -0.1026090532541275, + "logps/chosen": -1.2323095798492432, + "logps/rejected": -1.4321863651275635, + "loss": 1.5876, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2323095798492432, + "rewards/margins": 0.19987669587135315, + "rewards/rejected": -1.4321863651275635, "step": 4935 }, { "epoch": 2.6439203880247533, - "grad_norm": 6.774101283588008, + "grad_norm": 6.697254262643319, "learning_rate": 4.216932932697859e-08, - "logits/chosen": -0.13060271739959717, - "logits/rejected": -0.05203498527407646, - "logps/chosen": -1.2672488689422607, - "logps/rejected": -1.4680770635604858, - "loss": 1.9994, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2672488689422607, - "rewards/margins": 0.2008281946182251, - "rewards/rejected": -1.4680770635604858, - "semantic_entropy": 0.8053982853889465, + "logits/chosen": -0.2031714916229248, + "logits/rejected": -0.14960214495658875, + "logps/chosen": -1.2334299087524414, + "logps/rejected": -1.3718469142913818, + "loss": 1.5928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2334299087524414, + "rewards/margins": 0.13841703534126282, + "rewards/rejected": -1.3718469142913818, "step": 4940 }, { "epoch": 2.646596420806155, - "grad_norm": 8.441149104353624, + "grad_norm": 7.426199038991483, "learning_rate": 4.154555229681844e-08, - "logits/chosen": -0.15102128684520721, - "logits/rejected": 0.04940281808376312, - "logps/chosen": -1.2585450410842896, - "logps/rejected": -1.7040252685546875, - "loss": 1.9433, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2585450410842896, - "rewards/margins": 0.4454802870750427, - "rewards/rejected": -1.7040252685546875, - "semantic_entropy": 0.7860039472579956, + "logits/chosen": -0.2201036959886551, + "logits/rejected": -0.04712003469467163, + "logps/chosen": -1.2281110286712646, + "logps/rejected": -1.5359797477722168, + "loss": 1.549, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2281110286712646, + "rewards/margins": 0.3078688085079193, + "rewards/rejected": -1.5359797477722168, "step": 4945 }, { "epoch": 2.6492724535875567, - "grad_norm": 14.170996265703746, + "grad_norm": 12.306575673857681, "learning_rate": 4.092622319372069e-08, - "logits/chosen": -0.15491417050361633, - "logits/rejected": -0.032993149012327194, - "logps/chosen": -1.2248013019561768, - "logps/rejected": -1.553957462310791, - "loss": 1.9588, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2248013019561768, - "rewards/margins": 0.3291561007499695, - "rewards/rejected": -1.553957462310791, - "semantic_entropy": 0.7976547479629517, + "logits/chosen": -0.24709157645702362, + "logits/rejected": -0.1545828878879547, + "logps/chosen": -1.191224455833435, + "logps/rejected": -1.4444421529769897, + "loss": 1.5518, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.191224455833435, + "rewards/margins": 0.2532176971435547, + "rewards/rejected": -1.4444421529769897, "step": 4950 }, { "epoch": 2.651948486368958, - "grad_norm": 12.696191026034306, + "grad_norm": 8.610634145118038, "learning_rate": 4.031134802641889e-08, - "logits/chosen": -0.1630629301071167, - "logits/rejected": -0.1627640724182129, - "logps/chosen": -1.2603944540023804, - "logps/rejected": -1.6164932250976562, - "loss": 1.9535, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2603944540023804, - "rewards/margins": 0.3560987114906311, - "rewards/rejected": -1.6164932250976562, - "semantic_entropy": 0.7857304215431213, + "logits/chosen": -0.22103190422058105, + "logits/rejected": -0.2326325923204422, + "logps/chosen": -1.2279322147369385, + "logps/rejected": -1.5125610828399658, + "loss": 1.5511, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2279322147369385, + "rewards/margins": 0.28462880849838257, + "rewards/rejected": -1.5125610828399658, "step": 4955 }, { "epoch": 2.6546245191503597, - "grad_norm": 7.404253383249965, + "grad_norm": 7.307535491883655, "learning_rate": 3.970093276043468e-08, - "logits/chosen": -0.03796197846531868, - "logits/rejected": 0.059105951339006424, - "logps/chosen": -1.2659322023391724, - "logps/rejected": -1.5594804286956787, - "loss": 1.9762, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2659322023391724, - "rewards/margins": 0.29354843497276306, - "rewards/rejected": -1.5594804286956787, - "semantic_entropy": 0.7827733755111694, + "logits/chosen": -0.13648240268230438, + "logits/rejected": -0.060978494584560394, + "logps/chosen": -1.2389017343521118, + "logps/rejected": -1.4436099529266357, + "loss": 1.5848, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2389017343521118, + "rewards/margins": 0.2047082930803299, + "rewards/rejected": -1.4436099529266357, "step": 4960 }, { "epoch": 2.657300551931761, - "grad_norm": 9.56732882815061, + "grad_norm": 8.703302267157248, "learning_rate": 3.9094983318019584e-08, - "logits/chosen": -0.18269088864326477, - "logits/rejected": -0.05542262643575668, - "logps/chosen": -1.1641151905059814, - "logps/rejected": -1.533266544342041, - "loss": 1.8922, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1641151905059814, - "rewards/margins": 0.3691512942314148, - "rewards/rejected": -1.533266544342041, - "semantic_entropy": 0.8211628794670105, + "logits/chosen": -0.24506616592407227, + "logits/rejected": -0.14053188264369965, + "logps/chosen": -1.1402724981307983, + "logps/rejected": -1.392337441444397, + "loss": 1.4885, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1402724981307983, + "rewards/margins": 0.25206464529037476, + "rewards/rejected": -1.392337441444397, "step": 4965 }, { "epoch": 2.6599765847131627, - "grad_norm": 13.076802645156093, + "grad_norm": 10.685508236641486, "learning_rate": 3.849350557809789e-08, - "logits/chosen": -0.0741979256272316, - "logits/rejected": -0.005497545935213566, - "logps/chosen": -1.2310014963150024, - "logps/rejected": -1.601223349571228, - "loss": 1.9484, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2310014963150024, - "rewards/margins": 0.37022197246551514, - "rewards/rejected": -1.601223349571228, - "semantic_entropy": 0.8146511316299438, + "logits/chosen": -0.15796777606010437, + "logits/rejected": -0.10449770838022232, + "logps/chosen": -1.1950023174285889, + "logps/rejected": -1.4939477443695068, + "loss": 1.5286, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1950023174285889, + "rewards/margins": 0.2989455759525299, + "rewards/rejected": -1.4939477443695068, "step": 4970 }, { "epoch": 2.6626526174945644, - "grad_norm": 9.43279246779188, + "grad_norm": 9.097216461292135, "learning_rate": 3.789650537620903e-08, - "logits/chosen": -0.10716526210308075, - "logits/rejected": -0.0495462603867054, - "logps/chosen": -1.304774522781372, - "logps/rejected": -1.5925801992416382, - "loss": 2.0143, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.304774522781372, - "rewards/margins": 0.2878056764602661, - "rewards/rejected": -1.5925801992416382, - "semantic_entropy": 0.7799090147018433, + "logits/chosen": -0.22597603499889374, + "logits/rejected": -0.1907559335231781, + "logps/chosen": -1.2609063386917114, + "logps/rejected": -1.4642679691314697, + "loss": 1.6028, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2609063386917114, + "rewards/margins": 0.20336155593395233, + "rewards/rejected": -1.4642679691314697, "step": 4975 }, { "epoch": 2.665328650275966, - "grad_norm": 10.763393837295931, + "grad_norm": 9.941018126511944, "learning_rate": 3.730398850445182e-08, - "logits/chosen": -0.025856634601950645, - "logits/rejected": 0.041945237666368484, - "logps/chosen": -1.3856089115142822, - "logps/rejected": -1.7057222127914429, - "loss": 2.0548, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3856089115142822, - "rewards/margins": 0.3201134204864502, - "rewards/rejected": -1.7057222127914429, - "semantic_entropy": 0.7589296102523804, + "logits/chosen": -0.10990214347839355, + "logits/rejected": -0.05437646433711052, + "logps/chosen": -1.343835473060608, + "logps/rejected": -1.556043267250061, + "loss": 1.6562, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.343835473060608, + "rewards/margins": 0.21220776438713074, + "rewards/rejected": -1.556043267250061, "step": 4980 }, { "epoch": 2.6680046830573674, - "grad_norm": 13.416545083305897, + "grad_norm": 11.559828017336898, "learning_rate": 3.671596071142735e-08, - "logits/chosen": -0.11143984645605087, - "logits/rejected": 0.05205560848116875, - "logps/chosen": -1.2196766138076782, - "logps/rejected": -1.6010757684707642, - "loss": 1.9485, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2196766138076782, - "rewards/margins": 0.3813990652561188, - "rewards/rejected": -1.6010757684707642, - "semantic_entropy": 0.7851622700691223, + "logits/chosen": -0.17771419882774353, + "logits/rejected": -0.04531203210353851, + "logps/chosen": -1.1872304677963257, + "logps/rejected": -1.4659850597381592, + "loss": 1.5465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1872304677963257, + "rewards/margins": 0.27875441312789917, + "rewards/rejected": -1.4659850597381592, "step": 4985 }, { "epoch": 2.670680715838769, - "grad_norm": 15.420414884524414, + "grad_norm": 9.282636949865253, "learning_rate": 3.6132427702183996e-08, - "logits/chosen": -0.2223101556301117, - "logits/rejected": -0.00972900353372097, - "logps/chosen": -1.2277615070343018, - "logps/rejected": -1.6345646381378174, - "loss": 1.9466, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2277615070343018, - "rewards/margins": 0.4068028926849365, - "rewards/rejected": -1.6345646381378174, - "semantic_entropy": 0.8061507940292358, + "logits/chosen": -0.27501699328422546, + "logits/rejected": -0.09257388114929199, + "logps/chosen": -1.1860332489013672, + "logps/rejected": -1.4876549243927002, + "loss": 1.5319, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1860332489013672, + "rewards/margins": 0.3016217052936554, + "rewards/rejected": -1.4876549243927002, "step": 4990 }, { "epoch": 2.6733567486201704, - "grad_norm": 10.77387194862738, + "grad_norm": 9.805693856644686, "learning_rate": 3.555339513816147e-08, - "logits/chosen": -0.16990795731544495, - "logits/rejected": -0.15832272171974182, - "logps/chosen": -1.279496431350708, - "logps/rejected": -1.5370452404022217, - "loss": 2.0197, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.279496431350708, - "rewards/margins": 0.25754863023757935, - "rewards/rejected": -1.5370452404022217, - "semantic_entropy": 0.7910841703414917, + "logits/chosen": -0.26975321769714355, + "logits/rejected": -0.26140162348747253, + "logps/chosen": -1.237966537475586, + "logps/rejected": -1.4422686100006104, + "loss": 1.5988, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.237966537475586, + "rewards/margins": 0.20430207252502441, + "rewards/rejected": -1.4422686100006104, "step": 4995 }, { "epoch": 2.676032781401572, - "grad_norm": 8.193883232131919, + "grad_norm": 8.328654351915757, "learning_rate": 3.497886863713639e-08, - "logits/chosen": -0.1418825089931488, - "logits/rejected": -0.11908824741840363, - "logps/chosen": -1.2753219604492188, - "logps/rejected": -1.6523866653442383, - "loss": 1.9877, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2753219604492188, - "rewards/margins": 0.3770645260810852, - "rewards/rejected": -1.6523866653442383, - "semantic_entropy": 0.7843011617660522, + "logits/chosen": -0.2382686585187912, + "logits/rejected": -0.23010282218456268, + "logps/chosen": -1.2322697639465332, + "logps/rejected": -1.5280249118804932, + "loss": 1.5755, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2322697639465332, + "rewards/margins": 0.2957552969455719, + "rewards/rejected": -1.5280249118804932, "step": 5000 }, { "epoch": 2.678708814182974, - "grad_norm": 11.719300104728754, + "grad_norm": 9.857601718828215, "learning_rate": 3.440885377316721e-08, - "logits/chosen": -0.05981435626745224, - "logits/rejected": -0.013632726855576038, - "logps/chosen": -1.2534570693969727, - "logps/rejected": -1.5460814237594604, - "loss": 1.9694, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2534570693969727, - "rewards/margins": 0.2926243245601654, - "rewards/rejected": -1.5460814237594604, - "semantic_entropy": 0.8032326698303223, + "logits/chosen": -0.17979669570922852, + "logits/rejected": -0.15013888478279114, + "logps/chosen": -1.2193511724472046, + "logps/rejected": -1.4303195476531982, + "loss": 1.5614, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2193511724472046, + "rewards/margins": 0.21096833050251007, + "rewards/rejected": -1.4303195476531982, "step": 5005 }, { "epoch": 2.6813848469643755, - "grad_norm": 11.762943253097305, + "grad_norm": 9.385374558005283, "learning_rate": 3.384335607654082e-08, - "logits/chosen": -0.025763485580682755, - "logits/rejected": 0.07047827541828156, - "logps/chosen": -1.3849637508392334, - "logps/rejected": -1.6733529567718506, - "loss": 2.0828, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3849637508392334, - "rewards/margins": 0.28838926553726196, - "rewards/rejected": -1.6733529567718506, - "semantic_entropy": 0.7628467082977295, + "logits/chosen": -0.1488681137561798, + "logits/rejected": -0.0743316262960434, + "logps/chosen": -1.350601315498352, + "logps/rejected": -1.5454646348953247, + "loss": 1.6932, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.350601315498352, + "rewards/margins": 0.19486349821090698, + "rewards/rejected": -1.5454646348953247, "step": 5010 }, { "epoch": 2.684060879745777, - "grad_norm": 10.864227585313829, + "grad_norm": 10.082983742251782, "learning_rate": 3.328238103371811e-08, - "logits/chosen": -0.1647218018770218, - "logits/rejected": -0.09952450543642044, - "logps/chosen": -1.2642019987106323, - "logps/rejected": -1.6397594213485718, - "loss": 1.9762, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2642019987106323, - "rewards/margins": 0.3755575716495514, - "rewards/rejected": -1.6397594213485718, - "semantic_entropy": 0.7904716730117798, + "logits/chosen": -0.2765267491340637, + "logits/rejected": -0.2307772934436798, + "logps/chosen": -1.2331907749176025, + "logps/rejected": -1.50911545753479, + "loss": 1.5738, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2331907749176025, + "rewards/margins": 0.2759249806404114, + "rewards/rejected": -1.50911545753479, "step": 5015 }, { "epoch": 2.6867369125271785, - "grad_norm": 13.4307602774936, + "grad_norm": 8.575174459256017, "learning_rate": 3.272593408728169e-08, - "logits/chosen": -0.17871826887130737, - "logits/rejected": 0.019997352734208107, - "logps/chosen": -1.2422888278961182, - "logps/rejected": -1.505932092666626, - "loss": 1.9959, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2422888278961182, - "rewards/margins": 0.26364344358444214, - "rewards/rejected": -1.505932092666626, - "semantic_entropy": 0.8106265068054199, + "logits/chosen": -0.28711315989494324, + "logits/rejected": -0.14060531556606293, + "logps/chosen": -1.2072677612304688, + "logps/rejected": -1.3850557804107666, + "loss": 1.582, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2072677612304688, + "rewards/margins": 0.17778778076171875, + "rewards/rejected": -1.3850557804107666, "step": 5020 }, { "epoch": 2.6894129453085798, - "grad_norm": 7.923356519451293, + "grad_norm": 7.781545018454678, "learning_rate": 3.217402063588204e-08, - "logits/chosen": -0.1537868082523346, - "logits/rejected": -0.014590066857635975, - "logps/chosen": -1.2995110750198364, - "logps/rejected": -1.5679407119750977, - "loss": 2.0147, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2995110750198364, - "rewards/margins": 0.26842954754829407, - "rewards/rejected": -1.5679407119750977, - "semantic_entropy": 0.7905632853507996, + "logits/chosen": -0.255064457654953, + "logits/rejected": -0.1399548202753067, + "logps/chosen": -1.2635090351104736, + "logps/rejected": -1.4571034908294678, + "loss": 1.6057, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2635090351104736, + "rewards/margins": 0.19359445571899414, + "rewards/rejected": -1.4571034908294678, "step": 5025 }, { "epoch": 2.6920889780899815, - "grad_norm": 7.611333717939541, + "grad_norm": 8.012243534412995, "learning_rate": 3.162664603418608e-08, - "logits/chosen": -0.10719075053930283, - "logits/rejected": -0.0370730385184288, - "logps/chosen": -1.2645071744918823, - "logps/rejected": -1.6142085790634155, - "loss": 1.9828, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2645071744918823, - "rewards/margins": 0.3497016727924347, - "rewards/rejected": -1.6142085790634155, - "semantic_entropy": 0.7915008664131165, + "logits/chosen": -0.2057885229587555, + "logits/rejected": -0.16035917401313782, + "logps/chosen": -1.229182481765747, + "logps/rejected": -1.4483156204223633, + "loss": 1.5837, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.229182481765747, + "rewards/margins": 0.21913310885429382, + "rewards/rejected": -1.4483156204223633, "step": 5030 }, { "epoch": 2.694765010871383, - "grad_norm": 17.417318056705316, + "grad_norm": 14.593694769561147, "learning_rate": 3.1083815592824416e-08, - "logits/chosen": -0.14526739716529846, - "logits/rejected": -0.02130313403904438, - "logps/chosen": -1.3097810745239258, - "logps/rejected": -1.5980546474456787, - "loss": 2.0191, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3097810745239258, - "rewards/margins": 0.2882736027240753, - "rewards/rejected": -1.5980546474456787, - "semantic_entropy": 0.7685422897338867, + "logits/chosen": -0.2511407434940338, + "logits/rejected": -0.141941636800766, + "logps/chosen": -1.2716915607452393, + "logps/rejected": -1.4837024211883545, + "loss": 1.6227, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2716915607452393, + "rewards/margins": 0.21201090514659882, + "rewards/rejected": -1.4837024211883545, "step": 5035 }, { "epoch": 2.697441043652785, - "grad_norm": 17.075460639153633, + "grad_norm": 11.589102849621897, "learning_rate": 3.054553457834053e-08, - "logits/chosen": 0.052671514451503754, - "logits/rejected": 0.008498812094330788, - "logps/chosen": -1.231005072593689, - "logps/rejected": -1.6238819360733032, - "loss": 1.941, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.231005072593689, - "rewards/margins": 0.3928767442703247, - "rewards/rejected": -1.6238819360733032, - "semantic_entropy": 0.8041082620620728, + "logits/chosen": -0.0357593409717083, + "logits/rejected": -0.08599834144115448, + "logps/chosen": -1.1936821937561035, + "logps/rejected": -1.5037119388580322, + "loss": 1.524, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1936821937561035, + "rewards/margins": 0.31002962589263916, + "rewards/rejected": -1.5037119388580322, "step": 5040 }, { "epoch": 2.700117076434186, - "grad_norm": 11.695480433858842, + "grad_norm": 8.900457362007634, "learning_rate": 3.0011808213139036e-08, - "logits/chosen": -0.09167749434709549, - "logits/rejected": -0.0660163015127182, - "logps/chosen": -1.2336540222167969, - "logps/rejected": -1.5083844661712646, - "loss": 1.9589, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2336540222167969, - "rewards/margins": 0.2747305631637573, - "rewards/rejected": -1.5083844661712646, - "semantic_entropy": 0.8070043325424194, + "logits/chosen": -0.19243542850017548, + "logits/rejected": -0.16841921210289001, + "logps/chosen": -1.1981301307678223, + "logps/rejected": -1.4053146839141846, + "loss": 1.5488, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1981301307678223, + "rewards/margins": 0.20718452334403992, + "rewards/rejected": -1.4053146839141846, "step": 5045 }, { "epoch": 2.702793109215588, - "grad_norm": 14.366463277793246, + "grad_norm": 8.919580335367286, "learning_rate": 2.948264167543568e-08, - "logits/chosen": -0.12496743351221085, - "logits/rejected": -0.06732519716024399, - "logps/chosen": -1.1421395540237427, - "logps/rejected": -1.5506408214569092, - "loss": 1.8628, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1421395540237427, - "rewards/margins": 0.4085013270378113, - "rewards/rejected": -1.5506408214569092, - "semantic_entropy": 0.824967086315155, + "logits/chosen": -0.2229524850845337, + "logits/rejected": -0.18007297813892365, + "logps/chosen": -1.1097791194915771, + "logps/rejected": -1.4422390460968018, + "loss": 1.4453, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1097791194915771, + "rewards/margins": 0.332459956407547, + "rewards/rejected": -1.4422390460968018, "step": 5050 }, { "epoch": 2.7054691419969896, - "grad_norm": 8.403527027540157, + "grad_norm": 8.349626905258832, "learning_rate": 2.8958040099206216e-08, - "logits/chosen": -0.2553723454475403, - "logits/rejected": -0.17169703543186188, - "logps/chosen": -1.1848903894424438, - "logps/rejected": -1.559682846069336, - "loss": 1.9387, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.1848903894424438, - "rewards/margins": 0.37479257583618164, - "rewards/rejected": -1.559682846069336, - "semantic_entropy": 0.8126693964004517, + "logits/chosen": -0.3001171350479126, + "logits/rejected": -0.23147349059581757, + "logps/chosen": -1.1641672849655151, + "logps/rejected": -1.4396671056747437, + "loss": 1.5375, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.1641672849655151, + "rewards/margins": 0.2754998803138733, + "rewards/rejected": -1.4396671056747437, "step": 5055 }, { "epoch": 2.708145174778391, - "grad_norm": 18.857515190754253, + "grad_norm": 11.847683979223595, "learning_rate": 2.843800857413775e-08, - "logits/chosen": -0.10081684589385986, - "logits/rejected": -0.03810068592429161, - "logps/chosen": -1.2591793537139893, - "logps/rejected": -1.5564645528793335, - "loss": 1.9713, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2591793537139893, - "rewards/margins": 0.2972853481769562, - "rewards/rejected": -1.5564645528793335, - "semantic_entropy": 0.7883800268173218, + "logits/chosen": -0.19615155458450317, + "logits/rejected": -0.14950647950172424, + "logps/chosen": -1.222930669784546, + "logps/rejected": -1.4548113346099854, + "loss": 1.5727, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.222930669784546, + "rewards/margins": 0.23188063502311707, + "rewards/rejected": -1.4548113346099854, "step": 5060 }, { "epoch": 2.7108212075597926, - "grad_norm": 12.161734792358606, + "grad_norm": 9.364344351079968, "learning_rate": 2.7922552145578203e-08, - "logits/chosen": -0.13401418924331665, - "logits/rejected": 0.1141396015882492, - "logps/chosen": -1.3173068761825562, - "logps/rejected": -1.6399991512298584, - "loss": 2.0045, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3173068761825562, - "rewards/margins": 0.32269221544265747, - "rewards/rejected": -1.6399991512298584, - "semantic_entropy": 0.7699416875839233, + "logits/chosen": -0.24645750224590302, + "logits/rejected": -0.05692873150110245, + "logps/chosen": -1.286911964416504, + "logps/rejected": -1.5462675094604492, + "loss": 1.6129, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.286911964416504, + "rewards/margins": 0.2593555748462677, + "rewards/rejected": -1.5462675094604492, "step": 5065 }, { "epoch": 2.7134972403411943, - "grad_norm": 7.90554243386142, + "grad_norm": 7.546798581760435, "learning_rate": 2.7411675814488277e-08, - "logits/chosen": -0.029569268226623535, - "logits/rejected": 0.12846367061138153, - "logps/chosen": -1.1842542886734009, - "logps/rejected": -1.4647448062896729, - "loss": 1.952, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1842542886734009, - "rewards/margins": 0.2804903984069824, - "rewards/rejected": -1.4647448062896729, - "semantic_entropy": 0.8209689259529114, + "logits/chosen": -0.12102420628070831, + "logits/rejected": 0.015786726027727127, + "logps/chosen": -1.1583021879196167, + "logps/rejected": -1.3769385814666748, + "loss": 1.5371, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1583021879196167, + "rewards/margins": 0.21863646805286407, + "rewards/rejected": -1.3769385814666748, "step": 5070 }, { "epoch": 2.7161732731225956, - "grad_norm": 14.226564064186148, + "grad_norm": 11.375516386492313, "learning_rate": 2.690538453739216e-08, - "logits/chosen": -0.11434072256088257, - "logits/rejected": -0.035854704678058624, - "logps/chosen": -1.2371490001678467, - "logps/rejected": -1.4231237173080444, - "loss": 2.0155, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2371490001678467, - "rewards/margins": 0.18597477674484253, - "rewards/rejected": -1.4231237173080444, - "semantic_entropy": 0.8113747835159302, + "logits/chosen": -0.20476698875427246, + "logits/rejected": -0.14468708634376526, + "logps/chosen": -1.2095907926559448, + "logps/rejected": -1.340696096420288, + "loss": 1.603, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2095907926559448, + "rewards/margins": 0.1311051994562149, + "rewards/rejected": -1.340696096420288, "step": 5075 }, { "epoch": 2.7188493059039973, - "grad_norm": 8.766818819474777, + "grad_norm": 7.390452082521005, "learning_rate": 2.6403683226330298e-08, - "logits/chosen": -0.17532986402511597, - "logits/rejected": -0.0436691977083683, - "logps/chosen": -1.3013237714767456, - "logps/rejected": -1.5891865491867065, - "loss": 2.0086, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3013237714767456, - "rewards/margins": 0.28786277770996094, - "rewards/rejected": -1.5891865491867065, - "semantic_entropy": 0.7856767773628235, + "logits/chosen": -0.23655863106250763, + "logits/rejected": -0.12614436447620392, + "logps/chosen": -1.2767188549041748, + "logps/rejected": -1.4665131568908691, + "loss": 1.6172, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2767188549041748, + "rewards/margins": 0.18979422748088837, + "rewards/rejected": -1.4665131568908691, "step": 5080 }, { "epoch": 2.721525338685399, - "grad_norm": 12.23805430836543, + "grad_norm": 10.44433841417844, "learning_rate": 2.5906576748810804e-08, - "logits/chosen": -0.21402184665203094, - "logits/rejected": -0.09077148139476776, - "logps/chosen": -1.1631752252578735, - "logps/rejected": -1.689199686050415, - "loss": 1.8654, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1631752252578735, - "rewards/margins": 0.5260245203971863, - "rewards/rejected": -1.689199686050415, - "semantic_entropy": 0.8219331502914429, + "logits/chosen": -0.3213575482368469, + "logits/rejected": -0.2255897969007492, + "logps/chosen": -1.1311696767807007, + "logps/rejected": -1.556115746498108, + "loss": 1.4573, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1311696767807007, + "rewards/margins": 0.42494598031044006, + "rewards/rejected": -1.556115746498108, "step": 5085 }, { "epoch": 2.7242013714668003, - "grad_norm": 13.707308783644233, + "grad_norm": 11.491098345334695, "learning_rate": 2.5414069927763016e-08, - "logits/chosen": -0.24503371119499207, - "logits/rejected": -0.09396155923604965, - "logps/chosen": -1.3182017803192139, - "logps/rejected": -1.6661937236785889, - "loss": 2.01, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3182017803192139, - "rewards/margins": 0.34799203276634216, - "rewards/rejected": -1.6661937236785889, - "semantic_entropy": 0.7787152528762817, + "logits/chosen": -0.3183497190475464, + "logits/rejected": -0.2026023417711258, + "logps/chosen": -1.2869782447814941, + "logps/rejected": -1.531435251235962, + "loss": 1.6113, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2869782447814941, + "rewards/margins": 0.2444569617509842, + "rewards/rejected": -1.531435251235962, "step": 5090 }, { "epoch": 2.726877404248202, - "grad_norm": 7.364777652105795, + "grad_norm": 6.65963325181154, "learning_rate": 2.4926167541490185e-08, - "logits/chosen": -0.27421778440475464, - "logits/rejected": -0.08374804258346558, - "logps/chosen": -1.2514071464538574, - "logps/rejected": -1.6568397283554077, - "loss": 1.9544, + "logits/chosen": -0.34064042568206787, + "logits/rejected": -0.1863982379436493, + "logps/chosen": -1.2175891399383545, + "logps/rejected": -1.5243316888809204, + "loss": 1.5504, "rewards/accuracies": 0.625, - "rewards/chosen": -1.2514071464538574, - "rewards/margins": 0.4054326117038727, - "rewards/rejected": -1.6568397283554077, - "semantic_entropy": 0.7822387218475342, + "rewards/chosen": -1.2175891399383545, + "rewards/margins": 0.3067426383495331, + "rewards/rejected": -1.5243316888809204, "step": 5095 }, { "epoch": 2.7295534370296037, - "grad_norm": 11.643911581815413, + "grad_norm": 9.51269645374707, "learning_rate": 2.4442874323623574e-08, - "logits/chosen": -0.0473858080804348, - "logits/rejected": 0.09867437928915024, - "logps/chosen": -1.2100188732147217, - "logps/rejected": -1.7059189081192017, - "loss": 1.9202, + "logits/chosen": -0.14729897677898407, + "logits/rejected": -0.039565883576869965, + "logps/chosen": -1.178842544555664, + "logps/rejected": -1.5293989181518555, + "loss": 1.5147, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2100188732147217, - "rewards/margins": 0.49590006470680237, - "rewards/rejected": -1.7059189081192017, - "semantic_entropy": 0.8004026412963867, + "rewards/chosen": -1.178842544555664, + "rewards/margins": 0.3505564332008362, + "rewards/rejected": -1.5293989181518555, "step": 5100 }, { "epoch": 2.7322294698110055, - "grad_norm": 11.921633311514181, + "grad_norm": 10.398165476971881, "learning_rate": 2.396419496307589e-08, - "logits/chosen": -0.1327304095029831, - "logits/rejected": 0.032488059252500534, - "logps/chosen": -1.279708981513977, - "logps/rejected": -1.5373016595840454, - "loss": 2.0003, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.279708981513977, - "rewards/margins": 0.25759270787239075, - "rewards/rejected": -1.5373016595840454, - "semantic_entropy": 0.7910350561141968, + "logits/chosen": -0.2192573994398117, + "logits/rejected": -0.08377514034509659, + "logps/chosen": -1.2447128295898438, + "logps/rejected": -1.412784218788147, + "loss": 1.5972, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2447128295898438, + "rewards/margins": 0.16807132959365845, + "rewards/rejected": -1.412784218788147, "step": 5105 }, { "epoch": 2.7349055025924067, - "grad_norm": 10.5803341934339, + "grad_norm": 8.038138448432369, "learning_rate": 2.349013410399653e-08, - "logits/chosen": -0.19295606017112732, - "logits/rejected": -0.04806617647409439, - "logps/chosen": -1.2397714853286743, - "logps/rejected": -1.5628407001495361, - "loss": 1.9677, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2397714853286743, - "rewards/margins": 0.3230692148208618, - "rewards/rejected": -1.5628407001495361, - "semantic_entropy": 0.7900466918945312, + "logits/chosen": -0.30199727416038513, + "logits/rejected": -0.17887206375598907, + "logps/chosen": -1.2027664184570312, + "logps/rejected": -1.4390037059783936, + "loss": 1.562, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2027664184570312, + "rewards/margins": 0.23623719811439514, + "rewards/rejected": -1.4390037059783936, "step": 5110 }, { "epoch": 2.7375815353738084, - "grad_norm": 9.2360531664133, + "grad_norm": 8.273843305880611, "learning_rate": 2.3020696345725954e-08, - "logits/chosen": -0.22518637776374817, - "logits/rejected": -0.02344553731381893, - "logps/chosen": -1.2428288459777832, - "logps/rejected": -1.6677284240722656, - "loss": 1.934, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2428288459777832, - "rewards/margins": 0.42489948868751526, - "rewards/rejected": -1.6677284240722656, - "semantic_entropy": 0.7951303124427795, + "logits/chosen": -0.30483612418174744, + "logits/rejected": -0.1412568837404251, + "logps/chosen": -1.2145841121673584, + "logps/rejected": -1.5266047716140747, + "loss": 1.5396, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2145841121673584, + "rewards/margins": 0.3120204210281372, + "rewards/rejected": -1.5266047716140747, "step": 5115 }, { "epoch": 2.7402575681552097, - "grad_norm": 18.21476960792327, + "grad_norm": 10.10204457670056, "learning_rate": 2.2555886242751398e-08, - "logits/chosen": -0.1483290195465088, - "logits/rejected": -0.092471644282341, - "logps/chosen": -1.3488088846206665, - "logps/rejected": -1.6935102939605713, - "loss": 2.0112, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3488088846206665, - "rewards/margins": 0.3447011709213257, - "rewards/rejected": -1.6935102939605713, - "semantic_entropy": 0.7674793004989624, + "logits/chosen": -0.25015076994895935, + "logits/rejected": -0.21441802382469177, + "logps/chosen": -1.3197141885757446, + "logps/rejected": -1.5619823932647705, + "loss": 1.6268, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3197141885757446, + "rewards/margins": 0.24226808547973633, + "rewards/rejected": -1.5619823932647705, "step": 5120 }, { "epoch": 2.7429336009366114, - "grad_norm": 19.542866097437877, + "grad_norm": 15.832997721578597, "learning_rate": 2.2095708304662453e-08, - "logits/chosen": -0.24939461052417755, - "logits/rejected": -0.03163083642721176, - "logps/chosen": -1.2250001430511475, - "logps/rejected": -1.5612132549285889, - "loss": 1.9479, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2250001430511475, - "rewards/margins": 0.3362131416797638, - "rewards/rejected": -1.5612132549285889, - "semantic_entropy": 0.7868286371231079, + "logits/chosen": -0.31810659170150757, + "logits/rejected": -0.13229572772979736, + "logps/chosen": -1.1907761096954346, + "logps/rejected": -1.4618098735809326, + "loss": 1.5408, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.1907761096954346, + "rewards/margins": 0.27103400230407715, + "rewards/rejected": -1.4618098735809326, "step": 5125 }, { "epoch": 2.745609633718013, - "grad_norm": 7.4810492997908495, + "grad_norm": 6.913405186133222, "learning_rate": 2.16401669961076e-08, - "logits/chosen": -0.29958608746528625, - "logits/rejected": -0.10414047539234161, - "logps/chosen": -1.2389986515045166, - "logps/rejected": -1.6375420093536377, - "loss": 1.9318, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2389986515045166, - "rewards/margins": 0.39854365587234497, - "rewards/rejected": -1.6375420093536377, - "semantic_entropy": 0.8021961450576782, + "logits/chosen": -0.37828677892684937, + "logits/rejected": -0.21424618363380432, + "logps/chosen": -1.2096235752105713, + "logps/rejected": -1.5016435384750366, + "loss": 1.53, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2096235752105713, + "rewards/margins": 0.29201993346214294, + "rewards/rejected": -1.5016435384750366, "step": 5130 }, { "epoch": 2.748285666499415, - "grad_norm": 10.75047665436145, + "grad_norm": 8.644851992143936, "learning_rate": 2.1189266736750532e-08, - "logits/chosen": -0.07831167429685593, - "logits/rejected": -0.0135183185338974, - "logps/chosen": -1.2490636110305786, - "logps/rejected": -1.5595505237579346, - "loss": 1.9829, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2490636110305786, - "rewards/margins": 0.3104868233203888, - "rewards/rejected": -1.5595505237579346, - "semantic_entropy": 0.7954851388931274, + "logits/chosen": -0.14419499039649963, + "logits/rejected": -0.08842798322439194, + "logps/chosen": -1.2097914218902588, + "logps/rejected": -1.4539039134979248, + "loss": 1.5729, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2097914218902588, + "rewards/margins": 0.24411258101463318, + "rewards/rejected": -1.4539039134979248, "step": 5135 }, { "epoch": 2.750961699280816, - "grad_norm": 8.63114102715087, + "grad_norm": 6.738111830624236, "learning_rate": 2.0743011901227623e-08, - "logits/chosen": -0.08205153793096542, - "logits/rejected": 0.05202709510922432, - "logps/chosen": -1.3270334005355835, - "logps/rejected": -1.5784505605697632, - "loss": 2.042, + "logits/chosen": -0.16246473789215088, + "logits/rejected": -0.05987215042114258, + "logps/chosen": -1.2946544885635376, + "logps/rejected": -1.4763842821121216, + "loss": 1.6436, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3270334005355835, - "rewards/margins": 0.25141724944114685, - "rewards/rejected": -1.5784505605697632, - "semantic_entropy": 0.7848073244094849, + "rewards/chosen": -1.2946544885635376, + "rewards/margins": 0.18172983825206757, + "rewards/rejected": -1.4763842821121216, "step": 5140 }, { "epoch": 2.753637732062218, - "grad_norm": 14.004505980024575, + "grad_norm": 11.981029031922487, "learning_rate": 2.030140681910508e-08, - "logits/chosen": -0.15839698910713196, - "logits/rejected": 0.012042008340358734, - "logps/chosen": -1.2330915927886963, - "logps/rejected": -1.5381847620010376, - "loss": 1.9663, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2330915927886963, - "rewards/margins": 0.30509334802627563, - "rewards/rejected": -1.5381847620010376, - "semantic_entropy": 0.8043828010559082, + "logits/chosen": -0.23299722373485565, + "logits/rejected": -0.0805606096982956, + "logps/chosen": -1.2046631574630737, + "logps/rejected": -1.4061990976333618, + "loss": 1.5615, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2046631574630737, + "rewards/margins": 0.20153579115867615, + "rewards/rejected": -1.4061990976333618, "step": 5145 }, { "epoch": 2.756313764843619, - "grad_norm": 7.444852079176876, + "grad_norm": 5.553668431643315, "learning_rate": 1.986445577483753e-08, - "logits/chosen": -0.18917617201805115, - "logits/rejected": -0.06302474439144135, - "logps/chosen": -1.2250678539276123, - "logps/rejected": -1.555406093597412, - "loss": 1.9536, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2250678539276123, - "rewards/margins": 0.3303382992744446, - "rewards/rejected": -1.555406093597412, - "semantic_entropy": 0.8019511103630066, + "logits/chosen": -0.2729364037513733, + "logits/rejected": -0.17603769898414612, + "logps/chosen": -1.197695016860962, + "logps/rejected": -1.4483814239501953, + "loss": 1.5524, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.197695016860962, + "rewards/margins": 0.25068649649620056, + "rewards/rejected": -1.4483814239501953, "step": 5150 }, { "epoch": 2.758989797625021, - "grad_norm": 8.680130105829013, + "grad_norm": 7.683718213238564, "learning_rate": 1.9432163007725765e-08, - "logits/chosen": -0.18140801787376404, - "logits/rejected": -0.07545305788516998, - "logps/chosen": -1.2941920757293701, - "logps/rejected": -1.6018486022949219, - "loss": 1.985, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2941920757293701, - "rewards/margins": 0.30765634775161743, - "rewards/rejected": -1.6018486022949219, - "semantic_entropy": 0.7732634544372559, + "logits/chosen": -0.31468385457992554, + "logits/rejected": -0.23054513335227966, + "logps/chosen": -1.2684756517410278, + "logps/rejected": -1.5039026737213135, + "loss": 1.599, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2684756517410278, + "rewards/margins": 0.23542693257331848, + "rewards/rejected": -1.5039026737213135, "step": 5155 }, { "epoch": 2.7616658304064226, - "grad_norm": 11.22410178800859, + "grad_norm": 9.212426590229022, "learning_rate": 1.9004532711876297e-08, - "logits/chosen": -0.1650318056344986, - "logits/rejected": -0.10949975252151489, - "logps/chosen": -1.2166471481323242, - "logps/rejected": -1.654528021812439, - "loss": 1.8955, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2166471481323242, - "rewards/margins": 0.43788084387779236, - "rewards/rejected": -1.654528021812439, - "semantic_entropy": 0.7718899250030518, + "logits/chosen": -0.2610294222831726, + "logits/rejected": -0.20872995257377625, + "logps/chosen": -1.1865425109863281, + "logps/rejected": -1.550987958908081, + "loss": 1.5008, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1865425109863281, + "rewards/margins": 0.364445298910141, + "rewards/rejected": -1.550987958908081, "step": 5160 }, { "epoch": 2.7643418631878243, - "grad_norm": 9.025154317523384, + "grad_norm": 8.081361937247129, "learning_rate": 1.8581569036159928e-08, - "logits/chosen": -0.18271777033805847, - "logits/rejected": 0.0020973458886146545, - "logps/chosen": -1.2153449058532715, - "logps/rejected": -1.56462824344635, - "loss": 1.931, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2153449058532715, - "rewards/margins": 0.3492833971977234, - "rewards/rejected": -1.56462824344635, - "semantic_entropy": 0.8019517660140991, + "logits/chosen": -0.2479631006717682, + "logits/rejected": -0.09326585382223129, + "logps/chosen": -1.1892836093902588, + "logps/rejected": -1.4418988227844238, + "loss": 1.5311, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1892836093902588, + "rewards/margins": 0.25261521339416504, + "rewards/rejected": -1.4418988227844238, "step": 5165 }, { "epoch": 2.7670178959692255, - "grad_norm": 10.287380361776217, + "grad_norm": 8.992632637265395, "learning_rate": 1.8163276084172285e-08, - "logits/chosen": -0.14131216704845428, - "logits/rejected": -0.015068145468831062, - "logps/chosen": -1.3193836212158203, - "logps/rejected": -1.6025813817977905, - "loss": 2.0232, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3193836212158203, - "rewards/margins": 0.2831977307796478, - "rewards/rejected": -1.6025813817977905, - "semantic_entropy": 0.7736366987228394, + "logits/chosen": -0.23023590445518494, + "logits/rejected": -0.12915799021720886, + "logps/chosen": -1.2853100299835205, + "logps/rejected": -1.4848616123199463, + "loss": 1.63, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2853100299835205, + "rewards/margins": 0.199551522731781, + "rewards/rejected": -1.4848616123199463, "step": 5170 }, { "epoch": 2.7696939287506273, - "grad_norm": 10.355275847231685, + "grad_norm": 8.50942739888827, "learning_rate": 1.7749657914193194e-08, - "logits/chosen": -0.20772120356559753, - "logits/rejected": -0.10938359797000885, - "logps/chosen": -1.313234567642212, - "logps/rejected": -1.560950517654419, - "loss": 2.0326, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.313234567642212, - "rewards/margins": 0.24771595001220703, - "rewards/rejected": -1.560950517654419, - "semantic_entropy": 0.7819124460220337, + "logits/chosen": -0.2819350063800812, + "logits/rejected": -0.19880495965480804, + "logps/chosen": -1.2908051013946533, + "logps/rejected": -1.4433648586273193, + "loss": 1.6442, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2908051013946533, + "rewards/margins": 0.15255983173847198, + "rewards/rejected": -1.4433648586273193, "step": 5175 }, { "epoch": 2.7723699615320285, - "grad_norm": 7.703346828402844, + "grad_norm": 7.09993439182778, "learning_rate": 1.7340718539148203e-08, - "logits/chosen": -0.08773273974657059, - "logits/rejected": -0.04304978996515274, - "logps/chosen": -1.3063275814056396, - "logps/rejected": -1.6553189754486084, - "loss": 1.9855, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3063275814056396, - "rewards/margins": 0.3489914834499359, - "rewards/rejected": -1.6553189754486084, - "semantic_entropy": 0.7809012532234192, + "logits/chosen": -0.18955400586128235, + "logits/rejected": -0.16120107471942902, + "logps/chosen": -1.2785166501998901, + "logps/rejected": -1.5583308935165405, + "loss": 1.5895, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2785166501998901, + "rewards/margins": 0.2798142433166504, + "rewards/rejected": -1.5583308935165405, "step": 5180 }, { "epoch": 2.7750459943134302, - "grad_norm": 16.638627028289388, + "grad_norm": 11.962208827975902, "learning_rate": 1.6936461926568724e-08, - "logits/chosen": -0.10791786015033722, - "logits/rejected": 0.013361009769141674, - "logps/chosen": -1.1710784435272217, - "logps/rejected": -1.5865617990493774, - "loss": 1.9282, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1710784435272217, - "rewards/margins": 0.41548317670822144, - "rewards/rejected": -1.5865617990493774, - "semantic_entropy": 0.8109432458877563, + "logits/chosen": -0.2284051477909088, + "logits/rejected": -0.14482834935188293, + "logps/chosen": -1.1373264789581299, + "logps/rejected": -1.4195890426635742, + "loss": 1.512, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1373264789581299, + "rewards/margins": 0.28226256370544434, + "rewards/rejected": -1.4195890426635742, "step": 5185 }, { "epoch": 2.777722027094832, - "grad_norm": 9.333197939438142, + "grad_norm": 8.616815056941201, "learning_rate": 1.6536891998554346e-08, - "logits/chosen": -0.22189322113990784, - "logits/rejected": -0.05813136696815491, - "logps/chosen": -1.2453774213790894, - "logps/rejected": -1.5646049976348877, - "loss": 1.9451, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2453774213790894, - "rewards/margins": 0.3192276954650879, - "rewards/rejected": -1.5646049976348877, - "semantic_entropy": 0.7913827896118164, + "logits/chosen": -0.3416827917098999, + "logits/rejected": -0.22021794319152832, + "logps/chosen": -1.224382996559143, + "logps/rejected": -1.4130111932754517, + "loss": 1.5654, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.224382996559143, + "rewards/margins": 0.18862825632095337, + "rewards/rejected": -1.4130111932754517, "step": 5190 }, { "epoch": 2.7803980598762337, - "grad_norm": 18.888524794439114, + "grad_norm": 11.488081919358322, "learning_rate": 1.6142012631734093e-08, - "logits/chosen": -0.13538241386413574, - "logits/rejected": -0.004059618804603815, - "logps/chosen": -1.2707325220108032, - "logps/rejected": -1.5524711608886719, - "loss": 1.9975, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2707325220108032, - "rewards/margins": 0.28173863887786865, - "rewards/rejected": -1.5524711608886719, - "semantic_entropy": 0.8077930212020874, + "logits/chosen": -0.2214185744524002, + "logits/rejected": -0.11094842851161957, + "logps/chosen": -1.2285008430480957, + "logps/rejected": -1.447881817817688, + "loss": 1.5802, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2285008430480957, + "rewards/margins": 0.2193809449672699, + "rewards/rejected": -1.447881817817688, "step": 5195 }, { "epoch": 2.783074092657635, - "grad_norm": 10.319272436486468, + "grad_norm": 9.903452577728135, "learning_rate": 1.575182765722949e-08, - "logits/chosen": -0.22574253380298615, - "logits/rejected": -0.07820216566324234, - "logps/chosen": -1.2026981115341187, - "logps/rejected": -1.5477980375289917, - "loss": 1.9666, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2026981115341187, - "rewards/margins": 0.34509986639022827, - "rewards/rejected": -1.5477980375289917, - "semantic_entropy": 0.8090740442276001, + "logits/chosen": -0.291803777217865, + "logits/rejected": -0.17823100090026855, + "logps/chosen": -1.174984335899353, + "logps/rejected": -1.419689416885376, + "loss": 1.5671, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.174984335899353, + "rewards/margins": 0.2447052001953125, + "rewards/rejected": -1.419689416885376, "step": 5200 }, { "epoch": 2.783074092657635, - "eval_logits/chosen": 0.202718585729599, - "eval_logits/rejected": 0.29181304574012756, - "eval_logps/chosen": -1.3354370594024658, - "eval_logps/rejected": -1.6131740808486938, - "eval_loss": 2.0453600883483887, - "eval_rewards/accuracies": 0.5942136645317078, - "eval_rewards/chosen": -1.3354370594024658, - "eval_rewards/margins": 0.27773699164390564, - "eval_rewards/rejected": -1.6131740808486938, - "eval_runtime": 34.6109, - "eval_samples_per_second": 38.861, - "eval_semantic_entropy": 0.7754858136177063, - "eval_steps_per_second": 9.737, + "eval_logits/chosen": 0.025222957134246826, + "eval_logits/rejected": 0.0927000492811203, + "eval_logps/chosen": -1.30295991897583, + "eval_logps/rejected": -1.4993714094161987, + "eval_loss": 1.6473075151443481, + "eval_rewards/accuracies": 0.5719584822654724, + "eval_rewards/chosen": -1.30295991897583, + "eval_rewards/margins": 0.19641143083572388, + "eval_rewards/rejected": -1.4993714094161987, + "eval_runtime": 40.3052, + "eval_samples_per_second": 33.37, + "eval_steps_per_second": 8.361, "step": 5200 }, { "epoch": 2.7857501254390367, - "grad_norm": 8.406302503630188, + "grad_norm": 8.17255312752598, "learning_rate": 1.536634086061672e-08, - "logits/chosen": -0.07829969376325607, - "logits/rejected": -0.03902588412165642, - "logps/chosen": -1.2435556650161743, - "logps/rejected": -1.53193199634552, - "loss": 1.9881, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.2435556650161743, - "rewards/margins": 0.28837618231773376, - "rewards/rejected": -1.53193199634552, - "semantic_entropy": 0.8077479600906372, + "logits/chosen": -0.1715945303440094, + "logits/rejected": -0.15807819366455078, + "logps/chosen": -1.2178270816802979, + "logps/rejected": -1.4111155271530151, + "loss": 1.5908, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.2178270816802979, + "rewards/margins": 0.1932884007692337, + "rewards/rejected": -1.4111155271530151, "step": 5205 }, { "epoch": 2.788426158220438, - "grad_norm": 10.640420609743641, + "grad_norm": 8.538930468238522, "learning_rate": 1.4985555981890495e-08, - "logits/chosen": -0.15689179301261902, - "logits/rejected": -0.07102429121732712, - "logps/chosen": -1.2550675868988037, - "logps/rejected": -1.6058406829833984, - "loss": 1.964, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2550675868988037, - "rewards/margins": 0.3507730960845947, - "rewards/rejected": -1.6058406829833984, - "semantic_entropy": 0.7976831197738647, + "logits/chosen": -0.22177401185035706, + "logits/rejected": -0.16164055466651917, + "logps/chosen": -1.222161054611206, + "logps/rejected": -1.4369077682495117, + "loss": 1.568, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.222161054611206, + "rewards/margins": 0.21474671363830566, + "rewards/rejected": -1.4369077682495117, "step": 5210 }, { "epoch": 2.7911021910018396, - "grad_norm": 12.183156183909475, + "grad_norm": 7.820754902549526, "learning_rate": 1.4609476715427226e-08, - "logits/chosen": -0.15650925040245056, - "logits/rejected": -0.06442428380250931, - "logps/chosen": -1.2369836568832397, - "logps/rejected": -1.6222703456878662, - "loss": 1.9431, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2369836568832397, - "rewards/margins": 0.38528645038604736, - "rewards/rejected": -1.6222703456878662, - "semantic_entropy": 0.7931517362594604, + "logits/chosen": -0.22256644070148468, + "logits/rejected": -0.1498219519853592, + "logps/chosen": -1.2076025009155273, + "logps/rejected": -1.5045273303985596, + "loss": 1.5444, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2076025009155273, + "rewards/margins": 0.29692476987838745, + "rewards/rejected": -1.5045273303985596, "step": 5215 }, { "epoch": 2.7937782237832414, - "grad_norm": 12.507724678721072, + "grad_norm": 10.773504638231776, "learning_rate": 1.4238106709949792e-08, - "logits/chosen": -0.17691577970981598, - "logits/rejected": -0.11024241149425507, - "logps/chosen": -1.2268879413604736, - "logps/rejected": -1.6865144968032837, - "loss": 1.9251, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2268879413604736, - "rewards/margins": 0.45962634682655334, - "rewards/rejected": -1.6865144968032837, - "semantic_entropy": 0.7949502468109131, + "logits/chosen": -0.25352656841278076, + "logits/rejected": -0.2084973305463791, + "logps/chosen": -1.1974619626998901, + "logps/rejected": -1.5066121816635132, + "loss": 1.5284, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.1974619626998901, + "rewards/margins": 0.3091502785682678, + "rewards/rejected": -1.5066121816635132, "step": 5220 }, { "epoch": 2.796454256564643, - "grad_norm": 10.289025431525555, + "grad_norm": 8.96727868563191, "learning_rate": 1.3871449568491511e-08, - "logits/chosen": -0.10499472916126251, - "logits/rejected": 0.023125629872083664, - "logps/chosen": -1.3095515966415405, - "logps/rejected": -1.64128839969635, - "loss": 1.9828, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3095515966415405, - "rewards/margins": 0.3317367434501648, - "rewards/rejected": -1.64128839969635, - "semantic_entropy": 0.7708584666252136, + "logits/chosen": -0.19537881016731262, + "logits/rejected": -0.09505072236061096, + "logps/chosen": -1.2803305387496948, + "logps/rejected": -1.519592046737671, + "loss": 1.5941, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2803305387496948, + "rewards/margins": 0.23926150798797607, + "rewards/rejected": -1.519592046737671, "step": 5225 }, { "epoch": 2.7991302893460444, - "grad_norm": 9.972604594873996, + "grad_norm": 9.775911172018159, "learning_rate": 1.3509508848361606e-08, - "logits/chosen": -0.2620421350002289, - "logits/rejected": -0.12123207747936249, - "logps/chosen": -1.2767547369003296, - "logps/rejected": -1.5348087549209595, - "loss": 1.9974, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2767547369003296, - "rewards/margins": 0.25805383920669556, - "rewards/rejected": -1.5348087549209595, - "semantic_entropy": 0.794947624206543, + "logits/chosen": -0.30756497383117676, + "logits/rejected": -0.1980385184288025, + "logps/chosen": -1.242377519607544, + "logps/rejected": -1.4381451606750488, + "loss": 1.5894, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.242377519607544, + "rewards/margins": 0.19576765596866608, + "rewards/rejected": -1.4381451606750488, "step": 5230 }, { "epoch": 2.801806322127446, - "grad_norm": 6.219131068931544, + "grad_norm": 6.16390638342754, "learning_rate": 1.3152288061110517e-08, - "logits/chosen": -0.20478801429271698, - "logits/rejected": -0.09910500794649124, - "logps/chosen": -1.281906247138977, - "logps/rejected": -1.5762264728546143, - "loss": 1.9851, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.281906247138977, - "rewards/margins": 0.294320285320282, - "rewards/rejected": -1.5762264728546143, - "semantic_entropy": 0.7873275279998779, + "logits/chosen": -0.2471589744091034, + "logits/rejected": -0.17163333296775818, + "logps/chosen": -1.2506660223007202, + "logps/rejected": -1.4531687498092651, + "loss": 1.5882, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2506660223007202, + "rewards/margins": 0.20250268280506134, + "rewards/rejected": -1.4531687498092651, "step": 5235 }, { "epoch": 2.804482354908848, - "grad_norm": 10.6291757278702, + "grad_norm": 9.416370335380098, "learning_rate": 1.2799790672495814e-08, - "logits/chosen": -0.1837000548839569, - "logits/rejected": 0.024717776104807854, - "logps/chosen": -1.2773462533950806, - "logps/rejected": -1.5936458110809326, - "loss": 2.0204, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2773462533950806, - "rewards/margins": 0.3162994682788849, - "rewards/rejected": -1.5936458110809326, - "semantic_entropy": 0.790886402130127, + "logits/chosen": -0.24961507320404053, + "logits/rejected": -0.07604103535413742, + "logps/chosen": -1.2434465885162354, + "logps/rejected": -1.4642319679260254, + "loss": 1.617, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2434465885162354, + "rewards/margins": 0.22078534960746765, + "rewards/rejected": -1.4642319679260254, "step": 5240 }, { "epoch": 2.807158387690249, - "grad_norm": 6.864364966283842, + "grad_norm": 6.819545176184575, "learning_rate": 1.2452020102448835e-08, - "logits/chosen": -0.10664012283086777, - "logits/rejected": -0.060091882944107056, - "logps/chosen": -1.2358882427215576, - "logps/rejected": -1.5273553133010864, - "loss": 1.994, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2358882427215576, - "rewards/margins": 0.2914671301841736, - "rewards/rejected": -1.5273553133010864, - "semantic_entropy": 0.8177428245544434, + "logits/chosen": -0.16510526835918427, + "logits/rejected": -0.13053981959819794, + "logps/chosen": -1.1947574615478516, + "logps/rejected": -1.3906234502792358, + "loss": 1.5771, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1947574615478516, + "rewards/margins": 0.1958661526441574, + "rewards/rejected": -1.3906234502792358, "step": 5245 }, { "epoch": 2.8098344204716508, - "grad_norm": 9.134418757055355, + "grad_norm": 7.398688825817576, "learning_rate": 1.2108979725041103e-08, - "logits/chosen": -0.20931419730186462, - "logits/rejected": -0.08170606940984726, - "logps/chosen": -1.306196689605713, - "logps/rejected": -1.6155498027801514, - "loss": 2.0098, + "logits/chosen": -0.26923614740371704, + "logits/rejected": -0.16045144200325012, + "logps/chosen": -1.2702943086624146, + "logps/rejected": -1.473854422569275, + "loss": 1.6096, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.306196689605713, - "rewards/margins": 0.3093533515930176, - "rewards/rejected": -1.6155498027801514, - "semantic_entropy": 0.7775629758834839, + "rewards/chosen": -1.2702943086624146, + "rewards/margins": 0.20355994999408722, + "rewards/rejected": -1.473854422569275, "step": 5250 }, { "epoch": 2.8125104532530525, - "grad_norm": 13.00788199652425, + "grad_norm": 8.283410973305017, "learning_rate": 1.1770672868451958e-08, - "logits/chosen": -0.17584146559238434, - "logits/rejected": 0.033514510840177536, - "logps/chosen": -1.2859491109848022, - "logps/rejected": -1.563084363937378, - "loss": 1.9978, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2859491109848022, - "rewards/margins": 0.27713528275489807, - "rewards/rejected": -1.563084363937378, - "semantic_entropy": 0.7916868329048157, + "logits/chosen": -0.23493504524230957, + "logits/rejected": -0.06386389583349228, + "logps/chosen": -1.2502615451812744, + "logps/rejected": -1.4193909168243408, + "loss": 1.6, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2502615451812744, + "rewards/margins": 0.16912934184074402, + "rewards/rejected": -1.4193909168243408, "step": 5255 }, { "epoch": 2.8151864860344538, - "grad_norm": 13.501783180882384, + "grad_norm": 10.62770644227685, "learning_rate": 1.1437102814935872e-08, - "logits/chosen": -0.15966646373271942, - "logits/rejected": -0.0897841602563858, - "logps/chosen": -1.2488842010498047, - "logps/rejected": -1.6675317287445068, - "loss": 1.9729, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2488842010498047, - "rewards/margins": 0.41864776611328125, - "rewards/rejected": -1.6675317287445068, - "semantic_entropy": 0.785289466381073, + "logits/chosen": -0.22654405236244202, + "logits/rejected": -0.16856110095977783, + "logps/chosen": -1.2229833602905273, + "logps/rejected": -1.5214375257492065, + "loss": 1.5809, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2229833602905273, + "rewards/margins": 0.2984543740749359, + "rewards/rejected": -1.5214375257492065, "step": 5260 }, { "epoch": 2.8178625188158555, - "grad_norm": 8.89645599420778, + "grad_norm": 8.053002223568207, "learning_rate": 1.1108272800791018e-08, - "logits/chosen": -0.27480974793434143, - "logits/rejected": -0.06155538558959961, - "logps/chosen": -1.437657356262207, - "logps/rejected": -1.6343066692352295, - "loss": 2.1194, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.437657356262207, - "rewards/margins": 0.19664961099624634, - "rewards/rejected": -1.6343066692352295, - "semantic_entropy": 0.7450114488601685, + "logits/chosen": -0.3435817360877991, + "logits/rejected": -0.18410401046276093, + "logps/chosen": -1.4058219194412231, + "logps/rejected": -1.521140456199646, + "loss": 1.7366, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4058219194412231, + "rewards/margins": 0.1153184324502945, + "rewards/rejected": -1.521140456199646, "step": 5265 }, { "epoch": 2.820538551597257, - "grad_norm": 7.272694732690356, + "grad_norm": 6.410906678426636, "learning_rate": 1.078418601632769e-08, - "logits/chosen": -0.14676833152770996, - "logits/rejected": -0.007456362247467041, - "logps/chosen": -1.2117810249328613, - "logps/rejected": -1.5890445709228516, - "loss": 1.9195, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2117810249328613, - "rewards/margins": 0.37726354598999023, - "rewards/rejected": -1.5890445709228516, - "semantic_entropy": 0.8097552061080933, + "logits/chosen": -0.2007797211408615, + "logits/rejected": -0.08656108379364014, + "logps/chosen": -1.1756919622421265, + "logps/rejected": -1.483074426651001, + "loss": 1.5043, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1756919622421265, + "rewards/margins": 0.30738240480422974, + "rewards/rejected": -1.483074426651001, "step": 5270 }, { "epoch": 2.8232145843786585, - "grad_norm": 9.385479739829243, + "grad_norm": 8.061132634743135, "learning_rate": 1.0464845605837159e-08, - "logits/chosen": -0.12716984748840332, - "logits/rejected": 0.021417586132884026, - "logps/chosen": -1.2831096649169922, - "logps/rejected": -1.6105226278305054, - "loss": 1.9832, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2831096649169922, - "rewards/margins": 0.32741299271583557, - "rewards/rejected": -1.6105226278305054, - "semantic_entropy": 0.7869722247123718, + "logits/chosen": -0.22029061615467072, + "logits/rejected": -0.09311743080615997, + "logps/chosen": -1.2549231052398682, + "logps/rejected": -1.4947597980499268, + "loss": 1.5859, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2549231052398682, + "rewards/margins": 0.23983661830425262, + "rewards/rejected": -1.4947597980499268, "step": 5275 }, { "epoch": 2.82589061716006, - "grad_norm": 9.165378787106281, + "grad_norm": 9.12189454063848, "learning_rate": 1.0150254667561642e-08, - "logits/chosen": -0.133694127202034, - "logits/rejected": 0.03595340996980667, - "logps/chosen": -1.3388880491256714, - "logps/rejected": -1.6667951345443726, - "loss": 2.0435, + "logits/chosen": -0.21285684406757355, + "logits/rejected": -0.0851225033402443, + "logps/chosen": -1.2914241552352905, + "logps/rejected": -1.4883497953414917, + "loss": 1.636, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3388880491256714, - "rewards/margins": 0.3279072344303131, - "rewards/rejected": -1.6667951345443726, - "semantic_entropy": 0.7781317830085754, + "rewards/chosen": -1.2914241552352905, + "rewards/margins": 0.19692568480968475, + "rewards/rejected": -1.4883497953414917, "step": 5280 }, { "epoch": 2.828566649941462, - "grad_norm": 8.646734065478284, + "grad_norm": 7.9275295644342005, "learning_rate": 9.840416253663719e-09, - "logits/chosen": -0.19486698508262634, - "logits/rejected": -0.09732584655284882, - "logps/chosen": -1.2249208688735962, - "logps/rejected": -1.6000207662582397, - "loss": 1.9347, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2249208688735962, - "rewards/margins": 0.37510010600090027, - "rewards/rejected": -1.6000207662582397, - "semantic_entropy": 0.8043219447135925, + "logits/chosen": -0.24292925000190735, + "logits/rejected": -0.16514852643013, + "logps/chosen": -1.194392204284668, + "logps/rejected": -1.4477427005767822, + "loss": 1.536, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.194392204284668, + "rewards/margins": 0.2533505856990814, + "rewards/rejected": -1.4477427005767822, "step": 5285 }, { "epoch": 2.8312426827228636, - "grad_norm": 8.266243745454222, + "grad_norm": 6.94883158701022, "learning_rate": 9.535333370197074e-09, - "logits/chosen": -0.1298667937517166, - "logits/rejected": 0.01398382056504488, - "logps/chosen": -1.2848511934280396, - "logps/rejected": -1.5456931591033936, - "loss": 2.0081, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2848511934280396, - "rewards/margins": 0.26084208488464355, - "rewards/rejected": -1.5456931591033936, - "semantic_entropy": 0.7974156141281128, + "logits/chosen": -0.2350497990846634, + "logits/rejected": -0.12720319628715515, + "logps/chosen": -1.2613648176193237, + "logps/rejected": -1.454443097114563, + "loss": 1.6147, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2613648176193237, + "rewards/margins": 0.1930783987045288, + "rewards/rejected": -1.454443097114563, "step": 5290 }, { "epoch": 2.833918715504265, - "grad_norm": 10.837295569106445, + "grad_norm": 6.377123568788396, "learning_rate": 9.23500897707713e-09, - "logits/chosen": -0.20668502151966095, - "logits/rejected": -0.016653254628181458, - "logps/chosen": -1.3635807037353516, - "logps/rejected": -1.6869462728500366, - "loss": 2.0255, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3635807037353516, - "rewards/margins": 0.32336559891700745, - "rewards/rejected": -1.6869462728500366, - "semantic_entropy": 0.7563022971153259, + "logits/chosen": -0.27777573466300964, + "logits/rejected": -0.12587803602218628, + "logps/chosen": -1.3249562978744507, + "logps/rejected": -1.5746320486068726, + "loss": 1.6309, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3249562978744507, + "rewards/margins": 0.24967575073242188, + "rewards/rejected": -1.5746320486068726, "step": 5295 }, { "epoch": 2.8365947482856666, - "grad_norm": 8.619693420401413, + "grad_norm": 7.856262939713161, "learning_rate": 8.939445988052574e-09, - "logits/chosen": -0.170781672000885, - "logits/rejected": -0.12549729645252228, - "logps/chosen": -1.2618080377578735, - "logps/rejected": -1.6614129543304443, - "loss": 1.962, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2618080377578735, - "rewards/margins": 0.3996048867702484, - "rewards/rejected": -1.6614129543304443, - "semantic_entropy": 0.7871630191802979, + "logits/chosen": -0.24618788063526154, + "logits/rejected": -0.2185472697019577, + "logps/chosen": -1.2331881523132324, + "logps/rejected": -1.558468222618103, + "loss": 1.5617, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2331881523132324, + "rewards/margins": 0.32528001070022583, + "rewards/rejected": -1.558468222618103, "step": 5300 }, { "epoch": 2.839270781067068, - "grad_norm": 10.523548641438046, + "grad_norm": 8.297563864396986, "learning_rate": 8.648647270676656e-09, - "logits/chosen": -0.14557881653308868, - "logits/rejected": -0.011387373320758343, - "logps/chosen": -1.3090760707855225, - "logps/rejected": -1.6498647928237915, - "loss": 2.0204, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3090760707855225, - "rewards/margins": 0.3407888114452362, - "rewards/rejected": -1.6498647928237915, - "semantic_entropy": 0.7654205560684204, + "logits/chosen": -0.22748489677906036, + "logits/rejected": -0.11356830596923828, + "logps/chosen": -1.2860530614852905, + "logps/rejected": -1.5371191501617432, + "loss": 1.6336, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2860530614852905, + "rewards/margins": 0.2510661780834198, + "rewards/rejected": -1.5371191501617432, "step": 5305 }, { "epoch": 2.8419468138484696, - "grad_norm": 6.399675035852637, + "grad_norm": 5.8219485752748765, "learning_rate": 8.362615646279991e-09, - "logits/chosen": -0.31894922256469727, - "logits/rejected": -0.07944828271865845, - "logps/chosen": -1.213585615158081, - "logps/rejected": -1.628617525100708, - "loss": 1.942, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.213585615158081, - "rewards/margins": 0.41503196954727173, - "rewards/rejected": -1.628617525100708, - "semantic_entropy": 0.7991796135902405, + "logits/chosen": -0.34974604845046997, + "logits/rejected": -0.14192189276218414, + "logps/chosen": -1.1785976886749268, + "logps/rejected": -1.4980539083480835, + "loss": 1.5294, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1785976886749268, + "rewards/margins": 0.3194560408592224, + "rewards/rejected": -1.4980539083480835, "step": 5310 }, { "epoch": 2.8446228466298713, - "grad_norm": 11.93961968956636, + "grad_norm": 9.582578031201527, "learning_rate": 8.081353889942466e-09, - "logits/chosen": -0.06523782014846802, - "logits/rejected": 0.050230931490659714, - "logps/chosen": -1.2478545904159546, - "logps/rejected": -1.4929559230804443, - "loss": 1.9888, + "logits/chosen": -0.12283550202846527, + "logits/rejected": -0.03970341011881828, + "logps/chosen": -1.2159522771835327, + "logps/rejected": -1.3907673358917236, + "loss": 1.5766, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2478545904159546, - "rewards/margins": 0.2451011836528778, - "rewards/rejected": -1.4929559230804443, - "semantic_entropy": 0.8178671598434448, + "rewards/chosen": -1.2159522771835327, + "rewards/margins": 0.17481514811515808, + "rewards/rejected": -1.3907673358917236, "step": 5315 }, { "epoch": 2.847298879411273, - "grad_norm": 9.99117359013494, + "grad_norm": 9.036023167800439, "learning_rate": 7.804864730467042e-09, - "logits/chosen": -0.08324486017227173, - "logits/rejected": -0.010287337005138397, - "logps/chosen": -1.2432942390441895, - "logps/rejected": -1.4651365280151367, - "loss": 2.0026, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2432942390441895, - "rewards/margins": 0.2218422144651413, - "rewards/rejected": -1.4651365280151367, - "semantic_entropy": 0.8178890347480774, + "logits/chosen": -0.13198763132095337, + "logits/rejected": -0.07314762473106384, + "logps/chosen": -1.2157835960388184, + "logps/rejected": -1.3626552820205688, + "loss": 1.5912, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2157835960388184, + "rewards/margins": 0.14687182009220123, + "rewards/rejected": -1.3626552820205688, "step": 5320 }, { "epoch": 2.8499749121926743, - "grad_norm": 7.974653615981402, + "grad_norm": 7.859939253955737, "learning_rate": 7.533150850352665e-09, - "logits/chosen": -0.13628502190113068, - "logits/rejected": -0.0066189453937113285, - "logps/chosen": -1.2962652444839478, - "logps/rejected": -1.6911674737930298, - "loss": 1.9772, + "logits/chosen": -0.18645405769348145, + "logits/rejected": -0.08493912220001221, + "logps/chosen": -1.2615854740142822, + "logps/rejected": -1.5630425214767456, + "loss": 1.5806, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2962652444839478, - "rewards/margins": 0.39490213990211487, - "rewards/rejected": -1.6911674737930298, - "semantic_entropy": 0.7716677188873291, + "rewards/chosen": -1.2615854740142822, + "rewards/margins": 0.301457017660141, + "rewards/rejected": -1.5630425214767456, "step": 5325 }, { "epoch": 2.852650944974076, - "grad_norm": 12.160552400520343, + "grad_norm": 9.732168161037025, "learning_rate": 7.2662148857686175e-09, - "logits/chosen": -0.08609539270401001, - "logits/rejected": -0.019694218412041664, - "logps/chosen": -1.2226903438568115, - "logps/rejected": -1.6135330200195312, - "loss": 1.9282, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2226903438568115, - "rewards/margins": 0.3908424973487854, - "rewards/rejected": -1.6135330200195312, - "semantic_entropy": 0.8041356801986694, + "logits/chosen": -0.1656079888343811, + "logits/rejected": -0.11995343863964081, + "logps/chosen": -1.1951570510864258, + "logps/rejected": -1.4527256488800049, + "loss": 1.5305, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1951570510864258, + "rewards/margins": 0.25756850838661194, + "rewards/rejected": -1.4527256488800049, "step": 5330 }, { "epoch": 2.8553269777554773, - "grad_norm": 13.713048310150327, + "grad_norm": 14.634501129963239, "learning_rate": 7.0040594265287635e-09, - "logits/chosen": -0.054106198251247406, - "logits/rejected": -0.07883013784885406, - "logps/chosen": -1.274173617362976, - "logps/rejected": -1.532825231552124, - "loss": 2.008, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.274173617362976, - "rewards/margins": 0.25865158438682556, - "rewards/rejected": -1.532825231552124, - "semantic_entropy": 0.8100245594978333, + "logits/chosen": -0.14711536467075348, + "logits/rejected": -0.1679435521364212, + "logps/chosen": -1.2382829189300537, + "logps/rejected": -1.4174609184265137, + "loss": 1.5958, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2382829189300537, + "rewards/margins": 0.17917793989181519, + "rewards/rejected": -1.4174609184265137, "step": 5335 }, { "epoch": 2.858003010536879, - "grad_norm": 8.870225797744203, + "grad_norm": 8.177412551257165, "learning_rate": 6.746687016066566e-09, - "logits/chosen": -0.1050344929099083, - "logits/rejected": -0.07736798375844955, - "logps/chosen": -1.2842061519622803, - "logps/rejected": -1.4983108043670654, - "loss": 2.0291, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2842061519622803, - "rewards/margins": 0.21410492062568665, - "rewards/rejected": -1.4983108043670654, - "semantic_entropy": 0.7951924204826355, + "logits/chosen": -0.1711091846227646, + "logits/rejected": -0.16347761452198029, + "logps/chosen": -1.2568700313568115, + "logps/rejected": -1.40218985080719, + "loss": 1.6302, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2568700313568115, + "rewards/margins": 0.14531996846199036, + "rewards/rejected": -1.40218985080719, "step": 5340 }, { "epoch": 2.8606790433182807, - "grad_norm": 7.400982060956275, + "grad_norm": 6.8081120753218825, "learning_rate": 6.494100151410276e-09, - "logits/chosen": -0.25141647458076477, - "logits/rejected": -0.08219093829393387, - "logps/chosen": -1.214036226272583, - "logps/rejected": -1.517608880996704, - "loss": 1.9179, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.214036226272583, - "rewards/margins": 0.3035728335380554, - "rewards/rejected": -1.517608880996704, - "semantic_entropy": 0.8065169453620911, + "logits/chosen": -0.31831425428390503, + "logits/rejected": -0.1833570897579193, + "logps/chosen": -1.1927130222320557, + "logps/rejected": -1.4259960651397705, + "loss": 1.5188, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1927130222320557, + "rewards/margins": 0.233283132314682, + "rewards/rejected": -1.4259960651397705, "step": 5345 }, { "epoch": 2.8633550760996824, - "grad_norm": 12.611434321679779, + "grad_norm": 10.222984434332936, "learning_rate": 6.246301283158728e-09, - "logits/chosen": -0.08424808084964752, - "logits/rejected": -0.10715341567993164, - "logps/chosen": -1.3435392379760742, - "logps/rejected": -1.6009973287582397, - "loss": 2.0603, + "logits/chosen": -0.1781303584575653, + "logits/rejected": -0.2083595246076584, + "logps/chosen": -1.3082549571990967, + "logps/rejected": -1.506154179573059, + "loss": 1.6577, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3435392379760742, - "rewards/margins": 0.257457971572876, - "rewards/rejected": -1.6009973287582397, - "semantic_entropy": 0.7735006213188171, + "rewards/chosen": -1.3082549571990967, + "rewards/margins": 0.19789907336235046, + "rewards/rejected": -1.506154179573059, "step": 5350 }, { "epoch": 2.8660311088810837, - "grad_norm": 9.774989125777068, + "grad_norm": 8.988075682315458, "learning_rate": 6.0032928154576944e-09, - "logits/chosen": -0.17313161492347717, - "logits/rejected": -0.08946957439184189, - "logps/chosen": -1.2829643487930298, - "logps/rejected": -1.5711157321929932, - "loss": 2.0195, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2829643487930298, - "rewards/margins": 0.28815120458602905, - "rewards/rejected": -1.5711157321929932, - "semantic_entropy": 0.8007059097290039, + "logits/chosen": -0.2713666558265686, + "logits/rejected": -0.19798487424850464, + "logps/chosen": -1.239227533340454, + "logps/rejected": -1.455033779144287, + "loss": 1.5997, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.239227533340454, + "rewards/margins": 0.21580609679222107, + "rewards/rejected": -1.455033779144287, "step": 5355 }, { "epoch": 2.8687071416624854, - "grad_norm": 12.97437785965202, + "grad_norm": 9.768561838045548, "learning_rate": 5.76507710597629e-09, - "logits/chosen": -0.16783717274665833, - "logits/rejected": 0.017929133027791977, - "logps/chosen": -1.2705787420272827, - "logps/rejected": -1.6087089776992798, - "loss": 1.994, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2705787420272827, - "rewards/margins": 0.3381301462650299, - "rewards/rejected": -1.6087089776992798, - "semantic_entropy": 0.7808153629302979, + "logits/chosen": -0.23024603724479675, + "logits/rejected": -0.0810357853770256, + "logps/chosen": -1.240037202835083, + "logps/rejected": -1.486128568649292, + "loss": 1.6004, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.240037202835083, + "rewards/margins": 0.24609121680259705, + "rewards/rejected": -1.486128568649292, "step": 5360 }, { "epoch": 2.8713831744438867, - "grad_norm": 7.617332004084715, + "grad_norm": 7.270990516203394, "learning_rate": 5.531656465884438e-09, - "logits/chosen": -0.21880276501178741, - "logits/rejected": -0.06172800809144974, - "logps/chosen": -1.2684425115585327, - "logps/rejected": -1.650254487991333, - "loss": 1.9603, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2684425115585327, - "rewards/margins": 0.38181185722351074, - "rewards/rejected": -1.650254487991333, - "semantic_entropy": 0.7841113805770874, + "logits/chosen": -0.2629620432853699, + "logits/rejected": -0.12923569977283478, + "logps/chosen": -1.2358171939849854, + "logps/rejected": -1.5153964757919312, + "loss": 1.5649, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2358171939849854, + "rewards/margins": 0.27957937121391296, + "rewards/rejected": -1.5153964757919312, "step": 5365 }, { "epoch": 2.8740592072252884, - "grad_norm": 11.370064221928539, + "grad_norm": 9.998922677571656, "learning_rate": 5.303033159830217e-09, - "logits/chosen": -0.0645141750574112, - "logits/rejected": -0.03054744563996792, - "logps/chosen": -1.2745743989944458, - "logps/rejected": -1.4681590795516968, - "loss": 2.027, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2745743989944458, - "rewards/margins": 0.19358472526073456, - "rewards/rejected": -1.4681590795516968, - "semantic_entropy": 0.8125517964363098, + "logits/chosen": -0.14417660236358643, + "logits/rejected": -0.12314148247241974, + "logps/chosen": -1.2316958904266357, + "logps/rejected": -1.3660876750946045, + "loss": 1.6054, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2316958904266357, + "rewards/margins": 0.13439175486564636, + "rewards/rejected": -1.3660876750946045, "step": 5370 }, { "epoch": 2.87673524000669, - "grad_norm": 8.848175566333316, + "grad_norm": 8.099282766800515, "learning_rate": 5.079209405917939e-09, - "logits/chosen": -0.15581555664539337, - "logits/rejected": -0.06532729417085648, - "logps/chosen": -1.217376947402954, - "logps/rejected": -1.6858150959014893, - "loss": 1.9122, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.217376947402954, - "rewards/margins": 0.4684379994869232, - "rewards/rejected": -1.6858150959014893, - "semantic_entropy": 0.7822630405426025, + "logits/chosen": -0.21907174587249756, + "logits/rejected": -0.1412518322467804, + "logps/chosen": -1.1827168464660645, + "logps/rejected": -1.5288317203521729, + "loss": 1.5092, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1827168464660645, + "rewards/margins": 0.34611478447914124, + "rewards/rejected": -1.5288317203521729, "step": 5375 }, { "epoch": 2.879411272788092, - "grad_norm": 8.075282901662117, + "grad_norm": 6.138067696587353, "learning_rate": 4.860187375686664e-09, - "logits/chosen": -0.180607870221138, - "logits/rejected": 0.031008031219244003, - "logps/chosen": -1.3817294836044312, - "logps/rejected": -1.6795692443847656, - "loss": 2.0293, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3817294836044312, - "rewards/margins": 0.29783961176872253, - "rewards/rejected": -1.6795692443847656, - "semantic_entropy": 0.7511149644851685, + "logits/chosen": -0.2404029667377472, + "logits/rejected": -0.07165710628032684, + "logps/chosen": -1.359360694885254, + "logps/rejected": -1.547788381576538, + "loss": 1.6632, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.359360694885254, + "rewards/margins": 0.18842774629592896, + "rewards/rejected": -1.547788381576538, "step": 5380 }, { "epoch": 2.882087305569493, - "grad_norm": 8.302562710027024, + "grad_norm": 7.384617991891714, "learning_rate": 4.64596919408905e-09, - "logits/chosen": -0.10999952256679535, - "logits/rejected": -0.03438536450266838, - "logps/chosen": -1.2459831237792969, - "logps/rejected": -1.5046635866165161, - "loss": 1.9805, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2459831237792969, - "rewards/margins": 0.25868046283721924, - "rewards/rejected": -1.5046635866165161, - "semantic_entropy": 0.7971447706222534, + "logits/chosen": -0.18356771767139435, + "logits/rejected": -0.12828607857227325, + "logps/chosen": -1.2168136835098267, + "logps/rejected": -1.3994461297988892, + "loss": 1.5823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2168136835098267, + "rewards/margins": 0.1826324164867401, + "rewards/rejected": -1.3994461297988892, "step": 5385 }, { "epoch": 2.884763338350895, - "grad_norm": 8.099658967264567, + "grad_norm": 6.76595372965001, "learning_rate": 4.436556939470814e-09, - "logits/chosen": -0.12270566076040268, - "logits/rejected": -0.01083715446293354, - "logps/chosen": -1.2990280389785767, - "logps/rejected": -1.554796814918518, - "loss": 1.9949, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2990280389785767, - "rewards/margins": 0.255768746137619, - "rewards/rejected": -1.554796814918518, - "semantic_entropy": 0.7722955942153931, + "logits/chosen": -0.18755054473876953, + "logits/rejected": -0.10399661958217621, + "logps/chosen": -1.2633230686187744, + "logps/rejected": -1.4344213008880615, + "loss": 1.6064, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2633230686187744, + "rewards/margins": 0.17109842598438263, + "rewards/rejected": -1.4344213008880615, "step": 5390 }, { "epoch": 2.887439371132296, - "grad_norm": 7.111781634638527, + "grad_norm": 6.6443189435045795, "learning_rate": 4.23195264355064e-09, - "logits/chosen": -0.29376253485679626, - "logits/rejected": -0.1047648936510086, - "logps/chosen": -1.2102776765823364, - "logps/rejected": -1.550250768661499, - "loss": 1.9123, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2102776765823364, - "rewards/margins": 0.3399733006954193, - "rewards/rejected": -1.550250768661499, - "semantic_entropy": 0.800680935382843, + "logits/chosen": -0.33120718598365784, + "logits/rejected": -0.17057207226753235, + "logps/chosen": -1.18402898311615, + "logps/rejected": -1.4530203342437744, + "loss": 1.5123, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.18402898311615, + "rewards/margins": 0.26899123191833496, + "rewards/rejected": -1.4530203342437744, "step": 5395 }, { "epoch": 2.890115403913698, - "grad_norm": 9.097021482847795, + "grad_norm": 7.705572513303001, "learning_rate": 4.032158291400245e-09, - "logits/chosen": -0.2035544365644455, - "logits/rejected": 0.0319555401802063, - "logps/chosen": -1.2738873958587646, - "logps/rejected": -1.7766224145889282, - "loss": 1.9624, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2738873958587646, - "rewards/margins": 0.5027349591255188, - "rewards/rejected": -1.7766224145889282, - "semantic_entropy": 0.7796919345855713, + "logits/chosen": -0.25371435284614563, + "logits/rejected": -0.04801352694630623, + "logps/chosen": -1.2450672388076782, + "logps/rejected": -1.6382758617401123, + "loss": 1.5736, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2450672388076782, + "rewards/margins": 0.3932085931301117, + "rewards/rejected": -1.6382758617401123, "step": 5400 }, { "epoch": 2.8927914366950995, - "grad_norm": 10.821844566960877, + "grad_norm": 9.116026612594158, "learning_rate": 3.837175821425398e-09, - "logits/chosen": -0.10578795522451401, - "logits/rejected": -0.06705756485462189, - "logps/chosen": -1.3472933769226074, - "logps/rejected": -1.668858528137207, - "loss": 2.0212, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3472933769226074, - "rewards/margins": 0.3215652108192444, - "rewards/rejected": -1.668858528137207, - "semantic_entropy": 0.7665466070175171, + "logits/chosen": -0.1768065094947815, + "logits/rejected": -0.15552476048469543, + "logps/chosen": -1.3003408908843994, + "logps/rejected": -1.5687168836593628, + "loss": 1.6192, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3003408908843994, + "rewards/margins": 0.2683759927749634, + "rewards/rejected": -1.5687168836593628, "step": 5405 }, { "epoch": 2.8954674694765012, - "grad_norm": 9.511240934829583, + "grad_norm": 7.581508989260258, "learning_rate": 3.6470071253467683e-09, - "logits/chosen": -0.09955920279026031, - "logits/rejected": 0.02503589354455471, - "logps/chosen": -1.250042200088501, - "logps/rejected": -1.6926124095916748, - "loss": 1.9493, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.250042200088501, - "rewards/margins": 0.44257020950317383, - "rewards/rejected": -1.6926124095916748, - "semantic_entropy": 0.7760982513427734, + "logits/chosen": -0.16222859919071198, + "logits/rejected": -0.06536058336496353, + "logps/chosen": -1.22344172000885, + "logps/rejected": -1.5434781312942505, + "loss": 1.56, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.22344172000885, + "rewards/margins": 0.3200364410877228, + "rewards/rejected": -1.5434781312942505, "step": 5410 }, { "epoch": 2.8981435022579025, - "grad_norm": 21.910192532646565, + "grad_norm": 7.875764868054461, "learning_rate": 3.461654048181939e-09, - "logits/chosen": -0.15834368765354156, - "logits/rejected": 0.009910332970321178, - "logps/chosen": -1.2846863269805908, - "logps/rejected": -1.5575840473175049, - "loss": 2.0001, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2846863269805908, - "rewards/margins": 0.2728978991508484, - "rewards/rejected": -1.5575840473175049, - "semantic_entropy": 0.7935778498649597, + "logits/chosen": -0.24861487746238708, + "logits/rejected": -0.11518082767724991, + "logps/chosen": -1.2504818439483643, + "logps/rejected": -1.4509363174438477, + "loss": 1.5929, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2504818439483643, + "rewards/margins": 0.2004544734954834, + "rewards/rejected": -1.4509363174438477, "step": 5415 }, { "epoch": 2.9008195350393042, - "grad_norm": 12.313994859625039, + "grad_norm": 10.540014977561935, "learning_rate": 3.281118388227255e-09, - "logits/chosen": -0.12874098122119904, - "logits/rejected": -0.06065679341554642, - "logps/chosen": -1.1999180316925049, - "logps/rejected": -1.5103440284729004, - "loss": 1.9509, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1999180316925049, - "rewards/margins": 0.3104260265827179, - "rewards/rejected": -1.5103440284729004, - "semantic_entropy": 0.808806300163269, + "logits/chosen": -0.20477533340454102, + "logits/rejected": -0.1502620428800583, + "logps/chosen": -1.1682026386260986, + "logps/rejected": -1.3941516876220703, + "loss": 1.5416, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1682026386260986, + "rewards/margins": 0.22594909369945526, + "rewards/rejected": -1.3941516876220703, "step": 5420 }, { "epoch": 2.903495567820706, - "grad_norm": 26.48957031686247, + "grad_norm": 12.9123691767093, "learning_rate": 3.1054018970405048e-09, - "logits/chosen": -0.13419437408447266, - "logits/rejected": -0.0012315213680267334, - "logps/chosen": -1.3119560480117798, - "logps/rejected": -1.630934476852417, - "loss": 2.0138, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3119560480117798, - "rewards/margins": 0.31897860765457153, - "rewards/rejected": -1.630934476852417, - "semantic_entropy": 0.7729989886283875, + "logits/chosen": -0.20697562396526337, + "logits/rejected": -0.1116500049829483, + "logps/chosen": -1.272203803062439, + "logps/rejected": -1.5195564031600952, + "loss": 1.6129, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.272203803062439, + "rewards/margins": 0.24735260009765625, + "rewards/rejected": -1.5195564031600952, "step": 5425 }, { "epoch": 2.906171600602107, - "grad_norm": 10.56228413619259, + "grad_norm": 9.672107526238177, "learning_rate": 2.9345062794238207e-09, - "logits/chosen": -0.16747361421585083, - "logits/rejected": -0.005986332893371582, - "logps/chosen": -1.2478454113006592, - "logps/rejected": -1.6454213857650757, - "loss": 1.9365, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2478454113006592, - "rewards/margins": 0.3975757956504822, - "rewards/rejected": -1.6454213857650757, - "semantic_entropy": 0.7814873456954956, + "logits/chosen": -0.23657703399658203, + "logits/rejected": -0.10220441967248917, + "logps/chosen": -1.2120410203933716, + "logps/rejected": -1.5323138236999512, + "loss": 1.5388, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2120410203933716, + "rewards/margins": 0.3202727437019348, + "rewards/rejected": -1.5323138236999512, "step": 5430 }, { "epoch": 2.908847633383509, - "grad_norm": 16.477061576992455, + "grad_norm": 13.32645795297891, "learning_rate": 2.7684331934072492e-09, - "logits/chosen": -0.2565791606903076, - "logits/rejected": -0.15934288501739502, - "logps/chosen": -1.2421395778656006, - "logps/rejected": -1.634916067123413, - "loss": 1.9308, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2421395778656006, - "rewards/margins": 0.3927767276763916, - "rewards/rejected": -1.634916067123413, - "semantic_entropy": 0.7934702634811401, + "logits/chosen": -0.3036276698112488, + "logits/rejected": -0.2224380522966385, + "logps/chosen": -1.211949348449707, + "logps/rejected": -1.5303452014923096, + "loss": 1.5299, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.211949348449707, + "rewards/margins": 0.31839582324028015, + "rewards/rejected": -1.5303452014923096, "step": 5435 }, { "epoch": 2.9115236661649107, - "grad_norm": 8.614201099283617, + "grad_norm": 8.30113982964633, "learning_rate": 2.6071842502326526e-09, - "logits/chosen": -0.20132644474506378, - "logits/rejected": -0.08974252641201019, - "logps/chosen": -1.2590628862380981, - "logps/rejected": -1.6281840801239014, - "loss": 1.9777, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2590628862380981, - "rewards/margins": 0.36912113428115845, - "rewards/rejected": -1.6281840801239014, - "semantic_entropy": 0.7918053865432739, + "logits/chosen": -0.24548538029193878, + "logits/rejected": -0.1568877398967743, + "logps/chosen": -1.2283847332000732, + "logps/rejected": -1.5463618040084839, + "loss": 1.5675, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2283847332000732, + "rewards/margins": 0.3179771304130554, + "rewards/rejected": -1.5463618040084839, "step": 5440 }, { "epoch": 2.9141996989463124, - "grad_norm": 8.001790955825099, + "grad_norm": 7.124171847107691, "learning_rate": 2.450761014337888e-09, - "logits/chosen": 0.013191893696784973, - "logits/rejected": 0.02028048411011696, - "logps/chosen": -1.2388643026351929, - "logps/rejected": -1.761318564414978, - "loss": 1.9365, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2388643026351929, - "rewards/margins": 0.5224540829658508, - "rewards/rejected": -1.761318564414978, - "semantic_entropy": 0.788965106010437, + "logits/chosen": -0.07658245414495468, + "logits/rejected": -0.08402254432439804, + "logps/chosen": -1.2022011280059814, + "logps/rejected": -1.5855541229248047, + "loss": 1.5345, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2022011280059814, + "rewards/margins": 0.383353054523468, + "rewards/rejected": -1.5855541229248047, "step": 5445 }, { "epoch": 2.9168757317277136, - "grad_norm": 9.697218288679418, + "grad_norm": 9.291362822335682, "learning_rate": 2.299165003341985e-09, - "logits/chosen": -0.056405842304229736, - "logits/rejected": 0.04429206997156143, - "logps/chosen": -1.2803170680999756, - "logps/rejected": -1.6816307306289673, - "loss": 1.9666, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2803170680999756, - "rewards/margins": 0.401313453912735, - "rewards/rejected": -1.6816307306289673, - "semantic_entropy": 0.7831650376319885, + "logits/chosen": -0.16195708513259888, + "logits/rejected": -0.07419019192457199, + "logps/chosen": -1.243790626525879, + "logps/rejected": -1.559624433517456, + "loss": 1.566, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.243790626525879, + "rewards/margins": 0.3158337473869324, + "rewards/rejected": -1.559624433517456, "step": 5450 }, { "epoch": 2.9195517645091154, - "grad_norm": 8.665580772903214, + "grad_norm": 7.5983784368398055, "learning_rate": 2.1523976880299945e-09, - "logits/chosen": -0.16580912470817566, - "logits/rejected": -0.004101097583770752, - "logps/chosen": -1.2580012083053589, - "logps/rejected": -1.508253574371338, - "loss": 1.9929, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2580012083053589, - "rewards/margins": 0.2502524256706238, - "rewards/rejected": -1.508253574371338, - "semantic_entropy": 0.8098133206367493, + "logits/chosen": -0.24264466762542725, + "logits/rejected": -0.09939133375883102, + "logps/chosen": -1.2212355136871338, + "logps/rejected": -1.3946906328201294, + "loss": 1.5814, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2212355136871338, + "rewards/margins": 0.17345522344112396, + "rewards/rejected": -1.3946906328201294, "step": 5455 }, { "epoch": 2.9222277972905166, - "grad_norm": 12.695614547089772, + "grad_norm": 8.042593806805142, "learning_rate": 2.010460492339161e-09, - "logits/chosen": -0.14466659724712372, - "logits/rejected": -0.04582420736551285, - "logps/chosen": -1.26729154586792, - "logps/rejected": -1.6190316677093506, - "loss": 1.9557, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.26729154586792, - "rewards/margins": 0.35174015164375305, - "rewards/rejected": -1.6190316677093506, - "semantic_entropy": 0.7803028225898743, + "logits/chosen": -0.22303660213947296, + "logits/rejected": -0.15075257420539856, + "logps/chosen": -1.2438392639160156, + "logps/rejected": -1.5116565227508545, + "loss": 1.568, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2438392639160156, + "rewards/margins": 0.2678173780441284, + "rewards/rejected": -1.5116565227508545, "step": 5460 }, { "epoch": 2.9249038300719183, - "grad_norm": 11.8295707458819, + "grad_norm": 8.522258152746392, "learning_rate": 1.8733547933446614e-09, - "logits/chosen": -0.21898195147514343, - "logits/rejected": -0.03481177240610123, - "logps/chosen": -1.3300405740737915, - "logps/rejected": -1.571028470993042, - "loss": 2.0446, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3300405740737915, - "rewards/margins": 0.240988090634346, - "rewards/rejected": -1.571028470993042, - "semantic_entropy": 0.7773094177246094, + "logits/chosen": -0.28066182136535645, + "logits/rejected": -0.13141262531280518, + "logps/chosen": -1.2957508563995361, + "logps/rejected": -1.4685115814208984, + "loss": 1.6427, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2957508563995361, + "rewards/margins": 0.1727607548236847, + "rewards/rejected": -1.4685115814208984, "step": 5465 }, { "epoch": 2.92757986285332, - "grad_norm": 11.067365059351255, + "grad_norm": 10.245778076063425, "learning_rate": 1.7410819212467231e-09, - "logits/chosen": -0.12249497324228287, - "logits/rejected": -0.040003784000873566, - "logps/chosen": -1.2132530212402344, - "logps/rejected": -1.5384536981582642, - "loss": 1.9432, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2132530212402344, - "rewards/margins": 0.3252008259296417, - "rewards/rejected": -1.5384536981582642, - "semantic_entropy": 0.82013338804245, + "logits/chosen": -0.1834983080625534, + "logits/rejected": -0.11405984312295914, + "logps/chosen": -1.1761218309402466, + "logps/rejected": -1.4256806373596191, + "loss": 1.5236, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1761218309402466, + "rewards/margins": 0.249558687210083, + "rewards/rejected": -1.4256806373596191, "step": 5470 }, { "epoch": 2.9302558956347218, - "grad_norm": 9.52612200576922, + "grad_norm": 8.151394853140006, "learning_rate": 1.613643159357192e-09, - "logits/chosen": -0.08768186718225479, - "logits/rejected": -0.1272381991147995, - "logps/chosen": -1.1816210746765137, - "logps/rejected": -1.5005667209625244, - "loss": 1.9438, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.1816210746765137, - "rewards/margins": 0.3189457654953003, - "rewards/rejected": -1.5005667209625244, - "semantic_entropy": 0.8184798955917358, + "logits/chosen": -0.15573295950889587, + "logits/rejected": -0.1990688145160675, + "logps/chosen": -1.1482346057891846, + "logps/rejected": -1.3903645277023315, + "loss": 1.5217, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1482346057891846, + "rewards/margins": 0.24212996661663055, + "rewards/rejected": -1.3903645277023315, "step": 5475 }, { "epoch": 2.932931928416123, - "grad_norm": 8.608822633708195, + "grad_norm": 7.154479535819093, "learning_rate": 1.4910397440875967e-09, - "logits/chosen": -0.13726715743541718, - "logits/rejected": -0.033788178116083145, - "logps/chosen": -1.2667338848114014, - "logps/rejected": -1.5312033891677856, - "loss": 1.9977, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2667338848114014, - "rewards/margins": 0.26446956396102905, - "rewards/rejected": -1.5312033891677856, - "semantic_entropy": 0.7881470918655396, + "logits/chosen": -0.19441083073616028, + "logits/rejected": -0.11569847166538239, + "logps/chosen": -1.2421464920043945, + "logps/rejected": -1.416693925857544, + "loss": 1.6012, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2421464920043945, + "rewards/margins": 0.17454750835895538, + "rewards/rejected": -1.416693925857544, "step": 5480 }, { "epoch": 2.9356079611975248, - "grad_norm": 10.212245406357974, + "grad_norm": 8.165676270910435, "learning_rate": 1.3732728649368253e-09, - "logits/chosen": -0.08094757795333862, - "logits/rejected": 0.08302084356546402, - "logps/chosen": -1.253039836883545, - "logps/rejected": -1.552504062652588, - "loss": 1.9854, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.253039836883545, - "rewards/margins": 0.2994643449783325, - "rewards/rejected": -1.552504062652588, - "semantic_entropy": 0.811646580696106, + "logits/chosen": -0.17287132143974304, + "logits/rejected": -0.04290672019124031, + "logps/chosen": -1.2198317050933838, + "logps/rejected": -1.4292641878128052, + "loss": 1.577, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2198317050933838, + "rewards/margins": 0.2094324380159378, + "rewards/rejected": -1.4292641878128052, "step": 5485 }, { "epoch": 2.938283993978926, - "grad_norm": 15.191474078296592, + "grad_norm": 10.173550581014478, "learning_rate": 1.260343664479524e-09, - "logits/chosen": -0.15114764869213104, - "logits/rejected": -0.1167159229516983, - "logps/chosen": -1.2422395944595337, - "logps/rejected": -1.5190513134002686, - "loss": 1.9799, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2422395944595337, - "rewards/margins": 0.27681177854537964, - "rewards/rejected": -1.5190513134002686, - "semantic_entropy": 0.8092382550239563, + "logits/chosen": -0.23846104741096497, + "logits/rejected": -0.21791306138038635, + "logps/chosen": -1.2163054943084717, + "logps/rejected": -1.3821016550064087, + "loss": 1.5776, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2163054943084717, + "rewards/margins": 0.16579614579677582, + "rewards/rejected": -1.3821016550064087, "step": 5490 }, { "epoch": 2.9409600267603278, - "grad_norm": 9.324040014918797, + "grad_norm": 7.329033604357603, "learning_rate": 1.1522532383554384e-09, - "logits/chosen": -0.21880176663398743, - "logits/rejected": -0.016771014779806137, - "logps/chosen": -1.2089433670043945, - "logps/rejected": -1.6005357503890991, - "loss": 1.9122, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2089433670043945, - "rewards/margins": 0.3915923535823822, - "rewards/rejected": -1.6005357503890991, - "semantic_entropy": 0.8016504049301147, + "logits/chosen": -0.30749398469924927, + "logits/rejected": -0.140707865357399, + "logps/chosen": -1.1801255941390991, + "logps/rejected": -1.4713468551635742, + "loss": 1.5143, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1801255941390991, + "rewards/margins": 0.29122108221054077, + "rewards/rejected": -1.4713468551635742, "step": 5495 }, { "epoch": 2.9436360595417295, - "grad_norm": 7.290687595120567, + "grad_norm": 7.202329760268409, "learning_rate": 1.049002635258256e-09, - "logits/chosen": -0.12340692430734634, - "logits/rejected": -0.015453631989657879, - "logps/chosen": -1.3105392456054688, - "logps/rejected": -1.5042502880096436, - "loss": 2.0316, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3105392456054688, - "rewards/margins": 0.19371113181114197, - "rewards/rejected": -1.5042502880096436, - "semantic_entropy": 0.7904552817344666, + "logits/chosen": -0.22415634989738464, + "logits/rejected": -0.13418297469615936, + "logps/chosen": -1.2656097412109375, + "logps/rejected": -1.3961628675460815, + "loss": 1.6224, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2656097412109375, + "rewards/margins": 0.1305532157421112, + "rewards/rejected": -1.3961628675460815, "step": 5500 }, { "epoch": 2.946312092323131, - "grad_norm": 6.2992627661020295, + "grad_norm": 6.303726793421413, "learning_rate": 9.505928569258358e-10, - "logits/chosen": -0.09954291582107544, - "logits/rejected": -0.0960114449262619, - "logps/chosen": -1.2732259035110474, - "logps/rejected": -1.577926754951477, - "loss": 1.965, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2732259035110474, - "rewards/margins": 0.30470094084739685, - "rewards/rejected": -1.577926754951477, - "semantic_entropy": 0.7664059400558472, + "logits/chosen": -0.1686837077140808, + "logits/rejected": -0.17766132950782776, + "logps/chosen": -1.2433160543441772, + "logps/rejected": -1.4891825914382935, + "loss": 1.5733, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2433160543441772, + "rewards/margins": 0.24586646258831024, + "rewards/rejected": -1.4891825914382935, "step": 5505 }, { "epoch": 2.9489881251045325, - "grad_norm": 9.891123526520488, + "grad_norm": 9.119218389688942, "learning_rate": 8.57024858130273e-10, - "logits/chosen": -0.18454578518867493, - "logits/rejected": -0.08242712914943695, - "logps/chosen": -1.2829363346099854, - "logps/rejected": -1.7573705911636353, - "loss": 1.9526, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2829363346099854, - "rewards/margins": 0.4744341969490051, - "rewards/rejected": -1.7573705911636353, - "semantic_entropy": 0.7814774513244629, + "logits/chosen": -0.25223469734191895, + "logits/rejected": -0.17962117493152618, + "logps/chosen": -1.2419198751449585, + "logps/rejected": -1.5848525762557983, + "loss": 1.5492, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2419198751449585, + "rewards/margins": 0.34293264150619507, + "rewards/rejected": -1.5848525762557983, "step": 5510 }, { "epoch": 2.951664157885934, - "grad_norm": 13.079909249227054, + "grad_norm": 9.600831043124986, "learning_rate": 7.682995466686826e-10, - "logits/chosen": -0.24678631126880646, - "logits/rejected": -0.11660070717334747, - "logps/chosen": -1.235660433769226, - "logps/rejected": -1.7078622579574585, - "loss": 1.9316, + "logits/chosen": -0.2802756428718567, + "logits/rejected": -0.17401473224163055, + "logps/chosen": -1.2063791751861572, + "logps/rejected": -1.5460056066513062, + "loss": 1.534, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.235660433769226, - "rewards/margins": 0.47220176458358765, - "rewards/rejected": -1.7078622579574585, - "semantic_entropy": 0.7808048129081726, + "rewards/chosen": -1.2063791751861572, + "rewards/margins": 0.3396264612674713, + "rewards/rejected": -1.5460056066513062, "step": 5515 }, { "epoch": 2.9543401906673354, - "grad_norm": 13.000873712454275, + "grad_norm": 9.948839107290668, "learning_rate": 6.844177833543741e-10, - "logits/chosen": -0.16099712252616882, - "logits/rejected": -0.1062990203499794, - "logps/chosen": -1.2108200788497925, - "logps/rejected": -1.515880823135376, - "loss": 1.9706, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2108200788497925, - "rewards/margins": 0.3050605058670044, - "rewards/rejected": -1.515880823135376, - "semantic_entropy": 0.8106793165206909, + "logits/chosen": -0.22035236656665802, + "logits/rejected": -0.1833755075931549, + "logps/chosen": -1.1798374652862549, + "logps/rejected": -1.4181078672409058, + "loss": 1.5562, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1798374652862549, + "rewards/margins": 0.2382703721523285, + "rewards/rejected": -1.4181078672409058, "step": 5520 }, { "epoch": 2.957016223448737, - "grad_norm": 7.343368786137372, + "grad_norm": 6.746792031918658, "learning_rate": 6.053803820087467e-10, - "logits/chosen": -0.15496540069580078, - "logits/rejected": -0.03871823102235794, - "logps/chosen": -1.269761323928833, - "logps/rejected": -1.7179292440414429, - "loss": 1.9548, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.269761323928833, - "rewards/margins": 0.44816771149635315, - "rewards/rejected": -1.7179292440414429, - "semantic_entropy": 0.7783750891685486, + "logits/chosen": -0.23057572543621063, + "logits/rejected": -0.13543887436389923, + "logps/chosen": -1.225454568862915, + "logps/rejected": -1.559378981590271, + "loss": 1.5537, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.225454568862915, + "rewards/margins": 0.33392423391342163, + "rewards/rejected": -1.559378981590271, "step": 5525 }, { "epoch": 2.959692256230139, - "grad_norm": 13.65669677320013, + "grad_norm": 12.319687335187103, "learning_rate": 5.311881094528514e-10, - "logits/chosen": -0.18310664594173431, - "logits/rejected": 0.036461032927036285, - "logps/chosen": -1.361135721206665, - "logps/rejected": -1.5666468143463135, - "loss": 2.0688, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.361135721206665, - "rewards/margins": 0.20551109313964844, - "rewards/rejected": -1.5666468143463135, - "semantic_entropy": 0.7689114809036255, + "logits/chosen": -0.2521504759788513, + "logits/rejected": -0.07061384618282318, + "logps/chosen": -1.3217427730560303, + "logps/rejected": -1.453894019126892, + "loss": 1.6689, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3217427730560303, + "rewards/margins": 0.13215124607086182, + "rewards/rejected": -1.453894019126892, "step": 5530 }, { "epoch": 2.9623682890115406, - "grad_norm": 15.509024609341324, + "grad_norm": 10.749252456170122, "learning_rate": 4.6184168550050806e-10, - "logits/chosen": -0.13887521624565125, - "logits/rejected": -0.09963810443878174, - "logps/chosen": -1.234496831893921, - "logps/rejected": -1.4671688079833984, - "loss": 1.9948, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.234496831893921, - "rewards/margins": 0.23267188668251038, - "rewards/rejected": -1.4671688079833984, - "semantic_entropy": 0.8018797636032104, + "logits/chosen": -0.24004845321178436, + "logits/rejected": -0.21623799204826355, + "logps/chosen": -1.2044014930725098, + "logps/rejected": -1.3565657138824463, + "loss": 1.5856, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2044014930725098, + "rewards/margins": 0.15216435492038727, + "rewards/rejected": -1.3565657138824463, "step": 5535 }, { "epoch": 2.965044321792942, - "grad_norm": 10.718545297303793, + "grad_norm": 8.928875115417165, "learning_rate": 3.973417829510328e-10, - "logits/chosen": -0.25150126218795776, - "logits/rejected": -0.09888546913862228, - "logps/chosen": -1.3156968355178833, - "logps/rejected": -1.5430917739868164, - "loss": 2.0171, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3156968355178833, - "rewards/margins": 0.2273949384689331, - "rewards/rejected": -1.5430917739868164, - "semantic_entropy": 0.7827169895172119, + "logits/chosen": -0.32898491621017456, + "logits/rejected": -0.19997842609882355, + "logps/chosen": -1.2778807878494263, + "logps/rejected": -1.4223525524139404, + "loss": 1.6185, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2778807878494263, + "rewards/margins": 0.14447186887264252, + "rewards/rejected": -1.4223525524139404, "step": 5540 }, { "epoch": 2.9677203545743436, - "grad_norm": 8.659532986487385, + "grad_norm": 7.423246640192279, "learning_rate": 3.3768902758274377e-10, - "logits/chosen": -0.15638864040374756, - "logits/rejected": -0.04953031614422798, - "logps/chosen": -1.2060474157333374, - "logps/rejected": -1.4767600297927856, - "loss": 1.9638, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2060474157333374, - "rewards/margins": 0.2707127630710602, - "rewards/rejected": -1.4767600297927856, - "semantic_entropy": 0.8145160675048828, + "logits/chosen": -0.23562900722026825, + "logits/rejected": -0.1592407524585724, + "logps/chosen": -1.1718533039093018, + "logps/rejected": -1.3776428699493408, + "loss": 1.5518, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.1718533039093018, + "rewards/margins": 0.20578965544700623, + "rewards/rejected": -1.3776428699493408, "step": 5545 }, { "epoch": 2.970396387355745, - "grad_norm": 10.443161949595284, + "grad_norm": 7.789031939079042, "learning_rate": 2.8288399814691e-10, - "logits/chosen": -0.08594232052564621, - "logits/rejected": 0.02057645097374916, - "logps/chosen": -1.3247028589248657, - "logps/rejected": -1.6422897577285767, - "loss": 2.0071, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3247028589248657, - "rewards/margins": 0.3175868093967438, - "rewards/rejected": -1.6422897577285767, - "semantic_entropy": 0.7709945440292358, + "logits/chosen": -0.16586032509803772, + "logits/rejected": -0.07216402888298035, + "logps/chosen": -1.2948392629623413, + "logps/rejected": -1.5134284496307373, + "loss": 1.6148, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2948392629623413, + "rewards/margins": 0.21858926117420197, + "rewards/rejected": -1.5134284496307373, "step": 5550 }, { "epoch": 2.9730724201371466, - "grad_norm": 11.926017722679141, + "grad_norm": 9.7236517150733, "learning_rate": 2.3292722636220066e-10, - "logits/chosen": -0.17937864363193512, - "logits/rejected": 0.0162881501019001, - "logps/chosen": -1.352497935295105, - "logps/rejected": -1.6892569065093994, - "loss": 2.0197, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.352497935295105, - "rewards/margins": 0.3367590308189392, - "rewards/rejected": -1.6892569065093994, - "semantic_entropy": 0.7497475743293762, + "logits/chosen": -0.2562541365623474, + "logits/rejected": -0.09349828958511353, + "logps/chosen": -1.3202444314956665, + "logps/rejected": -1.5408875942230225, + "loss": 1.6399, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3202444314956665, + "rewards/margins": 0.22064313292503357, + "rewards/rejected": -1.5408875942230225, "step": 5555 }, { "epoch": 2.9757484529185483, - "grad_norm": 9.346052854797605, + "grad_norm": 7.980336065142897, "learning_rate": 1.8781919690946668e-10, - "logits/chosen": -0.11221225559711456, - "logits/rejected": -0.10421918332576752, - "logps/chosen": -1.2898824214935303, - "logps/rejected": -1.4666532278060913, - "loss": 2.0231, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2898824214935303, - "rewards/margins": 0.17677077651023865, - "rewards/rejected": -1.4666532278060913, - "semantic_entropy": 0.794800877571106, + "logits/chosen": -0.20060701668262482, + "logits/rejected": -0.20425963401794434, + "logps/chosen": -1.2538707256317139, + "logps/rejected": -1.3540282249450684, + "loss": 1.6204, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2538707256317139, + "rewards/margins": 0.10015746206045151, + "rewards/rejected": -1.3540282249450684, "step": 5560 }, { "epoch": 2.97842448569995, - "grad_norm": 14.60377778075287, + "grad_norm": 11.045387212434024, "learning_rate": 1.4756034742696711e-10, - "logits/chosen": -0.2185247242450714, - "logits/rejected": -0.17179787158966064, - "logps/chosen": -1.2479779720306396, - "logps/rejected": -1.4974956512451172, - "loss": 1.9906, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2479779720306396, - "rewards/margins": 0.2495175302028656, - "rewards/rejected": -1.4974956512451172, - "semantic_entropy": 0.814715564250946, + "logits/chosen": -0.2866879999637604, + "logits/rejected": -0.2636876106262207, + "logps/chosen": -1.2187350988388062, + "logps/rejected": -1.39274001121521, + "loss": 1.5827, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2187350988388062, + "rewards/margins": 0.17400483787059784, + "rewards/rejected": -1.39274001121521, "step": 5565 }, { "epoch": 2.9811005184813513, - "grad_norm": 9.99143896346487, + "grad_norm": 9.409135351575326, "learning_rate": 1.12151068506261e-10, - "logits/chosen": -0.13437816500663757, - "logits/rejected": -0.010555913671851158, - "logps/chosen": -1.2261104583740234, - "logps/rejected": -1.6780275106430054, - "loss": 1.9285, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2261104583740234, - "rewards/margins": 0.4519170820713043, - "rewards/rejected": -1.6780275106430054, - "semantic_entropy": 0.783943772315979, + "logits/chosen": -0.2014199197292328, + "logits/rejected": -0.10790824890136719, + "logps/chosen": -1.1877915859222412, + "logps/rejected": -1.5239720344543457, + "loss": 1.5295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1877915859222412, + "rewards/margins": 0.33618029952049255, + "rewards/rejected": -1.5239720344543457, "step": 5570 }, { "epoch": 2.983776551262753, - "grad_norm": 11.481288312845669, + "grad_norm": 9.875788186643785, "learning_rate": 8.159170368826629e-11, - "logits/chosen": -0.18372122943401337, - "logits/rejected": -0.04386281967163086, - "logps/chosen": -1.2467625141143799, - "logps/rejected": -1.6158266067504883, - "loss": 1.9529, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2467625141143799, - "rewards/margins": 0.3690639138221741, - "rewards/rejected": -1.6158266067504883, - "semantic_entropy": 0.7892996072769165, + "logits/chosen": -0.2554752826690674, + "logits/rejected": -0.1442861109972, + "logps/chosen": -1.2132186889648438, + "logps/rejected": -1.4958226680755615, + "loss": 1.5554, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2132186889648438, + "rewards/margins": 0.2826038599014282, + "rewards/rejected": -1.4958226680755615, "step": 5575 }, { "epoch": 2.9864525840441547, - "grad_norm": 8.394770887084695, + "grad_norm": 8.273370345518318, "learning_rate": 5.588254946015114e-11, - "logits/chosen": -0.2578745484352112, - "logits/rejected": -0.018613124266266823, - "logps/chosen": -1.1846576929092407, - "logps/rejected": -1.6049606800079346, - "loss": 1.9194, + "logits/chosen": -0.31946608424186707, + "logits/rejected": -0.12483116239309311, + "logps/chosen": -1.1476317644119263, + "logps/rejected": -1.49470055103302, + "loss": 1.5048, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1846576929092407, - "rewards/margins": 0.4203028678894043, - "rewards/rejected": -1.6049606800079346, - "semantic_entropy": 0.8201394081115723, + "rewards/chosen": -1.1476317644119263, + "rewards/margins": 0.3470688462257385, + "rewards/rejected": -1.49470055103302, "step": 5580 }, { "epoch": 2.989128616825556, - "grad_norm": 11.272528683106293, + "grad_norm": 9.002525495716036, "learning_rate": 3.502385525216978e-11, - "logits/chosen": -0.2203395813703537, - "logits/rejected": -0.06656957417726517, - "logps/chosen": -1.2808977365493774, - "logps/rejected": -1.6387157440185547, - "loss": 1.9687, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2808977365493774, - "rewards/margins": 0.357818067073822, - "rewards/rejected": -1.6387157440185547, - "semantic_entropy": 0.7756115794181824, + "logits/chosen": -0.2967630922794342, + "logits/rejected": -0.17088404297828674, + "logps/chosen": -1.2522767782211304, + "logps/rejected": -1.4954627752304077, + "loss": 1.5749, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2522767782211304, + "rewards/margins": 0.24318604171276093, + "rewards/rejected": -1.4954627752304077, "step": 5585 }, { "epoch": 2.9918046496069577, - "grad_norm": 7.44528259986258, + "grad_norm": 6.989817783688385, "learning_rate": 1.901582343555308e-11, - "logits/chosen": -0.1454438716173172, - "logits/rejected": -0.1031564474105835, - "logps/chosen": -1.3283231258392334, - "logps/rejected": -1.5797977447509766, - "loss": 2.0296, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3283231258392334, - "rewards/margins": 0.2514745593070984, - "rewards/rejected": -1.5797977447509766, - "semantic_entropy": 0.7781016230583191, + "logits/chosen": -0.23328891396522522, + "logits/rejected": -0.20442518591880798, + "logps/chosen": -1.2976136207580566, + "logps/rejected": -1.4591684341430664, + "loss": 1.637, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2976136207580566, + "rewards/margins": 0.16155479848384857, + "rewards/rejected": -1.4591684341430664, "step": 5590 }, { "epoch": 2.9944806823883594, - "grad_norm": 12.137871603428197, + "grad_norm": 9.159023001208364, "learning_rate": 7.858609320232634e-12, - "logits/chosen": -0.17230184376239777, - "logits/rejected": -0.04320326820015907, - "logps/chosen": -1.2008966207504272, - "logps/rejected": -1.540555715560913, - "loss": 1.9434, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2008966207504272, - "rewards/margins": 0.3396591544151306, - "rewards/rejected": -1.540555715560913, - "semantic_entropy": 0.8178640604019165, + "logits/chosen": -0.24357542395591736, + "logits/rejected": -0.1504550278186798, + "logps/chosen": -1.166772484779358, + "logps/rejected": -1.4286892414093018, + "loss": 1.5248, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.166772484779358, + "rewards/margins": 0.26191678643226624, + "rewards/rejected": -1.4286892414093018, "step": 5595 }, { "epoch": 2.9971567151697607, - "grad_norm": 12.303664266654412, + "grad_norm": 9.005783452720413, "learning_rate": 1.5523211535639624e-12, - "logits/chosen": -0.16199703514575958, - "logits/rejected": -0.0670817419886589, - "logps/chosen": -1.2330716848373413, - "logps/rejected": -1.6515203714370728, - "loss": 1.9498, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2330716848373413, - "rewards/margins": 0.4184487462043762, - "rewards/rejected": -1.6515203714370728, - "semantic_entropy": 0.7843411564826965, + "logits/chosen": -0.2362523078918457, + "logits/rejected": -0.1721603125333786, + "logps/chosen": -1.2057607173919678, + "logps/rejected": -1.5258591175079346, + "loss": 1.5482, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2057607173919678, + "rewards/margins": 0.32009822130203247, + "rewards/rejected": -1.5258591175079346, "step": 5600 }, { "epoch": 2.9971567151697607, - "eval_logits/chosen": 0.1504385620355606, - "eval_logits/rejected": 0.23508602380752563, - "eval_logps/chosen": -1.3358336687088013, - "eval_logps/rejected": -1.611994743347168, - "eval_loss": 2.0457005500793457, - "eval_rewards/accuracies": 0.5927299857139587, - "eval_rewards/chosen": -1.3358336687088013, - "eval_rewards/margins": 0.27616116404533386, - "eval_rewards/rejected": -1.611994743347168, - "eval_runtime": 34.6344, - "eval_samples_per_second": 38.834, - "eval_semantic_entropy": 0.7755205631256104, - "eval_steps_per_second": 9.73, + "eval_logits/chosen": 0.016507524996995926, + "eval_logits/rejected": 0.083260677754879, + "eval_logps/chosen": -1.3030215501785278, + "eval_logps/rejected": -1.4992130994796753, + "eval_loss": 1.6474518775939941, + "eval_rewards/accuracies": 0.5712166428565979, + "eval_rewards/chosen": -1.3030215501785278, + "eval_rewards/margins": 0.19619165360927582, + "eval_rewards/rejected": -1.4992130994796753, + "eval_runtime": 40.3232, + "eval_samples_per_second": 33.355, + "eval_steps_per_second": 8.357, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, - "train_loss": 2.028473087245443, - "train_runtime": 28359.3461, - "train_samples_per_second": 6.325, - "train_steps_per_second": 0.198 + "train_loss": 1.6264250374623148, + "train_runtime": 30148.9193, + "train_samples_per_second": 5.949, + "train_steps_per_second": 0.186 } ], "logging_steps": 5,