{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 1556, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0706638115631692e-09, "logits/chosen": -3.0633435249328613, "logits/rejected": -3.0370049476623535, "logps/chosen": -237.29315185546875, "logps/rejected": -251.69747924804688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.070663811563169e-08, "logits/chosen": -2.990461826324463, "logits/rejected": -3.0024797916412354, "logps/chosen": -356.6201171875, "logps/rejected": -390.87042236328125, "loss": 0.6911, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": -0.004924382548779249, "rewards/margins": 0.009135871194303036, "rewards/rejected": -0.014060255140066147, "step": 10 }, { "epoch": 0.03, "learning_rate": 2.141327623126338e-08, "logits/chosen": -3.002528429031372, "logits/rejected": -3.0017483234405518, "logps/chosen": -350.7555847167969, "logps/rejected": -393.46014404296875, "loss": 0.6801, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.050124846398830414, "rewards/margins": 0.028588850051164627, "rewards/rejected": 0.02153599075973034, "step": 20 }, { "epoch": 0.04, "learning_rate": 3.2119914346895076e-08, "logits/chosen": -2.975447416305542, "logits/rejected": -3.0126380920410156, "logps/chosen": -375.95391845703125, "logps/rejected": -432.83587646484375, "loss": 0.6435, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2032477855682373, "rewards/margins": 0.1010356992483139, "rewards/rejected": 0.1022120863199234, "step": 30 }, { "epoch": 0.05, "learning_rate": 4.282655246252676e-08, "logits/chosen": -3.0026869773864746, "logits/rejected": -2.9945485591888428, "logps/chosen": -383.3456115722656, "logps/rejected": -392.7911376953125, "loss": 0.5784, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.4273909628391266, "rewards/margins": 0.30088725686073303, "rewards/rejected": 0.12650372087955475, "step": 40 }, { "epoch": 0.06, "learning_rate": 5.353319057815846e-08, "logits/chosen": -2.989891529083252, "logits/rejected": -2.996675968170166, "logps/chosen": -339.07513427734375, "logps/rejected": -373.727783203125, "loss": 0.5345, "rewards/accuracies": 0.75, "rewards/chosen": 0.6149066686630249, "rewards/margins": 0.39920732378959656, "rewards/rejected": 0.21569931507110596, "step": 50 }, { "epoch": 0.08, "learning_rate": 6.423982869379015e-08, "logits/chosen": -3.026094913482666, "logits/rejected": -2.9982128143310547, "logps/chosen": -327.8692321777344, "logps/rejected": -375.9877624511719, "loss": 0.4485, "rewards/accuracies": 0.875, "rewards/chosen": 0.6913961172103882, "rewards/margins": 0.7190420031547546, "rewards/rejected": -0.02764584682881832, "step": 60 }, { "epoch": 0.09, "learning_rate": 7.494646680942184e-08, "logits/chosen": -2.974823474884033, "logits/rejected": -2.980032444000244, "logps/chosen": -351.2728576660156, "logps/rejected": -395.68609619140625, "loss": 0.3966, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.8828132748603821, "rewards/margins": 0.9640719294548035, "rewards/rejected": -0.08125858008861542, "step": 70 }, { "epoch": 0.1, "learning_rate": 8.565310492505352e-08, "logits/chosen": -2.977529287338257, "logits/rejected": -2.9725558757781982, "logps/chosen": -359.2842712402344, "logps/rejected": -405.7890625, "loss": 0.3519, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.091180682182312, "rewards/margins": 1.2520115375518799, "rewards/rejected": -0.16083075106143951, "step": 80 }, { "epoch": 0.12, "learning_rate": 9.635974304068522e-08, "logits/chosen": -2.979015827178955, "logits/rejected": -2.9813497066497803, "logps/chosen": -309.3511047363281, "logps/rejected": -358.91607666015625, "loss": 0.3201, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2030521631240845, "rewards/margins": 1.6773903369903564, "rewards/rejected": -0.4743381440639496, "step": 90 }, { "epoch": 0.13, "learning_rate": 1.0706638115631692e-07, "logits/chosen": -2.941194534301758, "logits/rejected": -2.9548678398132324, "logps/chosen": -343.6178894042969, "logps/rejected": -463.1512145996094, "loss": 0.2696, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.2106283903121948, "rewards/margins": 1.9713561534881592, "rewards/rejected": -0.7607278823852539, "step": 100 }, { "epoch": 0.13, "eval_logits/chosen": -2.977161169052124, "eval_logits/rejected": -2.957442045211792, "eval_logps/chosen": -296.8330383300781, "eval_logps/rejected": -349.66558837890625, "eval_loss": 0.2511790990829468, "eval_rewards/accuracies": 0.921875, "eval_rewards/chosen": 1.1878268718719482, "eval_rewards/margins": 1.8798556327819824, "eval_rewards/rejected": -0.6920287609100342, "eval_runtime": 38.7534, "eval_samples_per_second": 12.902, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.14, "learning_rate": 1.177730192719486e-07, "logits/chosen": -2.9442899227142334, "logits/rejected": -2.9481866359710693, "logps/chosen": -346.63873291015625, "logps/rejected": -406.31964111328125, "loss": 0.2493, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2421057224273682, "rewards/margins": 2.18147873878479, "rewards/rejected": -0.9393728971481323, "step": 110 }, { "epoch": 0.15, "learning_rate": 1.284796573875803e-07, "logits/chosen": -2.94069242477417, "logits/rejected": -2.9417574405670166, "logps/chosen": -351.788330078125, "logps/rejected": -379.61065673828125, "loss": 0.2406, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.7772680521011353, "rewards/margins": 1.8036502599716187, "rewards/rejected": -1.0263820886611938, "step": 120 }, { "epoch": 0.17, "learning_rate": 1.3918629550321198e-07, "logits/chosen": -2.926699638366699, "logits/rejected": -2.911668300628662, "logps/chosen": -327.4112548828125, "logps/rejected": -408.2745361328125, "loss": 0.2073, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.5646601915359497, "rewards/margins": 2.2064461708068848, "rewards/rejected": -1.6417862176895142, "step": 130 }, { "epoch": 0.18, "learning_rate": 1.4989293361884367e-07, "logits/chosen": -2.904219150543213, "logits/rejected": -2.921232223510742, "logps/chosen": -311.6190185546875, "logps/rejected": -411.2701110839844, "loss": 0.1967, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.46902722120285034, "rewards/margins": 2.7694640159606934, "rewards/rejected": -2.3004367351531982, "step": 140 }, { "epoch": 0.19, "learning_rate": 1.6059957173447535e-07, "logits/chosen": -2.901981830596924, "logits/rejected": -2.9112467765808105, "logps/chosen": -301.6145324707031, "logps/rejected": -391.1957092285156, "loss": 0.1723, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.218230202794075, "rewards/margins": 3.2492637634277344, "rewards/rejected": -3.031033992767334, "step": 150 }, { "epoch": 0.21, "learning_rate": 1.7130620985010704e-07, "logits/chosen": -2.8996052742004395, "logits/rejected": -2.8838694095611572, "logps/chosen": -312.6499938964844, "logps/rejected": -447.8002014160156, "loss": 0.1554, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3209637701511383, "rewards/margins": 4.501524925231934, "rewards/rejected": -4.180561065673828, "step": 160 }, { "epoch": 0.22, "learning_rate": 1.8201284796573874e-07, "logits/chosen": -2.8928513526916504, "logits/rejected": -2.9001543521881104, "logps/chosen": -329.20953369140625, "logps/rejected": -423.6446228027344, "loss": 0.1566, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.1461164504289627, "rewards/margins": 4.050145626068115, "rewards/rejected": -3.904029130935669, "step": 170 }, { "epoch": 0.23, "learning_rate": 1.9271948608137044e-07, "logits/chosen": -2.8557610511779785, "logits/rejected": -2.855731725692749, "logps/chosen": -338.60076904296875, "logps/rejected": -448.8922424316406, "loss": 0.1421, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.21355919539928436, "rewards/margins": 4.191808223724365, "rewards/rejected": -3.9782490730285645, "step": 180 }, { "epoch": 0.24, "learning_rate": 2.0342612419700214e-07, "logits/chosen": -2.8638434410095215, "logits/rejected": -2.877293825149536, "logps/chosen": -347.19573974609375, "logps/rejected": -469.17755126953125, "loss": 0.1381, "rewards/accuracies": 1.0, "rewards/chosen": -0.14256651699543, "rewards/margins": 3.96684193611145, "rewards/rejected": -4.109408378601074, "step": 190 }, { "epoch": 0.26, "learning_rate": 2.1413276231263384e-07, "logits/chosen": -2.829555034637451, "logits/rejected": -2.85453462600708, "logps/chosen": -364.0372009277344, "logps/rejected": -442.7489318847656, "loss": 0.1427, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18970072269439697, "rewards/margins": 5.029218673706055, "rewards/rejected": -4.839517593383789, "step": 200 }, { "epoch": 0.26, "eval_logits/chosen": -2.8512933254241943, "eval_logits/rejected": -2.8302505016326904, "eval_logps/chosen": -305.8147888183594, "eval_logps/rejected": -387.1728210449219, "eval_loss": 0.12157174944877625, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 0.28965064883232117, "eval_rewards/margins": 4.73240327835083, "eval_rewards/rejected": -4.442752361297607, "eval_runtime": 38.702, "eval_samples_per_second": 12.919, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.27, "learning_rate": 2.248394004282655e-07, "logits/chosen": -2.817666530609131, "logits/rejected": -2.8465371131896973, "logps/chosen": -325.3854675292969, "logps/rejected": -439.5003356933594, "loss": 0.1413, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21560493111610413, "rewards/margins": 4.514598369598389, "rewards/rejected": -4.298993110656738, "step": 210 }, { "epoch": 0.28, "learning_rate": 2.355460385438972e-07, "logits/chosen": -2.7650692462921143, "logits/rejected": -2.7801504135131836, "logps/chosen": -326.321533203125, "logps/rejected": -456.98663330078125, "loss": 0.1332, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.027444612234830856, "rewards/margins": 5.077801704406738, "rewards/rejected": -5.050357818603516, "step": 220 }, { "epoch": 0.3, "learning_rate": 2.462526766595289e-07, "logits/chosen": -2.788020610809326, "logits/rejected": -2.7895946502685547, "logps/chosen": -324.4822998046875, "logps/rejected": -439.76397705078125, "loss": 0.1356, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29427874088287354, "rewards/margins": 5.166212558746338, "rewards/rejected": -4.871933460235596, "step": 230 }, { "epoch": 0.31, "learning_rate": 2.569593147751606e-07, "logits/chosen": -2.6995949745178223, "logits/rejected": -2.7345399856567383, "logps/chosen": -356.4814758300781, "logps/rejected": -490.60931396484375, "loss": 0.1074, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.35535210371017456, "rewards/margins": 6.1955766677856445, "rewards/rejected": -6.550928592681885, "step": 240 }, { "epoch": 0.32, "learning_rate": 2.676659528907923e-07, "logits/chosen": -2.6892549991607666, "logits/rejected": -2.694087505340576, "logps/chosen": -305.6263122558594, "logps/rejected": -387.88543701171875, "loss": 0.0979, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6652821898460388, "rewards/margins": 4.929129600524902, "rewards/rejected": -5.5944108963012695, "step": 250 }, { "epoch": 0.33, "learning_rate": 2.7837259100642395e-07, "logits/chosen": -2.73167085647583, "logits/rejected": -2.7620654106140137, "logps/chosen": -408.2175598144531, "logps/rejected": -449.8201599121094, "loss": 0.1298, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9623678922653198, "rewards/margins": 5.48039436340332, "rewards/rejected": -6.4427618980407715, "step": 260 }, { "epoch": 0.35, "learning_rate": 2.890792291220557e-07, "logits/chosen": -2.7657806873321533, "logits/rejected": -2.802060604095459, "logps/chosen": -384.2090148925781, "logps/rejected": -481.82696533203125, "loss": 0.1181, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16663847863674164, "rewards/margins": 5.502591133117676, "rewards/rejected": -5.335952281951904, "step": 270 }, { "epoch": 0.36, "learning_rate": 2.9978586723768735e-07, "logits/chosen": -2.673283815383911, "logits/rejected": -2.707296848297119, "logps/chosen": -312.5271911621094, "logps/rejected": -411.64031982421875, "loss": 0.0947, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9499552845954895, "rewards/margins": 4.852605819702148, "rewards/rejected": -5.802561283111572, "step": 280 }, { "epoch": 0.37, "learning_rate": 3.1049250535331905e-07, "logits/chosen": -2.623725175857544, "logits/rejected": -2.7073614597320557, "logps/chosen": -391.2462158203125, "logps/rejected": -474.2684631347656, "loss": 0.1168, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0791637897491455, "rewards/margins": 7.065374851226807, "rewards/rejected": -8.144537925720215, "step": 290 }, { "epoch": 0.39, "learning_rate": 3.211991434689507e-07, "logits/chosen": -2.6202073097229004, "logits/rejected": -2.652608633041382, "logps/chosen": -341.9140319824219, "logps/rejected": -462.9012145996094, "loss": 0.0944, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.275757372379303, "rewards/margins": 5.93372106552124, "rewards/rejected": -6.20947790145874, "step": 300 }, { "epoch": 0.39, "eval_logits/chosen": -2.6932637691497803, "eval_logits/rejected": -2.6872053146362305, "eval_logps/chosen": -311.619873046875, "eval_logps/rejected": -409.2980041503906, "eval_loss": 0.11095032095909119, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": -0.29085665941238403, "eval_rewards/margins": 6.364411354064941, "eval_rewards/rejected": -6.65526819229126, "eval_runtime": 38.7504, "eval_samples_per_second": 12.903, "eval_steps_per_second": 0.413, "step": 300 }, { "epoch": 0.4, "learning_rate": 3.3190578158458244e-07, "logits/chosen": -2.6386542320251465, "logits/rejected": -2.7159385681152344, "logps/chosen": -368.5979919433594, "logps/rejected": -466.84783935546875, "loss": 0.131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8093490600585938, "rewards/margins": 7.050684452056885, "rewards/rejected": -7.8600335121154785, "step": 310 }, { "epoch": 0.41, "learning_rate": 3.426124197002141e-07, "logits/chosen": -2.571882486343384, "logits/rejected": -2.6551308631896973, "logps/chosen": -361.48394775390625, "logps/rejected": -489.70989990234375, "loss": 0.0905, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7510203123092651, "rewards/margins": 8.015697479248047, "rewards/rejected": -8.766717910766602, "step": 320 }, { "epoch": 0.42, "learning_rate": 3.533190578158458e-07, "logits/chosen": -2.5930895805358887, "logits/rejected": -2.6723227500915527, "logps/chosen": -384.87664794921875, "logps/rejected": -509.010986328125, "loss": 0.1232, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.44466814398765564, "rewards/margins": 6.863368988037109, "rewards/rejected": -7.308036804199219, "step": 330 }, { "epoch": 0.44, "learning_rate": 3.640256959314775e-07, "logits/chosen": -2.5658717155456543, "logits/rejected": -2.62716007232666, "logps/chosen": -304.2865295410156, "logps/rejected": -435.2959899902344, "loss": 0.0874, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5686666369438171, "rewards/margins": 6.669247627258301, "rewards/rejected": -7.237914085388184, "step": 340 }, { "epoch": 0.45, "learning_rate": 3.747323340471092e-07, "logits/chosen": -2.584165096282959, "logits/rejected": -2.70393967628479, "logps/chosen": -364.13262939453125, "logps/rejected": -477.5604553222656, "loss": 0.1015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5823951363563538, "rewards/margins": 7.484101295471191, "rewards/rejected": -8.066494941711426, "step": 350 }, { "epoch": 0.46, "learning_rate": 3.854389721627409e-07, "logits/chosen": -2.5895907878875732, "logits/rejected": -2.646876573562622, "logps/chosen": -355.0018005371094, "logps/rejected": -442.65948486328125, "loss": 0.0896, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8058759570121765, "rewards/margins": 8.065896987915039, "rewards/rejected": -8.871771812438965, "step": 360 }, { "epoch": 0.48, "learning_rate": 3.961456102783726e-07, "logits/chosen": -2.615499973297119, "logits/rejected": -2.6612184047698975, "logps/chosen": -308.342041015625, "logps/rejected": -432.08319091796875, "loss": 0.0821, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5296161770820618, "rewards/margins": 7.243483066558838, "rewards/rejected": -7.773098945617676, "step": 370 }, { "epoch": 0.49, "learning_rate": 4.068522483940043e-07, "logits/chosen": -2.6956448554992676, "logits/rejected": -2.7061805725097656, "logps/chosen": -346.4541931152344, "logps/rejected": -481.19989013671875, "loss": 0.1104, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6448992490768433, "rewards/margins": 7.711002349853516, "rewards/rejected": -8.355902671813965, "step": 380 }, { "epoch": 0.5, "learning_rate": 4.175588865096359e-07, "logits/chosen": -2.6077234745025635, "logits/rejected": -2.6278557777404785, "logps/chosen": -353.8262634277344, "logps/rejected": -447.3440856933594, "loss": 0.0958, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5978514552116394, "rewards/margins": 7.370479583740234, "rewards/rejected": -7.968331336975098, "step": 390 }, { "epoch": 0.51, "learning_rate": 4.282655246252677e-07, "logits/chosen": -2.603065252304077, "logits/rejected": -2.675497531890869, "logps/chosen": -355.2611999511719, "logps/rejected": -411.75732421875, "loss": 0.1039, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2750840187072754, "rewards/margins": 7.0222907066345215, "rewards/rejected": -7.2973737716674805, "step": 400 }, { "epoch": 0.51, "eval_logits/chosen": -2.6301259994506836, "eval_logits/rejected": -2.6286230087280273, "eval_logps/chosen": -315.64288330078125, "eval_logps/rejected": -421.1318359375, "eval_loss": 0.07803654670715332, "eval_rewards/accuracies": 0.984375, "eval_rewards/chosen": -0.6931607723236084, "eval_rewards/margins": 7.145491600036621, "eval_rewards/rejected": -7.83865213394165, "eval_runtime": 38.7861, "eval_samples_per_second": 12.891, "eval_steps_per_second": 0.413, "step": 400 }, { "epoch": 0.53, "learning_rate": 4.389721627408993e-07, "logits/chosen": -2.5576305389404297, "logits/rejected": -2.602813243865967, "logps/chosen": -361.10797119140625, "logps/rejected": -468.213134765625, "loss": 0.1042, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9836179614067078, "rewards/margins": 6.6080522537231445, "rewards/rejected": -7.591670989990234, "step": 410 }, { "epoch": 0.54, "learning_rate": 4.49678800856531e-07, "logits/chosen": -2.521080732345581, "logits/rejected": -2.5644307136535645, "logps/chosen": -325.7511901855469, "logps/rejected": -407.7994384765625, "loss": 0.1057, "rewards/accuracies": 0.9375, "rewards/chosen": -0.45771685242652893, "rewards/margins": 7.0977678298950195, "rewards/rejected": -7.555483818054199, "step": 420 }, { "epoch": 0.55, "learning_rate": 4.603854389721627e-07, "logits/chosen": -2.5245959758758545, "logits/rejected": -2.559770107269287, "logps/chosen": -340.15087890625, "logps/rejected": -485.052490234375, "loss": 0.084, "rewards/accuracies": 0.9375, "rewards/chosen": -0.994246780872345, "rewards/margins": 7.357940673828125, "rewards/rejected": -8.35218620300293, "step": 430 }, { "epoch": 0.57, "learning_rate": 4.710920770877944e-07, "logits/chosen": -2.401303768157959, "logits/rejected": -2.548125743865967, "logps/chosen": -358.9648742675781, "logps/rejected": -462.87890625, "loss": 0.1172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.293526530265808, "rewards/margins": 7.095101833343506, "rewards/rejected": -8.388628005981445, "step": 440 }, { "epoch": 0.58, "learning_rate": 4.817987152034261e-07, "logits/chosen": -2.4654183387756348, "logits/rejected": -2.560048818588257, "logps/chosen": -291.2701721191406, "logps/rejected": -362.7830505371094, "loss": 0.0959, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4040035009384155, "rewards/margins": 5.726696968078613, "rewards/rejected": -7.130700588226318, "step": 450 }, { "epoch": 0.59, "learning_rate": 4.925053533190578e-07, "logits/chosen": -2.489262104034424, "logits/rejected": -2.5457305908203125, "logps/chosen": -356.9480285644531, "logps/rejected": -435.594970703125, "loss": 0.1132, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3584586381912231, "rewards/margins": 6.3141889572143555, "rewards/rejected": -7.672647953033447, "step": 460 }, { "epoch": 0.6, "learning_rate": 4.996429421566293e-07, "logits/chosen": -2.5229034423828125, "logits/rejected": -2.565725326538086, "logps/chosen": -326.0317077636719, "logps/rejected": -448.7723083496094, "loss": 0.1051, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5931789875030518, "rewards/margins": 7.049294471740723, "rewards/rejected": -8.642473220825195, "step": 470 }, { "epoch": 0.62, "learning_rate": 4.98452749345394e-07, "logits/chosen": -2.5022709369659424, "logits/rejected": -2.555453062057495, "logps/chosen": -361.46563720703125, "logps/rejected": -498.7660217285156, "loss": 0.1386, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8986074924468994, "rewards/margins": 6.340726375579834, "rewards/rejected": -8.239333152770996, "step": 480 }, { "epoch": 0.63, "learning_rate": 4.972625565341585e-07, "logits/chosen": -2.4549243450164795, "logits/rejected": -2.5045337677001953, "logps/chosen": -320.4005432128906, "logps/rejected": -437.33612060546875, "loss": 0.0958, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8746875524520874, "rewards/margins": 6.6805620193481445, "rewards/rejected": -8.555249214172363, "step": 490 }, { "epoch": 0.64, "learning_rate": 4.960723637229232e-07, "logits/chosen": -2.448908567428589, "logits/rejected": -2.458101272583008, "logps/chosen": -355.0153503417969, "logps/rejected": -504.32330322265625, "loss": 0.0762, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4554470777511597, "rewards/margins": 8.177068710327148, "rewards/rejected": -9.632516860961914, "step": 500 }, { "epoch": 0.64, "eval_logits/chosen": -2.504735231399536, "eval_logits/rejected": -2.5092720985412598, "eval_logps/chosen": -323.16851806640625, "eval_logps/rejected": -433.9158020019531, "eval_loss": 0.08059512078762054, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -1.4457205533981323, "eval_rewards/margins": 7.671328544616699, "eval_rewards/rejected": -9.117048263549805, "eval_runtime": 38.7512, "eval_samples_per_second": 12.903, "eval_steps_per_second": 0.413, "step": 500 }, { "epoch": 0.66, "learning_rate": 4.948821709116876e-07, "logits/chosen": -2.376183032989502, "logits/rejected": -2.455298900604248, "logps/chosen": -485.12603759765625, "logps/rejected": -551.7554931640625, "loss": 0.1056, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5901005268096924, "rewards/margins": 8.771623611450195, "rewards/rejected": -10.361722946166992, "step": 510 }, { "epoch": 0.67, "learning_rate": 4.936919781004522e-07, "logits/chosen": -2.470151424407959, "logits/rejected": -2.5587172508239746, "logps/chosen": -377.3062438964844, "logps/rejected": -507.6141052246094, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": -1.233569860458374, "rewards/margins": 8.123286247253418, "rewards/rejected": -9.356857299804688, "step": 520 }, { "epoch": 0.68, "learning_rate": 4.925017852892168e-07, "logits/chosen": -2.5230183601379395, "logits/rejected": -2.603940725326538, "logps/chosen": -362.92333984375, "logps/rejected": -481.7613220214844, "loss": 0.0683, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.9068357944488525, "rewards/margins": 7.921334743499756, "rewards/rejected": -9.828168869018555, "step": 530 }, { "epoch": 0.69, "learning_rate": 4.913115924779814e-07, "logits/chosen": -2.438596248626709, "logits/rejected": -2.562830924987793, "logps/chosen": -386.5306701660156, "logps/rejected": -499.86444091796875, "loss": 0.0677, "rewards/accuracies": 1.0, "rewards/chosen": -1.2040196657180786, "rewards/margins": 8.880427360534668, "rewards/rejected": -10.084446907043457, "step": 540 }, { "epoch": 0.71, "learning_rate": 4.90121399666746e-07, "logits/chosen": -2.4589312076568604, "logits/rejected": -2.524345874786377, "logps/chosen": -332.1251220703125, "logps/rejected": -433.63787841796875, "loss": 0.1309, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1124681234359741, "rewards/margins": 7.2715253829956055, "rewards/rejected": -8.383993148803711, "step": 550 }, { "epoch": 0.72, "learning_rate": 4.889312068555106e-07, "logits/chosen": -2.58622407913208, "logits/rejected": -2.60271954536438, "logps/chosen": -271.59014892578125, "logps/rejected": -417.29833984375, "loss": 0.1275, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1244533061981201, "rewards/margins": 6.384497165679932, "rewards/rejected": -7.508950710296631, "step": 560 }, { "epoch": 0.73, "learning_rate": 4.877410140442752e-07, "logits/chosen": -2.4364261627197266, "logits/rejected": -2.4858317375183105, "logps/chosen": -350.3711853027344, "logps/rejected": -449.4051818847656, "loss": 0.0982, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0685746669769287, "rewards/margins": 7.424908638000488, "rewards/rejected": -8.49348258972168, "step": 570 }, { "epoch": 0.75, "learning_rate": 4.865508212330398e-07, "logits/chosen": -2.441240072250366, "logits/rejected": -2.527020215988159, "logps/chosen": -366.98150634765625, "logps/rejected": -525.4156494140625, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": -0.9126319885253906, "rewards/margins": 9.27831745147705, "rewards/rejected": -10.190949440002441, "step": 580 }, { "epoch": 0.76, "learning_rate": 4.853606284218044e-07, "logits/chosen": -2.3090662956237793, "logits/rejected": -2.3255538940429688, "logps/chosen": -371.3923034667969, "logps/rejected": -526.1776123046875, "loss": 0.1095, "rewards/accuracies": 0.9375, "rewards/chosen": -2.190389633178711, "rewards/margins": 9.64104175567627, "rewards/rejected": -11.831432342529297, "step": 590 }, { "epoch": 0.77, "learning_rate": 4.841704356105689e-07, "logits/chosen": -2.334197521209717, "logits/rejected": -2.423285484313965, "logps/chosen": -369.0033264160156, "logps/rejected": -506.4518127441406, "loss": 0.0959, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7217298746109009, "rewards/margins": 8.07056999206543, "rewards/rejected": -9.7923002243042, "step": 600 }, { "epoch": 0.77, "eval_logits/chosen": -2.467820405960083, "eval_logits/rejected": -2.440288782119751, "eval_logps/chosen": -318.6737060546875, "eval_logps/rejected": -428.9326171875, "eval_loss": 0.07413332909345627, "eval_rewards/accuracies": 0.984375, "eval_rewards/chosen": -0.9962404370307922, "eval_rewards/margins": 7.622487545013428, "eval_rewards/rejected": -8.618727684020996, "eval_runtime": 38.7439, "eval_samples_per_second": 12.905, "eval_steps_per_second": 0.413, "step": 600 }, { "epoch": 0.78, "learning_rate": 4.829802427993334e-07, "logits/chosen": -2.3268227577209473, "logits/rejected": -2.3746628761291504, "logps/chosen": -404.0111083984375, "logps/rejected": -492.5167541503906, "loss": 0.0859, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8345616459846497, "rewards/margins": 8.07560920715332, "rewards/rejected": -8.910171508789062, "step": 610 }, { "epoch": 0.8, "learning_rate": 4.81790049988098e-07, "logits/chosen": -2.415301561355591, "logits/rejected": -2.4919333457946777, "logps/chosen": -388.5622253417969, "logps/rejected": -531.6051025390625, "loss": 0.0631, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3940558433532715, "rewards/margins": 7.842892646789551, "rewards/rejected": -9.23694896697998, "step": 620 }, { "epoch": 0.81, "learning_rate": 4.805998571768626e-07, "logits/chosen": -2.310925245285034, "logits/rejected": -2.42446231842041, "logps/chosen": -342.0956115722656, "logps/rejected": -516.9351196289062, "loss": 0.1142, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3800750970840454, "rewards/margins": 8.762998580932617, "rewards/rejected": -10.143075942993164, "step": 630 }, { "epoch": 0.82, "learning_rate": 4.794096643656272e-07, "logits/chosen": -2.280027151107788, "logits/rejected": -2.31703782081604, "logps/chosen": -409.70379638671875, "logps/rejected": -529.5406494140625, "loss": 0.0723, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3363559246063232, "rewards/margins": 10.320574760437012, "rewards/rejected": -11.656930923461914, "step": 640 }, { "epoch": 0.84, "learning_rate": 4.782194715543918e-07, "logits/chosen": -2.276779890060425, "logits/rejected": -2.343441963195801, "logps/chosen": -348.50531005859375, "logps/rejected": -521.2000122070312, "loss": 0.0902, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.5391457080841064, "rewards/margins": 9.673690795898438, "rewards/rejected": -11.212836265563965, "step": 650 }, { "epoch": 0.85, "learning_rate": 4.770292787431564e-07, "logits/chosen": -2.3436553478240967, "logits/rejected": -2.3175175189971924, "logps/chosen": -386.4251403808594, "logps/rejected": -530.1958618164062, "loss": 0.0787, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.0843443870544434, "rewards/margins": 9.766562461853027, "rewards/rejected": -11.850906372070312, "step": 660 }, { "epoch": 0.86, "learning_rate": 4.7583908593192097e-07, "logits/chosen": -2.2515616416931152, "logits/rejected": -2.2762718200683594, "logps/chosen": -396.88751220703125, "logps/rejected": -541.3609619140625, "loss": 0.0841, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.3059911727905273, "rewards/margins": 9.442736625671387, "rewards/rejected": -11.748727798461914, "step": 670 }, { "epoch": 0.87, "learning_rate": 4.746488931206855e-07, "logits/chosen": -2.304055690765381, "logits/rejected": -2.3429813385009766, "logps/chosen": -353.8645935058594, "logps/rejected": -520.8157348632812, "loss": 0.0793, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6302597522735596, "rewards/margins": 10.208868980407715, "rewards/rejected": -12.839129447937012, "step": 680 }, { "epoch": 0.89, "learning_rate": 4.734587003094501e-07, "logits/chosen": -2.326953887939453, "logits/rejected": -2.4166040420532227, "logps/chosen": -377.34356689453125, "logps/rejected": -494.58782958984375, "loss": 0.1041, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1931746006011963, "rewards/margins": 9.344148635864258, "rewards/rejected": -11.537323951721191, "step": 690 }, { "epoch": 0.9, "learning_rate": 4.722685074982147e-07, "logits/chosen": -2.3279807567596436, "logits/rejected": -2.38569974899292, "logps/chosen": -320.0870056152344, "logps/rejected": -498.17706298828125, "loss": 0.0814, "rewards/accuracies": 1.0, "rewards/chosen": -1.5494163036346436, "rewards/margins": 10.105340957641602, "rewards/rejected": -11.654756546020508, "step": 700 }, { "epoch": 0.9, "eval_logits/chosen": -2.498293399810791, "eval_logits/rejected": -2.4712274074554443, "eval_logps/chosen": -323.183837890625, "eval_logps/rejected": -441.4797058105469, "eval_loss": 0.055789634585380554, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.4472523927688599, "eval_rewards/margins": 8.426188468933105, "eval_rewards/rejected": -9.87343978881836, "eval_runtime": 38.7758, "eval_samples_per_second": 12.895, "eval_steps_per_second": 0.413, "step": 700 }, { "epoch": 0.91, "learning_rate": 4.710783146869793e-07, "logits/chosen": -2.3991808891296387, "logits/rejected": -2.4218363761901855, "logps/chosen": -314.1746520996094, "logps/rejected": -519.7462768554688, "loss": 0.0819, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.8268877267837524, "rewards/margins": 10.331625938415527, "rewards/rejected": -12.158514022827148, "step": 710 }, { "epoch": 0.93, "learning_rate": 4.698881218757438e-07, "logits/chosen": -2.363438606262207, "logits/rejected": -2.3997836112976074, "logps/chosen": -305.2399597167969, "logps/rejected": -481.65582275390625, "loss": 0.0786, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5438249111175537, "rewards/margins": 8.62690544128418, "rewards/rejected": -10.17072868347168, "step": 720 }, { "epoch": 0.94, "learning_rate": 4.6869792906450845e-07, "logits/chosen": -2.3670878410339355, "logits/rejected": -2.4363322257995605, "logps/chosen": -342.06622314453125, "logps/rejected": -468.9805603027344, "loss": 0.0719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5114291906356812, "rewards/margins": 8.608851432800293, "rewards/rejected": -10.120282173156738, "step": 730 }, { "epoch": 0.95, "learning_rate": 4.67507736253273e-07, "logits/chosen": -2.2785589694976807, "logits/rejected": -2.3089492321014404, "logps/chosen": -407.75048828125, "logps/rejected": -557.4127197265625, "loss": 0.0903, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8981235027313232, "rewards/margins": 10.704629898071289, "rewards/rejected": -12.602753639221191, "step": 740 }, { "epoch": 0.96, "learning_rate": 4.6631754344203763e-07, "logits/chosen": -2.3073747158050537, "logits/rejected": -2.383291244506836, "logps/chosen": -357.61492919921875, "logps/rejected": -522.1990356445312, "loss": 0.1043, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.5501503944396973, "rewards/margins": 8.703204154968262, "rewards/rejected": -11.253355026245117, "step": 750 }, { "epoch": 0.98, "learning_rate": 4.6512735063080217e-07, "logits/chosen": -2.492027521133423, "logits/rejected": -2.534536361694336, "logps/chosen": -430.7220764160156, "logps/rejected": -559.482666015625, "loss": 0.0971, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.385508418083191, "rewards/margins": 9.584807395935059, "rewards/rejected": -10.970315933227539, "step": 760 }, { "epoch": 0.99, "learning_rate": 4.6393715781956676e-07, "logits/chosen": -2.3780312538146973, "logits/rejected": -2.37473201751709, "logps/chosen": -326.2506103515625, "logps/rejected": -496.7969665527344, "loss": 0.0865, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0338951349258423, "rewards/margins": 9.33600902557373, "rewards/rejected": -10.369903564453125, "step": 770 }, { "epoch": 1.0, "learning_rate": 4.6274696500833135e-07, "logits/chosen": -2.4264612197875977, "logits/rejected": -2.45288348197937, "logps/chosen": -368.6007385253906, "logps/rejected": -534.6527709960938, "loss": 0.0645, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9812146425247192, "rewards/margins": 9.224861145019531, "rewards/rejected": -10.206075668334961, "step": 780 }, { "epoch": 1.02, "learning_rate": 4.6155677219709594e-07, "logits/chosen": -2.383737087249756, "logits/rejected": -2.4557416439056396, "logps/chosen": -401.9710388183594, "logps/rejected": -555.4797973632812, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -1.5994548797607422, "rewards/margins": 12.170892715454102, "rewards/rejected": -13.770347595214844, "step": 790 }, { "epoch": 1.03, "learning_rate": 4.603665793858605e-07, "logits/chosen": -2.4060428142547607, "logits/rejected": -2.4426844120025635, "logps/chosen": -366.8950500488281, "logps/rejected": -558.5940551757812, "loss": 0.0164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.232177972793579, "rewards/margins": 12.297248840332031, "rewards/rejected": -13.529426574707031, "step": 800 }, { "epoch": 1.03, "eval_logits/chosen": -2.492385149002075, "eval_logits/rejected": -2.468630313873291, "eval_logps/chosen": -324.3902587890625, "eval_logps/rejected": -453.6976623535156, "eval_loss": 0.06341304630041122, "eval_rewards/accuracies": 0.984375, "eval_rewards/chosen": -1.5678963661193848, "eval_rewards/margins": 9.527338981628418, "eval_rewards/rejected": -11.095235824584961, "eval_runtime": 38.5408, "eval_samples_per_second": 12.973, "eval_steps_per_second": 0.415, "step": 800 }, { "epoch": 1.04, "learning_rate": 4.5917638657462507e-07, "logits/chosen": -2.33616042137146, "logits/rejected": -2.3640098571777344, "logps/chosen": -373.46905517578125, "logps/rejected": -514.2394409179688, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -1.4009530544281006, "rewards/margins": 10.871899604797363, "rewards/rejected": -12.272851943969727, "step": 810 }, { "epoch": 1.05, "learning_rate": 4.5798619376338966e-07, "logits/chosen": -2.4044508934020996, "logits/rejected": -2.420480966567993, "logps/chosen": -347.3623962402344, "logps/rejected": -556.5758056640625, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -1.055593490600586, "rewards/margins": 11.897196769714355, "rewards/rejected": -12.952789306640625, "step": 820 }, { "epoch": 1.07, "learning_rate": 4.567960009521542e-07, "logits/chosen": -2.359771490097046, "logits/rejected": -2.4249939918518066, "logps/chosen": -370.0980529785156, "logps/rejected": -567.7897338867188, "loss": 0.0131, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0004942417144775, "rewards/margins": 12.14315414428711, "rewards/rejected": -14.143648147583008, "step": 830 }, { "epoch": 1.08, "learning_rate": 4.5560580814091884e-07, "logits/chosen": -2.3424394130706787, "logits/rejected": -2.342963457107544, "logps/chosen": -385.192626953125, "logps/rejected": -510.11749267578125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -2.1284375190734863, "rewards/margins": 11.841325759887695, "rewards/rejected": -13.969762802124023, "step": 840 }, { "epoch": 1.09, "learning_rate": 4.5441561532968337e-07, "logits/chosen": -2.3772830963134766, "logits/rejected": -2.414663791656494, "logps/chosen": -375.8727722167969, "logps/rejected": -580.7897338867188, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -3.0805163383483887, "rewards/margins": 12.892430305480957, "rewards/rejected": -15.972944259643555, "step": 850 }, { "epoch": 1.11, "learning_rate": 4.5322542251844796e-07, "logits/chosen": -2.3776564598083496, "logits/rejected": -2.409484386444092, "logps/chosen": -331.92431640625, "logps/rejected": -500.89739990234375, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -2.3345754146575928, "rewards/margins": 11.422739028930664, "rewards/rejected": -13.757314682006836, "step": 860 }, { "epoch": 1.12, "learning_rate": 4.5203522970721255e-07, "logits/chosen": -2.3700737953186035, "logits/rejected": -2.397162914276123, "logps/chosen": -340.53094482421875, "logps/rejected": -506.8477478027344, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4118890166282654, "rewards/margins": 12.948440551757812, "rewards/rejected": -13.360328674316406, "step": 870 }, { "epoch": 1.13, "learning_rate": 4.5084503689597714e-07, "logits/chosen": -2.41035795211792, "logits/rejected": -2.4271979331970215, "logps/chosen": -329.87933349609375, "logps/rejected": -537.0123291015625, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -1.4996788501739502, "rewards/margins": 11.888396263122559, "rewards/rejected": -13.388073921203613, "step": 880 }, { "epoch": 1.14, "learning_rate": 4.496548440847417e-07, "logits/chosen": -2.401721477508545, "logits/rejected": -2.447669506072998, "logps/chosen": -366.2709045410156, "logps/rejected": -519.80224609375, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -1.3627954721450806, "rewards/margins": 12.356982231140137, "rewards/rejected": -13.71977710723877, "step": 890 }, { "epoch": 1.16, "learning_rate": 4.484646512735063e-07, "logits/chosen": -2.4436986446380615, "logits/rejected": -2.5449397563934326, "logps/chosen": -384.5765686035156, "logps/rejected": -555.2340087890625, "loss": 0.0172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7870714068412781, "rewards/margins": 11.903576850891113, "rewards/rejected": -12.690648078918457, "step": 900 }, { "epoch": 1.16, "eval_logits/chosen": -2.5417840480804443, "eval_logits/rejected": -2.5121681690216064, "eval_logps/chosen": -326.2882080078125, "eval_logps/rejected": -464.37054443359375, "eval_loss": 0.06124735251069069, "eval_rewards/accuracies": 0.984375, "eval_rewards/chosen": -1.7576879262924194, "eval_rewards/margins": 10.404834747314453, "eval_rewards/rejected": -12.162521362304688, "eval_runtime": 38.6563, "eval_samples_per_second": 12.934, "eval_steps_per_second": 0.414, "step": 900 }, { "epoch": 1.17, "learning_rate": 4.4727445846227086e-07, "logits/chosen": -2.438345432281494, "logits/rejected": -2.4737024307250977, "logps/chosen": -369.38397216796875, "logps/rejected": -519.6220703125, "loss": 0.011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9280792474746704, "rewards/margins": 12.675816535949707, "rewards/rejected": -14.60389518737793, "step": 910 }, { "epoch": 1.18, "learning_rate": 4.4608426565103545e-07, "logits/chosen": -2.450275182723999, "logits/rejected": -2.462500810623169, "logps/chosen": -343.4928283691406, "logps/rejected": -515.9462280273438, "loss": 0.0221, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3710033893585205, "rewards/margins": 13.644805908203125, "rewards/rejected": -16.015810012817383, "step": 920 }, { "epoch": 1.2, "learning_rate": 4.4489407283980004e-07, "logits/chosen": -2.423760414123535, "logits/rejected": -2.385545253753662, "logps/chosen": -370.15985107421875, "logps/rejected": -515.8549194335938, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -2.4730286598205566, "rewards/margins": 12.967801094055176, "rewards/rejected": -15.440831184387207, "step": 930 }, { "epoch": 1.21, "learning_rate": 4.437038800285646e-07, "logits/chosen": -2.399423360824585, "logits/rejected": -2.418363094329834, "logps/chosen": -384.27984619140625, "logps/rejected": -549.5245971679688, "loss": 0.0156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.72330904006958, "rewards/margins": 12.818862915039062, "rewards/rejected": -16.542171478271484, "step": 940 }, { "epoch": 1.22, "learning_rate": 4.4251368721732916e-07, "logits/chosen": -2.5278353691101074, "logits/rejected": -2.5364837646484375, "logps/chosen": -329.5386657714844, "logps/rejected": -519.6696166992188, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -1.4351348876953125, "rewards/margins": 11.446606636047363, "rewards/rejected": -12.881741523742676, "step": 950 }, { "epoch": 1.23, "learning_rate": 4.413234944060938e-07, "logits/chosen": -2.527299165725708, "logits/rejected": -2.5759024620056152, "logps/chosen": -403.71063232421875, "logps/rejected": -589.4862670898438, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -0.19414202868938446, "rewards/margins": 12.035063743591309, "rewards/rejected": -12.229207038879395, "step": 960 }, { "epoch": 1.25, "learning_rate": 4.4013330159485834e-07, "logits/chosen": -2.4672398567199707, "logits/rejected": -2.4999210834503174, "logps/chosen": -334.6300048828125, "logps/rejected": -534.4932250976562, "loss": 0.0255, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7743580341339111, "rewards/margins": 12.416712760925293, "rewards/rejected": -14.191072463989258, "step": 970 }, { "epoch": 1.26, "learning_rate": 4.3894310878362293e-07, "logits/chosen": -2.447817087173462, "logits/rejected": -2.5005249977111816, "logps/chosen": -338.5157470703125, "logps/rejected": -544.09423828125, "loss": 0.0229, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8672630786895752, "rewards/margins": 12.040175437927246, "rewards/rejected": -13.907438278198242, "step": 980 }, { "epoch": 1.27, "learning_rate": 4.377529159723875e-07, "logits/chosen": -2.4685416221618652, "logits/rejected": -2.49491548538208, "logps/chosen": -366.1611022949219, "logps/rejected": -518.9093627929688, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -1.9218127727508545, "rewards/margins": 11.573265075683594, "rewards/rejected": -13.495076179504395, "step": 990 }, { "epoch": 1.29, "learning_rate": 4.365627231611521e-07, "logits/chosen": -2.470853805541992, "logits/rejected": -2.497331380844116, "logps/chosen": -405.1899719238281, "logps/rejected": -591.7445068359375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -1.9404414892196655, "rewards/margins": 13.470489501953125, "rewards/rejected": -15.410931587219238, "step": 1000 }, { "epoch": 1.29, "eval_logits/chosen": -2.5345709323883057, "eval_logits/rejected": -2.507004737854004, "eval_logps/chosen": -336.10919189453125, "eval_logps/rejected": -476.1966552734375, "eval_loss": 0.0556936077773571, "eval_rewards/accuracies": 0.984375, "eval_rewards/chosen": -2.7397918701171875, "eval_rewards/margins": 10.605344772338867, "eval_rewards/rejected": -13.345136642456055, "eval_runtime": 38.7118, "eval_samples_per_second": 12.916, "eval_steps_per_second": 0.413, "step": 1000 }, { "epoch": 1.3, "learning_rate": 4.3537253034991665e-07, "logits/chosen": -2.441990852355957, "logits/rejected": -2.4507715702056885, "logps/chosen": -329.62542724609375, "logps/rejected": -574.9547729492188, "loss": 0.0214, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.120880603790283, "rewards/margins": 13.88032054901123, "rewards/rejected": -17.001201629638672, "step": 1010 }, { "epoch": 1.31, "learning_rate": 4.3418233753868124e-07, "logits/chosen": -2.3679394721984863, "logits/rejected": -2.410681962966919, "logps/chosen": -341.8808898925781, "logps/rejected": -532.3084106445312, "loss": 0.0303, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.787487030029297, "rewards/margins": 11.951956748962402, "rewards/rejected": -14.739442825317383, "step": 1020 }, { "epoch": 1.32, "learning_rate": 4.3299214472744583e-07, "logits/chosen": -2.4356143474578857, "logits/rejected": -2.484920024871826, "logps/chosen": -378.17376708984375, "logps/rejected": -561.7147216796875, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": -2.4539060592651367, "rewards/margins": 12.572771072387695, "rewards/rejected": -15.026677131652832, "step": 1030 }, { "epoch": 1.34, "learning_rate": 4.3180195191621036e-07, "logits/chosen": -2.4165291786193848, "logits/rejected": -2.3931941986083984, "logps/chosen": -377.8540344238281, "logps/rejected": -555.7592163085938, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": -1.2512832880020142, "rewards/margins": 12.33320426940918, "rewards/rejected": -13.58448600769043, "step": 1040 }, { "epoch": 1.35, "learning_rate": 4.30611759104975e-07, "logits/chosen": -2.3533992767333984, "logits/rejected": -2.3296687602996826, "logps/chosen": -418.5027770996094, "logps/rejected": -600.8396606445312, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -2.496593952178955, "rewards/margins": 13.320207595825195, "rewards/rejected": -15.816801071166992, "step": 1050 }, { "epoch": 1.36, "learning_rate": 4.2942156629373954e-07, "logits/chosen": -2.246854782104492, "logits/rejected": -2.3130173683166504, "logps/chosen": -396.1013488769531, "logps/rejected": -553.8746337890625, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -2.86772084236145, "rewards/margins": 13.22656536102295, "rewards/rejected": -16.094287872314453, "step": 1060 }, { "epoch": 1.38, "learning_rate": 4.2823137348250413e-07, "logits/chosen": -2.1099252700805664, "logits/rejected": -2.1625306606292725, "logps/chosen": -439.188232421875, "logps/rejected": -567.4981689453125, "loss": 0.0195, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.4562058448791504, "rewards/margins": 11.824674606323242, "rewards/rejected": -15.280881881713867, "step": 1070 }, { "epoch": 1.39, "learning_rate": 4.270411806712687e-07, "logits/chosen": -2.182868480682373, "logits/rejected": -2.140045642852783, "logps/chosen": -414.1625061035156, "logps/rejected": -590.7791748046875, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -2.0633182525634766, "rewards/margins": 13.505340576171875, "rewards/rejected": -15.568659782409668, "step": 1080 }, { "epoch": 1.4, "learning_rate": 4.258509878600333e-07, "logits/chosen": -2.301701068878174, "logits/rejected": -2.3724331855773926, "logps/chosen": -318.6136779785156, "logps/rejected": -549.11572265625, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -1.7360296249389648, "rewards/margins": 12.463074684143066, "rewards/rejected": -14.199106216430664, "step": 1090 }, { "epoch": 1.41, "learning_rate": 4.2466079504879785e-07, "logits/chosen": -2.3375637531280518, "logits/rejected": -2.371568202972412, "logps/chosen": -355.43218994140625, "logps/rejected": -497.6923828125, "loss": 0.0296, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8382488489151, "rewards/margins": 11.133204460144043, "rewards/rejected": -12.971455574035645, "step": 1100 }, { "epoch": 1.41, "eval_logits/chosen": -2.422253131866455, "eval_logits/rejected": -2.3856472969055176, "eval_logps/chosen": -327.49688720703125, "eval_logps/rejected": -458.99761962890625, "eval_loss": 0.0712868794798851, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -1.8785579204559326, "eval_rewards/margins": 9.746674537658691, "eval_rewards/rejected": -11.625232696533203, "eval_runtime": 38.5688, "eval_samples_per_second": 12.964, "eval_steps_per_second": 0.415, "step": 1100 }, { "epoch": 1.43, "learning_rate": 4.234706022375625e-07, "logits/chosen": -2.335549831390381, "logits/rejected": -2.3633885383605957, "logps/chosen": -334.0445251464844, "logps/rejected": -532.0367431640625, "loss": 0.0173, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1568909883499146, "rewards/margins": 12.151830673217773, "rewards/rejected": -13.308721542358398, "step": 1110 }, { "epoch": 1.44, "learning_rate": 4.2228040942632703e-07, "logits/chosen": -2.2730376720428467, "logits/rejected": -2.279794931411743, "logps/chosen": -372.47711181640625, "logps/rejected": -565.377197265625, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.948188066482544, "rewards/margins": 12.926470756530762, "rewards/rejected": -14.874661445617676, "step": 1120 }, { "epoch": 1.45, "learning_rate": 4.210902166150916e-07, "logits/chosen": -2.1850171089172363, "logits/rejected": -2.2554237842559814, "logps/chosen": -330.89398193359375, "logps/rejected": -572.4408569335938, "loss": 0.0152, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2409050464630127, "rewards/margins": 15.152783393859863, "rewards/rejected": -17.393688201904297, "step": 1130 }, { "epoch": 1.47, "learning_rate": 4.199000238038562e-07, "logits/chosen": -2.2348155975341797, "logits/rejected": -2.276552200317383, "logps/chosen": -391.0440673828125, "logps/rejected": -562.8758544921875, "loss": 0.0083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.518620014190674, "rewards/margins": 13.422780990600586, "rewards/rejected": -15.941401481628418, "step": 1140 }, { "epoch": 1.48, "learning_rate": 4.187098309926208e-07, "logits/chosen": -2.234314441680908, "logits/rejected": -2.273665428161621, "logps/chosen": -379.77752685546875, "logps/rejected": -609.7650146484375, "loss": 0.0167, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2126364707946777, "rewards/margins": 14.33509635925293, "rewards/rejected": -16.547733306884766, "step": 1150 }, { "epoch": 1.49, "learning_rate": 4.1751963818138534e-07, "logits/chosen": -2.2460713386535645, "logits/rejected": -2.28529953956604, "logps/chosen": -391.7981872558594, "logps/rejected": -584.82373046875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.583667278289795, "rewards/margins": 13.928072929382324, "rewards/rejected": -16.511741638183594, "step": 1160 }, { "epoch": 1.5, "learning_rate": 4.1632944537015e-07, "logits/chosen": -2.312187671661377, "logits/rejected": -2.313152313232422, "logps/chosen": -332.22418212890625, "logps/rejected": -550.9510498046875, "loss": 0.0151, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8060202598571777, "rewards/margins": 13.428415298461914, "rewards/rejected": -16.23443603515625, "step": 1170 }, { "epoch": 1.52, "learning_rate": 4.151392525589145e-07, "logits/chosen": -2.269207715988159, "logits/rejected": -2.2718236446380615, "logps/chosen": -332.3182067871094, "logps/rejected": -509.44085693359375, "loss": 0.0267, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1180636882781982, "rewards/margins": 12.261663436889648, "rewards/rejected": -15.379727363586426, "step": 1180 }, { "epoch": 1.53, "learning_rate": 4.139490597476791e-07, "logits/chosen": -2.2478084564208984, "logits/rejected": -2.3000128269195557, "logps/chosen": -337.1382141113281, "logps/rejected": -537.2418212890625, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -2.009748935699463, "rewards/margins": 12.527368545532227, "rewards/rejected": -14.537118911743164, "step": 1190 }, { "epoch": 1.54, "learning_rate": 4.127588669364437e-07, "logits/chosen": -2.313680648803711, "logits/rejected": -2.327012538909912, "logps/chosen": -291.6064758300781, "logps/rejected": -546.3372802734375, "loss": 0.0148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.4880402088165283, "rewards/margins": 14.52784252166748, "rewards/rejected": -18.01588249206543, "step": 1200 }, { "epoch": 1.54, "eval_logits/chosen": -2.3877577781677246, "eval_logits/rejected": -2.35882568359375, "eval_logps/chosen": -347.202880859375, "eval_logps/rejected": -496.7171325683594, "eval_loss": 0.07778895646333694, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -3.8491578102111816, "eval_rewards/margins": 11.548023223876953, "eval_rewards/rejected": -15.397181510925293, "eval_runtime": 38.6215, "eval_samples_per_second": 12.946, "eval_steps_per_second": 0.414, "step": 1200 }, { "epoch": 1.56, "learning_rate": 4.115686741252083e-07, "logits/chosen": -2.292132616043091, "logits/rejected": -2.347907781600952, "logps/chosen": -362.74481201171875, "logps/rejected": -558.0933837890625, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -2.724060297012329, "rewards/margins": 13.57036304473877, "rewards/rejected": -16.294422149658203, "step": 1210 }, { "epoch": 1.57, "learning_rate": 4.103784813139728e-07, "logits/chosen": -2.3167264461517334, "logits/rejected": -2.3449079990386963, "logps/chosen": -369.4256591796875, "logps/rejected": -566.0360107421875, "loss": 0.0155, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4625415802001953, "rewards/margins": 13.401751518249512, "rewards/rejected": -16.86429214477539, "step": 1220 }, { "epoch": 1.58, "learning_rate": 4.091882885027374e-07, "logits/chosen": -2.3674325942993164, "logits/rejected": -2.455508232116699, "logps/chosen": -381.26068115234375, "logps/rejected": -550.90625, "loss": 0.0244, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.13775897026062, "rewards/margins": 13.795980453491211, "rewards/rejected": -16.933740615844727, "step": 1230 }, { "epoch": 1.59, "learning_rate": 4.07998095691502e-07, "logits/chosen": -2.3083391189575195, "logits/rejected": -2.330939769744873, "logps/chosen": -362.44171142578125, "logps/rejected": -523.51171875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.1269755363464355, "rewards/margins": 12.574740409851074, "rewards/rejected": -15.701716423034668, "step": 1240 }, { "epoch": 1.61, "learning_rate": 4.0680790288026654e-07, "logits/chosen": -2.3918001651763916, "logits/rejected": -2.426542282104492, "logps/chosen": -420.2566833496094, "logps/rejected": -605.3551025390625, "loss": 0.0202, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9122402667999268, "rewards/margins": 13.67309856414795, "rewards/rejected": -16.585338592529297, "step": 1250 }, { "epoch": 1.62, "learning_rate": 4.056177100690312e-07, "logits/chosen": -2.2674708366394043, "logits/rejected": -2.2906508445739746, "logps/chosen": -390.3266296386719, "logps/rejected": -587.2613525390625, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -4.047337532043457, "rewards/margins": 13.966493606567383, "rewards/rejected": -18.013832092285156, "step": 1260 }, { "epoch": 1.63, "learning_rate": 4.044275172577957e-07, "logits/chosen": -2.256685733795166, "logits/rejected": -2.283980131149292, "logps/chosen": -307.6758728027344, "logps/rejected": -536.929931640625, "loss": 0.0251, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8640975952148438, "rewards/margins": 14.663250923156738, "rewards/rejected": -17.527347564697266, "step": 1270 }, { "epoch": 1.65, "learning_rate": 4.0323732444656036e-07, "logits/chosen": -2.2302117347717285, "logits/rejected": -2.319187641143799, "logps/chosen": -377.6014099121094, "logps/rejected": -592.4954223632812, "loss": 0.0208, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1300606727600098, "rewards/margins": 14.494562149047852, "rewards/rejected": -16.624622344970703, "step": 1280 }, { "epoch": 1.66, "learning_rate": 4.020471316353249e-07, "logits/chosen": -2.3077661991119385, "logits/rejected": -2.34450364112854, "logps/chosen": -384.89007568359375, "logps/rejected": -577.9298095703125, "loss": 0.0126, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4994373321533203, "rewards/margins": 12.733844757080078, "rewards/rejected": -14.233282089233398, "step": 1290 }, { "epoch": 1.67, "learning_rate": 4.008569388240895e-07, "logits/chosen": -2.230447292327881, "logits/rejected": -2.283294677734375, "logps/chosen": -346.1694641113281, "logps/rejected": -534.3992919921875, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -1.3882415294647217, "rewards/margins": 14.5983247756958, "rewards/rejected": -15.986566543579102, "step": 1300 }, { "epoch": 1.67, "eval_logits/chosen": -2.403440475463867, "eval_logits/rejected": -2.378675699234009, "eval_logps/chosen": -332.9962463378906, "eval_logps/rejected": -477.9118957519531, "eval_loss": 0.07047431915998459, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -2.4284939765930176, "eval_rewards/margins": 11.088165283203125, "eval_rewards/rejected": -13.516657829284668, "eval_runtime": 38.6695, "eval_samples_per_second": 12.93, "eval_steps_per_second": 0.414, "step": 1300 }, { "epoch": 1.68, "learning_rate": 3.996667460128541e-07, "logits/chosen": -2.31799578666687, "logits/rejected": -2.3302206993103027, "logps/chosen": -333.87261962890625, "logps/rejected": -506.0113220214844, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -1.8233000040054321, "rewards/margins": 13.524618148803711, "rewards/rejected": -15.347920417785645, "step": 1310 }, { "epoch": 1.7, "learning_rate": 3.9847655320161867e-07, "logits/chosen": -2.3380274772644043, "logits/rejected": -2.3655543327331543, "logps/chosen": -330.939453125, "logps/rejected": -566.5387573242188, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -3.293247938156128, "rewards/margins": 13.109285354614258, "rewards/rejected": -16.402530670166016, "step": 1320 }, { "epoch": 1.71, "learning_rate": 3.972863603903832e-07, "logits/chosen": -2.4296791553497314, "logits/rejected": -2.395019054412842, "logps/chosen": -368.58843994140625, "logps/rejected": -550.57177734375, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.1659107208251953, "rewards/margins": 14.171772956848145, "rewards/rejected": -16.337684631347656, "step": 1330 }, { "epoch": 1.72, "learning_rate": 3.9609616757914784e-07, "logits/chosen": -2.386429786682129, "logits/rejected": -2.401638984680176, "logps/chosen": -347.26214599609375, "logps/rejected": -538.3074951171875, "loss": 0.0162, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.806589126586914, "rewards/margins": 12.520380973815918, "rewards/rejected": -15.326970100402832, "step": 1340 }, { "epoch": 1.74, "learning_rate": 3.949059747679124e-07, "logits/chosen": -2.3784899711608887, "logits/rejected": -2.42669939994812, "logps/chosen": -364.69512939453125, "logps/rejected": -592.1053466796875, "loss": 0.0159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1103992462158203, "rewards/margins": 15.538830757141113, "rewards/rejected": -18.649229049682617, "step": 1350 }, { "epoch": 1.75, "learning_rate": 3.9371578195667697e-07, "logits/chosen": -2.4179718494415283, "logits/rejected": -2.4337425231933594, "logps/chosen": -338.0289001464844, "logps/rejected": -522.47412109375, "loss": 0.0343, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.291858196258545, "rewards/margins": 14.658024787902832, "rewards/rejected": -16.949880599975586, "step": 1360 }, { "epoch": 1.76, "learning_rate": 3.9252558914544156e-07, "logits/chosen": -2.37274169921875, "logits/rejected": -2.376906633377075, "logps/chosen": -371.0089111328125, "logps/rejected": -562.0587158203125, "loss": 0.0236, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.699599504470825, "rewards/margins": 14.066309928894043, "rewards/rejected": -16.76590919494629, "step": 1370 }, { "epoch": 1.77, "learning_rate": 3.9133539633420615e-07, "logits/chosen": -2.3570303916931152, "logits/rejected": -2.4414098262786865, "logps/chosen": -347.50531005859375, "logps/rejected": -606.2113647460938, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -2.3379924297332764, "rewards/margins": 14.867982864379883, "rewards/rejected": -17.205974578857422, "step": 1380 }, { "epoch": 1.79, "learning_rate": 3.901452035229707e-07, "logits/chosen": -2.373347043991089, "logits/rejected": -2.4218459129333496, "logps/chosen": -421.48187255859375, "logps/rejected": -606.8762817382812, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -2.6556594371795654, "rewards/margins": 14.492483139038086, "rewards/rejected": -17.148143768310547, "step": 1390 }, { "epoch": 1.8, "learning_rate": 3.8895501071173533e-07, "logits/chosen": -2.3142504692077637, "logits/rejected": -2.3538806438446045, "logps/chosen": -325.9708557128906, "logps/rejected": -511.67449951171875, "loss": 0.0214, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7798726558685303, "rewards/margins": 13.109631538391113, "rewards/rejected": -15.889503479003906, "step": 1400 }, { "epoch": 1.8, "eval_logits/chosen": -2.3960964679718018, "eval_logits/rejected": -2.3517098426818848, "eval_logps/chosen": -346.35821533203125, "eval_logps/rejected": -495.85186767578125, "eval_loss": 0.07910314947366714, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": -3.7646918296813965, "eval_rewards/margins": 11.545960426330566, "eval_rewards/rejected": -15.310651779174805, "eval_runtime": 38.7173, "eval_samples_per_second": 12.914, "eval_steps_per_second": 0.413, "step": 1400 }, { "epoch": 1.81, "learning_rate": 3.8776481790049987e-07, "logits/chosen": -2.3062312602996826, "logits/rejected": -2.3327383995056152, "logps/chosen": -354.59381103515625, "logps/rejected": -503.6541442871094, "loss": 0.0196, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5312271118164062, "rewards/margins": 13.276026725769043, "rewards/rejected": -16.807254791259766, "step": 1410 }, { "epoch": 1.83, "learning_rate": 3.865746250892644e-07, "logits/chosen": -2.396146774291992, "logits/rejected": -2.3744444847106934, "logps/chosen": -397.74609375, "logps/rejected": -583.1174926757812, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -3.2743606567382812, "rewards/margins": 15.37347412109375, "rewards/rejected": -18.647836685180664, "step": 1420 }, { "epoch": 1.84, "learning_rate": 3.8538443227802905e-07, "logits/chosen": -2.3621578216552734, "logits/rejected": -2.3470935821533203, "logps/chosen": -374.19757080078125, "logps/rejected": -564.0121459960938, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.340083122253418, "rewards/margins": 13.78313159942627, "rewards/rejected": -18.123212814331055, "step": 1430 }, { "epoch": 1.85, "learning_rate": 3.841942394667936e-07, "logits/chosen": -2.304884672164917, "logits/rejected": -2.4029793739318848, "logps/chosen": -369.39898681640625, "logps/rejected": -578.387451171875, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7867379188537598, "rewards/margins": 14.443509101867676, "rewards/rejected": -17.23024559020996, "step": 1440 }, { "epoch": 1.86, "learning_rate": 3.8300404665555817e-07, "logits/chosen": -2.2816107273101807, "logits/rejected": -2.2829480171203613, "logps/chosen": -374.7585144042969, "logps/rejected": -540.5015869140625, "loss": 0.0164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.205556869506836, "rewards/margins": 14.464788436889648, "rewards/rejected": -16.670345306396484, "step": 1450 }, { "epoch": 1.88, "learning_rate": 3.8181385384432276e-07, "logits/chosen": -2.282743453979492, "logits/rejected": -2.2942354679107666, "logps/chosen": -394.46502685546875, "logps/rejected": -594.6571044921875, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -2.892620325088501, "rewards/margins": 14.386013984680176, "rewards/rejected": -17.27863311767578, "step": 1460 }, { "epoch": 1.89, "learning_rate": 3.8062366103308735e-07, "logits/chosen": -2.2720725536346436, "logits/rejected": -2.245262622833252, "logps/chosen": -342.9836730957031, "logps/rejected": -546.7418212890625, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -2.7557284832000732, "rewards/margins": 14.667689323425293, "rewards/rejected": -17.423416137695312, "step": 1470 }, { "epoch": 1.9, "learning_rate": 3.794334682218519e-07, "logits/chosen": -2.295213222503662, "logits/rejected": -2.3375067710876465, "logps/chosen": -370.61798095703125, "logps/rejected": -474.4059143066406, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.196665048599243, "rewards/margins": 12.084269523620605, "rewards/rejected": -15.28093433380127, "step": 1480 }, { "epoch": 1.92, "learning_rate": 3.7824327541061653e-07, "logits/chosen": -2.4100170135498047, "logits/rejected": -2.4586007595062256, "logps/chosen": -358.7035217285156, "logps/rejected": -547.9478149414062, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -3.731393337249756, "rewards/margins": 12.000238418579102, "rewards/rejected": -15.73162841796875, "step": 1490 }, { "epoch": 1.93, "learning_rate": 3.7705308259938107e-07, "logits/chosen": -2.432584047317505, "logits/rejected": -2.430572032928467, "logps/chosen": -400.4476318359375, "logps/rejected": -589.388427734375, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.0731418132781982, "rewards/margins": 13.324457168579102, "rewards/rejected": -16.397600173950195, "step": 1500 }, { "epoch": 1.93, "eval_logits/chosen": -2.4233508110046387, "eval_logits/rejected": -2.3732004165649414, "eval_logps/chosen": -345.49517822265625, "eval_logps/rejected": -491.72662353515625, "eval_loss": 0.08803335577249527, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": -3.678384304046631, "eval_rewards/margins": 11.219746589660645, "eval_rewards/rejected": -14.89813232421875, "eval_runtime": 38.608, "eval_samples_per_second": 12.951, "eval_steps_per_second": 0.414, "step": 1500 }, { "epoch": 1.94, "learning_rate": 3.7586288978814566e-07, "logits/chosen": -2.335282564163208, "logits/rejected": -2.330732583999634, "logps/chosen": -388.20806884765625, "logps/rejected": -580.2225341796875, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -3.1078379154205322, "rewards/margins": 13.242405891418457, "rewards/rejected": -16.350242614746094, "step": 1510 }, { "epoch": 1.95, "learning_rate": 3.7467269697691025e-07, "logits/chosen": -2.3464579582214355, "logits/rejected": -2.3436694145202637, "logps/chosen": -335.885986328125, "logps/rejected": -532.0635986328125, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -3.1488466262817383, "rewards/margins": 13.591397285461426, "rewards/rejected": -16.740243911743164, "step": 1520 }, { "epoch": 1.97, "learning_rate": 3.7348250416567484e-07, "logits/chosen": -2.2621750831604004, "logits/rejected": -2.2600533962249756, "logps/chosen": -415.00982666015625, "logps/rejected": -549.5345458984375, "loss": 0.0264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.42472767829895, "rewards/margins": 13.469167709350586, "rewards/rejected": -15.893896102905273, "step": 1530 }, { "epoch": 1.98, "learning_rate": 3.722923113544394e-07, "logits/chosen": -2.361262559890747, "logits/rejected": -2.315338611602783, "logps/chosen": -394.708740234375, "logps/rejected": -578.1019287109375, "loss": 0.0251, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.082348585128784, "rewards/margins": 15.23118782043457, "rewards/rejected": -17.31353759765625, "step": 1540 }, { "epoch": 1.99, "learning_rate": 3.71102118543204e-07, "logits/chosen": -2.315455913543701, "logits/rejected": -2.284585952758789, "logps/chosen": -367.0815734863281, "logps/rejected": -577.2198486328125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -2.139265537261963, "rewards/margins": 14.051069259643555, "rewards/rejected": -16.19033432006836, "step": 1550 } ], "logging_steps": 10, "max_steps": 4668, "num_train_epochs": 6, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }